├── 03-data-lake ├── .gitignore ├── configuration-overrides.json ├── payload-2024-11.json ├── reviews-per-listing.py └── README.md ├── 06-data-eng-with-llms ├── .gitignore ├── 02-structured-output.py ├── 01-simple-transformers.py ├── 03-spark-llms.py └── README.md ├── .gitignore ├── 04-orchestration-with-airflow ├── .gitignore ├── exercises │ ├── .gitignore │ ├── 02-process-customer-reviews-exercise │ │ ├── docker-compose.yaml │ │ ├── spark_etl_reviews.py │ │ ├── customer_reviews_dag.py │ │ └── README.md │ ├── 02-process-customer-reviews-solution │ │ ├── docker-compose.yaml │ │ ├── spark_etl_reviews.py │ │ ├── customer_reviews_dag.py │ │ └── README.md │ ├── 01-data-validation-solution │ │ ├── README.md │ │ └── data_validation_dag.py │ ├── 01-data-validation-exercise │ │ ├── README.md │ │ └── data_validation_dag.py │ └── README.md ├── docker-compose.yaml ├── dags │ ├── bookings_per_listing_spark.py │ ├── 01-average_page_visits.py │ ├── 02-average_page_visits_with_failures.py │ ├── 03-bookings_per_listing.py │ ├── 04-bookings_per_listing_with_sensor.py │ └── 05-bookings_per_listing_with_postgres.py └── README.md ├── 05-ml-with-spark ├── .gitignore ├── README.md └── 04-pyspark-pipeline.ipynb ├── 07-kafka-streaming ├── exercises │ ├── 01-wikipedia-stream-exercise │ │ ├── .gitignore │ │ ├── requirements.txt │ │ ├── docker-compose.yml │ │ ├── wiki-consumer.py │ │ └── wiki-producer.py │ ├── 01-wikipedia-stream-solution │ │ ├── .gitignore │ │ ├── requirements.txt │ │ ├── docker-compose.yml │ │ ├── wiki-consumer.py │ │ └── wiki-producer.py │ ├── 02-kafka-connect-exercise │ │ ├── .gitignore │ │ ├── requirements.txt │ │ ├── config_debezium.json │ │ ├── kafka-connect-consumer.py │ │ ├── docker-compose.yaml │ │ └── README.md │ └── 02-kafka-connect-solution │ │ ├── .gitignore │ │ ├── requirements.txt │ │ ├── config_debezium.json │ │ ├── kafka-connect-consumer.py │ │ └── docker-compose.yaml ├── config_debezium.json ├── docker-compose.yml ├── README.md ├── docker-compose-schema-registry.yaml ├── 03-kafka-connect-consumer.py ├── 02-kafka-consumer.py ├── 01-kafka-producer.py ├── docker-compose-kafka-connect.yaml ├── order.py ├── 05-kafka-schema-registry-consumer.py └── 04-kafka-schema-registry-producer.py ├── 08-flink-stream-processing ├── .gitignore ├── exercises │ ├── 01-payments-data-exercise │ │ ├── .gitignore │ │ ├── docker-compose.yml │ │ ├── requirements.txt │ │ ├── flink-app.py │ │ ├── payments-producer.py │ │ └── README.md │ ├── 01-payments-data-solution │ │ ├── .gitignore │ │ ├── docker-compose.yml │ │ ├── requirements.txt │ │ ├── payments-producer.py │ │ ├── flink-app.py │ │ └── README.md │ ├── 02-anomalities-detector-exercise │ │ ├── .gitignore │ │ ├── docker-compose.yml │ │ ├── requirements.txt │ │ ├── payments-producer.py │ │ ├── README.md │ │ └── flink-app.py │ └── 02-anomalities-detector-solution │ │ ├── .gitignore │ │ ├── docker-compose.yml │ │ ├── requirements.txt │ │ ├── payments-producer.py │ │ ├── README.md │ │ └── flink-app.py ├── 01-flink-hello-world.py ├── docker-compose.yml ├── products.json ├── products-producer.py ├── orders-producer.py ├── README.md ├── variable-spend-producer.py ├── popular-products-producer.py ├── late-events-producer.py ├── 02-orders-processing.py ├── 05-local-state.py ├── 03-windows-aggregation.py ├── 04-late-events-processing.py └── 06-connecting-streams.py ├── 02-data-processing-with-spark ├── .gitignore ├── data │ └── download_data.sh ├── reviews-per-listing.py ├── 02-reading-airbnb-data.ipynb ├── README.md ├── 01-test-pyspark-app.ipynb ├── exercises │ ├── 02-aggregation-functions.ipynb │ └── 03-advanced-spark.ipynb └── 03-processing-airbnb-data.ipynb ├── README.md └── 01-introduction └── docker-compose.yaml /03-data-lake/.gitignore: -------------------------------------------------------------------------------- 1 | data/ -------------------------------------------------------------------------------- /06-data-eng-with-llms/.gitignore: -------------------------------------------------------------------------------- 1 | ./venv 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | **/.ipynb_checkpoints/ 3 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | __pycache__/ -------------------------------------------------------------------------------- /05-ml-with-spark/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | venv 3 | .ipynb_checkpoints -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | dags/ -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-exercise/.gitignore: -------------------------------------------------------------------------------- 1 | ex-1-venv/ -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-solution/.gitignore: -------------------------------------------------------------------------------- 1 | ex-1-venv/ -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-exercise/.gitignore: -------------------------------------------------------------------------------- 1 | ex-2-venv/ -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-solution/.gitignore: -------------------------------------------------------------------------------- 1 | ex-2-venv/ -------------------------------------------------------------------------------- /08-flink-stream-processing/.gitignore: -------------------------------------------------------------------------------- 1 | flink-sql-connector-kafka-3.4.0-1.20.jar 2 | venv/ -------------------------------------------------------------------------------- /02-data-processing-with-spark/.gitignore: -------------------------------------------------------------------------------- 1 | artifacts 2 | data 3 | venv 4 | .ipynb_checkpoints -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-exercise/requirements.txt: -------------------------------------------------------------------------------- 1 | confluent-kafka==2.6.1 2 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-solution/requirements.txt: -------------------------------------------------------------------------------- 1 | confluent-kafka==2.6.1 2 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-exercise/.gitignore: -------------------------------------------------------------------------------- 1 | flink-sql-connector-kafka-3.4.0-1.20.jar 2 | venv/ -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-solution/.gitignore: -------------------------------------------------------------------------------- 1 | flink-sql-connector-kafka-3.4.0-1.20.jar 2 | venv/ -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-exercise/.gitignore: -------------------------------------------------------------------------------- 1 | flink-sql-connector-kafka-3.4.0-1.20.jar 2 | venv/ -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-solution/.gitignore: -------------------------------------------------------------------------------- 1 | flink-sql-connector-kafka-3.4.0-1.20.jar 2 | venv/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZTM - Data Engineering Bootcamp 2 | 3 | Source code for the [Data Engineering Bootcamp](https://academy.zerotomastery.io/a/aff_nbp8km09/external?affcode=441520_ehcbjdb9) on ZTM. 4 | -------------------------------------------------------------------------------- /03-data-lake/configuration-overrides.json: -------------------------------------------------------------------------------- 1 | { 2 | "monitoringConfiguration": { 3 | "s3MonitoringConfiguration": { 4 | "logUri": "s3://ztm-data-engineering-bootcamp/logs" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-exercise/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2024.8.30 2 | charset-normalizer==3.4.0 3 | confluent-kafka==2.6.1 4 | idna==3.10 5 | requests==2.32.3 6 | six==1.17.0 7 | sseclient==0.0.27 8 | urllib3==2.2.3 9 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-solution/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2024.8.30 2 | charset-normalizer==3.4.0 3 | confluent-kafka==2.6.1 4 | idna==3.10 5 | requests==2.32.3 6 | six==1.17.0 7 | sseclient==0.0.27 8 | urllib3==2.2.3 9 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | postgres: 4 | image: postgres:15 5 | container_name: postgres 6 | environment: 7 | POSTGRES_USER: user 8 | POSTGRES_PASSWORD: password 9 | POSTGRES_DB: rental_site 10 | ports: 11 | - "5432:5432" -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | postgres: 4 | image: postgres:15 5 | container_name: postgres 6 | environment: 7 | POSTGRES_USER: user 8 | POSTGRES_PASSWORD: password 9 | POSTGRES_DB: rental_site 10 | ports: 11 | - "5432:5432" -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | postgres: 4 | image: postgres:15 5 | container_name: postgres 6 | environment: 7 | POSTGRES_USER: user 8 | POSTGRES_PASSWORD: password 9 | POSTGRES_DB: rental_site 10 | ports: 11 | - "5432:5432" -------------------------------------------------------------------------------- /08-flink-stream-processing/01-flink-hello-world.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | 3 | def main(): 4 | env = StreamExecutionEnvironment.get_execution_environment() 5 | data_stream = env.from_collection([1, 2, 3, 4, 5]) 6 | 7 | mapped_stream = data_stream.map(lambda x: x * 2) 8 | 9 | mapped_stream.print() 10 | 11 | env.execute("Flink Hello World") 12 | 13 | if __name__ == "__main__": 14 | main() -------------------------------------------------------------------------------- /07-kafka-streaming/config_debezium.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "postgres-debezium-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "plugin.name": "pgoutput", 6 | "database.hostname": "postgres", 7 | "database.port": "5432", 8 | "database.user": "user", 9 | "database.password": "password", 10 | "database.dbname": "onlineshop", 11 | "topic.prefix": "postgres-" 12 | } 13 | } -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-exercise/config_debezium.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "postgres-debezium-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "plugin.name": "pgoutput", 6 | "database.hostname": "postgres", 7 | "database.port": "5432", 8 | "database.user": "user", 9 | "database.password": "password", 10 | "database.dbname": "onlineshop", 11 | "topic.prefix": "postgres-" 12 | } 13 | } -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-solution/config_debezium.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "postgres-debezium-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "plugin.name": "pgoutput", 6 | "database.hostname": "postgres", 7 | "database.port": "5432", 8 | "database.user": "user", 9 | "database.password": "password", 10 | "database.dbname": "onlineshop", 11 | "topic.prefix": "postgres-" 12 | } 13 | } -------------------------------------------------------------------------------- /03-data-lake/payload-2024-11.json: -------------------------------------------------------------------------------- 1 | { 2 | "sparkSubmit": { 3 | "entryPoint": "s3://ztm-data-engineering-bootcamp/apps/reviews-per-listing.py", 4 | "entryPointArguments": [ 5 | "--listings", 6 | "s3://ztm-data-engineering-bootcamp/listings/date=2024-11/listings.csv.gz", 7 | "--reviews", 8 | "s3://ztm-data-engineering-bootcamp/reviews/date=2024-11/reviews.csv.gz", 9 | "--output", 10 | "s3://ztm-data-engineering-bootcamp/reviews_per_listing/date=2024-11" 11 | ] 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /02-data-processing-with-spark/data/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -f "$0" ]; then script=$0; else script=$(command -v -- "$0"); fi 4 | dir=$(dirname -- "$script") 5 | 6 | echo "Writing data to $dir" 7 | 8 | wget -O "$dir/reviews.csv.gz" https://data.insideairbnb.com/united-kingdom/england/london/2024-09-06/data/reviews.csv.gz 9 | wget -O "$dir/calendar.csv.gz" https://data.insideairbnb.com/united-kingdom/england/london/2024-09-06/data/calendar.csv.gz 10 | wget -O "$dir/listings.csv.gz" https://data.insideairbnb.com/united-kingdom/england/london/2024-09-06/data/listings.csv.gz 11 | -------------------------------------------------------------------------------- /01-introduction/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | mongo: 3 | image: mongo:latest 4 | environment: 5 | MONGO_INITDB_ROOT_USERNAME: admin 6 | MONGO_INITDB_ROOT_PASSWORD: secret 7 | ports: 8 | - "27017:27017" 9 | volumes: 10 | - mongo-data:/data/db 11 | 12 | mongo-express: 13 | image: mongo-express:latest 14 | depends_on: 15 | - mongo 16 | environment: 17 | ME_CONFIG_MONGODB_ADMINUSERNAME: admin 18 | ME_CONFIG_MONGODB_ADMINPASSWORD: secret 19 | ME_CONFIG_MONGODB_SERVER: mongo 20 | ME_CONFIG_BASICAUTH_ENABLED: 'false' 21 | ports: 22 | - "8081:8081" 23 | 24 | volumes: 25 | mongo-data: {} 26 | 27 | -------------------------------------------------------------------------------- /07-kafka-streaming/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT 16 | KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092 19 | KAFKA_LOG_RETENTION_HOURS: 168 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /08-flink-stream-processing/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT 16 | KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092 19 | KAFKA_LOG_RETENTION_HOURS: 168 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-exercise/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT 16 | KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092 19 | KAFKA_LOG_RETENTION_HOURS: 168 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-solution/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT 16 | KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092 19 | KAFKA_LOG_RETENTION_HOURS: 168 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-exercise/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT 16 | KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092 19 | KAFKA_LOG_RETENTION_HOURS: 168 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-solution/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT 16 | KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092 19 | KAFKA_LOG_RETENTION_HOURS: 168 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-exercise/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | hostname: kafka 7 | container_name: kafka 8 | ports: 9 | - "9092:9092" 10 | environment: 11 | KAFKA_KRAFT_MODE: "true" 12 | KAFKA_PROCESS_ROLES: broker,controller 13 | KAFKA_NODE_ID: 1 14 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 15 | KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 16 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT 17 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 18 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 19 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | KAFKA_LOG_RETENTION_HOURS: 168 22 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-solution/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | hostname: kafka 7 | container_name: kafka 8 | ports: 9 | - "9092:9092" 10 | environment: 11 | KAFKA_KRAFT_MODE: "true" 12 | KAFKA_PROCESS_ROLES: broker,controller 13 | KAFKA_NODE_ID: 1 14 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 15 | KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 16 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT 17 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 18 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 19 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | KAFKA_LOG_RETENTION_HOURS: 168 22 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" -------------------------------------------------------------------------------- /08-flink-stream-processing/products.json: -------------------------------------------------------------------------------- 1 | {"product_id":"product-1","name":"Cotton T-Shirt","category":"Clothing"} 2 | {"product_id":"product-2","name":"Wireless Earbuds","category":"Electronics"} 3 | {"product_id":"product-3","name":"Ceramic Coffee Mug","category":"Home & Kitchen"} 4 | {"product_id":"product-4","name":"Stainless Steel Water Bottle","category":"Outdoor & Travel"} 5 | {"product_id":"product-5","name":"Yoga Mat","category":"Sports & Fitness"} 6 | {"product_id":"product-6","name":"Leather Wallet","category":"Accessories"} 7 | {"product_id":"product-7","name":"Laptop Stand","category":"Office Supplies"} 8 | {"product_id":"product-8","name":"Running Shoes","category":"Footwear"} 9 | {"product_id":"product-9","name":"Bluetooth Speaker","category":"Electronics"} 10 | {"product_id":"product-10","name":"Scented Candle","category":"Home & Kitchen"} 11 | 12 | 13 | {"product_id":"product-6","name":"Cotton Wallet","category":"Accessories"} -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/spark_etl_reviews.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import avg 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--customer_reviews", required=True, help="Input CSV file path") 8 | parser.add_argument("--output_path", required=True, help="Output CSV file path") 9 | args = parser.parse_args() 10 | 11 | spark = SparkSession \ 12 | .builder \ 13 | .appName("CustomerReviews") \ 14 | .getOrCreate() 15 | 16 | # TODO: Read input data 17 | customer_reviews = None 18 | 19 | customer_reviews = customer_reviews \ 20 | .withColumn("review_score", customer_reviews["review_score"].cast("float")) 21 | 22 | # TODO: Calculate an average review score per listing ID 23 | 24 | # TODO: Write the result to an output path 25 | 26 | if __name__ == "__main__": 27 | main() -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-exercise/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam==2.48.0 2 | apache-flink==1.20.0 3 | apache-flink-libraries==1.20.0 4 | avro-python3==1.10.2 5 | certifi==2024.12.14 6 | charset-normalizer==3.4.1 7 | cloudpickle==2.2.1 8 | confluent-kafka==2.8.0 9 | crcmod==1.7 10 | dill==0.3.1.1 11 | dnspython==2.7.0 12 | docopt==0.6.2 13 | fastavro==1.10.0 14 | fasteners==0.19 15 | find_libpython==0.4.0 16 | grpcio==1.69.0 17 | hdfs==2.7.3 18 | httplib2==0.22.0 19 | idna==3.10 20 | numpy==1.24.4 21 | objsize==0.6.1 22 | orjson==3.10.15 23 | pandas==2.2.3 24 | pemja==0.4.1 25 | proto-plus==1.25.0 26 | protobuf==4.23.4 27 | py4j==0.10.9.7 28 | pyarrow==11.0.0 29 | pydot==1.4.2 30 | pymongo==4.10.1 31 | pyparsing==3.2.1 32 | python-dateutil==2.9.0.post0 33 | pytz==2024.2 34 | regex==2024.11.6 35 | requests==2.32.3 36 | ruamel.yaml==0.18.10 37 | ruamel.yaml.clib==0.2.12 38 | six==1.17.0 39 | typing_extensions==4.12.2 40 | tzdata==2024.2 41 | urllib3==2.3.0 42 | zstandard==0.23.0 43 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-solution/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam==2.48.0 2 | apache-flink==1.20.0 3 | apache-flink-libraries==1.20.0 4 | avro-python3==1.10.2 5 | certifi==2024.12.14 6 | charset-normalizer==3.4.1 7 | cloudpickle==2.2.1 8 | confluent-kafka==2.8.0 9 | crcmod==1.7 10 | dill==0.3.1.1 11 | dnspython==2.7.0 12 | docopt==0.6.2 13 | fastavro==1.10.0 14 | fasteners==0.19 15 | find_libpython==0.4.0 16 | grpcio==1.69.0 17 | hdfs==2.7.3 18 | httplib2==0.22.0 19 | idna==3.10 20 | numpy==1.24.4 21 | objsize==0.6.1 22 | orjson==3.10.15 23 | pandas==2.2.3 24 | pemja==0.4.1 25 | proto-plus==1.25.0 26 | protobuf==4.23.4 27 | py4j==0.10.9.7 28 | pyarrow==11.0.0 29 | pydot==1.4.2 30 | pymongo==4.10.1 31 | pyparsing==3.2.1 32 | python-dateutil==2.9.0.post0 33 | pytz==2024.2 34 | regex==2024.11.6 35 | requests==2.32.3 36 | ruamel.yaml==0.18.10 37 | ruamel.yaml.clib==0.2.12 38 | six==1.17.0 39 | typing_extensions==4.12.2 40 | tzdata==2024.2 41 | urllib3==2.3.0 42 | zstandard==0.23.0 43 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-exercise/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam==2.48.0 2 | apache-flink==1.20.0 3 | apache-flink-libraries==1.20.0 4 | avro-python3==1.10.2 5 | certifi==2024.12.14 6 | charset-normalizer==3.4.1 7 | cloudpickle==2.2.1 8 | confluent-kafka==2.8.0 9 | crcmod==1.7 10 | dill==0.3.1.1 11 | dnspython==2.7.0 12 | docopt==0.6.2 13 | fastavro==1.10.0 14 | fasteners==0.19 15 | find_libpython==0.4.0 16 | grpcio==1.69.0 17 | hdfs==2.7.3 18 | httplib2==0.22.0 19 | idna==3.10 20 | numpy==1.24.4 21 | objsize==0.6.1 22 | orjson==3.10.15 23 | pandas==2.2.3 24 | pemja==0.4.1 25 | proto-plus==1.25.0 26 | protobuf==4.23.4 27 | py4j==0.10.9.7 28 | pyarrow==11.0.0 29 | pydot==1.4.2 30 | pymongo==4.10.1 31 | pyparsing==3.2.1 32 | python-dateutil==2.9.0.post0 33 | pytz==2024.2 34 | regex==2024.11.6 35 | requests==2.32.3 36 | ruamel.yaml==0.18.10 37 | ruamel.yaml.clib==0.2.12 38 | six==1.17.0 39 | typing_extensions==4.12.2 40 | tzdata==2024.2 41 | urllib3==2.3.0 42 | zstandard==0.23.0 43 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-solution/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam==2.48.0 2 | apache-flink==1.20.0 3 | apache-flink-libraries==1.20.0 4 | avro-python3==1.10.2 5 | certifi==2024.12.14 6 | charset-normalizer==3.4.1 7 | cloudpickle==2.2.1 8 | confluent-kafka==2.8.0 9 | crcmod==1.7 10 | dill==0.3.1.1 11 | dnspython==2.7.0 12 | docopt==0.6.2 13 | fastavro==1.10.0 14 | fasteners==0.19 15 | find_libpython==0.4.0 16 | grpcio==1.69.0 17 | hdfs==2.7.3 18 | httplib2==0.22.0 19 | idna==3.10 20 | numpy==1.24.4 21 | objsize==0.6.1 22 | orjson==3.10.15 23 | pandas==2.2.3 24 | pemja==0.4.1 25 | proto-plus==1.25.0 26 | protobuf==4.23.4 27 | py4j==0.10.9.7 28 | pyarrow==11.0.0 29 | pydot==1.4.2 30 | pymongo==4.10.1 31 | pyparsing==3.2.1 32 | python-dateutil==2.9.0.post0 33 | pytz==2024.2 34 | regex==2024.11.6 35 | requests==2.32.3 36 | ruamel.yaml==0.18.10 37 | ruamel.yaml.clib==0.2.12 38 | six==1.17.0 39 | typing_extensions==4.12.2 40 | tzdata==2024.2 41 | urllib3==2.3.0 42 | zstandard==0.23.0 43 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/01-data-validation-solution/README.md: -------------------------------------------------------------------------------- 1 | 2 | This is a README for the first exercise in this section. 3 | 4 | # 0. Follow the instruction in the README.md file in the "exercises" folder 5 | 6 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow. 7 | 8 | # 1. Copy the DAG's starter code 9 | 10 | Copy the starter code for the DAG to the `dags` folder you've created while setting up Airflow locally 11 | 12 | # 2. Restart the scheduler 13 | 14 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`. 15 | 16 | After this, start it again using the following command: 17 | 18 | ```sh 19 | airflow scheduler 20 | ``` 21 | 22 | # 3. Implement the TODOs in the 23 | 24 | Now implement the TODO comments in the starter code. 25 | 26 | 27 | # 4. Start the DAG 28 | 29 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented. -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/01-data-validation-exercise/README.md: -------------------------------------------------------------------------------- 1 | 2 | This is a README for the first exercise in this section. 3 | 4 | # 0. Follow the instruction in the README.md file in the "exercises" folder 5 | 6 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow. 7 | 8 | # 1. Copy the DAG's starter code 9 | 10 | Copy the starter code for the DAG to the `dags` folder you've created while setting up Airflow locally 11 | 12 | # 2. Restart the scheduler 13 | 14 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`. 15 | 16 | After this, start it again using the following command: 17 | 18 | ```sh 19 | airflow scheduler 20 | ``` 21 | 22 | # 3. Implement the TODOs in the code 23 | 24 | Now implement the TODO comments in the starter code. 25 | 26 | 27 | # 4. Start the DAG 28 | 29 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented. -------------------------------------------------------------------------------- /05-ml-with-spark/README.md: -------------------------------------------------------------------------------- 1 | # Install Spark 2 | 3 | To install Spark locally on macOS, run the following command: 4 | 5 | ```sh 6 | brew install apache-spark 7 | ``` 8 | 9 | To check that it was installed correctly, you can run: 10 | 11 | ```sh 12 | pyspark --version 13 | ``` 14 | 15 | --- 16 | 17 | ## Set Up Virtual Environment 18 | 19 | Create and activate a virtual environment: 20 | 21 | ```sh 22 | python3 -m venv . 23 | source bin/activate 24 | ``` 25 | 26 | --- 27 | 28 | ## Install Required Packages 29 | 30 | Install Jupyter and NumPy (used for numerical operations in machine learning): 31 | 32 | ```sh 33 | pip install jupyter numpy 34 | ``` 35 | 36 | --- 37 | 38 | ## Configure PySpark to Use Jupyter Notebooks 39 | 40 | Set environment variables so PySpark launches in Jupyter Lab: 41 | 42 | ```sh 43 | export PYSPARK_DRIVER_PYTHON=jupyter 44 | export PYSPARK_DRIVER_PYTHON_OPTS='lab' 45 | ``` 46 | 47 | --- 48 | 49 | ## Launch PySpark with Jupyter 50 | 51 | Start the interactive Spark environment in Jupyter Lab: 52 | 53 | ```sh 54 | pyspark 55 | ``` 56 | 57 | --- -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/spark_etl_reviews.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import avg 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--customer_reviews", required=True, help="Input CSV file path") 8 | parser.add_argument("--output_path", required=True, help="Output CSV file path") 9 | args = parser.parse_args() 10 | 11 | spark = SparkSession \ 12 | .builder \ 13 | .appName("CustomerReviews") \ 14 | .getOrCreate() 15 | 16 | customer_reviews = spark.read.csv( 17 | args.customer_reviews, 18 | header=True, 19 | ) 20 | 21 | customer_reviews = customer_reviews \ 22 | .withColumn("review_score", customer_reviews["review_score"].cast("float")) 23 | 24 | result = customer_reviews \ 25 | .groupBy("listing_id") \ 26 | .agg( 27 | avg("review_score").alias("avg_review_score") 28 | ) 29 | 30 | result.write.mode("overwrite").csv(args.output_path) 31 | 32 | spark.stop() 33 | 34 | if __name__ == "__main__": 35 | main() -------------------------------------------------------------------------------- /08-flink-stream-processing/products-producer.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import Producer 2 | import sys 3 | import textwrap 4 | 5 | def delivery_callback(err, msg): 6 | if err: 7 | print("ERROR: Message failed delivery: {}".format(err)) 8 | else: 9 | print( 10 | textwrap.dedent( 11 | f""" 12 | Produced event to topic {msg.topic()}: 13 | value = {msg.value().decode('utf-8')} 14 | """) 15 | ) 16 | 17 | def main(): 18 | producer_config = { 19 | 'bootstrap.servers': 'localhost:9092', 20 | } 21 | producer = Producer(producer_config) 22 | 23 | print("Enter products's data") 24 | try: 25 | while True: 26 | json_line = input("> ").strip() 27 | if json_line: 28 | producer.produce( 29 | "products", 30 | key=None, 31 | value=json_line, 32 | callback=delivery_callback 33 | ) 34 | producer.poll(1) 35 | finally: 36 | producer.flush() 37 | producer.close() 38 | 39 | if __name__ == "__main__": 40 | main() -------------------------------------------------------------------------------- /06-data-eng-with-llms/02-structured-output.py: -------------------------------------------------------------------------------- 1 | %pip install outlines 2 | 3 | from huggingface_hub import login 4 | 5 | login(token="hf_...") 6 | 7 | import torch 8 | import json 9 | import outlines 10 | 11 | model_name = "mistralai/Mistral-7B-Instruct-v0.3" 12 | 13 | generator = outlines.models.transformers( 14 | model_name, 15 | device="cuda", 16 | model_kwargs={ 17 | "torch_dtype": torch.float16, 18 | } 19 | ) 20 | 21 | schema = json.dumps({ 22 | "type": "object", 23 | "properties": { 24 | "sentiment": { 25 | "type": "string", 26 | "enum": ["positive", "negative"] 27 | } 28 | }, 29 | "required": ["sentiment"] 30 | }) 31 | 32 | generate_json = outlines.generate.json(generator, schema) 33 | 34 | def classify_review(review): 35 | prompt = ( 36 | "Classify the following customer review as positive or negative.\n\n" 37 | f"Review:\n{review}\n" 38 | ) 39 | 40 | output_json = generate_json(prompt, max_tokens=40) 41 | 42 | return output_json 43 | 44 | 45 | print(classify_review("This is absolutely delightful!")) 46 | 47 | print(classify_review("This was the worst hotel I've ever seen")) -------------------------------------------------------------------------------- /07-kafka-streaming/README.md: -------------------------------------------------------------------------------- 1 | ## Prerequisites 2 | 3 | - [Docker](https://www.docker.com/) installed and running 4 | - Docker Compose (included with Docker Desktop) 5 | 6 | --- 7 | 8 | ## Start Kafka Broker 9 | 10 | Run the following command to start Kafka using the provided `docker-compose.yml` file: 11 | 12 | ```sh 13 | docker-compose up 14 | ``` 15 | 16 | --- 17 | 18 | ## Install Kafka CLI Tools 19 | 20 | To interact with Kafka from the command line, install the Kafka tools: 21 | 22 | ```sh 23 | brew install kafka 24 | ``` 25 | 26 | --- 27 | 28 | ## List Kafka Topics 29 | 30 | Use the following command to list all topics in the Kafka cluster: 31 | 32 | ```sh 33 | kafka-topics --list --bootstrap-server localhost:9092 34 | ``` 35 | 36 | --- 37 | 38 | ## Create a Kafka Topic 39 | 40 | Create a topic named `orders` with 4 partitions and a replication factor of 1: 41 | 42 | ```sh 43 | kafka-topics --create \ 44 | --bootstrap-server localhost:9092 \ 45 | --topic orders \ 46 | --replication-factor 1 \ 47 | --partitions 4 48 | ``` 49 | 50 | --- 51 | 52 | ## Verify Topic Creation 53 | 54 | List topics again to verify the new topic was created: 55 | 56 | ```sh 57 | kafka-topics --list --bootstrap-server localhost:9092 58 | ``` -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-exercise/wiki-consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from confluent_kafka import Consumer 5 | 6 | consumer_conf = { 7 | "bootstrap.servers": "localhost:9092", 8 | "group.id": "wiki-consumer-group", 9 | "auto.offset.reset": "earliest", 10 | } 11 | kafka_topic = "wikipedia-changes" 12 | 13 | 14 | def main(): 15 | consumer = Consumer(consumer_conf) 16 | consumer.subscribe([kafka_topic]) 17 | 18 | print(f"Consuming messages from topic '{kafka_topic}'") 19 | 20 | try: 21 | while True: 22 | msg = consumer.poll(timeout=1.0) 23 | if msg is None: 24 | continue 25 | 26 | if msg.error(): 27 | print(f"Error: {msg.error()}", file=sys.stderr) 28 | continue 29 | 30 | # TODO: Print a message about a Wikipedia edit if two conditions are true: 31 | # * If a change was made by a bot 32 | # * If a change is not minor 33 | # 34 | # The printed messages should include the name of an author making a change and 35 | # the title of a changed page 36 | 37 | finally: 38 | consumer.close() 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-exercise/flink-app.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.typeinfo import Types 6 | from pyflink.common.watermark_strategy import WatermarkStrategy 7 | from pyflink.datastream import StreamExecutionEnvironment 8 | from pyflink.datastream.connectors.kafka import KafkaSink, KafkaSource, KafkaRecordSerializationSchema 9 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 10 | 11 | # TODO: In this exercise you will need to 12 | # 13 | # * Read data written by the "payments-producer.py" 14 | # * Filter payments with amount greater than 500 15 | # * Output new records with only two fields: "payment_id" and "amount" 16 | # * Write output to another Kafka topic 17 | 18 | # TODO: Implement any functions and types that you need 19 | 20 | 21 | def main(): 22 | env = StreamExecutionEnvironment.get_execution_environment() 23 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 24 | 25 | # TODO: Create a Kafka source 26 | # TODO: Create a payments stream 27 | 28 | # TODO: Implement stream processing logic 29 | 30 | # TODO: Write resulting data to Kafka 31 | 32 | env.execute("Payments stream processing") 33 | 34 | 35 | if __name__ == "__main__": 36 | main() -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-solution/wiki-consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from confluent_kafka import Consumer 5 | 6 | consumer_conf = { 7 | "bootstrap.servers": "localhost:9092", 8 | "group.id": "wiki-consumer-group", 9 | "auto.offset.reset": "earliest", 10 | } 11 | kafka_topic = "wikipedia-changes" 12 | 13 | 14 | def main(): 15 | consumer = Consumer(consumer_conf) 16 | consumer.subscribe([kafka_topic]) 17 | 18 | print(f"Consuming messages from topic '{kafka_topic}'") 19 | 20 | try: 21 | while True: 22 | msg = consumer.poll(timeout=1.0) 23 | if msg is None: 24 | continue 25 | 26 | if msg.error(): 27 | print(f"Error: {msg.error()}", file=sys.stderr) 28 | continue 29 | 30 | message_value = msg.value().decode("utf-8") 31 | 32 | event = json.loads(message_value) 33 | 34 | bot = event.get("bot", False) 35 | minor = event.get("minor", True) 36 | title = event.get("title", "Unknown") 37 | user = event.get("user", "Unknown") 38 | 39 | if bot and not minor: 40 | print(f"Major bot edit detected: User '{user}' edited '{title}'") 41 | 42 | finally: 43 | consumer.close() 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /02-data-processing-with-spark/reviews-per-listing.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pyspark.sql import SparkSession 3 | import pyspark.sql.functions as F 4 | 5 | parser = argparse.ArgumentParser(description='Most popular listings parameters') 6 | parser.add_argument('--listings', help='Path to the listings dataset') 7 | parser.add_argument('--reviews', help='Path to the reviews dataset') 8 | parser.add_argument('--output', help='Directory to save the output') 9 | args = parser.parse_args() 10 | 11 | spark = SparkSession.builder \ 12 | .appName("Most popular listings") \ 13 | .getOrCreate() 14 | 15 | listings = spark.read.csv(args.listings, 16 | header=True, 17 | inferSchema=True, 18 | sep=",", 19 | quote='"', 20 | escape='"', 21 | multiLine=True, 22 | mode="PERMISSIVE" 23 | ) 24 | 25 | reviews = spark.read.csv(args.reviews, 26 | header=True, 27 | inferSchema=True, 28 | sep=",", 29 | quote='"', 30 | escape='"', 31 | multiLine=True, 32 | mode="PERMISSIVE" 33 | ) 34 | 35 | listings_reviews = listings.join( 36 | reviews, listings.id == reviews.listing_id, how='inner' 37 | ) 38 | 39 | reviews_per_listing = listings_reviews \ 40 | .groupBy(listings.id, listings.name) \ 41 | .agg( 42 | F.count(reviews.id).alias('num_reviews') 43 | ) \ 44 | .orderBy('num_reviews', ascending=False) \ 45 | 46 | reviews_per_listing \ 47 | .write \ 48 | .csv(args.output) -------------------------------------------------------------------------------- /03-data-lake/reviews-per-listing.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pyspark.sql import SparkSession 3 | import pyspark.sql.functions as F 4 | 5 | parser = argparse.ArgumentParser(description='Most popular listings parameters') 6 | parser.add_argument('--listings', help='Path to the listings dataset') 7 | parser.add_argument('--reviews', help='Path to the reviews dataset') 8 | parser.add_argument('--output', help='Directory to save the output') 9 | args = parser.parse_args() 10 | 11 | spark = SparkSession.builder \ 12 | .appName("Most popular listings") \ 13 | .getOrCreate() 14 | 15 | listings = spark.read.csv(args.listings, 16 | header=True, 17 | inferSchema=True, 18 | sep=",", 19 | quote='"', 20 | escape='"', 21 | multiLine=True, 22 | mode="PERMISSIVE" 23 | ) 24 | 25 | reviews = spark.read.csv(args.reviews, 26 | header=True, 27 | inferSchema=True, 28 | sep=",", 29 | quote='"', 30 | escape='"', 31 | multiLine=True, 32 | mode="PERMISSIVE" 33 | ) 34 | 35 | listings_reviews = listings.join( 36 | reviews, listings.id == reviews.listing_id, how='inner' 37 | ) 38 | 39 | reviews_per_listing = listings_reviews \ 40 | .groupBy(listings.id, listings.name) \ 41 | .agg( 42 | F.count(reviews.id).alias('num_reviews') 43 | ) \ 44 | .orderBy('num_reviews', ascending=False) \ 45 | 46 | reviews_per_listing \ 47 | .write \ 48 | .csv( 49 | args.output, 50 | header=True, 51 | ) -------------------------------------------------------------------------------- /07-kafka-streaming/docker-compose-schema-registry.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | kafka: 4 | image: confluentinc/cp-kafka:7.8.0 5 | hostname: kafka 6 | container_name: kafka 7 | ports: 8 | - "9092:9092" 9 | environment: 10 | KAFKA_KRAFT_MODE: "true" 11 | KAFKA_PROCESS_ROLES: broker,controller 12 | KAFKA_NODE_ID: 1 13 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 14 | KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092 15 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT 16 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 17 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092,INTERNAL://kafka:19092 18 | KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL 19 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 20 | KAFKA_LOG_RETENTION_HOURS: 168 21 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" 22 | 23 | schema-registry: 24 | image: confluentinc/cp-schema-registry:7.5.0 25 | container_name: schema-registry 26 | depends_on: 27 | - kafka 28 | ports: 29 | - "8081:8081" 30 | environment: 31 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: PLAINTEXT://kafka:19092 32 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 33 | SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081 34 | SCHEMA_REGISTRY_AVRO_COMPATIBILY_LEVEL: full 35 | 36 | -------------------------------------------------------------------------------- /06-data-eng-with-llms/01-simple-transformers.py: -------------------------------------------------------------------------------- 1 | %pip install transformers 2 | 3 | from huggingface_hub import login 4 | 5 | login(token="hf_...") 6 | 7 | import torch 8 | from transformers import pipeline 9 | 10 | model_name = "mistralai/Mistral-7B-Instruct-v0.3" 11 | 12 | generator = pipeline( 13 | "text-generation", 14 | model=model_name, 15 | device_map="cuda", 16 | torch_dtype=torch.float16, 17 | max_new_tokens=20, 18 | return_full_text=False, 19 | ) 20 | 21 | import textwrap 22 | 23 | def classify_review(review): 24 | messages = [ 25 | { 26 | "role": "user", 27 | "content": textwrap.dedent(f""" 28 | You are a sentiment classifier. 29 | Is the following customer review positive or negative? 30 | Respond with exactly one of the two words: positive, negative. 31 | 32 | Review: 33 | ``` 34 | {review} 35 | ``` 36 | """) 37 | } 38 | ] 39 | 40 | print('----------------------------') 41 | print(f"{messages[0]['content']}") 42 | print('----------------------------') 43 | 44 | output = generator(messages) 45 | generated_text = output[0]["generated_text"] 46 | return generated_text.strip().lower() 47 | 48 | print(classify_review("This is absolutely delightful!")) 49 | 50 | print(classify_review("This was the worst hotel I've ever seen")) -------------------------------------------------------------------------------- /07-kafka-streaming/03-kafka-connect-consumer.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from decimal import Decimal 4 | 5 | from confluent_kafka import Consumer 6 | 7 | consumer_config = { 8 | "bootstrap.servers": "localhost:9092", 9 | "group.id": "postgres-price-consumer", 10 | "auto.offset.reset": "earliest", 11 | } 12 | 13 | 14 | def main(): 15 | consumer = Consumer(consumer_config) 16 | 17 | topic = "postgres-.public.orders" 18 | consumer.subscribe([topic]) 19 | 20 | try: 21 | print(f"Consuming messages from topic '{topic}'") 22 | while True: 23 | msg = consumer.poll(1.0) 24 | 25 | if msg is None: 26 | continue 27 | if msg.error(): 28 | raise KafkaException(msg.error()) 29 | 30 | process_message(msg) 31 | 32 | finally: 33 | consumer.close() 34 | 35 | 36 | def process_message(msg): 37 | value = msg.value() 38 | 39 | order = json.loads(value.decode("utf-8")) 40 | total_amount_bytes = ( 41 | order.get("payload", {}).get("after", {}).get("total_amount") 42 | ) 43 | 44 | total_amount = decode_decimal(total_amount_bytes) 45 | print(f"Received order with total amount={total_amount}") 46 | 47 | 48 | def decode_decimal(encoded_string, scale=2): 49 | value_bytes = base64.b64decode(encoded_string) 50 | unscaled_value = int.from_bytes(value_bytes, byteorder="big", signed=True) 51 | return Decimal(unscaled_value) / Decimal(10**scale) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-solution/kafka-connect-consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from confluent_kafka import Consumer, KafkaError, KafkaException 4 | 5 | conf = { 6 | "bootstrap.servers": "localhost:9092", 7 | "group.id": "postgres-price-consumer", 8 | "auto.offset.reset": "earliest", 9 | } 10 | 11 | 12 | def main(): 13 | consumer = Consumer(conf) 14 | 15 | topic = "postgres-.public.orders" 16 | consumer.subscribe([topic]) 17 | 18 | try: 19 | print(f"Consuming messages from topic '{topic}'") 20 | while True: 21 | msg = consumer.poll(1.0) 22 | 23 | if msg is None: 24 | continue 25 | if msg.error(): 26 | print(f"Error: {msg.error()}") 27 | continue 28 | 29 | process_message(msg) 30 | 31 | finally: 32 | consumer.close() 33 | 34 | 35 | def process_message(msg): 36 | value = msg.value() 37 | order = json.loads(value.decode("utf-8")) 38 | payload = order.get("payload", {}) 39 | 40 | before = payload.get("before", None) 41 | after = payload.get("after", None) 42 | 43 | if not before or not after: 44 | return 45 | 46 | before_status = before.get("status") 47 | after_status = after.get("status") 48 | 49 | if before_status == "processed" and after_status == "refunded": 50 | print( 51 | f"Status changed from 'processed' to 'refunded' for order: {order.get('payload', {}).get('after', {}).get('id')}" 52 | ) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/dags/bookings_per_listing_spark.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import count 3 | import argparse 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--listings_file", required=True, help="Path to the monthly listings file") 8 | parser.add_argument("--bookings_file", required=True, help="Path to the hourly bookings file") 9 | parser.add_argument("--output_path", required=True, help="Output path for the aggregated results") 10 | args = parser.parse_args() 11 | 12 | print(f"Reading listings from {args.listings_file}") 13 | print(f"Reading bookings from {args.bookings_file}") 14 | spark = SparkSession.builder.appName("ListingsBookingsJoin").getOrCreate() 15 | 16 | listings = spark.read.csv(args.listings_file, 17 | header=True, 18 | inferSchema=True, 19 | sep=",", 20 | quote='"', 21 | escape='"', 22 | multiLine=True, 23 | mode="PERMISSIVE" 24 | ) 25 | 26 | bookings = spark.read.csv( 27 | args.bookings_file, 28 | header=True, 29 | inferSchema=True, 30 | ) 31 | 32 | aggregated = listings \ 33 | .join(bookings, listings["id"] == bookings["listing_id"], how="inner") \ 34 | .groupBy("listing_id", "name", "price") \ 35 | .agg( 36 | count("booking_id").alias("booking_count") 37 | ) 38 | 39 | aggregated.write.mode("overwrite").csv(args.output_path) 40 | 41 | print(f"Aggregated results written to {args.output_path}") 42 | spark.stop() 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /07-kafka-streaming/02-kafka-consumer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from confluent_kafka import Consumer 5 | 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser(description="Test Kafka consumer") 9 | parser.add_argument("--group-id", "-g", help="Consumer group ID") 10 | parser.add_argument("--topic-name", "-t", help="Topic name ") 11 | parser.add_argument("--name", "-n", help="Name of this consumer") 12 | 13 | args = parser.parse_args() 14 | 15 | group_id = args.group_id 16 | topic_name = args.topic_name 17 | consumer_name = args.name 18 | 19 | consumer_config = { 20 | "bootstrap.servers": "localhost:9092", 21 | "group.id": group_id, 22 | "auto.offset.reset": "earliest", 23 | } 24 | 25 | consumer = Consumer(consumer_config) 26 | consumer.subscribe([topic_name]) 27 | 28 | try: 29 | while True: 30 | msg = consumer.poll(timeout=1.0) 31 | if msg is None: 32 | # No new messages 33 | continue 34 | if msg.error(): 35 | # Error while reading messages 36 | print(f"[{consumer_name}]Error encountered: {msg.error()}") 37 | continue 38 | 39 | process_message(consumer_name, msg) 40 | 41 | finally: 42 | consumer.close() 43 | 44 | 45 | def process_message(consumer_name, msg): 46 | value = msg.value() 47 | 48 | order = json.loads(value.decode("utf-8")) 49 | price = order.get("total_price", 0) 50 | if price < 250: 51 | return 52 | 53 | print( 54 | f"[{consumer_name}] [partition={msg.partition()}] Received order price={price}" 55 | ) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-exercise/payments-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_payment(): 11 | payment_id = f"payment-{random.randint(1000, 9999)}" 12 | user_id = f"user-{random.randint(1, 50)}" 13 | merchant_id = f"merchant-{random.randint(1, 20)}" 14 | amount = round(random.uniform(10.0, 1000.0), 2) 15 | payment_time = datetime.now().isoformat() 16 | 17 | payment_event = { 18 | "payment_id": payment_id, 19 | "user_id": user_id, 20 | "merchant_id": merchant_id, 21 | "amount": amount, 22 | "payment_time": payment_time 23 | } 24 | return payment_event 25 | 26 | 27 | def main(): 28 | 29 | config = { 30 | "bootstrap.servers": "localhost:9092" 31 | } 32 | 33 | producer = Producer(config) 34 | 35 | topic = "payments" 36 | 37 | def delivery_callback(err, msg): 38 | if err: 39 | print("ERROR: Message failed delivery: {}".format(err)) 40 | else: 41 | print( 42 | textwrap.dedent( 43 | f""" 44 | Produced event to topic {msg.topic()}: 45 | key = {msg.key().decode('utf-8')} 46 | value = {msg.value().decode('utf-8')} 47 | """) 48 | ) 49 | 50 | while True: 51 | payment = generate_payment() 52 | print(f"Sending payment: {payment}") 53 | 54 | producer.produce( 55 | topic, 56 | key=str(payment["user_id"]), 57 | value=json.dumps(payment), 58 | callback=delivery_callback, 59 | ) 60 | 61 | producer.poll(0) 62 | time.sleep(1) 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-solution/payments-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_payment(): 11 | payment_id = f"payment-{random.randint(1000, 9999)}" 12 | user_id = f"user-{random.randint(1, 50)}" 13 | merchant_id = f"merchant-{random.randint(1, 20)}" 14 | amount = round(random.uniform(10.0, 1000.0), 2) 15 | payment_time = datetime.now().isoformat() 16 | 17 | payment_event = { 18 | "payment_id": payment_id, 19 | "user_id": user_id, 20 | "merchant_id": merchant_id, 21 | "amount": amount, 22 | "payment_time": payment_time 23 | } 24 | return payment_event 25 | 26 | 27 | def main(): 28 | 29 | config = { 30 | "bootstrap.servers": "localhost:9092" 31 | } 32 | 33 | producer = Producer(config) 34 | 35 | topic = "payments" 36 | 37 | def delivery_callback(err, msg): 38 | if err: 39 | print("ERROR: Message failed delivery: {}".format(err)) 40 | else: 41 | print( 42 | textwrap.dedent( 43 | f""" 44 | Produced event to topic {msg.topic()}: 45 | key = {msg.key().decode('utf-8')} 46 | value = {msg.value().decode('utf-8')} 47 | """) 48 | ) 49 | 50 | while True: 51 | payment = generate_payment() 52 | print(f"Sending payment: {payment}") 53 | 54 | producer.produce( 55 | topic, 56 | key=str(payment["user_id"]), 57 | value=json.dumps(payment), 58 | callback=delivery_callback, 59 | ) 60 | 61 | producer.poll(0) 62 | time.sleep(1) 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /07-kafka-streaming/01-kafka-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | def generate_order(): 10 | countries = [ 11 | "USA", 12 | "Canada", 13 | "UK", 14 | "Germany", 15 | "France", 16 | "Australia", 17 | "Japan", 18 | "Ireland", 19 | ] 20 | order = { 21 | "order_id": random.randint(1000, 9999), 22 | "customer_id": random.randint(1, 10), 23 | "total_price": round(random.uniform(20.0, 1000.0), 2), 24 | "customer_country": random.choice(countries), 25 | "merchant_country": random.choice(countries), 26 | "order_date": datetime.now().isoformat(), 27 | } 28 | return order 29 | 30 | def main(): 31 | 32 | config = { 33 | "bootstrap.servers": "localhost:9092" 34 | } 35 | 36 | producer = Producer(config) 37 | 38 | topic = "orders" 39 | 40 | def delivery_callback(err, msg): 41 | if err: 42 | print("ERROR: Message failed delivery: {}".format(err)) 43 | else: 44 | print( 45 | textwrap.dedent( 46 | f""" 47 | Produced event to topic {msg.topic()}: 48 | key = {msg.key().decode('utf-8')} 49 | value = {msg.value().decode('utf-8')} 50 | """) 51 | ) 52 | 53 | while True: 54 | order = generate_order() 55 | print(f"Sending order: {order}") 56 | 57 | producer.produce( 58 | topic, 59 | key=str(order["customer_id"]), 60 | value=json.dumps(order), 61 | callback=delivery_callback, 62 | ) 63 | 64 | producer.poll(0) 65 | 66 | time.sleep(1) 67 | 68 | if __name__ == "__main__": 69 | main() -------------------------------------------------------------------------------- /08-flink-stream-processing/orders-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_order(): 11 | order_id = f"order-{random.randint(1000, 9999)}" 12 | customer_id = f"customer-{random.randint(1, 200)}" 13 | product_id = f"product-{random.randint(1, 10)}" 14 | quantity = random.randint(1, 5) 15 | price = round(random.uniform(5.0, 100.0), 2) 16 | order_time = datetime.now().isoformat() 17 | 18 | order_event = { 19 | "order_id": order_id, 20 | "customer_id": customer_id, 21 | "product_id": product_id, 22 | "quantity": quantity, 23 | "price": price, 24 | "order_time": order_time 25 | } 26 | return order_event 27 | 28 | 29 | def main(): 30 | 31 | config = { 32 | "bootstrap.servers": "localhost:9092" 33 | } 34 | 35 | producer = Producer(config) 36 | 37 | topic = "orders" 38 | 39 | def delivery_callback(err, msg): 40 | if err: 41 | print("ERROR: Message failed delivery: {}".format(err)) 42 | else: 43 | print( 44 | textwrap.dedent( 45 | f""" 46 | Produced event to topic {msg.topic()}: 47 | key = {msg.key().decode('utf-8')} 48 | value = {msg.value().decode('utf-8')} 49 | """) 50 | ) 51 | 52 | while True: 53 | order = generate_order() 54 | print(f"Sending order: {order}") 55 | 56 | producer.produce( 57 | topic, 58 | key=str(order["customer_id"]), 59 | value=json.dumps(order), 60 | callback=delivery_callback, 61 | ) 62 | 63 | producer.poll(0) 64 | 65 | time.sleep(1) 66 | 67 | if __name__ == "__main__": 68 | main() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/customer_reviews_dag.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook 4 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator 5 | from datetime import datetime, timedelta 6 | import os 7 | import csv 8 | 9 | 10 | @dag( 11 | "customer_reviews_dag", 12 | start_date=datetime(2025, 1, 1), 13 | schedule_interval="* * * * *", 14 | catchup=False, 15 | description="Review average score", 16 | ) 17 | def customer_reviews_dag(): 18 | 19 | @task 20 | def extract_reviews(): 21 | pg_hook = PostgresHook(postgres_conn_id="postgres_rental_site") 22 | 23 | context = get_current_context() 24 | execution_date = context["execution_date"] 25 | start_of_minute = execution_date.replace(second=0, microsecond=0) 26 | end_of_minute = start_of_minute + timedelta(hours=1) 27 | 28 | query = f""" 29 | SELECT review_id, listing_id, review_score, review_comment, review_date 30 | FROM customer_reviews 31 | WHERE review_date >= '{start_of_minute.strftime('%Y-%m-%d %H:%M:%S')}' 32 | AND review_date < '{end_of_minute.strftime('%Y-%m-%d %H:%M:%S')}' 33 | """ 34 | 35 | # TODO: Read data from Postgres, and write the results 36 | 37 | spark_etl = SparkSubmitOperator( 38 | task_id="spark_etl_reviews", 39 | application="dags/spark_etl_reviews.py", 40 | name="guest_reviews_etl", 41 | application_args=[ 42 | # TODO: Set input and output paths 43 | "--customer_reviews", "", 44 | "--output_path", "" 45 | ], 46 | conn_id='spark_rental_site', 47 | ) 48 | 49 | extract_task = extract_reviews() 50 | extract_task >> spark_etl 51 | 52 | dag_instance = customer_reviews_dag() -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-exercise/payments-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_payment(): 11 | payment_id = f"payment-{random.randint(1000, 9999)}" 12 | user_id = f"user-{random.randint(1, 50)}" 13 | merchant_id = f"merchant-{random.randint(1, 20)}" 14 | 15 | if random.randint(1, 10) < 2: 16 | amount = round(random.uniform(10.0, 10000.0), 2) 17 | else: 18 | amount = round(random.uniform(10.0, 1000.0), 2) 19 | payment_time = datetime.now().isoformat() 20 | 21 | payment_event = { 22 | "payment_id": payment_id, 23 | "user_id": user_id, 24 | "merchant_id": merchant_id, 25 | "amount": amount, 26 | "payment_time": payment_time 27 | } 28 | return payment_event 29 | 30 | 31 | def main(): 32 | 33 | config = { 34 | "bootstrap.servers": "localhost:9092" 35 | } 36 | 37 | producer = Producer(config) 38 | 39 | topic = "payments" 40 | 41 | def delivery_callback(err, msg): 42 | if err: 43 | print("ERROR: Message failed delivery: {}".format(err)) 44 | else: 45 | print( 46 | textwrap.dedent( 47 | f""" 48 | Produced event to topic {msg.topic()}: 49 | key = {msg.key().decode('utf-8')} 50 | value = {msg.value().decode('utf-8')} 51 | """) 52 | ) 53 | 54 | while True: 55 | payment = generate_payment() 56 | print(f"Sending payment: {payment}") 57 | 58 | producer.produce( 59 | topic, 60 | key=str(payment["user_id"]), 61 | value=json.dumps(payment), 62 | callback=delivery_callback, 63 | ) 64 | 65 | producer.poll(0) 66 | time.sleep(1) 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-solution/payments-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_payment(): 11 | payment_id = f"payment-{random.randint(1000, 9999)}" 12 | user_id = f"user-{random.randint(1, 50)}" 13 | merchant_id = f"merchant-{random.randint(1, 20)}" 14 | 15 | if random.randint(1, 10) < 2: 16 | amount = round(random.uniform(10.0, 10000.0), 2) 17 | else: 18 | amount = round(random.uniform(10.0, 1000.0), 2) 19 | payment_time = datetime.now().isoformat() 20 | 21 | payment_event = { 22 | "payment_id": payment_id, 23 | "user_id": user_id, 24 | "merchant_id": merchant_id, 25 | "amount": amount, 26 | "payment_time": payment_time 27 | } 28 | return payment_event 29 | 30 | 31 | def main(): 32 | 33 | config = { 34 | "bootstrap.servers": "localhost:9092" 35 | } 36 | 37 | producer = Producer(config) 38 | 39 | topic = "payments" 40 | 41 | def delivery_callback(err, msg): 42 | if err: 43 | print("ERROR: Message failed delivery: {}".format(err)) 44 | else: 45 | print( 46 | textwrap.dedent( 47 | f""" 48 | Produced event to topic {msg.topic()}: 49 | key = {msg.key().decode('utf-8')} 50 | value = {msg.value().decode('utf-8')} 51 | """) 52 | ) 53 | 54 | while True: 55 | payment = generate_payment() 56 | print(f"Sending payment: {payment}") 57 | 58 | producer.produce( 59 | topic, 60 | key=str(payment["user_id"]), 61 | value=json.dumps(payment), 62 | callback=delivery_callback, 63 | ) 64 | 65 | producer.poll(0) 66 | time.sleep(1) 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-exercise/kafka-connect-consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from confluent_kafka import Consumer, KafkaError, KafkaException 4 | 5 | conf = { 6 | "bootstrap.servers": "localhost:9092", 7 | "group.id": "postgres-price-consumer", 8 | "auto.offset.reset": "earliest", 9 | } 10 | 11 | # TODO: Read the "README.md" for instructions on how to set up this exercise 12 | 13 | def main(): 14 | consumer = Consumer(conf) 15 | 16 | topic = "postgres-.public.orders" 17 | consumer.subscribe([topic]) 18 | 19 | try: 20 | print(f"Consuming messages from topic '{topic}'") 21 | while True: 22 | msg = consumer.poll(1.0) 23 | 24 | if msg is None: 25 | continue 26 | if msg.error(): 27 | print(f"Error: {msg.error()}") 28 | continue 29 | 30 | process_message(msg) 31 | 32 | finally: 33 | consumer.close() 34 | 35 | 36 | def process_message(msg): 37 | # TODO: Process incoming WAL record 38 | # Print a string message if two conditions are true: 39 | # * If a message is for an update operation 40 | # * If an order status has changed from "processed" to "refunded" 41 | # 42 | # Note: If you go though the steps in the README.md, 43 | # each record will contain the "payload" object two fields: 44 | # * `before` - a snapshot of a database record before it was updated 45 | # * `after` - a snapshot of a database record after it was updated 46 | # 47 | # You will need to extract the "status" column values from both records and compare their values 48 | # 49 | # To get those field you should do something like this: 50 | # 51 | # ```py 52 | # before = wal_record["payload"]["before"] 53 | # after = wal_record["payload"]["after"] 54 | # ``` 55 | # 56 | # But keep in mind that one or both of these fields can be None for some events. 57 | pass 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /07-kafka-streaming/docker-compose-kafka-connect.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | hostname: kafka 7 | container_name: kafka 8 | ports: 9 | - "9092:9092" 10 | environment: 11 | KAFKA_KRAFT_MODE: "true" 12 | KAFKA_PROCESS_ROLES: broker,controller 13 | KAFKA_NODE_ID: 1 14 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 15 | KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092 16 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092,INTERNAL://kafka:19092 19 | KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL 20 | KAFKA_LOG_RETENTION_HOURS: 168 21 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 22 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" 23 | 24 | kafka-connect: 25 | image: debezium/connect:3.0.0.Final 26 | container_name: kafka-connect 27 | depends_on: 28 | - kafka 29 | ports: 30 | - "8083:8083" 31 | environment: 32 | BOOTSTRAP_SERVERS: kafka:19092 33 | GROUP_ID: "kafka-connect-group" 34 | CONFIG_STORAGE_TOPIC: _connect-configs 35 | OFFSET_STORAGE_TOPIC: _connect-offsets 36 | STATUS_STORAGE_TOPIC: _connect-status 37 | KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 38 | VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 39 | INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 40 | INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 41 | 42 | postgres: 43 | image: postgres:15 44 | container_name: postgres 45 | environment: 46 | POSTGRES_USER: user 47 | POSTGRES_PASSWORD: password 48 | POSTGRES_DB: onlineshop 49 | command: 50 | - "postgres" 51 | - "-c" 52 | - "wal_level=logical" 53 | ports: 54 | - "5432:5432" 55 | 56 | -------------------------------------------------------------------------------- /08-flink-stream-processing/README.md: -------------------------------------------------------------------------------- 1 | ## Prerequisites 2 | 3 | - Python 3.11 (Try to use this version if you have errors) 4 | - Java (required for running Flink) 5 | 6 | --- 7 | 8 | ## Create and Activate a Virtual Environment 9 | 10 | ```sh 11 | python3.11 -m venv venv 12 | source ./venv/bin/activate 13 | ``` 14 | 15 | --- 16 | 17 | ## Install Required Python Packages 18 | 19 | Install the Confluent Kafka client and Apache Flink with: 20 | 21 | ```sh 22 | pip install confluent-kafka 23 | pip install apache-flink 24 | ``` 25 | 26 | --- 27 | 28 | ## Configure `flink` CLI Command 29 | 30 | Determine the Flink installation path and update the `PATH` environment variable: 31 | 32 | ```sh 33 | FLINK_HOME=$(./venv/bin/find_flink_home.py) 34 | export PATH=$PATH:$FLINK_HOME/bin 35 | ``` 36 | 37 | Verify the installation: 38 | 39 | ```sh 40 | flink --version 41 | ``` 42 | 43 | --- 44 | 45 | ## Download the Kafka Connector 46 | 47 | Search for **`flink-sql-connector-kafka maven`** in your browser and download the latest available JAR file. Place it in the project directory. 48 | 49 | --- 50 | 51 | ## Sample Flink Application 52 | 53 | Create a Python file named `01-flink-hello-world.py` with the following content: 54 | 55 | ```python 56 | from pyflink.datastream import StreamExecutionEnvironment 57 | 58 | def main(): 59 | env = StreamExecutionEnvironment.get_execution_environment() 60 | data_stream = env.from_collection([1, 2, 3, 4, 5]) 61 | 62 | mapped_stream = data_stream.map(lambda x: x * 2) 63 | 64 | mapped_stream.print() 65 | 66 | env.execute("Flink Hello World") 67 | 68 | if __name__ == "__main__": 69 | main() 70 | ``` 71 | 72 | --- 73 | 74 | ## Run the Application 75 | 76 | Run the Flink application locally with the Kafka connector: 77 | 78 | ```sh 79 | flink run \ 80 | --python 01-flink-hello-world.py \ 81 | --target local \ 82 | --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar 83 | ``` 84 | 85 | --- 86 | 87 | Flink is now ready and running locally. You can build on this setup in future demos using real-time Kafka streams. -------------------------------------------------------------------------------- /08-flink-stream-processing/variable-spend-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_order(): 11 | customer_id_num = random.randint(1, 50) 12 | order_id = f"order-{random.randint(1000, 9999)}" 13 | customer_id = f"customer-{customer_id_num}" 14 | product_id = f"product-{random.randint(1, 200)}" 15 | quantity = random.randint(1, 5) 16 | 17 | if customer_id_num < 40: 18 | price = round(random.uniform(5.0, 100.0), 2) 19 | else: 20 | price = round(random.uniform(200.0, 300.0), 2) 21 | order_time = datetime.now().isoformat() 22 | 23 | order_event = { 24 | "order_id": order_id, 25 | "customer_id": customer_id, 26 | "product_id": product_id, 27 | "quantity": quantity, 28 | "price": price, 29 | "order_time": order_time 30 | } 31 | return order_event 32 | 33 | 34 | def main(): 35 | 36 | config = { 37 | "bootstrap.servers": "localhost:9092" 38 | } 39 | 40 | producer = Producer(config) 41 | 42 | topic = "orders" 43 | 44 | def delivery_callback(err, msg): 45 | if err: 46 | print("ERROR: Message failed delivery: {}".format(err)) 47 | else: 48 | print( 49 | textwrap.dedent( 50 | f""" 51 | Produced event to topic {msg.topic()}: 52 | key = {msg.key().decode('utf-8')} 53 | value = {msg.value().decode('utf-8')} 54 | """) 55 | ) 56 | 57 | while True: 58 | order = generate_order() 59 | print(f"Sending order: {order}") 60 | 61 | producer.produce( 62 | topic, 63 | key=str(order["customer_id"]), 64 | value=json.dumps(order), 65 | callback=delivery_callback, 66 | ) 67 | 68 | producer.poll(0) 69 | 70 | time.sleep(1) 71 | 72 | if __name__ == "__main__": 73 | main() -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-solution/wiki-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import textwrap 3 | 4 | from confluent_kafka import Producer 5 | from sseclient import SSEClient 6 | 7 | producer_conf = {"bootstrap.servers": "localhost:9092"} 8 | kafka_topic = "wikipedia-changes" 9 | 10 | 11 | def delivery_callback(err, msg): 12 | if err: 13 | print("ERROR: Message failed delivery: {}".format(err)) 14 | else: 15 | print( 16 | textwrap.dedent(f""" 17 | Produced event to topic {msg.topic()}: 18 | key = {msg.key().decode('utf-8')} 19 | value = {msg.value().decode('utf-8')} 20 | """) 21 | ) 22 | 23 | 24 | def main(): 25 | url = "https://stream.wikimedia.org/v2/stream/recentchange" 26 | 27 | print( 28 | f"Starting to consume Wikipedia recent changes from {url} and produce to Kafka topic '{kafka_topic}'..." 29 | ) 30 | 31 | producer = Producer(producer_conf) 32 | messages = SSEClient(url) 33 | 34 | for event in messages: 35 | if event.event == "message" and event.data: 36 | try: 37 | data = json.loads(event.data) 38 | except json.JSONDecodeError: 39 | continue 40 | 41 | id = data.get("id") 42 | message = { 43 | "id": id, 44 | "type": data.get("type"), 45 | "title": data.get("title"), 46 | "user": data.get("user"), 47 | "bot": data.get("bot"), 48 | "timestamp": data.get("timestamp"), 49 | "comment": data.get("comment"), 50 | "minor": data.get("minor", False), 51 | } 52 | 53 | value = json.dumps(message) 54 | producer.produce( 55 | topic=kafka_topic, 56 | key=str(id), 57 | value=value, 58 | callback=delivery_callback, 59 | ) 60 | producer.poll(0) 61 | 62 | producer.flush() 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-exercise/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | hostname: kafka 7 | container_name: kafka 8 | ports: 9 | - "9092:9092" 10 | environment: 11 | KAFKA_KRAFT_MODE: "true" 12 | KAFKA_PROCESS_ROLES: broker,controller 13 | KAFKA_NODE_ID: 1 14 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 15 | KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092 16 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092,INTERNAL://kafka:19092 19 | KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | KAFKA_LOG_RETENTION_HOURS: 168 22 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" 23 | 24 | kafka-connect: 25 | image: debezium/connect:3.0.0.Final 26 | container_name: kafka-connect 27 | depends_on: 28 | - kafka 29 | ports: 30 | - "8083:8083" 31 | environment: 32 | BOOTSTRAP_SERVERS: kafka:19092 33 | GROUP_ID: "kafka-connect-group" 34 | CONFIG_STORAGE_TOPIC: _connect-configs 35 | OFFSET_STORAGE_TOPIC: _connect-offsets 36 | STATUS_STORAGE_TOPIC: _connect-status 37 | KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 38 | VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 39 | INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 40 | INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 41 | 42 | postgres: 43 | image: postgres:15 44 | container_name: postgres 45 | environment: 46 | POSTGRES_USER: user 47 | POSTGRES_PASSWORD: password 48 | POSTGRES_DB: onlineshop 49 | command: 50 | - "postgres" 51 | - "-c" 52 | - "wal_level=logical" 53 | ports: 54 | - "5432:5432" 55 | 56 | -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-solution/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | 4 | kafka: 5 | image: confluentinc/cp-kafka:7.8.0 6 | hostname: kafka 7 | container_name: kafka 8 | ports: 9 | - "9092:9092" 10 | environment: 11 | KAFKA_KRAFT_MODE: "true" 12 | KAFKA_PROCESS_ROLES: broker,controller 13 | KAFKA_NODE_ID: 1 14 | KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 15 | KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092 16 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT 17 | KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER 18 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092,INTERNAL://kafka:19092 19 | KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL 20 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 21 | KAFKA_LOG_RETENTION_HOURS: 168 22 | CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c" 23 | 24 | kafka-connect: 25 | image: debezium/connect:3.0.0.Final 26 | container_name: kafka-connect 27 | depends_on: 28 | - kafka 29 | ports: 30 | - "8083:8083" 31 | environment: 32 | BOOTSTRAP_SERVERS: kafka:19092 33 | GROUP_ID: "kafka-connect-group" 34 | CONFIG_STORAGE_TOPIC: _connect-configs 35 | OFFSET_STORAGE_TOPIC: _connect-offsets 36 | STATUS_STORAGE_TOPIC: _connect-status 37 | KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 38 | VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 39 | INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 40 | INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 41 | 42 | postgres: 43 | image: postgres:15 44 | container_name: postgres 45 | environment: 46 | POSTGRES_USER: user 47 | POSTGRES_PASSWORD: password 48 | POSTGRES_DB: onlineshop 49 | command: 50 | - "postgres" 51 | - "-c" 52 | - "wal_level=logical" 53 | ports: 54 | - "5432:5432" 55 | 56 | -------------------------------------------------------------------------------- /08-flink-stream-processing/popular-products-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime, timedelta 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_order(): 11 | 12 | order_id = f"order-{random.randint(1000, 9999)}" 13 | customer_id = f"customer-{random.randint(1, 200)}" 14 | 15 | product_id_num = random.randint(1, 10) 16 | product_id = f"product-{product_id_num}" 17 | 18 | if product_id_num > 3: 19 | quantity = random.randint(1, 5) 20 | else: 21 | quantity = random.randint(1, 20) 22 | 23 | price = round(random.uniform(5.0, 100.0), 2) 24 | current_time = datetime.now() 25 | 26 | order_event = { 27 | "order_id": order_id, 28 | "customer_id": customer_id, 29 | "product_id": product_id, 30 | "quantity": quantity, 31 | "price": price, 32 | "order_time": current_time.isoformat() 33 | } 34 | return order_event 35 | 36 | 37 | def main(): 38 | 39 | config = { 40 | "bootstrap.servers": "localhost:9092" 41 | } 42 | 43 | producer = Producer(config) 44 | 45 | topic = "orders" 46 | 47 | def delivery_callback(err, msg): 48 | if err: 49 | print("ERROR: Message failed delivery: {}".format(err)) 50 | else: 51 | print( 52 | textwrap.dedent( 53 | f""" 54 | Produced event to topic {msg.topic()}: 55 | key = {msg.key().decode('utf-8')} 56 | value = {msg.value().decode('utf-8')} 57 | """) 58 | ) 59 | 60 | while True: 61 | order = generate_order() 62 | print(f"Sending order: {order}") 63 | 64 | producer.produce( 65 | topic, 66 | key=str(order["customer_id"]), 67 | value=json.dumps(order), 68 | callback=delivery_callback, 69 | ) 70 | 71 | producer.poll(0) 72 | 73 | time.sleep(1) 74 | 75 | if __name__ == "__main__": 76 | main() -------------------------------------------------------------------------------- /08-flink-stream-processing/late-events-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime, timedelta 6 | 7 | from confluent_kafka import Producer 8 | 9 | 10 | def generate_order(): 11 | order_id = f"order-{random.randint(1000, 9999)}" 12 | customer_id = f"customer-{random.randint(1, 10)}" 13 | product_id = f"product-{random.randint(1, 200)}" 14 | quantity = random.randint(1, 5) 15 | price = round(random.uniform(5.0, 100.0), 2) 16 | current_time = datetime.now() 17 | 18 | if random.random() < 0.2: 19 | late_by = random.randint(60, 120) 20 | event_time = current_time - timedelta(seconds=late_by) 21 | else: 22 | event_time = current_time 23 | 24 | order_event = { 25 | "order_id": order_id, 26 | "customer_id": customer_id, 27 | "product_id": product_id, 28 | "quantity": quantity, 29 | "price": price, 30 | "order_time": event_time.isoformat() 31 | } 32 | return order_event 33 | 34 | 35 | def main(): 36 | 37 | config = { 38 | "bootstrap.servers": "localhost:9092" 39 | } 40 | 41 | producer = Producer(config) 42 | 43 | topic = "late-orders" 44 | 45 | def delivery_callback(err, msg): 46 | if err: 47 | print("ERROR: Message failed delivery: {}".format(err)) 48 | else: 49 | print( 50 | textwrap.dedent( 51 | f""" 52 | Produced event to topic {msg.topic()}: 53 | key = {msg.key().decode('utf-8')} 54 | value = {msg.value().decode('utf-8')} 55 | """) 56 | ) 57 | 58 | while True: 59 | order = generate_order() 60 | print(f"Sending order: {order}") 61 | 62 | producer.produce( 63 | topic, 64 | key=str(order["customer_id"]), 65 | value=json.dumps(order), 66 | callback=delivery_callback, 67 | ) 68 | 69 | producer.poll(0) 70 | 71 | time.sleep(1) 72 | 73 | if __name__ == "__main__": 74 | main() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/dags/01-average_page_visits.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | import json 4 | import random 5 | 6 | from airflow.decorators import dag, task 7 | from airflow.operators.python import get_current_context 8 | 9 | @dag( 10 | "average_page_visits", 11 | start_date=datetime(2025, 1, 1), 12 | schedule_interval="* * * * *", 13 | catchup=False, 14 | description="" 15 | ) 16 | def average_page_visits(): 17 | 18 | def get_data_path(): 19 | context = get_current_context() 20 | execution_date = context["execution_date"] 21 | file_date = execution_date.strftime("%Y-%m-%d_%H%M") 22 | return f"/tmp/page_visits/{file_date}.json" 23 | 24 | @task 25 | def produce_page_visits_data(): 26 | 27 | page_visits = [ 28 | {"id": 1, "name": "Cozy Apartment", "price": 120, "page_visits": random.randint(0, 50)}, 29 | {"id": 2, "name": "Luxury Condo", "price": 300, "page_visits": random.randint(0, 50)}, 30 | {"id": 3, "name": "Modern Studio", "price": 180, "page_visits": random.randint(0, 50)}, 31 | {"id": 4, "name": "Charming Loft", "price": 150, "page_visits": random.randint(0, 50)}, 32 | {"id": 5, "name": "Spacious Villa", "price": 400, "page_visits": random.randint(0, 50)}, 33 | ] 34 | file_path = get_data_path() 35 | 36 | directory = os.path.dirname(file_path) 37 | if not os.path.exists(directory): 38 | os.makedirs(directory) 39 | 40 | with open(file_path, "w") as f: 41 | json.dump(page_visits, f) 42 | 43 | print(f"Written to file: {file_path}") 44 | 45 | @task 46 | def process_page_visits_data(): 47 | file_path = get_data_path() 48 | 49 | with open(file_path, "r") as f: 50 | page_visits = json.load(f) 51 | 52 | average_price = sum(page_visit["page_visits"] for page_visit in page_visits) / len(page_visits) 53 | print(f"Average number of page visits {average_price}") 54 | 55 | produce_page_visits_data() >> process_page_visits_data() 56 | 57 | demo_dag = average_page_visits() -------------------------------------------------------------------------------- /07-kafka-streaming/order.py: -------------------------------------------------------------------------------- 1 | 2 | class Order: 3 | def __init__( 4 | self, 5 | order_id, 6 | customer_id, 7 | total_price, 8 | customer_country, 9 | merchant_country, 10 | order_datetime, 11 | ): 12 | self.order_id = order_id 13 | self.customer_id = customer_id 14 | self.total_price = total_price 15 | self.customer_country = customer_country 16 | self.merchant_country = merchant_country 17 | self.order_datetime = order_datetime 18 | 19 | 20 | @staticmethod 21 | def from_dict(obj): 22 | return Order( 23 | order_id=obj["order_id"], 24 | customer_id=obj["customer_id"], 25 | total_price=obj["total_price"], 26 | customer_country=obj["customer_country"], 27 | merchant_country=obj["merchant_country"], 28 | order_datetime=obj["order_datetime"], 29 | ) 30 | 31 | def to_dict(self): 32 | return { 33 | "order_id": self.order_id, 34 | "customer_id": self.customer_id, 35 | "total_price": self.total_price, 36 | "customer_country": self.customer_country, 37 | "merchant_country": self.merchant_country, 38 | "order_datetime": self.order_datetime, 39 | } 40 | 41 | def __str__(self): 42 | return (f"Order(" 43 | f"order_id={self.order_id}, " 44 | f"customer_id={self.customer_id}, " 45 | f"total_price={self.total_price}, " 46 | f"customer_country='{self.customer_country}', " 47 | f"merchant_country='{self.merchant_country}', " 48 | f"order_datetime='{self.order_datetime}')") 49 | 50 | 51 | ORDER_SCHEMA = { 52 | "type": "record", 53 | "name": "Order", 54 | "fields": [ 55 | {"name": "order_id", "type": "int"}, 56 | {"name": "customer_id", "type": "string"}, 57 | {"name": "total_price", "type": "float"}, 58 | {"name": "customer_country", "type": "string"}, 59 | {"name": "merchant_country", "type": "string"}, 60 | {"name": "order_datetime", "type": "string"}, 61 | ], 62 | } -------------------------------------------------------------------------------- /07-kafka-streaming/05-kafka-schema-registry-consumer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from order import Order, ORDER_SCHEMA 5 | 6 | from confluent_kafka import Consumer, KafkaError 7 | from confluent_kafka.schema_registry import SchemaRegistryClient 8 | from confluent_kafka.schema_registry.avro import AvroDeserializer 9 | from confluent_kafka.serialization import MessageField, SerializationContext 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description="Test Kafka consumer") 13 | parser.add_argument("--group-id", "-g", help="Consumer group ID") 14 | parser.add_argument("--topic-name", "-t", help="Topic name ") 15 | 16 | args = parser.parse_args() 17 | 18 | group_id = args.group_id 19 | topic_name = args.topic_name 20 | 21 | schema_registry_conf = {"url": "http://localhost:8081"} 22 | schema_registry_client = SchemaRegistryClient(schema_registry_conf) 23 | 24 | avro_deserializer = AvroDeserializer( 25 | schema_registry_client, 26 | json.dumps(ORDER_SCHEMA), 27 | lambda obj, ctx: Order.from_dict(obj), 28 | ) 29 | 30 | config = { 31 | "bootstrap.servers": "localhost:9092", 32 | "group.id": group_id, 33 | "auto.offset.reset": "earliest", 34 | } 35 | 36 | consumer = Consumer(config) 37 | consumer.subscribe([topic_name]) 38 | 39 | print(f"Starting consumer with group ID '{group_id}'") 40 | 41 | try: 42 | while True: 43 | msg = consumer.poll(timeout=1.0) 44 | if msg is None: 45 | # No new messages 46 | continue 47 | if msg.error(): 48 | # Error while reading 49 | print(f"Error encountered: {msg.error()}") 50 | continue 51 | 52 | process_message(avro_deserializer, msg) 53 | 54 | finally: 55 | consumer.close() 56 | 57 | 58 | def process_message(avro_deserializer, msg): 59 | order = avro_deserializer( 60 | msg.value(), SerializationContext(msg.topic(), MessageField.VALUE) 61 | ) 62 | if order.total_price < 250: 63 | return 64 | 65 | print(f"Received order price={order.total_price}") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/dags/02-average_page_visits_with_failures.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | import json 4 | import random 5 | 6 | from airflow.decorators import dag, task 7 | from airflow.operators.python import get_current_context 8 | 9 | @dag( 10 | "average_page_visits", 11 | start_date=datetime(2025, 1, 1), 12 | schedule_interval="* * * * *", 13 | catchup=False, 14 | description="" 15 | ) 16 | def average_page_visits(): 17 | 18 | def get_data_path(): 19 | context = get_current_context() 20 | execution_date = context["execution_date"] 21 | file_date = execution_date.strftime("%Y-%m-%d_%H%M") 22 | return f"/tmp/page_visits/{file_date}.json" 23 | 24 | @task 25 | def produce_page_visits_data(): 26 | 27 | if random.random() < 0.5: 28 | raise Exception("Job has failed") 29 | 30 | page_visits = [ 31 | {"id": 1, "name": "Cozy Apartment", "price": 120, "page_visits": random.randint(0, 50)}, 32 | {"id": 2, "name": "Luxury Condo", "price": 300, "page_visits": random.randint(0, 50)}, 33 | {"id": 3, "name": "Modern Studio", "price": 180, "page_visits": random.randint(0, 50)}, 34 | {"id": 4, "name": "Charming Loft", "price": 150, "page_visits": random.randint(0, 50)}, 35 | {"id": 5, "name": "Spacious Villa", "price": 400, "page_visits": random.randint(0, 50)}, 36 | ] 37 | file_path = get_data_path() 38 | 39 | directory = os.path.dirname(file_path) 40 | if not os.path.exists(directory): 41 | os.makedirs(directory) 42 | 43 | with open(file_path, "w") as f: 44 | json.dump(page_visits, f) 45 | 46 | print(f"Written to file: {file_path}") 47 | 48 | @task 49 | def process_page_visits_data(): 50 | file_path = get_data_path() 51 | 52 | with open(file_path, "r") as f: 53 | page_visits = json.load(f) 54 | 55 | average_price = sum(page_visit["page_visits"] for page_visit in page_visits) / len(page_visits) 56 | print(f"Average number of page visits {average_price}") 57 | 58 | produce_page_visits_data() >> process_page_visits_data() 59 | 60 | demo_dag = average_page_visits() -------------------------------------------------------------------------------- /06-data-eng-with-llms/03-spark-llms.py: -------------------------------------------------------------------------------- 1 | %pip install transformers 2 | %pip install outlines 3 | 4 | # ----------------------- 5 | 6 | from pyspark.sql import SparkSession 7 | 8 | spark = SparkSession.builder \ 9 | .appName("ReviewsClassifier") \ 10 | .master("local[*]") \ 11 | .getOrCreate() 12 | 13 | spark 14 | 15 | 16 | # ------------------------ 17 | 18 | reviews = [ 19 | (1, "This is absolutely delightful!"), 20 | (2, "This was the worst hotel I've ever seen"), 21 | (3, "Great location but the rooms were dirty."), 22 | (4, "Staff were friendly and helpful."), 23 | (5, "Mediocre breakfast, but I'd stay again."), 24 | ] 25 | df = spark.createDataFrame( 26 | reviews, 27 | ["review_id", "review"] 28 | ) 29 | df.show() 30 | 31 | # ------------------------ 32 | 33 | import outlines 34 | import json 35 | import torch 36 | 37 | model_name = "mistralai/Mistral-7B-Instruct-v0.3" 38 | 39 | schema = json.dumps({ 40 | "type": "object", 41 | "properties": { 42 | "sentiment": { 43 | "type": "string", 44 | "enum": ["positive", "negative"] 45 | } 46 | }, 47 | "required": ["sentiment"] 48 | }) 49 | 50 | def classify(generate_json, review): 51 | prompt = ( 52 | "Classify the following customer review as positive or negative.\n\n" 53 | f"Review:\n{review}\n" 54 | ) 55 | output_json = generate_json(prompt, max_tokens=40) 56 | return output_json['sentiment'] 57 | 58 | # ------------------------------------------ 59 | 60 | from pyspark.sql.functions import udf 61 | from functools import cache 62 | from huggingface_hub import login 63 | 64 | @udf("string") 65 | def sentiment_udf(review): 66 | 67 | @cache 68 | def get_generate_json(): 69 | login(token="hf_...") 70 | generator = outlines.models.transformers( 71 | model_name, 72 | device="cuda", 73 | model_kwargs={"torch_dtype": torch.float16}, 74 | ) 75 | return outlines.generate.json(generator, schema) 76 | 77 | generate_json = get_generate_json() 78 | 79 | return classify(generate_json, review) 80 | 81 | 82 | result_df = df.withColumn("sentiment", sentiment_udf("review")) 83 | result_df.show(truncate=False) -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/01-wikipedia-stream-exercise/wiki-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import textwrap 3 | 4 | from confluent_kafka import Producer 5 | from sseclient import SSEClient 6 | 7 | producer_conf = {"bootstrap.servers": "localhost:9092"} 8 | kafka_topic = "wikipedia-changes" 9 | 10 | # TODO: Before running a producer. 11 | # 1. Install Kafka CLI tools 12 | # 13 | # ``` 14 | # brew install kafka 15 | # ``` 16 | # 17 | # 2. Start Kafka 18 | # 19 | # ``` 20 | # docker-compose up 21 | # ``` 22 | # 23 | # 3. Create a virtual environment and install dependencies 24 | # 25 | # ``` 26 | # python3 -m venv venv 27 | # source venv/bin/activate 28 | # pip install -r requirements.txt 29 | # ``` 30 | 31 | # TODO: Read docs about Wikipedia edit stream: https://www.mediawiki.org/wiki/Manual:RCFeed 32 | 33 | def delivery_report(err, msg): 34 | if err: 35 | print("ERROR: Message failed delivery: {}".format(err)) 36 | else: 37 | print( 38 | textwrap.dedent(f""" 39 | Produced event to topic {msg.topic()}: 40 | key = {msg.key().decode('utf-8')} 41 | value = {msg.value().decode('utf-8')} 42 | """) 43 | ) 44 | 45 | 46 | def main(): 47 | url = "https://stream.wikimedia.org/v2/stream/recentchange" 48 | 49 | print( 50 | f"Starting to consume Wikipedia recent changes from {url} and produce to Kafka topic '{kafka_topic}'..." 51 | ) 52 | 53 | producer = Producer(producer_conf) 54 | messages = SSEClient(url) 55 | 56 | for event in messages: 57 | if event.event == "message" and event.data: 58 | try: 59 | data = json.loads(event.data) 60 | except json.JSONDecodeError: 61 | continue 62 | 63 | print(data) 64 | 65 | # TODO: Produce a Kafka messages from a Wikistream update message 66 | # * Parse the input message 67 | # * Extract fields you need to write 68 | # * Create a JSON object for a new Kafka even 69 | # * Write a messages to a Kafka topic 70 | # 71 | # To test your producer, run the following command: 72 | # 73 | # ``` 74 | # kafka-console-consumer --bootstrap-server localhost:9092 --topic wikipedia-changes --from-beginning 75 | # ``` 76 | 77 | producer.flush() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /03-data-lake/README.md: -------------------------------------------------------------------------------- 1 | # AWS Setup for Data Lake 2 | 3 | This README describes how to configure your AWS account and credentials to enable programmatic access via the AWS CLI. This setup will be used for working with services like Amazon S3 in the data lake section. 4 | 5 | --- 6 | 7 | ## Prerequisites 8 | 9 | - An [AWS account](https://aws.amazon.com/) (sign-up requires a valid payment method) 10 | - [AWS CLI installed](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) 11 | 12 | --- 13 | 14 | ## Step 1: Sign In to AWS 15 | 16 | 1. Go to the [AWS Management Console](https://aws.amazon.com/console/). 17 | 2. Log in as the **root user** (only for initial setup). 18 | 3. Navigate to the **IAM (Identity and Access Management)** service using the search bar. 19 | 20 | --- 21 | 22 | ## Step 2: Create an IAM User 23 | 24 | 1. In the IAM dashboard, go to **Users** → **Create user**. 25 | 2. Enter a username, e.g., `rental-website-admin`. 26 | 3. Under **Permissions**, select **Attach existing policies directly** and choose **AdministratorAccess** for full access. 27 | - Alternatively, you can choose more restricted permissions (e.g., S3-only or read-only). 28 | 4. Continue through the wizard and create the user. 29 | 30 | --- 31 | 32 | ## Step 3: Generate Programmatic Credentials 33 | 34 | 1. After creating the user, go to the **Security credentials** tab. 35 | 2. Under **Access keys**, click **Create access key**. 36 | 3. Choose **Command Line Interface (CLI)** as the use case. 37 | 4. Click **Create access key**. 38 | 5. Copy both the **Access key ID** and **Secret access key**. 39 | 40 | > ⚠️ The secret key is shown only once. Store it securely. 41 | 42 | --- 43 | 44 | ## Step 4: Set Up Local AWS Credentials 45 | 46 | Create the AWS credentials file: 47 | 48 | ```sh 49 | vim ~/.aws/credentials 50 | ``` 51 | 52 | Paste the following into the file, replacing the placeholders with your actual keys: 53 | 54 | ```ini 55 | [default] 56 | aws_access_key_id = YOUR_ACCESS_KEY_ID 57 | aws_secret_access_key = YOUR_SECRET_ACCESS_KEY 58 | ``` 59 | 60 | Save and close the file. 61 | 62 | --- 63 | 64 | ## Step 5: Test Your Configuration 65 | 66 | Run the following command to verify that your credentials are working: 67 | 68 | ```sh 69 | aws s3 ls 70 | ``` 71 | 72 | You should see a list of accessible S3 buckets (or an empty list if you have none yet). 73 | 74 | --- 75 | 76 | You're now ready to use the AWS CLI to interact with AWS services. -------------------------------------------------------------------------------- /02-data-processing-with-spark/02-reading-airbnb-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c219df52-cb8d-4431-831a-3751a69062f2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession\n", 11 | "\n", 12 | "spark = SparkSession.builder \\\n", 13 | " .appName(\"Read Inside Airbnb data\") \\\n", 14 | " .getOrCreate()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "ec00f92d-c582-4970-9617-ff0a9852cc45", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "listings = spark.read.csv(\"data/listings.csv.gz\", \n", 25 | " header=True,\n", 26 | " inferSchema=True,\n", 27 | " sep=\",\", \n", 28 | " quote='\"',\n", 29 | " escape='\"', \n", 30 | " multiLine=True,\n", 31 | " mode=\"PERMISSIVE\" \n", 32 | ")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "173f066e-9e4c-40bf-9b30-befcf4b0e4ec", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "for field in listings.schema:\n", 43 | " print(field)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "1bf33965-8cb2-48b6-bea0-d1da7de3382f", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "neighbourhoods = listings.select(listings.neighbourhood_cleansed)\n", 54 | "neighbourhoods.show(20)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "d108827d-ad80-43dc-9648-ef8b21c08a49", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "neighbourhoods = listings.select(listings.neighbourhood_cleansed)\n", 65 | "neighbourhoods.show(20, truncate=False)" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3 (ipykernel)", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.12.7" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 5 90 | } 91 | -------------------------------------------------------------------------------- /06-data-eng-with-llms/README.md: -------------------------------------------------------------------------------- 1 | # Databricks Workspace Setup 2 | 3 | This README describes how to create and configure a Databricks workspace on AWS, set up compute, and prepare the environment for running notebooks with GPU acceleration and Hugging Face models. 4 | 5 | --- 6 | 7 | ## Prerequisites 8 | 9 | - An AWS account with administrator access 10 | - A Hugging Face account with an access token 11 | 12 | --- 13 | 14 | ## Step 1: Create a Databricks Workspace 15 | 16 | 1. Go to [https://databricks.com](https://databricks.com) and sign in 17 | 2. Go to **Manage Account** → **Workspaces**. 18 | 3. Click **Create Workspace**. 19 | 4. You’ll be redirected to an AWS login page. Sign in using your AWS root or IAM credentials. 20 | 5. AWS will ask you to create a **CloudFormation stack**. A stack is a group of AWS resources (e.g., EC2, S3) that Databricks uses to manage compute. 21 | 6. Acknowledge the required permissions and click **Create stack**. 22 | 7. Wait for the stack creation to complete. The workspace will then appear in your Databricks account. 23 | 24 | --- 25 | 26 | ## Step 2: Create a Compute Cluster 27 | 28 | 1. Go to your Databricks workspace. 29 | 2. Navigate to **Compute** → **Create Compute**. 30 | 3. Choose a **GPU-accelerated instance**. 31 | 4. Set the number of Spark workers: 32 | - **Min workers:** 0 33 | - **Max workers:** 1 34 | (This ensures a worker is only created when Spark is actually used.) 35 | 5. Create the compute cluster. 36 | 37 | --- 38 | 39 | ## Step 3: Create a Notebook 40 | 41 | 1. In the workspace, go to **Workspace** → **Create** → **Notebook**. 42 | 2. Name it something like: `llm-classification`. 43 | 3. Select the compute cluster you just created as the execution environment. 44 | 45 | --- 46 | 47 | ## Step 4: Configure Hugging Face Token 48 | 49 | To download models from Hugging Face, you need an access token. 50 | 51 | 1. Go to [https://huggingface.co](https://huggingface.co). 52 | 2. Click on your **User icon** → **Access Tokens**. 53 | 3. Create a new token and copy it. 54 | 55 | In your Databricks notebook, run the following: 56 | 57 | ```python 58 | %pip install transformers 59 | 60 | from huggingface_hub import login 61 | 62 | login(token="hf_...") 63 | ``` 64 | 65 | > ⚠️ Do **not** hardcode your token in production environments. Use Databricks Secrets instead. This example uses a token directly for simplicity in a demo setting. 66 | 67 | --- 68 | 69 | You're now ready to run notebooks using Spark and Hugging Face models in Databricks. -------------------------------------------------------------------------------- /07-kafka-streaming/04-kafka-schema-registry-producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import textwrap 4 | import time 5 | from datetime import datetime 6 | from order import Order, ORDER_SCHEMA 7 | 8 | from confluent_kafka import Producer 9 | from confluent_kafka.schema_registry import SchemaRegistryClient 10 | from confluent_kafka.schema_registry.avro import AvroSerializer 11 | from confluent_kafka.serialization import ( 12 | MessageField, 13 | SerializationContext, 14 | ) 15 | 16 | 17 | def generate_order(): 18 | countries = [ 19 | "USA", 20 | "Canada", 21 | "UK", 22 | "Germany", 23 | "France", 24 | "Australia", 25 | "Japan", 26 | "Ireland", 27 | ] 28 | return Order( 29 | order_id=random.randint(1000, 9999), 30 | customer_id='123', 31 | total_price=round(random.uniform(20.0, 1000.0), 2), 32 | customer_country=random.choice(countries), 33 | merchant_country=random.choice(countries), 34 | order_datetime=datetime.now().isoformat() 35 | ) 36 | 37 | 38 | def main(): 39 | schema_registry_config = { 40 | "url": "http://localhost:8081" 41 | } 42 | schema_registry_client = SchemaRegistryClient(schema_registry_config) 43 | 44 | avro_serializer = AvroSerializer( 45 | schema_registry_client, 46 | json.dumps(ORDER_SCHEMA), 47 | lambda obj, ctx: obj.to_dict() 48 | ) 49 | 50 | producer_config = { 51 | "bootstrap.servers": "localhost:9092", 52 | "acks": "all" 53 | } 54 | 55 | producer = Producer(producer_config) 56 | 57 | topic = "orders.avro" 58 | 59 | def delivery_callback(err, msg): 60 | if err: 61 | print("ERROR: Message failed delivery: {}".format(err)) 62 | else: 63 | print( 64 | textwrap.dedent( 65 | f""" 66 | Produced event to topic {msg.topic()}: 67 | key = {msg.key().decode('utf-8')} 68 | """) 69 | ) 70 | 71 | while True: 72 | order = generate_order() 73 | print(f"Sending order: {order}") 74 | 75 | serialized_data = avro_serializer( 76 | order, SerializationContext(topic, MessageField.VALUE) 77 | ) 78 | producer.produce( 79 | topic, 80 | key=str(order.order_id).encode(), 81 | value=serialized_data, 82 | callback=delivery_callback, 83 | ) 84 | 85 | producer.poll(0) 86 | 87 | time.sleep(1) 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-exercise/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 0. Stop all Docker containers 4 | 5 | Before you start, you would need to stop Docker containers related to this bootcamp 6 | running on your machine. 7 | 8 | # 1. Start Kafka 9 | 10 | First, start Kafka using Docker Compose: 11 | 12 | ```sh 13 | docker-compose up 14 | ``` 15 | 16 | 17 | ## 2. Create a Python Virtual Environment 18 | 19 | Run the following commands to create a virtual environment and install dependencies: 20 | 21 | ```bash 22 | python3 -m venv venv 23 | source ./venv/bin/activate 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## 3. Download the Flink Kafka Connector 28 | 29 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven: 30 | 31 | ```bash 32 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar 33 | ``` 34 | 35 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file 36 | 37 | ## 4. Configure the `flink` Command in the Virtual Environment 38 | 39 | Ensure the `flink` CLI is available in the virtual environment. 40 | 41 | ```bash 42 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin" 43 | ``` 44 | 45 | Run `flink` without arguments to check if it was installed correctly 46 | 47 | ```bash 48 | flink --version 49 | ``` 50 | 51 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment. 52 | 53 | ## 5. Implement the Flink Application 54 | 55 | 56 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments. 57 | 58 | 59 | ## 6. Start the Producer 60 | 61 | In the virtual environment run the producer script to generate the payments data: 62 | 63 | ```bash 64 | python payments-producer.py 65 | ``` 66 | 67 | This will produce random payments data to the `payments` Kafka topic. 68 | 69 | ## 7. (Optional) Verify Producer Output 70 | 71 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic: 72 | 73 | ```bash 74 | kafka-console-consumer --topic payments \ 75 | --bootstrap-server localhost:9092 \ 76 | --from-beginning 77 | ``` 78 | 79 | Make sure that you see payments data and that it is written every second. 80 | 81 | ## 8. Run the Flink Application 82 | 83 | Run the Flink application to process the payments data: 84 | 85 | ```bash 86 | flink run \ 87 | --python flink-app.py \ 88 | --target local \ 89 | --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar 90 | ``` 91 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-solution/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 0. Stop all Docker containers 4 | 5 | Before you start, you would need to stop Docker containers related to this bootcamp 6 | running on your machine. 7 | 8 | # 1. Start Kafka 9 | 10 | First, start Kafka using Docker Compose: 11 | 12 | ```sh 13 | docker-compose up 14 | ``` 15 | 16 | 17 | ## 2. Create a Python Virtual Environment 18 | 19 | Run the following commands to create a virtual environment and install dependencies: 20 | 21 | ```bash 22 | python3 -m venv venv 23 | source ./venv/bin/activate 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## 3. Download the Flink Kafka Connector 28 | 29 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven: 30 | 31 | ```bash 32 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar 33 | ``` 34 | 35 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file 36 | 37 | ## 4. Configure the `flink` Command in the Virtual Environment 38 | 39 | Ensure the `flink` CLI is available in the virtual environment. 40 | 41 | ```bash 42 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin" 43 | ``` 44 | 45 | Run `flink` without arguments to check if it was installed correctly 46 | 47 | ```bash 48 | flink --version 49 | ``` 50 | 51 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment. 52 | 53 | ## 5. Implement the Flink Application 54 | 55 | 56 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments. 57 | 58 | 59 | ## 6. Start the Producer 60 | 61 | In the virtual environment run the producer script to generate the payments data: 62 | 63 | ```bash 64 | python payments-producer.py 65 | ``` 66 | 67 | This will produce random payments data to the `payments` Kafka topic. 68 | 69 | ## 7. (Optional) Verify Producer Output 70 | 71 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic: 72 | 73 | ```bash 74 | kafka-console-consumer --topic payments \ 75 | --bootstrap-server localhost:9092 \ 76 | --from-beginning 77 | ``` 78 | 79 | Make sure that you see payments data and that it is written every second. 80 | 81 | ## 8. Run the Flink Application 82 | 83 | Run the Flink application to process the payments data: 84 | 85 | ```bash 86 | flink run \ 87 | --python flink-app.py \ 88 | --target local \ 89 | --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar 90 | ``` 91 | -------------------------------------------------------------------------------- /02-data-processing-with-spark/README.md: -------------------------------------------------------------------------------- 1 | # Install Spark 2 | 3 | To install Spark locally on macOS, run the following command: 4 | 5 | ```sh 6 | brew install apache-spark 7 | ``` 8 | 9 | To check that it was installed correctly, you can run: 10 | 11 | ```sh 12 | pyspark --version 13 | ``` 14 | 15 | --- 16 | 17 | # Create a Virtual Environment 18 | 19 | Create a virtual environment in a directory named `venv`: 20 | 21 | ```sh 22 | python -m venv venv 23 | ``` 24 | 25 | Activate the virtual environment: 26 | 27 | ```sh 28 | source venv/bin/activate 29 | ``` 30 | 31 | --- 32 | 33 | # Install Jupyter Lab 34 | 35 | Run the following command inside the virtual environment to install Jupyter Lab: 36 | 37 | ```sh 38 | pip install jupyter 39 | ``` 40 | 41 | Then configure PySpark to use Jupyter Lab as the driver: 42 | 43 | ```sh 44 | export PYSPARK_DRIVER_PYTHON=jupyter 45 | export PYSPARK_DRIVER_PYTHON_OPTS='lab' 46 | ``` 47 | 48 | --- 49 | 50 | # Start Jupyter Notebooks with Local PySpark 51 | 52 | Now you can start Jupyter Lab with PySpark: 53 | 54 | ```sh 55 | pyspark 56 | ``` 57 | 58 | This will open a Jupyter Lab interface in your browser where you can interact with Spark using notebooks. 59 | 60 | --- 61 | 62 | # Spark "Hello World" 63 | 64 | You can try to run the following Spark code to verify everything is working: 65 | 66 | ```python 67 | from pyspark.sql import SparkSession 68 | 69 | spark = (SparkSession.builder 70 | .appName("First Spark application") 71 | .getOrCreate()) 72 | 73 | data = [ 74 | {"userId": 1, "paymentAmount": 100.0, "date": "2025-01-01"}, 75 | {"userId": 2, "paymentAmount": 150.5, "date": "2025-01-02"}, 76 | {"userId": 3, "paymentAmount": 200.75, "date": "2025-01-03"}, 77 | {"userId": 2, "paymentAmount": 50.25, "date": "2025-01-04"}, 78 | {"userId": 1, "paymentAmount": 80.0, "date": "2025-01-05"}, 79 | ] 80 | 81 | df = spark.createDataFrame(data) 82 | df.count() 83 | ``` 84 | 85 | You should see the number of rows in the DataFrame as the output. 86 | 87 | --- 88 | 89 | # Clean Up 90 | 91 | When you're done working, follow these steps to shut everything down: 92 | 93 | 1. **Stop the Spark session (in a notebook):** 94 | 95 | ```python 96 | spark.stop() 97 | ``` 98 | 99 | 2. **Stop Jupyter Lab (in terminal):** 100 | 101 | Press `Ctrl+C` in the terminal where `pyspark` was running. 102 | 103 | 3. **Deactivate the virtual environment:** 104 | 105 | ```sh 106 | deactivate 107 | ``` 108 | 109 | At this point, you're back to your global Python environment. You're now ready to continue developing Spark applications! 110 | 111 | --- -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/customer_reviews_dag.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook 4 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator 5 | from datetime import datetime, timedelta 6 | import os 7 | import csv 8 | 9 | 10 | @dag( 11 | "customer_reviews_dag", 12 | start_date=datetime(2025, 1, 1), 13 | schedule_interval="* * * * *", 14 | catchup=False, 15 | description="Review average score", 16 | ) 17 | def customer_reviews_dag(): 18 | 19 | @task 20 | def extract_reviews(): 21 | pg_hook = PostgresHook(postgres_conn_id="postgres_rental_site") 22 | 23 | context = get_current_context() 24 | execution_date = context["execution_date"] 25 | start_of_minute = execution_date.replace(second=0, microsecond=0) 26 | end_of_minute = start_of_minute + timedelta(hours=1) 27 | 28 | query = f""" 29 | SELECT review_id, listing_id, review_score, review_comment, review_date 30 | FROM customer_reviews 31 | WHERE review_date >= '{start_of_minute.strftime('%Y-%m-%d %H:%M:%S')}' 32 | AND review_date < '{end_of_minute.strftime('%Y-%m-%d %H:%M:%S')}' 33 | """ 34 | 35 | records = pg_hook.get_records(query) 36 | column_names = ["review_id", "listing_id", "review_score", "review_comment", "review_date"] 37 | 38 | file_date = execution_date.strftime('%Y%m%d_%H%M') 39 | file_path = f"/tmp/data/customer_reviews/{file_date}/customer_reviews.csv" 40 | 41 | directory = os.path.dirname(file_path) 42 | if not os.path.exists(directory): 43 | os.makedirs(directory) 44 | 45 | with open(file_path, "w", newline="") as csvfile: 46 | writer = csv.writer(csvfile) 47 | writer.writerow(column_names) 48 | writer.writerows(records) 49 | 50 | print(f"Customer reviews written to {file_path}") 51 | 52 | spark_etl = SparkSubmitOperator( 53 | task_id="spark_etl_reviews", 54 | application="dags/spark_etl_reviews.py", 55 | name="guest_reviews_etl", 56 | application_args=[ 57 | "--customer_reviews", "/tmp/data/customer_reviews/{{ execution_date.strftime('%Y%m%d_%H%M') }}/customer_reviews.csv", 58 | "--output_path", "/tmp/data/avg_review_score_by_listing/{{ execution_date.strftime('%Y%m%d_%H%M') }}" 59 | ], 60 | conn_id='spark_rental_site', 61 | ) 62 | 63 | extract_task = extract_reviews() 64 | extract_task >> spark_etl 65 | 66 | dag_instance = customer_reviews_dag() -------------------------------------------------------------------------------- /08-flink-stream-processing/02-orders-processing.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.typeinfo import Types 6 | from pyflink.common.watermark_strategy import WatermarkStrategy 7 | from pyflink.datastream import StreamExecutionEnvironment 8 | from pyflink.datastream.connectors.kafka import KafkaSink, KafkaSource, KafkaRecordSerializationSchema 9 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 10 | 11 | 12 | @dataclass 13 | class Order: 14 | order_id: str 15 | customer_id: str 16 | product_id: str 17 | quantity: int 18 | price: float 19 | order_time: str 20 | 21 | 22 | def parse_order(json_str): 23 | data = json.loads(json_str) 24 | return Order( 25 | order_id=data.get("order_id", "unknown"), 26 | customer_id=data.get("customer_id", "unknown"), 27 | product_id=data.get("product_id", "unknown"), 28 | quantity=data.get("quantity", 0), 29 | price=float(data.get("price", 0.0)), 30 | order_time=data.get("order_time", "unknown") 31 | ) 32 | 33 | 34 | def filter_high_price(order): 35 | return order.price > 10 36 | 37 | 38 | def convert_order(order): 39 | 40 | simplified = { 41 | "order_id": order.order_id, 42 | "price": order.price, 43 | } 44 | 45 | return json.dumps(simplified) 46 | 47 | 48 | def main(): 49 | env = StreamExecutionEnvironment.get_execution_environment() 50 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 51 | 52 | kafka_source = KafkaSource.builder() \ 53 | .set_bootstrap_servers("localhost:9092") \ 54 | .set_topics("orders") \ 55 | .set_group_id("flink-consumer-group") \ 56 | .set_value_only_deserializer(SimpleStringSchema()) \ 57 | .build() 58 | 59 | orders_stream = env.from_source( 60 | kafka_source, 61 | watermark_strategy=WatermarkStrategy.no_watermarks(), 62 | source_name="kafka_source" 63 | ) 64 | 65 | filtered_stream = orders_stream \ 66 | .map(parse_order) \ 67 | .filter(filter_high_price) \ 68 | .map(convert_order, Types.STRING()) 69 | 70 | filtered_stream.print() 71 | 72 | kafka_sink = KafkaSink.builder() \ 73 | .set_bootstrap_servers("localhost:9092") \ 74 | .set_record_serializer( 75 | KafkaRecordSerializationSchema.builder() 76 | .set_topic("filtered-orders") 77 | .set_value_serialization_schema(SimpleStringSchema()) 78 | .build() 79 | ) \ 80 | .build() 81 | 82 | filtered_stream.sink_to(kafka_sink) 83 | 84 | env.execute("Orders stream processing") 85 | 86 | 87 | if __name__ == "__main__": 88 | main() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/README.md: -------------------------------------------------------------------------------- 1 | ## Create and Activate a Virtual Environment 2 | 3 | Create a virtual environment using a supported Python version: 4 | 5 | ```sh 6 | python -m venv venv 7 | ``` 8 | 9 | Activate the virtual environment: 10 | 11 | ```sh 12 | source venv/bin/activate 13 | ``` 14 | 15 | Check the Python version to confirm you're using the right one: 16 | 17 | ```sh 18 | python --version 19 | ``` 20 | 21 | --- 22 | 23 | ## Install Apache Airflow 24 | 25 | Airflow must be installed with a constraints file to ensure compatible dependencies. 26 | 27 | ```sh 28 | pip install 'apache-airflow==2.10.4' \ 29 | --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.12.txt" 30 | ``` 31 | 32 | --- 33 | 34 | ## Initialize the Metadata Database 35 | 36 | Set up the Airflow database: 37 | 38 | ```sh 39 | airflow db migrate 40 | ``` 41 | 42 | --- 43 | 44 | ## Create an Admin User 45 | 46 | Create a user account for logging into the Airflow web UI: 47 | 48 | ```sh 49 | airflow users create \ 50 | --username admin \ 51 | --firstname John \ 52 | --lastname Doe \ 53 | --role Admin \ 54 | --email admin@example.com \ 55 | --password admin 56 | ``` 57 | 58 | --- 59 | 60 | ## Configure DAGs Folder and Disable Example DAGs 61 | 62 | Find the Airflow home directory: 63 | 64 | ```sh 65 | airflow info 66 | ``` 67 | 68 | Locate the `airflow_home` value in the output. The config file is at: 69 | 70 | ``` 71 | /airflow.cfg 72 | ``` 73 | 74 | Get your current working directory (where you will store DAGs): 75 | 76 | ```sh 77 | pwd 78 | ``` 79 | 80 | Edit the configuration file: 81 | 82 | ```sh 83 | vim /airflow.cfg 84 | ``` 85 | 86 | Update the following settings: 87 | 88 | ```ini 89 | dags_folder = /your/current/directory 90 | load_examples = False 91 | ``` 92 | 93 | Save and exit. 94 | 95 | --- 96 | 97 | ## Start Airflow Services 98 | 99 | Start the Airflow webserver on port 8080: 100 | 101 | ```sh 102 | airflow webserver --port 8080 103 | ``` 104 | 105 | In a new terminal with the virtual environment activated, start the scheduler: 106 | 107 | ```sh 108 | airflow scheduler 109 | ``` 110 | 111 | --- 112 | 113 | ## Access the Web UI 114 | 115 | Open a browser and go to: 116 | 117 | ``` 118 | http://localhost:8080 119 | ``` 120 | 121 | Login credentials: 122 | 123 | - **Username:** `admin` 124 | - **Password:** `admin` 125 | 126 | --- 127 | 128 | ## Clean Up 129 | 130 | To stop Airflow and exit the virtual environment: 131 | 132 | 1. Press `Ctrl+C` in both terminal windows to stop the webserver and scheduler. 133 | 2. Deactivate the virtual environment: 134 | 135 | ```sh 136 | deactivate 137 | ``` 138 | 139 | Airflow is now installed and ready to use. -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-solution/flink-app.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.typeinfo import Types 6 | from pyflink.common.watermark_strategy import WatermarkStrategy 7 | from pyflink.datastream import StreamExecutionEnvironment 8 | from pyflink.datastream.connectors.kafka import KafkaSink, KafkaSource, KafkaRecordSerializationSchema 9 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 10 | 11 | 12 | @dataclass 13 | class Payment: 14 | payment_id: str 15 | user_id: str 16 | merchant_id: str 17 | amount: float 18 | currency: str 19 | payment_time: str 20 | 21 | 22 | def parse_payment(json_str): 23 | data = json.loads(json_str) 24 | return Payment( 25 | payment_id=data.get("payment_id", "unknown"), 26 | user_id=data.get("user_id", "unknown"), 27 | merchant_id=data.get("merchant_id", "unknown"), 28 | amount=float(data.get("amount", 0.0)), 29 | currency=data.get("currency", "unknown"), 30 | payment_time=data.get("payment_time", "unknown") 31 | ) 32 | 33 | 34 | def filter_high_amount(payment): 35 | return payment.amount > 500 36 | 37 | 38 | def convert_payment(payment): 39 | simplified = { 40 | "payment_id": payment.payment_id, 41 | "amount": payment.amount, 42 | } 43 | return json.dumps(simplified) 44 | 45 | 46 | def main(): 47 | env = StreamExecutionEnvironment.get_execution_environment() 48 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 49 | 50 | kafka_source = KafkaSource.builder() \ 51 | .set_bootstrap_servers("localhost:9092") \ 52 | .set_topics("payments") \ 53 | .set_group_id("flink-consumer-group") \ 54 | .set_value_only_deserializer(SimpleStringSchema()) \ 55 | .build() 56 | 57 | payments_stream = env.from_source( 58 | kafka_source, 59 | watermark_strategy=WatermarkStrategy.no_watermarks(), 60 | source_name="kafka_source" 61 | ) 62 | 63 | filtered_stream = payments_stream \ 64 | .map(parse_payment) \ 65 | .filter(filter_high_amount) \ 66 | .map(convert_payment, Types.STRING()) 67 | 68 | filtered_stream.print("FilteredStream") 69 | 70 | kafka_sink = KafkaSink.builder() \ 71 | .set_bootstrap_servers("localhost:9092") \ 72 | .set_record_serializer( 73 | KafkaRecordSerializationSchema.builder() 74 | .set_topic("filtered-payments") 75 | .set_value_serialization_schema(SimpleStringSchema()) 76 | .build() 77 | ) \ 78 | .build() 79 | 80 | filtered_stream.sink_to(kafka_sink) 81 | 82 | env.execute("Payments stream processing") 83 | 84 | 85 | if __name__ == "__main__": 86 | main() -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-solution/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 0. Stop all Docker containers 4 | 5 | Before you start, you would need to stop Docker containers related to this bootcamp 6 | running on your machine. 7 | 8 | 9 | # 1. Start Kafka 10 | 11 | First, start Kafka using Docker Compose: 12 | 13 | ```sh 14 | docker-compose up 15 | ``` 16 | 17 | 18 | ## 2. Create a Python Virtual Environment 19 | 20 | Run the following commands to create a virtual environment and install dependencies: 21 | 22 | ```bash 23 | python -m venv venv 24 | source ./venv/bin/activate 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | 29 | ## 3. Download the Flink Kafka Connector 30 | 31 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven: 32 | 33 | ```bash 34 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar 35 | ``` 36 | 37 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file 38 | 39 | 40 | ## 4. Configure the `flink` Command in the Virtual Environment 41 | 42 | Ensure the `flink` CLI is available in the virtual environment. 43 | 44 | ```bash 45 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin" 46 | ``` 47 | 48 | Run `flink` without arguments to check if it was installed correctly 49 | 50 | ```bash 51 | flink --version 52 | ``` 53 | 54 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment. 55 | 56 | 57 | ## 5. Implement the Flink Application 58 | 59 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments. 60 | 61 | 62 | ## 6. Start the Producer 63 | 64 | In the virtual environment run the producer script to generate the payments data: 65 | 66 | ```bash 67 | python payments-producer.py 68 | ``` 69 | 70 | This will produce random payments data to the `payments` Kafka topic. 71 | 72 | 73 | ## 7. (Optional) Verify Producer Output 74 | 75 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic: 76 | 77 | ```bash 78 | kafka-console-consumer --topic payments \ 79 | --bootstrap-server localhost:9092 \ 80 | --from-beginning 81 | ``` 82 | 83 | Make sure that you see payments data and that it is written every second. 84 | 85 | 86 | ## 8. Run the Flink Application 87 | 88 | Run the Flink application to process the data. Make sure the Kafka connector JAR is included in the classpath. 89 | 90 | ```bash 91 | flink run \ 92 | --python flink-app.py \ 93 | --target local \ 94 | --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar 95 | ``` 96 | 97 | 98 | ## 9. Verify Flink Output 99 | 100 | Run the Flink application to process the payments data: 101 | 102 | ```bash 103 | kafka-console-consumer --topic filtered-payments \ 104 | --bootstrap-server localhost:9092 \ 105 | --from-beginning 106 | ``` 107 | -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/01-payments-data-exercise/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 0. Stop all Docker containers 4 | 5 | Before you start, you would need to stop Docker containers related to this bootcamp 6 | running on your machine. 7 | 8 | 9 | # 1. Start Kafka 10 | 11 | First, start Kafka using Docker Compose: 12 | 13 | ```sh 14 | docker-compose up 15 | ``` 16 | 17 | 18 | ## 2. Create a Python Virtual Environment 19 | 20 | Run the following commands to create a virtual environment and install dependencies: 21 | 22 | ```bash 23 | python3.11 -m venv venv 24 | source ./venv/bin/activate 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | 29 | ## 3. Download the Flink Kafka Connector 30 | 31 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven: 32 | 33 | ```bash 34 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar 35 | ``` 36 | 37 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file 38 | 39 | 40 | ## 4. Configure the `flink` Command in the Virtual Environment 41 | 42 | Ensure the `flink` CLI is available in the virtual environment. 43 | 44 | ```bash 45 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin" 46 | ``` 47 | 48 | Run `flink` without arguments to check if it was installed correctly 49 | 50 | ```bash 51 | flink --version 52 | ``` 53 | 54 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment. 55 | 56 | 57 | ## 5. Implement the Flink Application 58 | 59 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments. 60 | 61 | 62 | ## 6. Start the Producer 63 | 64 | In the virtual environment run the producer script to generate the payments data: 65 | 66 | ```bash 67 | python payments-producer.py 68 | ``` 69 | 70 | This will produce random payments data to the `payments` Kafka topic. 71 | 72 | 73 | ## 7. (Optional) Verify Producer Output 74 | 75 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic: 76 | 77 | ```bash 78 | kafka-console-consumer --topic payments \ 79 | --bootstrap-server localhost:9092 \ 80 | --from-beginning 81 | ``` 82 | 83 | Make sure that you see payments data and that it is written every second. 84 | 85 | 86 | ## 8. Run the Flink Application 87 | 88 | Run the Flink application to process the data. Make sure the Kafka connector JAR is included in the classpath. 89 | 90 | ```bash 91 | flink run \ 92 | --python flink-app.py \ 93 | --target local \ 94 | --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar 95 | ``` 96 | 97 | 98 | ## 9. Verify Flink Output 99 | 100 | Run the Flink application to process the payments data: 101 | 102 | ```bash 103 | kafka-console-consumer --topic filtered-payments \ 104 | --bootstrap-server localhost:9092 \ 105 | --from-beginning 106 | ``` 107 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/dags/03-bookings_per_listing.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator 4 | from datetime import datetime 5 | import os 6 | import csv 7 | import random 8 | 9 | 10 | @dag( 11 | "bookings_spark_pipeline", 12 | start_date=datetime(2025, 1, 1), 13 | schedule_interval="* * * * *", 14 | catchup=False, 15 | description="", 16 | ) 17 | def bookings_spark_pipeline(): 18 | 19 | @task 20 | def generate_bookings(): 21 | context = get_current_context() 22 | execution_date = context["execution_date"] 23 | 24 | file_date = execution_date.strftime("%Y-%m-%d_%H%M") 25 | file_path = f"/tmp/data/bookings/{file_date}/bookings.csv" 26 | 27 | num_bookings = random.randint(30, 50) 28 | bookings = [] 29 | for i in range(num_bookings): 30 | booking = { 31 | "booking_id": random.randint(1000, 5000), 32 | "listing_id": random.choice([13913, 17402, 24328, 33332, 116268, 117203, 127652, 127860]), 33 | "user_id": random.randint(1000, 5000), 34 | "booking_time": execution_date.strftime("%Y-%m-%d %H:%M:%S"), 35 | "status": random.choice(["confirmed", "cancelled", "pending"]) 36 | } 37 | bookings.append(booking) 38 | 39 | directory = os.path.dirname(file_path) 40 | if not os.path.exists(directory): 41 | os.makedirs(directory) 42 | 43 | fieldnames = ["booking_id", "listing_id", "user_id", "booking_time", "status"] 44 | 45 | with open(file_path, "w", newline="") as csvfile: 46 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 47 | writer.writeheader() 48 | for booking in bookings: 49 | writer.writerow({ 50 | "booking_id": booking["booking_id"], 51 | "listing_id": booking["listing_id"], 52 | "user_id": booking["user_id"], 53 | "booking_time": booking["booking_time"], 54 | "status": booking["status"] 55 | }) 56 | 57 | print(f"Generated bookings data written to {file_path}") 58 | 59 | spark_job = SparkSubmitOperator( 60 | task_id="process_listings_and_bookings", 61 | application="bookings_per_listing_spark.py", 62 | name="listings_bookings_join", 63 | application_args=[ 64 | "--listings_file", "/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz", 65 | "--bookings_file", "/tmp/data/bookings/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}/bookings.csv", 66 | "--output_path", "/tmp/data/bookings_per_listing/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}" 67 | ], 68 | conn_id='spark_booking', 69 | ) 70 | 71 | bookings_file = generate_bookings() 72 | bookings_file >> spark_job 73 | 74 | dag_instance = bookings_spark_pipeline() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/README.md: -------------------------------------------------------------------------------- 1 | 2 | This is a README for how to set up your environment to work on exercises for this section 3 | 4 | # 1. Create a Python virtual environment 5 | 6 | To create a Python virtual environment run the following command: 7 | 8 | ```sh 9 | python -m venv venv 10 | ``` 11 | 12 | Note that you might need to run it with a specific Python version to make sure it is compatible with the Airflow version you are using. 13 | 14 | ```sh 15 | python3.12 -m venv venv 16 | ``` 17 | 18 | # 2. Activate a Python virtual environment 19 | 20 | To activate a virtual environment run the following command: 21 | 22 | ```sh 23 | source venv/bin/activate 24 | ``` 25 | 26 | # 3. Install Airflow 27 | 28 | Install Airflow using these commands: 29 | 30 | ```sh 31 | AIRFLOW_VERSION="2.10.4" 32 | # Set this variable to your Python version 33 | PYTHON_VERSION="3.12" 34 | 35 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" 36 | 37 | pip install "apache-airflow==${AIRFLOW_VERSION}" \ 38 | --constraint "${CONSTRAINT_URL}" 39 | ``` 40 | 41 | Not all Airflow versions are compatible with all Python versions. 42 | 43 | You can find a list of compatible versions on this page: https://pypi.org/project/apache-airflow/ 44 | 45 | # 4. Create an Airflow database 46 | 47 | Run this command to create an Airflow database 48 | 49 | ```sh 50 | airflow db migrate 51 | ``` 52 | 53 | # 5. Create a folder for DAGs 54 | 55 | Create a folder for the DAGs you will implement 56 | 57 | ```sh 58 | mkdir dags 59 | ``` 60 | 61 | # 6. Update Airflow configuration 62 | 63 | To update the Airflow configuration get the location of the Airflow's home directory. To do this first run the following command. 64 | 65 | ```sh 66 | airflow info 67 | ``` 68 | 69 | And after this update edit the Airflow configuration. 70 | 71 | ```sh 72 | vim /airflow.cfg 73 | ``` 74 | 75 | In this configuration file you would need to change two values: 76 | 77 | * `dags_folder` to the path to the `dags` folder you've just created 78 | * `load_examples` to `False` 79 | 80 | 81 | # 7. Create an Airflow user 82 | 83 | ```sh 84 | airflow users create \ 85 | --username admin \ 86 | --firstname \ 87 | --lastname \ 88 | --role Admin \ 89 | --email admin@example.com \ 90 | --password admin 91 | ``` 92 | 93 | 94 | # 8. Start a web server for Airflow 95 | 96 | To start a web server run the following command: 97 | 98 | ```sh 99 | airflow webserver --port 8080 100 | ``` 101 | 102 | # 9. Start a scheduler 103 | 104 | In a different terminal session, activate a virtual environment in the same folder and start an Airflow scheduler 105 | 106 | ```sh 107 | source venv/bin/activate 108 | airflow scheduler 109 | ``` 110 | 111 | # 10. Check if Airflow is working 112 | 113 | You should now be able to go to `localhost:8080`, and log into Airflow UI. You should you the username and password you've selected in step `7`. 114 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/01-data-validation-exercise/data_validation_dag.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from datetime import datetime 4 | import os 5 | import json 6 | import random 7 | 8 | 9 | # TODO: Use the @dag decorator to create a DAG that: 10 | # * Runs every minute 11 | # * Does not use catchup 12 | def data_quality_pipeline(): 13 | 14 | CORRECT_PROB = 0.7 15 | 16 | def get_bookings_path(context): 17 | execution_date = context["execution_date"] 18 | file_date = execution_date.strftime("%Y-%m-%d_%H-%M") 19 | return f"/tmp/data/bookings/{file_date}/bookings.json" 20 | 21 | def generate_booking_id(i): 22 | if random.random() < CORRECT_PROB: 23 | return i + 1 24 | 25 | return "" 26 | 27 | def generate_listing_id(): 28 | if random.random() < CORRECT_PROB: 29 | return random.choice([1, 2, 3, 4, 5]) 30 | 31 | return "" 32 | 33 | def generate_user_id(correct_prob=0.7): 34 | return random.randint(1000, 5000) if random.random() < correct_prob else "" 35 | 36 | def generate_booking_time(execution_date): 37 | if random.random() < CORRECT_PROB: 38 | return execution_date.strftime('%Y-%m-%d %H:%M:%S') 39 | 40 | return "" 41 | 42 | def generate_status(): 43 | if random.random() < CORRECT_PROB: 44 | return random.choice(["confirmed", "pending", "cancelled"]) 45 | 46 | return random.choice(["unknown", "", "error"]) 47 | 48 | @task 49 | def generate_bookings(): 50 | context = get_current_context() 51 | booking_path = get_bookings_path(context) 52 | 53 | num_bookings = random.randint(5, 15) 54 | bookings = [] 55 | 56 | for i in range(num_bookings): 57 | booking = { 58 | "booking_id": generate_booking_id(i), 59 | "listing_id": generate_listing_id(), 60 | "user_id": generate_user_id(), 61 | "booking_time": generate_booking_time(context["execution_date"]), 62 | "status": generate_status() 63 | } 64 | bookings.append(booking) 65 | 66 | directory = os.path.dirname(booking_path) 67 | if not os.path.exists(directory): 68 | os.makedirs(directory) 69 | 70 | with open(booking_path, "w") as f: 71 | json.dump(bookings, f, indent=4) 72 | 73 | print(f"Written to file: {booking_path}") 74 | 75 | # TODO: Create a data quality check task that reads bookings data and validates every record. 76 | # For every invalid record it should return a validation record that includes: 77 | # * A record position in an input file 78 | # * A list of identified violations 79 | # 80 | # Here is a list of validations it should perform: 81 | # * Check if each of the fields is missing 82 | # * Check if the "status" field has one of the valid values 83 | # 84 | # It should write all found anomalies into an input file. 85 | 86 | # TODO: Define dependencies between tasks 87 | 88 | # TODO: Create an instance of the DAG 89 | -------------------------------------------------------------------------------- /08-flink-stream-processing/05-local-state.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.typeinfo import Types 6 | from pyflink.common.watermark_strategy import WatermarkStrategy 7 | from pyflink.datastream import ( 8 | StreamExecutionEnvironment, KeyedProcessFunction, RuntimeContext 9 | ) 10 | from pyflink.datastream.connectors.kafka import KafkaSource 11 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 12 | from pyflink.datastream.state import ValueState, ValueStateDescriptor 13 | 14 | 15 | TIER_1_THRESHOLD = 300.0 16 | TIER_2_THRESHOLD = 1000.0 17 | 18 | 19 | @dataclass 20 | class Order: 21 | order_id: str 22 | customer_id: str 23 | product_id: str 24 | quantity: int 25 | price: float 26 | order_time: str 27 | 28 | 29 | def parse_order(json_str) -> Order: 30 | data = json.loads(json_str) 31 | return Order( 32 | order_id=data.get("order_id", "unknown"), 33 | customer_id=data.get("customer_id", "unknown"), 34 | product_id=data.get("product_id", "unknown"), 35 | quantity=int(data.get("quantity", 0)), 36 | price=float(data.get("price", 0.0)), 37 | order_time=data.get("order_time", "unknown") 38 | ) 39 | 40 | 41 | class LoyaltyTierFunction(KeyedProcessFunction): 42 | 43 | def open(self, runtime_context): 44 | spend_desc = ValueStateDescriptor("total_spend", Types.DOUBLE()) 45 | self.total_spend_state = runtime_context.get_state(spend_desc) 46 | 47 | def process_element(self, order, ctx): 48 | current_spend = self.total_spend_state.value() or 0 49 | 50 | order_total = order.price * order.quantity 51 | new_total_spend = current_spend + order_total 52 | self.total_spend_state.update(new_total_spend) 53 | 54 | if new_total_spend >= TIER_1_THRESHOLD: 55 | yield json.dumps({ 56 | "customer_id": order.customer_id, 57 | "total_spend": new_total_spend, 58 | "tier": 1 59 | }) 60 | 61 | if new_total_spend >= TIER_2_THRESHOLD: 62 | yield json.dumps({ 63 | "customer_id": order.customer_id, 64 | "total_spend": new_total_spend, 65 | "tier": 2 66 | }) 67 | 68 | 69 | def main(): 70 | env = StreamExecutionEnvironment.get_execution_environment() 71 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 72 | 73 | kafka_source = KafkaSource.builder() \ 74 | .set_bootstrap_servers("localhost:9092") \ 75 | .set_topics("orders") \ 76 | .set_group_id("customers-loyalty-tiers") \ 77 | .set_value_only_deserializer(SimpleStringSchema()) \ 78 | .build() 79 | 80 | orders_stream = env.from_source( 81 | source=kafka_source, 82 | watermark_strategy=WatermarkStrategy.no_watermarks(), 83 | source_name="kafka_source" 84 | ) 85 | 86 | loyalty_stream = orders_stream \ 87 | .map(parse_order) \ 88 | .key_by(lambda o: o.customer_id) \ 89 | .process(LoyaltyTierFunction(), Types.STRING()) 90 | 91 | loyalty_stream.print("LoyaltyTierEvent") 92 | 93 | env.execute("Loyalty Tier Tracking") 94 | 95 | if __name__ == "__main__": 96 | main() -------------------------------------------------------------------------------- /02-data-processing-with-spark/01-test-pyspark-app.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c219df52-cb8d-4431-831a-3751a69062f2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession\n", 11 | "\n", 12 | "spark = SparkSession.builder \\\n", 13 | " .appName(\"First Spark application\") \\\n", 14 | " .getOrCreate()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "b0b346e6-5cc9-41a7-8008-2a640bece180", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "ec00f92d-c582-4970-9617-ff0a9852cc45", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "data = [\n", 33 | " {\"userId\": 1, \"paymentAmount\": 100.0, \"date\": \"2025-01-01\"},\n", 34 | " {\"userId\": 2, \"paymentAmount\": 150.5, \"date\": \"2025-01-02\"},\n", 35 | " {\"userId\": 3, \"paymentAmount\": 200.75, \"date\": \"2025-01-03\"},\n", 36 | " {\"userId\": 2, \"paymentAmount\": 50.25, \"date\": \"2025-01-04\"},\n", 37 | " {\"userId\": 1, \"paymentAmount\": 80.0, \"date\": \"2025-01-05\"}\n", 38 | "]\n", 39 | "\n", 40 | "df = spark.createDataFrame(data)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "070d0281-927f-430c-a522-14e93ae1d399", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "df.show()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "5ed8ce14-7f1e-4a27-a5f8-e0f8f2aba424", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df.count()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "3acb5930-966d-4c86-96b1-0d36e52c499c", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "data = data_values = [\n", 71 | " (1, 100.0, \"2025-01-01\"),\n", 72 | " (2, 150.5, \"2025-01-02\"),\n", 73 | " (3, 200.75, \"2025-01-03\"),\n", 74 | " (2, 50.25, \"2025-01-04\"),\n", 75 | " (1, 80.0, \"2025-01-05\")\n", 76 | "]\n", 77 | "\n", 78 | "df = spark.createDataFrame(data, [\"userId\", \"amount\", \"date\"])\n", 79 | "df.show(3)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "888929b6-e789-456d-a638-a93c88d85814", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "spark.stop()" 90 | ] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3 (ipykernel)", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.12.7" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 5 114 | } 115 | -------------------------------------------------------------------------------- /08-flink-stream-processing/03-windows-aggregation.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | from datetime import datetime 4 | 5 | from pyflink.common import Time 6 | from pyflink.common.typeinfo import Types 7 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 8 | from pyflink.datastream.window import TumblingProcessingTimeWindows 9 | 10 | from pyflink.common.serialization import SimpleStringSchema 11 | from pyflink.common.typeinfo import Types 12 | from pyflink.datastream.connectors.kafka import KafkaSource 13 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 14 | from pyflink.common.watermark_strategy import WatermarkStrategy 15 | from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction 16 | 17 | 18 | @dataclass 19 | class Order: 20 | order_id: str 21 | customer_id: str 22 | product_id: str 23 | quantity: int 24 | price: float 25 | order_time: str 26 | 27 | 28 | def parse_order(json_str) -> Order: 29 | data = json.loads(json_str) 30 | return Order( 31 | order_id=data.get("order_id", "unknown"), 32 | customer_id=data.get("customer_id", "unknown"), 33 | product_id=data.get("product_id", "unknown"), 34 | quantity=data.get("quantity", 0), 35 | price=float(data.get("price", 0.0)), 36 | order_time=data.get("order_time", "unknown") 37 | ) 38 | 39 | 40 | class AggregateWindowFunction(ProcessWindowFunction): 41 | def process(self, 42 | key, 43 | context, 44 | elements): 45 | 46 | total_quantity = 0 47 | total_sum = 0 48 | 49 | for input in elements: 50 | total_quantity += input.quantity 51 | total_sum += input.quantity * input.price 52 | 53 | result = { 54 | "product_id": key, 55 | "total_quantity": total_quantity, 56 | "total_spent": round(total_sum, 2), 57 | "window_start": datetime.utcfromtimestamp( 58 | context.window().start / 1000 59 | ).isoformat(), 60 | "window_end": datetime.utcfromtimestamp( 61 | context.window().end / 1000 62 | ).isoformat(), 63 | } 64 | return [json.dumps(result)] 65 | 66 | 67 | def main(): 68 | env = StreamExecutionEnvironment.get_execution_environment() 69 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 70 | 71 | kafka_source = KafkaSource.builder() \ 72 | .set_bootstrap_servers("localhost:9092") \ 73 | .set_topics("orders") \ 74 | .set_group_id("flink-window-aggregation-group") \ 75 | .set_value_only_deserializer(SimpleStringSchema()) \ 76 | .build() 77 | 78 | 79 | orders_stream = env.from_source( 80 | kafka_source, 81 | watermark_strategy=WatermarkStrategy.no_watermarks(), 82 | source_name="kafka_source" 83 | ) 84 | 85 | windowed_stream = orders_stream \ 86 | .map(parse_order) \ 87 | .key_by(lambda x: x.product_id) \ 88 | .window(TumblingProcessingTimeWindows.of(Time.seconds(30))) \ 89 | .process(AggregateWindowFunction(), 90 | Types.STRING()) 91 | 92 | windowed_stream.print() 93 | 94 | env.execute("Window-based aggregation") 95 | 96 | 97 | if __name__ == "__main__": 98 | main() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/dags/04-bookings_per_listing_with_sensor.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator 4 | from airflow.sensors.filesystem import FileSensor 5 | from datetime import datetime 6 | import os 7 | import csv 8 | import random 9 | 10 | 11 | @dag( 12 | "bookings_spark_pipeline", 13 | start_date=datetime(2025, 1, 1), 14 | schedule_interval="* * * * *", 15 | catchup=False, 16 | description="", 17 | ) 18 | def bookings_spark_pipeline(): 19 | 20 | @task 21 | def generate_bookings(): 22 | context = get_current_context() 23 | execution_date = context["execution_date"] 24 | 25 | file_date = execution_date.strftime("%Y-%m-%d_%H%M") 26 | file_path = f"/tmp/data/bookings/{file_date}/bookings.csv" 27 | 28 | num_bookings = random.randint(30, 50) 29 | bookings = [] 30 | for i in range(num_bookings): 31 | booking = { 32 | "booking_id": random.randint(1000, 5000), 33 | "listing_id": random.choice([13913, 17402, 24328, 33332, 116268, 117203, 127652, 127860]), 34 | "user_id": random.randint(1000, 5000), 35 | "booking_time": execution_date.strftime("%Y-%m-%d %H:%M:%S"), 36 | "status": random.choice(["confirmed", "cancelled", "pending"]) 37 | } 38 | bookings.append(booking) 39 | 40 | directory = os.path.dirname(file_path) 41 | if not os.path.exists(directory): 42 | os.makedirs(directory) 43 | 44 | fieldnames = ["booking_id", "listing_id", "user_id", "booking_time", "status"] 45 | 46 | with open(file_path, "w", newline="") as csvfile: 47 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 48 | writer.writeheader() 49 | for booking in bookings: 50 | writer.writerow({ 51 | "booking_id": booking["booking_id"], 52 | "listing_id": booking["listing_id"], 53 | "user_id": booking["user_id"], 54 | "booking_time": booking["booking_time"], 55 | "status": booking["status"] 56 | }) 57 | 58 | print(f"Generated bookings data written to {file_path}") 59 | 60 | wait_for_listings_file = FileSensor( 61 | task_id="wait_for_listings_file", 62 | fs_conn_id="local_fs", 63 | filepath="/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz", 64 | poke_interval=30, 65 | timeout=600, 66 | ) 67 | 68 | spark_job = SparkSubmitOperator( 69 | task_id="process_listings_and_bookings", 70 | application="bookings_per_listing_spark.py", 71 | name="listings_bookings_join", 72 | application_args=[ 73 | "--listings_file", "/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz", 74 | "--bookings_file", "/tmp/data/bookings/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}/bookings.csv", 75 | "--output_path", "/tmp/data/bookings_per_listing/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}" 76 | ], 77 | conn_id='spark_booking', 78 | ) 79 | 80 | bookings_file = generate_bookings() 81 | bookings_file >> spark_job 82 | wait_for_listings_file >> spark_job 83 | 84 | dag_instance = bookings_spark_pipeline() -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-exercise/flink-app.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.typeinfo import Types 6 | from pyflink.datastream import StreamExecutionEnvironment 7 | from pyflink.datastream.connectors.kafka import KafkaSource 8 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 9 | from pyflink.common.watermark_strategy import WatermarkStrategy 10 | from pyflink.common import Time 11 | from pyflink.datastream.window import TumblingProcessingTimeWindows 12 | from pyflink.datastream.state import ValueStateDescriptor 13 | from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction 14 | 15 | 16 | @dataclass 17 | class Payment: 18 | payment_id: str 19 | user_id: str 20 | merchant_id: str 21 | amount: float 22 | payment_time: str 23 | 24 | 25 | def parse_payment(json_str: str) -> Payment: 26 | data = json.loads(json_str) 27 | return Payment( 28 | payment_id=data.get("payment_id", "unknown"), 29 | user_id=data.get("user_id", "unknown"), 30 | merchant_id=data.get("merchant_id", "unknown"), 31 | amount=float(data.get("amount", 0.0)), 32 | payment_time=data.get("payment_time", "unknown") 33 | ) 34 | 35 | 36 | class PaymentsAnomaliesDetector(ProcessWindowFunction): 37 | 38 | def open(self, runtime_context): 39 | # TODO: Define state for the total number of payments from a merchant 40 | self.total_count = None 41 | # TODO: Define state for the sum of all payment amounts from a merchant 42 | self.total_amount = None 43 | 44 | def process(self, 45 | key, 46 | context, 47 | elements): 48 | current_total_count = self.total_count.value() or 0 49 | current_total_amount = self.total_amount.value() or 0 50 | 51 | window_total = 0 52 | window_count = 0 53 | 54 | for input in elements: 55 | # TODO: Compute window_total and window_count using elements in the window 56 | 57 | if current_total_count > 0: 58 | # TODO: Compute average payment amount using values from the local state 59 | current_average = None 60 | # TODO: Compute average payment amount for the current window 61 | window_average = None 62 | 63 | if window_average > 1.5 * current_average: 64 | # TODO: Emit a record about a detected anomaly 65 | 66 | # TODO: Update local state using data from the current window 67 | 68 | 69 | def main(): 70 | env = StreamExecutionEnvironment.get_execution_environment() 71 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 72 | 73 | kafka_source = KafkaSource.builder() \ 74 | .set_bootstrap_servers("localhost:9092") \ 75 | .set_topics("payments") \ 76 | .set_group_id("flink-consumer-group") \ 77 | .set_value_only_deserializer(SimpleStringSchema()) \ 78 | .build() 79 | 80 | payments_stream = env.from_source( 81 | kafka_source, 82 | watermark_strategy=WatermarkStrategy.no_watermarks(), 83 | source_name="kafka_source" 84 | ).map(parse_payment) 85 | 86 | 87 | anomalies_stream = payments_stream 88 | # TODO: Add stream processing steps for anomaly detection. 89 | # For each merchant, use PaymentsAnomaliesDetector on 90 | # 10 seconds tumbling windows 91 | 92 | anomalies_stream.print("DetectedAnomalies") 93 | 94 | env.execute("Payment anomalies detection") 95 | 96 | 97 | if __name__ == "__main__": 98 | main() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/dags/05-bookings_per_listing_with_postgres.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator 4 | from airflow.sensors.filesystem import FileSensor 5 | from airflow.providers.postgres.hooks.postgres import PostgresHook 6 | from datetime import datetime, timedelta 7 | import os 8 | import csv 9 | 10 | 11 | @dag( 12 | "bookings_spark_pipeline", 13 | start_date=datetime(2025, 1, 1), 14 | schedule_interval="* * * * *", 15 | catchup=False, 16 | description="", 17 | ) 18 | def bookings_per_listing_with_sensor(): 19 | 20 | @task 21 | def read_bookings_from_postgres(): 22 | context = get_current_context() 23 | execution_date = context["execution_date"] 24 | file_date = execution_date.strftime("%Y-%m-%d_%H-%M") 25 | 26 | file_path = f"/tmp/data/bookings/{file_date}/bookings.csv" 27 | 28 | start_of_minute = execution_date.replace(second=0, microsecond=0) 29 | end_of_minute = start_of_minute + timedelta(minutes=1) 30 | 31 | pg_hook = PostgresHook(postgres_conn_id="postgres_default") 32 | query = f""" 33 | SELECT booking_id, listing_id, user_id, booking_time, status 34 | FROM bookings 35 | WHERE booking_time >= '{start_of_minute.strftime('%Y-%m-%d %H:%M:%S')}' 36 | AND booking_time < '{end_of_minute.strftime('%Y-%m-%d %H:%M:%S')}' 37 | """ 38 | records = pg_hook.get_records(query) 39 | 40 | bookings = [] 41 | 42 | print(f"Read {len(records)} from Postgres") 43 | for record in records: 44 | booking = { 45 | "booking_id": record[0], 46 | "listing_id": record[1], 47 | "user_id": record[2], 48 | "booking_time": record[3].strftime('%Y-%m-%d %H:%M:%S'), 49 | "status": record[4] 50 | } 51 | bookings.append(booking) 52 | 53 | directory = os.path.dirname(file_path) 54 | if not os.path.exists(directory): 55 | os.makedirs(directory) 56 | 57 | fieldnames = ["booking_id", "listing_id", "user_id", "booking_time", "status"] 58 | 59 | with open(file_path, "w", newline="") as csvfile: 60 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 61 | writer.writeheader() 62 | for booking in bookings: 63 | writer.writerow({ 64 | "booking_id": booking["booking_id"], 65 | "listing_id": booking["listing_id"], 66 | "user_id": booking["user_id"], 67 | "booking_time": booking["booking_time"], 68 | "status": booking["status"] 69 | }) 70 | 71 | print(f"Generated bookings data written to {file_path}") 72 | 73 | wait_for_listings_file = FileSensor( 74 | task_id="wait_for_listings_file", 75 | fs_conn_id="local_fs", 76 | filepath="/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz", 77 | poke_interval=30, 78 | timeout=600, 79 | ) 80 | 81 | spark_job = SparkSubmitOperator( 82 | task_id="process_listings_and_bookings", 83 | application="bookings_per_listing_spark.py", 84 | name="listings_bookings_join", 85 | application_args=[ 86 | "--listings_file", "/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz", 87 | "--bookings_file", "/tmp/data/bookings/{{ execution_date.strftime('%Y-%m-%d_%H-%M') }}/bookings.csv", 88 | "--output_path", "/tmp/data/bookings_per_listing/{{ execution_date.strftime('%Y-%m-%d_%H-%M') }}" 89 | ], 90 | conn_id='spark_default', 91 | ) 92 | 93 | bookings_file = read_bookings_from_postgres() 94 | bookings_file >> spark_job 95 | wait_for_listings_file >> spark_job 96 | 97 | dag_instance = bookings_per_listing_with_sensor() -------------------------------------------------------------------------------- /08-flink-stream-processing/exercises/02-anomalities-detector-solution/flink-app.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.typeinfo import Types 6 | from pyflink.datastream import StreamExecutionEnvironment 7 | from pyflink.datastream.connectors.kafka import KafkaSource 8 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 9 | from pyflink.common.watermark_strategy import WatermarkStrategy 10 | from pyflink.common import Time 11 | from pyflink.datastream.window import TumblingProcessingTimeWindows 12 | from pyflink.datastream.state import ValueStateDescriptor 13 | from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction 14 | 15 | 16 | @dataclass 17 | class Payment: 18 | payment_id: str 19 | user_id: str 20 | merchant_id: str 21 | amount: float 22 | payment_time: str 23 | 24 | 25 | def parse_payment(json_str: str) -> Payment: 26 | data = json.loads(json_str) 27 | return Payment( 28 | payment_id=data.get("payment_id", "unknown"), 29 | user_id=data.get("user_id", "unknown"), 30 | merchant_id=data.get("merchant_id", "unknown"), 31 | amount=float(data.get("amount", 0.0)), 32 | payment_time=data.get("payment_time", "unknown") 33 | ) 34 | 35 | 36 | class PaymentsAnomaliesDetector(ProcessWindowFunction): 37 | 38 | def open(self, runtime_context): 39 | self.total_count = runtime_context.get_state( 40 | ValueStateDescriptor("total_count", Types.LONG()) 41 | ) 42 | self.total_amount = runtime_context.get_state( 43 | ValueStateDescriptor("total_amount", Types.DOUBLE()) 44 | ) 45 | 46 | def process(self, 47 | key, 48 | context, 49 | elements): 50 | current_total_count = self.total_count.value() or 0 51 | current_total_amount = self.total_amount.value() or 0 52 | 53 | window_total = 0 54 | window_count = 0 55 | 56 | for input in elements: 57 | window_count += 1 58 | window_total += input.amount 59 | 60 | if current_total_count > 0: 61 | current_average = current_total_amount / current_total_count 62 | window_average = window_total / window_count 63 | 64 | if window_average > 1.5 * current_average: 65 | yield json.dumps({ 66 | "merchant_id": key, 67 | "running_average": current_average, 68 | "window_average": window_average, 69 | }) 70 | 71 | new_total_count = current_total_count + window_count 72 | new_total_amount = current_total_amount + window_total 73 | 74 | self.total_count.update(new_total_count) 75 | self.total_amount.update(new_total_amount) 76 | 77 | 78 | def main(): 79 | env = StreamExecutionEnvironment.get_execution_environment() 80 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 81 | 82 | kafka_source = KafkaSource.builder() \ 83 | .set_bootstrap_servers("localhost:9092") \ 84 | .set_topics("payments") \ 85 | .set_group_id("flink-consumer-group") \ 86 | .set_value_only_deserializer(SimpleStringSchema()) \ 87 | .build() 88 | 89 | payments_stream = env.from_source( 90 | kafka_source, 91 | watermark_strategy=WatermarkStrategy.no_watermarks(), 92 | source_name="kafka_source" 93 | ).map(parse_payment) 94 | 95 | anomalies_stream = payments_stream \ 96 | .key_by(lambda payment: payment.merchant_id) \ 97 | .window(TumblingProcessingTimeWindows.of(Time.seconds(10))) \ 98 | .process(PaymentsAnomaliesDetector(), output_type=Types.STRING()) 99 | 100 | anomalies_stream.print("DetectedAnomalies") 101 | 102 | env.execute("Payment anomalies detection") 103 | 104 | 105 | if __name__ == "__main__": 106 | main() -------------------------------------------------------------------------------- /07-kafka-streaming/exercises/02-kafka-connect-exercise/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 0. Stop all Docker containers 3 | 4 | Before you start, you would need to stop Docker containers related to this bootcamp 5 | running on your machine. 6 | 7 | # 1. Start Kafka 8 | 9 | First, start Kafka, Kafka Connect, and Postrges using Docker Compose: 10 | 11 | ```sh 12 | docker-compose up 13 | ``` 14 | 15 | 16 | # 2. Create a virtual environment and install dependencies 17 | 18 | Run the following commands to create a virtual environment and install dependencies: 19 | 20 | ```sh 21 | python3 -m venv venv 22 | source venv/bin/activate 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | # 3. Create a Debezium connector 27 | 28 | Once you have Kafka Connect running, you need to create a connector to read a stream of updates from Postgres. Run this command to create it: 29 | 30 | ```sh 31 | curl -X POST -H "Content-Type: application/json" -d @config_debezium.json http://localhost:8083/connectors 32 | ``` 33 | 34 | # 4. Connect to a database 35 | 36 | You will need to execute several SQL operations. To connect to a database, use the following arguments: 37 | 38 | * *URL* - `127.0.0.1:5432` 39 | * *Username* - `user` 40 | * **Password** - `password` 41 | * **Database** - `onlineshop` 42 | 43 | # 5. Create "orders" table 44 | 45 | First, you need to create the `orders` table using the following SQL statement: 46 | 47 | ```sql 48 | CREATE TABLE orders ( 49 | id SERIAL PRIMARY KEY, 50 | customer_id VARCHAR(50) NOT NULL, 51 | customer_name VARCHAR(100), 52 | customer_email VARCHAR(255), 53 | product_id VARCHAR(50), 54 | total_amount NUMERIC(10, 2), 55 | order_date TIMESTAMPTZ, 56 | status VARCHAR(50), 57 | payment_method VARCHAR(50) 58 | ); 59 | ``` 60 | 61 | 62 | # 6. Alter table 63 | 64 | By default, WAL records produced by Postgres will only contain data in a table after the update. To include a snapshot of data before the update, we need to run the following SQL command: 65 | 66 | ```sql 67 | ALTER TABLE orders REPLICA IDENTITY FULL; 68 | ``` 69 | 70 | After this command every `UPDATE` or `DELETE` operation on the `orders` table, Postgres will log the entire row’s data before and after the update in the Write-Ahead Log. 71 | 72 | # 7. Create a new order 73 | 74 | Once you have a table, you can create the `orders` table using this SQL statement. 75 | 76 | ```sql 77 | INSERT INTO orders ( 78 | customer_id, 79 | customer_name, 80 | customer_email, 81 | product_id, 82 | total_amount, 83 | order_date, 84 | status, 85 | payment_method 86 | ) 87 | VALUES ( 88 | 'CUST-1234', 89 | 'John Smith', 90 | 'john.smith@example.com', 91 | 'PROD-XYZ789', 92 | 59.95, 93 | '2024-12-09T10:45:00Z', 94 | 'processed', 95 | 'paypal' 96 | ) 97 | RETURNING id; 98 | ``` 99 | 100 | This should return the `id` of the newly created record that you can use to perform an update operation. 101 | 102 | # 8. Update an order status 103 | 104 | Now, we can update the created record. You can do it using this command: 105 | 106 | ```sql 107 | UPDATE orders 108 | SET status = 'refunded' 109 | WHERE id = 1 110 | ``` 111 | 112 | Since it changes the `status` value from `processed` to `refunded` it should. 113 | 114 | # 9. Check if Kafka Connect writes records to Kafka 115 | 116 | Run the following command to test if Kafka Connect writes records to Kafka: 117 | 118 | ```sh 119 | kafka-console-consumer --bootstrap-server localhost:9092 --topic postgres-.public.orders --from-beginning 120 | ``` 121 | 122 | You should see two records: one for the `INSERT` operation and another one for the `UPDATE` operation. 123 | 124 | # 10. Implement and run your consumer and see if it works 125 | 126 | You should now implement and run your Python consumer. 127 | It should print a single message for the executed update operation. 128 | 129 | # 11. (Optional) Create more test records 130 | 131 | If you need more test records, you can repeat steps **7** and **8** again for a new record, but you would need to change the `id` comparison value in the `UPDATE` statement. -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/README.md: -------------------------------------------------------------------------------- 1 | This is a README for the second exercise in this section. 2 | 3 | # 0. Follow the instruction in the README.md file in the "exercises" folder 4 | 5 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow. 6 | 7 | # 1. Install Airflow provider 8 | 9 | Since your DAG will read data from Postgres and use Spark, you need to first install Postgres and Spark providers: 10 | 11 | ```sh 12 | pip install apache-airflow-providers-postgres 13 | pip install apache-airflow-providers-apache-spark 14 | ``` 15 | 16 | You need to run this command in the virtual environment you've created for exercises in this section. 17 | 18 | # 2. Start the Postgres database 19 | 20 | Start the Postgres instance from which your DAG will ingest data using the Docker Compose command: 21 | 22 | ```sh 23 | docker-compose up 24 | ``` 25 | 26 | # 3. Create a table in the Postgres database 27 | 28 | Having a database running we can create a table from which Airflow will ingest data. 29 | 30 | To connect to a database use the following parameters: 31 | 32 | * *Host* - `localhost` 33 | * *Database* - `rental_site` 34 | * *Login* - `user` 35 | * *Password* - `password` 36 | * *Port* - `5432` 37 | 38 | Then, execute this statement to create a database for this exercise: 39 | 40 | ```sql 41 | CREATE TABLE customer_reviews ( 42 | review_id SERIAL PRIMARY KEY, 43 | listing_id INT NOT NULL, 44 | review_score INT NOT NULL, 45 | review_comment TEXT, 46 | review_date TIMESTAMP NOT NULL DEFAULT NOW() 47 | ); 48 | ``` 49 | 50 | # 4. Create a Postgres connection in the Airflow UI 51 | 52 | Create a Postgres connection, so your DAG could use a Postgres hook. 53 | 54 | To do it run the following command from your virtual environment: 55 | 56 | ```sh 57 | airflow connections add 'postgres_rental_site' \ 58 | --conn-type 'postgres' \ 59 | --conn-host 'localhost' \ 60 | --conn-login 'user' \ 61 | --conn-password 'password' \ 62 | --conn-port '5432' \ 63 | --conn-schema 'rental_site' 64 | ``` 65 | 66 | # 5. Create a Spark connection in the Airflow UI 67 | 68 | Create a Spark connection, so your DAG could run Spark applications. 69 | 70 | To do it run the following command from your virtual environment: 71 | 72 | ```sh 73 | airflow connections add 'spark_rental_site' \ 74 | --conn-type 'spark' \ 75 | --conn-host 'local' \ 76 | --conn-extra '{"deploy_mode": "client"}' 77 | ``` 78 | 79 | 80 | # 6. Copy the DAG and the Spark code 81 | 82 | Copy the following files to the `dags` folder you've created while setting up Airflow locally: 83 | 84 | * `customer_reviews_dag.py` - Airflow DAGs implementing customer reviews processing 85 | * `spark_etl_reviews.py` - Spark job for processing customer reviews 86 | 87 | # 7. Restart the scheduler 88 | 89 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`. 90 | 91 | After this, start it again using the following command: 92 | 93 | ```sh 94 | airflow scheduler 95 | ``` 96 | 97 | # 8. Implement the TODOs in the code 98 | 99 | Now implement the TODO comments in the starter code. 100 | 101 | 102 | # 9. Start the DAG 103 | 104 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented. 105 | 106 | # 10. Add some test reviews to test the created pipeline 107 | 108 | Now you can test your pipeline. Add these reviews to the `customer_reviews` table: 109 | 110 | ```sql 111 | INSERT INTO customer_reviews (listing_id, review_score, review_comment, review_date) 112 | VALUES 113 | (101, 5, 'Excellent stay, highly recommend!', NOW()), 114 | (101, 5, 'Great location!', NOW()), 115 | (102, 4, 'Good location but a bit noisy.', NOW()), 116 | (102, 3, 'Poor room service.', NOW()), 117 | (103, 3, 'Could have been worse.', NOW()); 118 | ``` 119 | 120 | At the next run your pipeline will read these reviews and compute an average score per listing ID. 121 | -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/README.md: -------------------------------------------------------------------------------- 1 | This is a README for the second exercise in this section. 2 | 3 | # 0. Follow the instruction in the README.md file in the "exercises" folder 4 | 5 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow. 6 | 7 | # 1. Install Airflow provider 8 | 9 | Since your DAG will read data from Postgres and use Spark, you need to first install Postgres and Spark providers: 10 | 11 | ```sh 12 | pip install apache-airflow-providers-postgres 13 | pip install apache-airflow-providers-apache-spark 14 | ``` 15 | 16 | You need to run this command in the virtual environment you've created for exercises in this section. 17 | 18 | # 2. Start the Postgres database 19 | 20 | Start the Postgres instance from which your DAG will ingest data using the Docker Compose command: 21 | 22 | ```sh 23 | docker-compose up 24 | ``` 25 | 26 | # 3. Create a table in the Postgres database 27 | 28 | Having a database running we can create a table from which Airflow will ingest data. 29 | 30 | To connect to a database use the following parameters: 31 | 32 | * *Host* - `localhost` 33 | * *Database* - `rental_site` 34 | * *Login* - `user` 35 | * *Password* - `password` 36 | * *Port* - `5432` 37 | 38 | Then, execute this statement to create a database for this exercise: 39 | 40 | ```sql 41 | CREATE TABLE customer_reviews ( 42 | review_id SERIAL PRIMARY KEY, 43 | listing_id INT NOT NULL, 44 | review_score INT NOT NULL, 45 | review_comment TEXT, 46 | review_date TIMESTAMP NOT NULL DEFAULT NOW() 47 | ); 48 | ``` 49 | 50 | # 4. Create a Postgres connection in the Airflow UI 51 | 52 | Create a Postgres connection, so your DAG could use a Postgres hook. 53 | 54 | To do it run the following command from your virtual environment: 55 | 56 | ```sh 57 | airflow connections add 'postgres_rental_site' \ 58 | --conn-type 'postgres' \ 59 | --conn-host 'localhost' \ 60 | --conn-login 'user' \ 61 | --conn-password 'password' \ 62 | --conn-port '5432' \ 63 | --conn-schema 'rental_site' 64 | ``` 65 | 66 | # 5. Create a Spark connection in the Airflow UI 67 | 68 | Create a Spark connection, so your DAG could run Spark applications. 69 | 70 | To do it run the following command from your virtual environment: 71 | 72 | ```sh 73 | airflow connections add 'spark_rental_site' \ 74 | --conn-type 'spark' \ 75 | --conn-host 'local' \ 76 | --conn-extra '{"deploy_mode": "client"}' 77 | ``` 78 | 79 | 80 | # 6. Copy the DAG and the Spark code 81 | 82 | Copy the following files to the `dags` folder you've created while setting up Airflow locally: 83 | 84 | * `customer_reviews_dag.py` - Airflow DAGs implementing customer reviews processing 85 | * `spark_etl_reviews.py` - Spark job for processing customer reviews 86 | 87 | # 7. Restart the scheduler 88 | 89 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`. 90 | 91 | After this, start it again using the following command: 92 | 93 | ```sh 94 | airflow scheduler 95 | ``` 96 | 97 | # 8. Implement the TODOs in the code 98 | 99 | Now implement the TODO comments in the starter code. 100 | 101 | 102 | # 9. Start the DAG 103 | 104 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented. 105 | 106 | # 10. Add some test reviews to test the created pipeline 107 | 108 | Now you can test your pipeline. Add these reviews to the `customer_reviews` table: 109 | 110 | ```sql 111 | INSERT INTO customer_reviews (listing_id, review_score, review_comment, review_date) 112 | VALUES 113 | (101, 5, 'Excellent stay, highly recommend!', NOW()), 114 | (101, 5, 'Great location!', NOW()), 115 | (102, 4, 'Good location but a bit noisy.', NOW()), 116 | (102, 3, 'Poor room service.', NOW()), 117 | (103, 3, 'Could have been worse.', NOW()); 118 | ``` 119 | 120 | At the next run your pipeline will read these reviews and compute an average score per listing ID. 121 | -------------------------------------------------------------------------------- /08-flink-stream-processing/04-late-events-processing.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | from datetime import datetime 4 | 5 | from pyflink.common import Time 6 | from pyflink.common.serialization import SimpleStringSchema 7 | from pyflink.common.watermark_strategy import WatermarkStrategy, TimestampAssigner 8 | from pyflink.common.time import Duration 9 | from pyflink.common.typeinfo import Types 10 | from pyflink.datastream import OutputTag, StreamExecutionEnvironment, ProcessWindowFunction, TimeCharacteristic 11 | from pyflink.datastream.connectors.kafka import KafkaSource 12 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 13 | from pyflink.datastream.window import TumblingEventTimeWindows 14 | 15 | @dataclass 16 | class Order: 17 | order_id: str 18 | customer_id: str 19 | product_id: str 20 | quantity: int 21 | price: float 22 | order_time: str 23 | 24 | 25 | def parse_order(json_str) -> Order: 26 | data = json.loads(json_str) 27 | order_time_seconds = datetime.fromisoformat(data["order_time"]) 28 | return Order( 29 | order_id=data.get("order_id", "unknown"), 30 | customer_id=data.get("customer_id", "unknown"), 31 | product_id=data.get("product_id", "unknown"), 32 | quantity=data.get("quantity", 0), 33 | price=float(data.get("price", 0.0)), 34 | order_time=data.get("order_time", "unknown") 35 | ) 36 | 37 | 38 | class OrderTimestampAssigner(TimestampAssigner): 39 | def extract_timestamp(self, value, record_timestamp) -> int: 40 | dt = datetime.fromisoformat(value.order_time) 41 | return int(dt.timestamp() * 1000) 42 | 43 | 44 | class AggregateWindowFunction(ProcessWindowFunction): 45 | def process(self, 46 | key, 47 | context, 48 | elements): 49 | 50 | total_quantity = 0 51 | total_sum = 0 52 | 53 | for input in elements: 54 | total_quantity += input.quantity 55 | total_sum += input.quantity * input.price 56 | 57 | result = { 58 | "product_id": key, 59 | "total_quantity": total_quantity, 60 | "total_spent": round(total_sum, 2), 61 | "window_start": datetime.utcfromtimestamp( 62 | context.window().start / 1000 63 | ).isoformat(), 64 | "window_end": datetime.utcfromtimestamp( 65 | context.window().end / 1000 66 | ).isoformat(), 67 | } 68 | return [json.dumps(result)] 69 | 70 | 71 | def main(): 72 | env = StreamExecutionEnvironment.get_execution_environment() 73 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 74 | env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 75 | 76 | kafka_source = KafkaSource.builder() \ 77 | .set_bootstrap_servers("localhost:9092") \ 78 | .set_topics("late-orders") \ 79 | .set_group_id("eventtime-demo") \ 80 | .set_value_only_deserializer(SimpleStringSchema()) \ 81 | .build() 82 | 83 | stream = env.from_source( 84 | kafka_source, 85 | watermark_strategy=WatermarkStrategy.no_watermarks(), 86 | source_name="kafka_source" 87 | ) 88 | 89 | watermark_strategy = WatermarkStrategy \ 90 | .for_bounded_out_of_orderness(Duration.of_seconds(10)) \ 91 | .with_timestamp_assigner(OrderTimestampAssigner()) 92 | 93 | late_tag = OutputTag("late-events", Types.PICKLED_BYTE_ARRAY()) 94 | 95 | windowed_stream = stream \ 96 | .map(parse_order) \ 97 | .assign_timestamps_and_watermarks(watermark_strategy) \ 98 | .key_by(lambda x: x.product_id) \ 99 | .window(TumblingEventTimeWindows.of(Time.seconds(30))) \ 100 | .side_output_late_data(late_tag) \ 101 | .process(AggregateWindowFunction(), Types.STRING()) 102 | 103 | windowed_stream.print("Aggregated") 104 | 105 | late_stream = windowed_stream.get_side_output(late_tag) 106 | late_stream.print("LateEvents") 107 | 108 | env.execute("Advanced Event-Time Window Demo") 109 | 110 | 111 | if __name__ == "__main__": 112 | main() -------------------------------------------------------------------------------- /08-flink-stream-processing/06-connecting-streams.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | from pyflink.common.typeinfo import Types 4 | from pyflink.common.serialization import SimpleStringSchema 5 | from pyflink.common.watermark_strategy import WatermarkStrategy 6 | 7 | from pyflink.datastream import ( 8 | StreamExecutionEnvironment, 9 | RuntimeContext 10 | ) 11 | from pyflink.datastream.functions import CoProcessFunction 12 | from pyflink.datastream.execution_mode import RuntimeExecutionMode 13 | from pyflink.datastream.state import ValueStateDescriptor 14 | from pyflink.datastream.connectors.kafka import KafkaSource 15 | 16 | 17 | @dataclass 18 | class Order: 19 | order_id: str 20 | customer_id: str 21 | product_id: str 22 | quantity: int 23 | price: float 24 | order_time: str 25 | 26 | @dataclass 27 | class Product: 28 | product_id: str 29 | name: str 30 | category: str 31 | 32 | 33 | def parse_order(line: str): 34 | data = json.loads(line) 35 | return Order( 36 | order_id=data.get("order_id", ""), 37 | customer_id=data.get("customer_id", ""), 38 | product_id=data.get("product_id", ""), 39 | quantity=int(data.get("quantity", 0)), 40 | price=float(data.get("price", 0.0)), 41 | order_time=data.get("order_time", "") 42 | ) 43 | 44 | def parse_product(line: str): 45 | data = json.loads(line) 46 | return Product( 47 | product_id=data.get("product_id", ""), 48 | name=data.get("name", "Unknown"), 49 | category=data.get("category", "Unknown") 50 | ) 51 | 52 | 53 | class OrdersProductsCoProcess(CoProcessFunction): 54 | 55 | def open(self, runtime_context: RuntimeContext): 56 | product_desc = ValueStateDescriptor("product_info", Types.PICKLED_BYTE_ARRAY()) 57 | self.product_state = runtime_context.get_state(product_desc) 58 | 59 | def process_element1(self, value, ctx): 60 | product = self.product_state.value() 61 | 62 | if product: 63 | enriched = { 64 | "order_id": value.order_id, 65 | "customer_id": value.customer_id, 66 | "product_id": value.product_id, 67 | "quantity": value.quantity, 68 | "price": value.price, 69 | "product_name": product.name, 70 | "product_category": product.category, 71 | } 72 | else: 73 | enriched = { 74 | "order_id": value.order_id, 75 | "customer_id": value.customer_id, 76 | "product_id": value.product_id, 77 | "quantity": value.quantity, 78 | "price": value.price, 79 | "product_name": "Unknown", 80 | "product_category": "Unknown", 81 | } 82 | 83 | yield json.dumps(enriched) 84 | 85 | def process_element2(self, value, ctx): 86 | self.product_state.update(value) 87 | 88 | 89 | def main(): 90 | env = StreamExecutionEnvironment.get_execution_environment() 91 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 92 | 93 | orders_source = KafkaSource.builder() \ 94 | .set_bootstrap_servers("localhost:9092") \ 95 | .set_topics("orders") \ 96 | .set_group_id("streams_join_consumer") \ 97 | .set_value_only_deserializer(SimpleStringSchema()) \ 98 | .build() 99 | 100 | products_source = KafkaSource.builder() \ 101 | .set_bootstrap_servers("localhost:9092") \ 102 | .set_topics("products") \ 103 | .set_group_id("streams_join_consumer") \ 104 | .set_value_only_deserializer(SimpleStringSchema()) \ 105 | .build() 106 | 107 | orders_stream = env.from_source( 108 | source=orders_source, 109 | watermark_strategy=WatermarkStrategy.no_watermarks(), 110 | source_name="orders_source" 111 | ).map(parse_order) 112 | 113 | products_stream = env.from_source( 114 | source=products_source, 115 | watermark_strategy=WatermarkStrategy.no_watermarks(), 116 | source_name="products_source" 117 | ).map(parse_product) 118 | 119 | products_stream.print("ProductsStream") 120 | 121 | keyed_orders = orders_stream.key_by(lambda o: o.product_id) 122 | keyed_products = products_stream.key_by(lambda c: c.product_id) 123 | 124 | connected = keyed_orders.connect(keyed_products) 125 | 126 | enriched_stream = connected.process( 127 | OrdersProductsCoProcess(), 128 | output_type=Types.STRING() 129 | ) 130 | 131 | enriched_stream.print("EnrichedOrder") 132 | 133 | env.execute("Connecting streams") 134 | 135 | 136 | if __name__ == "__main__": 137 | main() -------------------------------------------------------------------------------- /04-orchestration-with-airflow/exercises/01-data-validation-solution/data_validation_dag.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import dag, task 2 | from airflow.operators.python import get_current_context 3 | from datetime import datetime 4 | import os 5 | import json 6 | import random 7 | 8 | 9 | @dag( 10 | "data_quality_pipeline", 11 | start_date=datetime(2025, 1, 1), 12 | schedule_interval='* * * * *', 13 | catchup=False, 14 | description="Data Quality Check DAG", 15 | ) 16 | def data_quality_pipeline(): 17 | 18 | CORRECT_PROB = 0.7 19 | 20 | def get_bookings_path(context): 21 | execution_date = context["execution_date"] 22 | file_date = execution_date.strftime("%Y-%m-%d_%H-%M") 23 | return f"/tmp/data/bookings/{file_date}/bookings.json" 24 | 25 | def generate_booking_id(i): 26 | if random.random() < CORRECT_PROB: 27 | return i + 1 28 | 29 | return "" 30 | 31 | def generate_listing_id(): 32 | if random.random() < CORRECT_PROB: 33 | return random.choice([1, 2, 3, 4, 5]) 34 | 35 | return "" 36 | 37 | def generate_user_id(correct_prob=0.7): 38 | return random.randint(1000, 5000) if random.random() < correct_prob else "" 39 | 40 | def generate_booking_time(execution_date): 41 | if random.random() < CORRECT_PROB: 42 | return execution_date.strftime('%Y-%m-%d %H:%M:%S') 43 | 44 | return "" 45 | 46 | def generate_status(): 47 | if random.random() < CORRECT_PROB: 48 | return random.choice(["confirmed", "pending", "cancelled"]) 49 | 50 | return random.choice(["unknown", "", "error"]) 51 | 52 | @task 53 | def generate_bookings(): 54 | context = get_current_context() 55 | booking_path = get_bookings_path(context) 56 | 57 | num_bookings = random.randint(5, 15) 58 | bookings = [] 59 | 60 | for i in range(num_bookings): 61 | booking = { 62 | "booking_id": generate_booking_id(i), 63 | "listing_id": generate_listing_id(), 64 | "user_id": generate_user_id(), 65 | "booking_time": generate_booking_time(context["execution_date"]), 66 | "status": generate_status() 67 | } 68 | bookings.append(booking) 69 | 70 | directory = os.path.dirname(booking_path) 71 | if not os.path.exists(directory): 72 | os.makedirs(directory) 73 | 74 | with open(booking_path, "w") as f: 75 | json.dump(bookings, f, indent=4) 76 | 77 | print(f"Written to file: {booking_path}") 78 | 79 | def get_anomalies_path(context): 80 | execution_date = context["execution_date"] 81 | file_date = execution_date.strftime("%Y-%m-%d_%H-%M") 82 | return f"/tmp/data/anomalies/{file_date}/anomalies.json" 83 | 84 | @task 85 | def quality_check(): 86 | context = get_current_context() 87 | booking_path = get_bookings_path(context) 88 | 89 | anomalies = [] 90 | valid_statuses = {"confirmed", "pending", "cancelled"} 91 | 92 | with open(booking_path, "r") as f: 93 | bookings = json.load(f) 94 | 95 | for index, row in enumerate(bookings): 96 | row_anomalies = [] 97 | if not row["booking_id"]: 98 | row_anomalies.append("Missing booking_id") 99 | if not row["listing_id"]: 100 | row_anomalies.append("Missing listing_id") 101 | if not row["user_id"]: 102 | row_anomalies.append("Missing user_id") 103 | if not row["booking_time"]: 104 | row_anomalies.append("Missing booking_time") 105 | if not row["status"]: 106 | row_anomalies.append("Missing status") 107 | 108 | 109 | if row["status"] and row["status"] not in valid_statuses: 110 | row_anomalies.append(f"Invalid status: {row['status']}") 111 | 112 | if row_anomalies: 113 | anomalies.append({ 114 | "booking_id": index, 115 | "anomalies": row_anomalies, 116 | }) 117 | 118 | anomalies_file = get_anomalies_path(context) 119 | directory = os.path.dirname(anomalies_file) 120 | if not os.path.exists(directory): 121 | os.makedirs(directory) 122 | 123 | with open(anomalies_file, "w") as f: 124 | json.dump(anomalies, f, indent=4) 125 | 126 | print(f"Completed validation for {booking_path}. Anomalies found: {len(anomalies)}") 127 | print(f"Result written to {anomalies_file}") 128 | 129 | generate_bookings() >> quality_check() 130 | 131 | dag_instance = data_quality_pipeline() 132 | -------------------------------------------------------------------------------- /02-data-processing-with-spark/exercises/02-aggregation-functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "2321b05f-9964-42dd-a049-6226310a5e08", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession\n", 11 | "\n", 12 | "spark = SparkSession.builder \\\n", 13 | " .appName(\"Spark aggregation functions\") \\\n", 14 | " .getOrCreate()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "b409f1b4-0cb7-4544-ae6e-5ecf760cba7d", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "listings = spark.read.csv(\"../data/listings.csv.gz\", \n", 25 | " header=True,\n", 26 | " inferSchema=True,\n", 27 | " sep=\",\", \n", 28 | " quote='\"',\n", 29 | " escape='\"', \n", 30 | " multiLine=True,\n", 31 | " mode=\"PERMISSIVE\" \n", 32 | ")\n", 33 | "listings.printSchema()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "ed5808e2-8296-4e55-8b9f-8ced9e7cae0b", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "reviews = spark.read.csv(\"../data/reviews.csv.gz\", \n", 44 | " header=True,\n", 45 | " inferSchema=True,\n", 46 | " sep=\",\",\n", 47 | " quote='\"',\n", 48 | " escape='\"',\n", 49 | " multiLine=True,\n", 50 | " mode=\"PERMISSIVE\"\n", 51 | ")\n", 52 | "reviews.printSchema()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "fde34e78-3feb-4fd3-a675-3831adf3bc73", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# 1. Count the number of reviews per listing using the \"reviews\" dataset\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "5bc370f9-0375-4b2c-a8cf-7fd7437e1978", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# 2. Compute the total number of listings and average review score per host\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "84d9e7bd-341e-4468-881c-ef4491df08b4", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# 3: Find the top ten listings with the highest number of reviews\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "ada1d343-15b9-4245-9801-7dd468ffd9cd", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# 4. Find the top five neighborhoods with the most listings\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "a45177ab-5191-4cbf-8f07-5244d78b4f58", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# 5. Get a data frame with the following four columns:\n", 103 | "# * Listing's ID\n", 104 | "# * Listing's name\n", 105 | "# * Reviewer's name\n", 106 | "# * Review's comment\n", 107 | "# Use \"join\" to combine data from two datasets\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "c70008c5-cb99-4079-8309-e812449bd8d7", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# 6.Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews\n", 118 | "# Use the \"length\" function from the \"pyspark.sql.functions\" to get a lenght of a review\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "f2d55601-9eb6-496c-b214-6ad56d2aec53", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# 7. Using the \"join\" operator find listings without reviews.\n", 129 | "# Hint: Use \"left_join\" or \"left_anti\" join type when implementing this\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "888ce8f0-6e49-41ba-b521-64147316cf22", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python 3 (ipykernel)", 144 | "language": "python", 145 | "name": "python3" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 3 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython3", 157 | "version": "3.13.2" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 5 162 | } 163 | -------------------------------------------------------------------------------- /02-data-processing-with-spark/03-processing-airbnb-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c219df52-cb8d-4431-831a-3751a69062f2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession\n", 11 | "\n", 12 | "spark = SparkSession.builder \\\n", 13 | " .appName(\"Inside Airbnb data processing\") \\\n", 14 | " .getOrCreate()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "ec00f92d-c582-4970-9617-ff0a9852cc45", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "listings = spark.read.csv(\"data/listings.csv.gz\", \n", 25 | " header=True,\n", 26 | " inferSchema=True,\n", 27 | " sep=\",\",\n", 28 | " quote='\"',\n", 29 | " escape='\"',\n", 30 | " multiLine=True,\n", 31 | " mode=\"PERMISSIVE\"\n", 32 | ")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "608ed71d-7a78-46df-9190-4fb49c6b62ce", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "review_locations = listings.select(listings.review_scores_location)\n", 43 | "review_locations.show()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "5ae431b3-c1c2-47e8-bae4-fb10f87fb5b3", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "listings \\\n", 54 | " .select(listings.review_scores_location) \\\n", 55 | " .show()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "7739ebc9-8b6b-428e-af57-7e2616f39b14", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "high_score_listings = listings \\\n", 66 | " .filter(listings.review_scores_location > 4.5) \\\n", 67 | " .select('id', 'price', 'name', 'review_scores_location')\n", 68 | "\n", 69 | "high_score_listings.show(20, truncate=False)\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "5306bebf-6f83-430e-affd-6618494210b9", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "high_score_listings.dropna().show(20, truncate=False)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "de1b6945-d850-4640-b985-99c1bd8e228b", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "high_score_listings.schema['price']" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "d42041b9-e078-430a-ab27-7ba97ebb8d75", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "from pyspark.sql.functions import regexp_replace\n", 100 | "\n", 101 | "price_num_df = listings \\\n", 102 | " .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float')) \\\n", 103 | "\n", 104 | "price_num_df.schema['price_num']" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "2611fddb-c125-4c8b-9cbc-387ed12529e0", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "price_num_df \\\n", 115 | " .select('price_num', 'name') \\\n", 116 | " .show(20, truncate=False)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "f3f00f4c-e0ce-4d06-b5fe-0b2cd8539f46", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "price_num_df.filter( (price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5)) \\\n", 127 | " .select('name', 'price', 'review_scores_location') \\\n", 128 | " .show(truncate=False)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "ca2e5d7c-ee58-4b09-8eda-cb7b5d1ab898", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "price_num_df.filter('price_num < 100 AND review_scores_location > 4.5') \\\n", 139 | " .select('name', 'price', 'review_scores_location') \\\n", 140 | " .show(truncate=False)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "cfb893b6-003b-40ec-82d3-b5bba6de90c0", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "listings \\\n", 151 | " .select(listings.property_type) \\\n", 152 | " .distinct() \\\n", 153 | " .show(truncate=False)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "67666c54-ea5f-4589-ad3a-5ca82434d1e8", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "listings \\\n", 164 | " .select(listings.property_type, listings.room_type) \\\n", 165 | " .distinct() \\\n", 166 | " .show(truncate=False)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "2686c317-8726-4276-ae8a-d4ce474cd487", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "listings \\\n", 177 | " .select(listings.property_type) \\\n", 178 | " .distinct() \\\n", 179 | " .write \\\n", 180 | " .csv('data/property_types')" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3 (ipykernel)", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.12.7" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 5 205 | } 206 | -------------------------------------------------------------------------------- /05-ml-with-spark/04-pyspark-pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "id": "53eb4614-6b2e-4c33-8822-c230a9dba13b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession\n", 11 | "\n", 12 | "spark = SparkSession \\\n", 13 | " .builder \\\n", 14 | " .appName(\"PySpark pipeline\") \\\n", 15 | " .getOrCreate()" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 9, 21 | "id": "af281fdf-a02b-4602-9c29-c0d22a726ecd", 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "root\n", 29 | " |-- longitude: double (nullable = true)\n", 30 | " |-- latitude: double (nullable = true)\n", 31 | " |-- housing_median_age: double (nullable = true)\n", 32 | " |-- total_rooms: double (nullable = true)\n", 33 | " |-- total_bedrooms: double (nullable = true)\n", 34 | " |-- population: double (nullable = true)\n", 35 | " |-- households: double (nullable = true)\n", 36 | " |-- median_income: double (nullable = true)\n", 37 | " |-- median_house_value: double (nullable = true)\n", 38 | " |-- ocean_proximity: string (nullable = true)\n", 39 | "\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "data = spark.read.csv(\"data/housing.csv\", header=True, inferSchema=True)\n", 45 | "\n", 46 | "data.printSchema()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 10, 52 | "id": "6c567350-0cdf-4d1f-82f5-77871e985665", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "20433" 59 | ] 60 | }, 61 | "execution_count": 10, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "filtered_data = data.na.drop(subset=['total_bedrooms'])\n", 68 | "filtered_data.count()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 11, 74 | "id": "0c891b8b-d1f9-481f-a45f-e9915bab6ea0", 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "Train size: 16395\n", 82 | "Test size: 4038\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)\n", 88 | "print(\"Train size: \", train_data.count())\n", 89 | "print(\"Test size: \", test_data.count())" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 12, 95 | "id": "f58c0b33-a7b6-4d68-9bb0-de49309b86a7", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "from pyspark.ml.regression import LinearRegression\n", 100 | "from pyspark.ml import Pipeline\n", 101 | "from pyspark.ml.feature import VectorAssembler\n", 102 | "from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler\n", 103 | "\n", 104 | "indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')\n", 105 | "encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec', dropLast=False)\n", 106 | "\n", 107 | "\n", 108 | "feature_cols = ['housing_median_age', 'total_rooms', 'total_bedrooms',\n", 109 | " 'population', 'households', 'median_income', 'ocean_proximity_vec']\n", 110 | "assembler = VectorAssembler(inputCols=feature_cols, outputCol='unscaled_features')\n", 111 | "scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)\n", 112 | "\n", 113 | "lr = LinearRegression(featuresCol='features', labelCol='median_house_value', regParam=0.001)\n", 114 | "\n", 115 | "pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler, lr])\n", 116 | "\n", 117 | "pipeline_model = pipeline.fit(train_data)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 13, 123 | "id": "1ccc3c07-f82e-49c1-b085-7aa4724d3266", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "test_predictions = pipeline_model.transform(test_data)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 14, 133 | "id": "b1b5796c-9e3b-4272-aa35-edaac2541f30", 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "Mean Absolute Error (MAE): 50597.33640580943\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 146 | "\n", 147 | "evaluator_mae = RegressionEvaluator(labelCol='median_house_value', predictionCol='prediction', metricName='mae')\n", 148 | "mae = evaluator_mae.evaluate(test_predictions)\n", 149 | "print(f\"Mean Absolute Error (MAE): {mae}\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "2ada0719-b3d2-42e2-8840-6c0388a11a85", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "c0a1b109-a13a-41f9-b8a2-7d4e86669c8b", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.12.7" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 5 190 | } 191 | -------------------------------------------------------------------------------- /02-data-processing-with-spark/exercises/03-advanced-spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "b0ce2631-b840-4a7f-8183-fc50cb1977ad", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession\n", 11 | "\n", 12 | "spark = SparkSession.builder \\\n", 13 | " .appName(\"Spark aggregation functions\") \\\n", 14 | " .getOrCreate()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "078a3193-f1a5-4e85-b50a-f80cf5908b18", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "listings = spark.read.csv(\"../data/listings.csv.gz\", \n", 25 | " header=True,\n", 26 | " inferSchema=True,\n", 27 | " sep=\",\", \n", 28 | " quote='\"',\n", 29 | " escape='\"', \n", 30 | " multiLine=True,\n", 31 | " mode=\"PERMISSIVE\" \n", 32 | ")\n", 33 | "listings.printSchema()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "32095599-a1da-408e-b315-3e0481e8bb22", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "reviews = spark.read.csv(\"../data/reviews.csv.gz\", \n", 44 | " header=True,\n", 45 | " inferSchema=True,\n", 46 | " sep=\",\",\n", 47 | " quote='\"',\n", 48 | " escape='\"',\n", 49 | " multiLine=True,\n", 50 | " mode=\"PERMISSIVE\"\n", 51 | ")\n", 52 | "reviews.printSchema()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "ba5fc7ba-8be1-4680-aaf1-6724d1399e1d", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# 1. For each listing compute string category depending on its price, and add it as a new column.\n", 63 | "# A category is defined in the following way:\n", 64 | "#\n", 65 | "# * price < 50 -> \"Budget\"\n", 66 | "# * 50 <= price < 150 -> \"Mid-range\"\n", 67 | "# * price >= 150 -> \"Luxury\"\n", 68 | "# \n", 69 | "# Only include listings where the price is not null.\n", 70 | "# Count the number of listings in each category\n", 71 | "\n", 72 | "from pyspark.sql.functions import regexp_replace\n", 73 | "\n", 74 | "listings = listings.withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))\n", 75 | "\n", 76 | "# TODO: Implement a UDF\n", 77 | "# TODO: Apply the UDF to create a new DataFrame" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "69b6d82e-6255-40bb-be8f-837c0cef6571", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# 2. In this task you will need to compute a santiment score per review, and then an average sentiment score per listing.\n", 88 | "# A santiment score indicates how \"positive\" or \"negative\" a review is. The higher the score the more positive it is, and vice-versa.\n", 89 | "#\n", 90 | "# To compute a sentiment score per review compute the number of positive words in a review and subtract the number of negative\n", 91 | "# words in the same review (the list of words is already provided)\n", 92 | "#\n", 93 | "# To complete this task, compute a DataFrame that contains the following fields:\n", 94 | "# * name - the name of a listing\n", 95 | "# * average_sentiment - average sentiment of reviews computed using the algorithm described above\n", 96 | "from pyspark.sql.types import FloatType\n", 97 | "\n", 98 | "# Lists of positive and negative words\n", 99 | "positive_words = {'good', 'great', 'excellent', 'amazing', 'fantastic', 'wonderful', 'pleasant', 'lovely', 'nice', 'enjoyed'}\n", 100 | "negative_words = {'bad', 'terrible', 'awful', 'horrible', 'disappointing', 'poor', 'hate', 'unpleasant', 'dirty', 'noisy'}\n", 101 | "\n", 102 | "# TODO: Implement the UDF\n", 103 | "def sentiment_score(comment):\n", 104 | " pass\n", 105 | "\n", 106 | "sentiment_score_udf = udf(sentiment_score, FloatType())\n", 107 | "\n", 108 | "reviews_with_sentiment = reviews \\\n", 109 | " .withColumn(\n", 110 | " 'sentiment_score',\n", 111 | " sentiment_score_udf(reviews.comments)\n", 112 | " )\n", 113 | "\n", 114 | "# TODO: Create a final DataFrame" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "637b15b2-66df-4e9b-9bc1-8ba328e14aee", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# 3. Rewrite the following code from the previous exercise using SparkSQL:\n", 125 | "#\n", 126 | "# ```\n", 127 | "# from pyspark.sql.functions import length, avg, count\n", 128 | "# \n", 129 | "# reviews_with_comment_length = reviews.withColumn('comment_length', length('comments'))\n", 130 | "# reviews_with_comment_length \\\n", 131 | "# .join(listings, reviews_with_comment_length.listing_id == listings.id, 'inner') \\\n", 132 | "# .groupBy('listing_id').agg(\n", 133 | "# avg(reviews_with_comment_length.comment_length).alias('average_comment_length'),\n", 134 | "# count(reviews_with_comment_length.id).alias('reviews_count')\n", 135 | "# ) \\\n", 136 | "# .filter('reviews_count >= 5') \\\n", 137 | "# .orderBy('average_comment_length', ascending=False) \\\n", 138 | "# .show()\n", 139 | "# ```\n", 140 | "# This was a solution for the the task:\n", 141 | "#\n", 142 | "# \"Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews\"\n", 143 | "\n", 144 | "reviews.createOrReplaceTempView(\"reviews\")\n", 145 | "listings.createOrReplaceTempView(\"listings\")\n", 146 | "\n", 147 | "# Write the SQL query\n", 148 | "sql_query = \"\"\"\n", 149 | "...\n", 150 | "\"\"\"\n", 151 | "\n", 152 | "spark \\\n", 153 | " .sql(sql_query) \\\n", 154 | " .show()\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "9cd68c71-0ce8-4a21-be62-a822fce18522", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# 4. [Optional][Challenge]\n", 165 | "# Calculate an average time passed from the first review for each host in the listings dataset. \n", 166 | "# To implmenet a custom aggregation function you would need to use \"pandas_udf\" function to write a custom aggregation function.\n", 167 | "#\n", 168 | "# Documentation about \"pandas_udf\": https://spark.apache.org/docs/3.4.2/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html \n", 169 | "#\n", 170 | "# To use \"pandas_udf\" you would need to install two additional dependencies in the virtual environment you use for PySpark:\n", 171 | "# Run these commands:\n", 172 | "# ```\n", 173 | "# pip install pandas\n", 174 | "# pip install pyarrow\n", 175 | "# ```\n", 176 | "\n", 177 | "from pyspark.sql.functions import col, pandas_udf\n", 178 | "from pyspark.sql.types import DoubleType\n", 179 | "from pyspark.sql.functions import PandasUDFType\n", 180 | "import pandas as pd\n", 181 | "\n", 182 | "@pandas_udf(DoubleType(), functionType=PandasUDFType.GROUPED_AGG)\n", 183 | "def average_days_since_first_review_udf(first_review_series) -> float:\n", 184 | " # TODO: Implement the UDF\n", 185 | " pass\n", 186 | "\n", 187 | "listings \\\n", 188 | " .filter(\n", 189 | " listings.first_review.isNotNull()\n", 190 | " ) \\\n", 191 | " .groupBy('host_id') \\\n", 192 | " .agg(\n", 193 | " average_days_since_first_review_udf(listings.first_review).alias('average_days_since_first_review_days')\n", 194 | " ) \\\n", 195 | " .show()" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3 (ipykernel)", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.12.7" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 5 220 | } 221 | --------------------------------------------------------------------------------