├── 03-data-lake
    ├── .gitignore
    ├── configuration-overrides.json
    ├── payload-2024-11.json
    ├── reviews-per-listing.py
    └── README.md
├── 06-data-eng-with-llms
    ├── .gitignore
    ├── 02-structured-output.py
    ├── 01-simple-transformers.py
    ├── 03-spark-llms.py
    └── README.md
├── .gitignore
├── 04-orchestration-with-airflow
    ├── .gitignore
    ├── exercises
    │   ├── .gitignore
    │   ├── 02-process-customer-reviews-exercise
    │   │   ├── docker-compose.yaml
    │   │   ├── spark_etl_reviews.py
    │   │   ├── customer_reviews_dag.py
    │   │   └── README.md
    │   ├── 02-process-customer-reviews-solution
    │   │   ├── docker-compose.yaml
    │   │   ├── spark_etl_reviews.py
    │   │   ├── customer_reviews_dag.py
    │   │   └── README.md
    │   ├── 01-data-validation-solution
    │   │   ├── README.md
    │   │   └── data_validation_dag.py
    │   ├── 01-data-validation-exercise
    │   │   ├── README.md
    │   │   └── data_validation_dag.py
    │   └── README.md
    ├── docker-compose.yaml
    ├── dags
    │   ├── bookings_per_listing_spark.py
    │   ├── 01-average_page_visits.py
    │   ├── 02-average_page_visits_with_failures.py
    │   ├── 03-bookings_per_listing.py
    │   ├── 04-bookings_per_listing_with_sensor.py
    │   └── 05-bookings_per_listing_with_postgres.py
    └── README.md
├── 05-ml-with-spark
    ├── .gitignore
    ├── README.md
    └── 04-pyspark-pipeline.ipynb
├── 07-kafka-streaming
    ├── exercises
    │   ├── 01-wikipedia-stream-exercise
    │   │   ├── .gitignore
    │   │   ├── requirements.txt
    │   │   ├── docker-compose.yml
    │   │   ├── wiki-consumer.py
    │   │   └── wiki-producer.py
    │   ├── 01-wikipedia-stream-solution
    │   │   ├── .gitignore
    │   │   ├── requirements.txt
    │   │   ├── docker-compose.yml
    │   │   ├── wiki-consumer.py
    │   │   └── wiki-producer.py
    │   ├── 02-kafka-connect-exercise
    │   │   ├── .gitignore
    │   │   ├── requirements.txt
    │   │   ├── config_debezium.json
    │   │   ├── kafka-connect-consumer.py
    │   │   ├── docker-compose.yaml
    │   │   └── README.md
    │   └── 02-kafka-connect-solution
    │   │   ├── .gitignore
    │   │   ├── requirements.txt
    │   │   ├── config_debezium.json
    │   │   ├── kafka-connect-consumer.py
    │   │   └── docker-compose.yaml
    ├── config_debezium.json
    ├── docker-compose.yml
    ├── README.md
    ├── docker-compose-schema-registry.yaml
    ├── 03-kafka-connect-consumer.py
    ├── 02-kafka-consumer.py
    ├── 01-kafka-producer.py
    ├── docker-compose-kafka-connect.yaml
    ├── order.py
    ├── 05-kafka-schema-registry-consumer.py
    └── 04-kafka-schema-registry-producer.py
├── 08-flink-stream-processing
    ├── .gitignore
    ├── exercises
    │   ├── 01-payments-data-exercise
    │   │   ├── .gitignore
    │   │   ├── docker-compose.yml
    │   │   ├── requirements.txt
    │   │   ├── flink-app.py
    │   │   ├── payments-producer.py
    │   │   └── README.md
    │   ├── 01-payments-data-solution
    │   │   ├── .gitignore
    │   │   ├── docker-compose.yml
    │   │   ├── requirements.txt
    │   │   ├── payments-producer.py
    │   │   ├── flink-app.py
    │   │   └── README.md
    │   ├── 02-anomalities-detector-exercise
    │   │   ├── .gitignore
    │   │   ├── docker-compose.yml
    │   │   ├── requirements.txt
    │   │   ├── payments-producer.py
    │   │   ├── README.md
    │   │   └── flink-app.py
    │   └── 02-anomalities-detector-solution
    │   │   ├── .gitignore
    │   │   ├── docker-compose.yml
    │   │   ├── requirements.txt
    │   │   ├── payments-producer.py
    │   │   ├── README.md
    │   │   └── flink-app.py
    ├── 01-flink-hello-world.py
    ├── docker-compose.yml
    ├── products.json
    ├── products-producer.py
    ├── orders-producer.py
    ├── README.md
    ├── variable-spend-producer.py
    ├── popular-products-producer.py
    ├── late-events-producer.py
    ├── 02-orders-processing.py
    ├── 05-local-state.py
    ├── 03-windows-aggregation.py
    ├── 04-late-events-processing.py
    └── 06-connecting-streams.py
├── 02-data-processing-with-spark
    ├── .gitignore
    ├── data
    │   └── download_data.sh
    ├── reviews-per-listing.py
    ├── 02-reading-airbnb-data.ipynb
    ├── README.md
    ├── 01-test-pyspark-app.ipynb
    ├── exercises
    │   ├── 02-aggregation-functions.ipynb
    │   └── 03-advanced-spark.ipynb
    └── 03-processing-airbnb-data.ipynb
├── README.md
└── 01-introduction
    └── docker-compose.yaml


/03-data-lake/.gitignore:
--------------------------------------------------------------------------------
1 | data/


--------------------------------------------------------------------------------
/06-data-eng-with-llms/.gitignore:
--------------------------------------------------------------------------------
1 | ./venv
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | **/.ipynb_checkpoints/
3 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | __pycache__/


--------------------------------------------------------------------------------
/05-ml-with-spark/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | venv
3 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | dags/


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-exercise/.gitignore:
--------------------------------------------------------------------------------
1 | ex-1-venv/


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-solution/.gitignore:
--------------------------------------------------------------------------------
1 | ex-1-venv/


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-exercise/.gitignore:
--------------------------------------------------------------------------------
1 | ex-2-venv/


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-solution/.gitignore:
--------------------------------------------------------------------------------
1 | ex-2-venv/


--------------------------------------------------------------------------------
/08-flink-stream-processing/.gitignore:
--------------------------------------------------------------------------------
1 | flink-sql-connector-kafka-3.4.0-1.20.jar
2 | venv/


--------------------------------------------------------------------------------
/02-data-processing-with-spark/.gitignore:
--------------------------------------------------------------------------------
1 | artifacts
2 | data
3 | venv
4 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-exercise/requirements.txt:
--------------------------------------------------------------------------------
1 | confluent-kafka==2.6.1
2 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-solution/requirements.txt:
--------------------------------------------------------------------------------
1 | confluent-kafka==2.6.1
2 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-exercise/.gitignore:
--------------------------------------------------------------------------------
1 | flink-sql-connector-kafka-3.4.0-1.20.jar
2 | venv/


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-solution/.gitignore:
--------------------------------------------------------------------------------
1 | flink-sql-connector-kafka-3.4.0-1.20.jar
2 | venv/


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-exercise/.gitignore:
--------------------------------------------------------------------------------
1 | flink-sql-connector-kafka-3.4.0-1.20.jar
2 | venv/


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-solution/.gitignore:
--------------------------------------------------------------------------------
1 | flink-sql-connector-kafka-3.4.0-1.20.jar
2 | venv/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ZTM - Data Engineering Bootcamp
2 | 
3 | Source code for the [Data Engineering Bootcamp](https://academy.zerotomastery.io/a/aff_nbp8km09/external?affcode=441520_ehcbjdb9) on ZTM.
4 | 


--------------------------------------------------------------------------------
/03-data-lake/configuration-overrides.json:
--------------------------------------------------------------------------------
1 | {
2 |      "monitoringConfiguration": {
3 |          "s3MonitoringConfiguration": {
4 |              "logUri": "s3://ztm-data-engineering-bootcamp/logs"
5 |          }
6 |      }
7 | }
8 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-exercise/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2024.8.30
2 | charset-normalizer==3.4.0
3 | confluent-kafka==2.6.1
4 | idna==3.10
5 | requests==2.32.3
6 | six==1.17.0
7 | sseclient==0.0.27
8 | urllib3==2.2.3
9 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-solution/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2024.8.30
2 | charset-normalizer==3.4.0
3 | confluent-kafka==2.6.1
4 | idna==3.10
5 | requests==2.32.3
6 | six==1.17.0
7 | sseclient==0.0.27
8 | urllib3==2.2.3
9 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:15
 5 |     container_name: postgres
 6 |     environment:
 7 |         POSTGRES_USER: user
 8 |         POSTGRES_PASSWORD: password
 9 |         POSTGRES_DB: rental_site
10 |     ports:
11 |         - "5432:5432"


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:15
 5 |     container_name: postgres
 6 |     environment:
 7 |         POSTGRES_USER: user
 8 |         POSTGRES_PASSWORD: password
 9 |         POSTGRES_DB: rental_site
10 |     ports:
11 |         - "5432:5432"


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:15
 5 |     container_name: postgres
 6 |     environment:
 7 |         POSTGRES_USER: user
 8 |         POSTGRES_PASSWORD: password
 9 |         POSTGRES_DB: rental_site
10 |     ports:
11 |         - "5432:5432"


--------------------------------------------------------------------------------
/08-flink-stream-processing/01-flink-hello-world.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | 
 3 | def main():
 4 |     env = StreamExecutionEnvironment.get_execution_environment()
 5 |     data_stream = env.from_collection([1, 2, 3, 4, 5])
 6 | 
 7 |     mapped_stream = data_stream.map(lambda x: x * 2)
 8 | 
 9 |     mapped_stream.print()
10 | 
11 |     env.execute("Flink Hello World")
12 | 
13 | if __name__ == "__main__":
14 |     main()


--------------------------------------------------------------------------------
/07-kafka-streaming/config_debezium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "postgres-debezium-connector",
 3 |   "config": {
 4 |     "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
 5 |     "plugin.name": "pgoutput",
 6 |     "database.hostname": "postgres",
 7 |     "database.port": "5432",
 8 |     "database.user": "user",
 9 |     "database.password": "password",
10 |     "database.dbname": "onlineshop",
11 |     "topic.prefix": "postgres-"
12 |   }
13 | }


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-exercise/config_debezium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "postgres-debezium-connector",
 3 |   "config": {
 4 |     "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
 5 |     "plugin.name": "pgoutput",
 6 |     "database.hostname": "postgres",
 7 |     "database.port": "5432",
 8 |     "database.user": "user",
 9 |     "database.password": "password",
10 |     "database.dbname": "onlineshop",
11 |     "topic.prefix": "postgres-"
12 |   }
13 | }


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-solution/config_debezium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "postgres-debezium-connector",
 3 |   "config": {
 4 |     "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
 5 |     "plugin.name": "pgoutput",
 6 |     "database.hostname": "postgres",
 7 |     "database.port": "5432",
 8 |     "database.user": "user",
 9 |     "database.password": "password",
10 |     "database.dbname": "onlineshop",
11 |     "topic.prefix": "postgres-"
12 |   }
13 | }


--------------------------------------------------------------------------------
/03-data-lake/payload-2024-11.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "sparkSubmit": {
 3 |         "entryPoint": "s3://ztm-data-engineering-bootcamp/apps/reviews-per-listing.py",
 4 |         "entryPointArguments": [
 5 |             "--listings",
 6 |             "s3://ztm-data-engineering-bootcamp/listings/date=2024-11/listings.csv.gz",
 7 |             "--reviews",
 8 |             "s3://ztm-data-engineering-bootcamp/reviews/date=2024-11/reviews.csv.gz",
 9 |             "--output",
10 |             "s3://ztm-data-engineering-bootcamp/reviews_per_listing/date=2024-11"
11 |         ]
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/02-data-processing-with-spark/data/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f "$0" ]; then script=$0; else script=$(command -v -- "$0"); fi
 4 | dir=$(dirname -- "$script")
 5 | 
 6 | echo "Writing data to $dir"
 7 | 
 8 | wget -O "$dir/reviews.csv.gz" https://data.insideairbnb.com/united-kingdom/england/london/2024-09-06/data/reviews.csv.gz
 9 | wget -O "$dir/calendar.csv.gz" https://data.insideairbnb.com/united-kingdom/england/london/2024-09-06/data/calendar.csv.gz
10 | wget -O "$dir/listings.csv.gz" https://data.insideairbnb.com/united-kingdom/england/london/2024-09-06/data/listings.csv.gz
11 | 


--------------------------------------------------------------------------------
/01-introduction/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   mongo:
 3 |     image: mongo:latest
 4 |     environment:
 5 |       MONGO_INITDB_ROOT_USERNAME: admin
 6 |       MONGO_INITDB_ROOT_PASSWORD: secret
 7 |     ports:
 8 |       - "27017:27017"
 9 |     volumes:
10 |       - mongo-data:/data/db
11 | 
12 |   mongo-express:
13 |     image: mongo-express:latest
14 |     depends_on:
15 |       - mongo
16 |     environment:
17 |       ME_CONFIG_MONGODB_ADMINUSERNAME: admin
18 |       ME_CONFIG_MONGODB_ADMINPASSWORD: secret
19 |       ME_CONFIG_MONGODB_SERVER: mongo
20 |       ME_CONFIG_BASICAUTH_ENABLED: 'false'
21 |     ports:
22 |       - "8081:8081"
23 | 
24 | volumes:
25 |   mongo-data: {}
26 | 
27 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT
16 |       KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092
19 |       KAFKA_LOG_RETENTION_HOURS: 168
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/08-flink-stream-processing/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT
16 |       KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092
19 |       KAFKA_LOG_RETENTION_HOURS: 168
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-exercise/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT
16 |       KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092
19 |       KAFKA_LOG_RETENTION_HOURS: 168
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-solution/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT
16 |       KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092
19 |       KAFKA_LOG_RETENTION_HOURS: 168
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-exercise/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT
16 |       KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092
19 |       KAFKA_LOG_RETENTION_HOURS: 168
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-solution/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT
16 |       KAFKA_INTER_BROKER_LISTENER_NAME: EXTERNAL
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092
19 |       KAFKA_LOG_RETENTION_HOURS: 168
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-exercise/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     hostname: kafka
 7 |     container_name: kafka
 8 |     ports:
 9 |       - "9092:9092"
10 |     environment:
11 |       KAFKA_KRAFT_MODE: "true"
12 |       KAFKA_PROCESS_ROLES: broker,controller
13 |       KAFKA_NODE_ID: 1
14 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
15 |       KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
16 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT
17 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
18 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
19 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       KAFKA_LOG_RETENTION_HOURS: 168
22 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-solution/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     hostname: kafka
 7 |     container_name: kafka
 8 |     ports:
 9 |       - "9092:9092"
10 |     environment:
11 |       KAFKA_KRAFT_MODE: "true"
12 |       KAFKA_PROCESS_ROLES: broker,controller
13 |       KAFKA_NODE_ID: 1
14 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
15 |       KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093
16 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT
17 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
18 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
19 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       KAFKA_LOG_RETENTION_HOURS: 168
22 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"


--------------------------------------------------------------------------------
/08-flink-stream-processing/products.json:
--------------------------------------------------------------------------------
 1 | {"product_id":"product-1","name":"Cotton T-Shirt","category":"Clothing"}
 2 | {"product_id":"product-2","name":"Wireless Earbuds","category":"Electronics"}
 3 | {"product_id":"product-3","name":"Ceramic Coffee Mug","category":"Home & Kitchen"}
 4 | {"product_id":"product-4","name":"Stainless Steel Water Bottle","category":"Outdoor & Travel"}
 5 | {"product_id":"product-5","name":"Yoga Mat","category":"Sports & Fitness"}
 6 | {"product_id":"product-6","name":"Leather Wallet","category":"Accessories"}
 7 | {"product_id":"product-7","name":"Laptop Stand","category":"Office Supplies"}
 8 | {"product_id":"product-8","name":"Running Shoes","category":"Footwear"}
 9 | {"product_id":"product-9","name":"Bluetooth Speaker","category":"Electronics"}
10 | {"product_id":"product-10","name":"Scented Candle","category":"Home & Kitchen"}
11 | 
12 | 
13 | {"product_id":"product-6","name":"Cotton Wallet","category":"Accessories"}


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/spark_etl_reviews.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import avg
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--customer_reviews", required=True, help="Input CSV file path")
 8 |     parser.add_argument("--output_path", required=True, help="Output CSV file path")
 9 |     args = parser.parse_args()
10 | 
11 |     spark = SparkSession \
12 |         .builder \
13 |         .appName("CustomerReviews") \
14 |         .getOrCreate()
15 | 
16 |     # TODO: Read input data
17 |     customer_reviews = None
18 | 
19 |     customer_reviews = customer_reviews \
20 |         .withColumn("review_score", customer_reviews["review_score"].cast("float"))
21 | 
22 |     # TODO: Calculate an average review score per listing ID
23 | 
24 |     # TODO: Write the result to an output path
25 | 
26 | if __name__ == "__main__":
27 |     main()


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-exercise/requirements.txt:
--------------------------------------------------------------------------------
 1 | apache-beam==2.48.0
 2 | apache-flink==1.20.0
 3 | apache-flink-libraries==1.20.0
 4 | avro-python3==1.10.2
 5 | certifi==2024.12.14
 6 | charset-normalizer==3.4.1
 7 | cloudpickle==2.2.1
 8 | confluent-kafka==2.8.0
 9 | crcmod==1.7
10 | dill==0.3.1.1
11 | dnspython==2.7.0
12 | docopt==0.6.2
13 | fastavro==1.10.0
14 | fasteners==0.19
15 | find_libpython==0.4.0
16 | grpcio==1.69.0
17 | hdfs==2.7.3
18 | httplib2==0.22.0
19 | idna==3.10
20 | numpy==1.24.4
21 | objsize==0.6.1
22 | orjson==3.10.15
23 | pandas==2.2.3
24 | pemja==0.4.1
25 | proto-plus==1.25.0
26 | protobuf==4.23.4
27 | py4j==0.10.9.7
28 | pyarrow==11.0.0
29 | pydot==1.4.2
30 | pymongo==4.10.1
31 | pyparsing==3.2.1
32 | python-dateutil==2.9.0.post0
33 | pytz==2024.2
34 | regex==2024.11.6
35 | requests==2.32.3
36 | ruamel.yaml==0.18.10
37 | ruamel.yaml.clib==0.2.12
38 | six==1.17.0
39 | typing_extensions==4.12.2
40 | tzdata==2024.2
41 | urllib3==2.3.0
42 | zstandard==0.23.0
43 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-solution/requirements.txt:
--------------------------------------------------------------------------------
 1 | apache-beam==2.48.0
 2 | apache-flink==1.20.0
 3 | apache-flink-libraries==1.20.0
 4 | avro-python3==1.10.2
 5 | certifi==2024.12.14
 6 | charset-normalizer==3.4.1
 7 | cloudpickle==2.2.1
 8 | confluent-kafka==2.8.0
 9 | crcmod==1.7
10 | dill==0.3.1.1
11 | dnspython==2.7.0
12 | docopt==0.6.2
13 | fastavro==1.10.0
14 | fasteners==0.19
15 | find_libpython==0.4.0
16 | grpcio==1.69.0
17 | hdfs==2.7.3
18 | httplib2==0.22.0
19 | idna==3.10
20 | numpy==1.24.4
21 | objsize==0.6.1
22 | orjson==3.10.15
23 | pandas==2.2.3
24 | pemja==0.4.1
25 | proto-plus==1.25.0
26 | protobuf==4.23.4
27 | py4j==0.10.9.7
28 | pyarrow==11.0.0
29 | pydot==1.4.2
30 | pymongo==4.10.1
31 | pyparsing==3.2.1
32 | python-dateutil==2.9.0.post0
33 | pytz==2024.2
34 | regex==2024.11.6
35 | requests==2.32.3
36 | ruamel.yaml==0.18.10
37 | ruamel.yaml.clib==0.2.12
38 | six==1.17.0
39 | typing_extensions==4.12.2
40 | tzdata==2024.2
41 | urllib3==2.3.0
42 | zstandard==0.23.0
43 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-exercise/requirements.txt:
--------------------------------------------------------------------------------
 1 | apache-beam==2.48.0
 2 | apache-flink==1.20.0
 3 | apache-flink-libraries==1.20.0
 4 | avro-python3==1.10.2
 5 | certifi==2024.12.14
 6 | charset-normalizer==3.4.1
 7 | cloudpickle==2.2.1
 8 | confluent-kafka==2.8.0
 9 | crcmod==1.7
10 | dill==0.3.1.1
11 | dnspython==2.7.0
12 | docopt==0.6.2
13 | fastavro==1.10.0
14 | fasteners==0.19
15 | find_libpython==0.4.0
16 | grpcio==1.69.0
17 | hdfs==2.7.3
18 | httplib2==0.22.0
19 | idna==3.10
20 | numpy==1.24.4
21 | objsize==0.6.1
22 | orjson==3.10.15
23 | pandas==2.2.3
24 | pemja==0.4.1
25 | proto-plus==1.25.0
26 | protobuf==4.23.4
27 | py4j==0.10.9.7
28 | pyarrow==11.0.0
29 | pydot==1.4.2
30 | pymongo==4.10.1
31 | pyparsing==3.2.1
32 | python-dateutil==2.9.0.post0
33 | pytz==2024.2
34 | regex==2024.11.6
35 | requests==2.32.3
36 | ruamel.yaml==0.18.10
37 | ruamel.yaml.clib==0.2.12
38 | six==1.17.0
39 | typing_extensions==4.12.2
40 | tzdata==2024.2
41 | urllib3==2.3.0
42 | zstandard==0.23.0
43 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-solution/requirements.txt:
--------------------------------------------------------------------------------
 1 | apache-beam==2.48.0
 2 | apache-flink==1.20.0
 3 | apache-flink-libraries==1.20.0
 4 | avro-python3==1.10.2
 5 | certifi==2024.12.14
 6 | charset-normalizer==3.4.1
 7 | cloudpickle==2.2.1
 8 | confluent-kafka==2.8.0
 9 | crcmod==1.7
10 | dill==0.3.1.1
11 | dnspython==2.7.0
12 | docopt==0.6.2
13 | fastavro==1.10.0
14 | fasteners==0.19
15 | find_libpython==0.4.0
16 | grpcio==1.69.0
17 | hdfs==2.7.3
18 | httplib2==0.22.0
19 | idna==3.10
20 | numpy==1.24.4
21 | objsize==0.6.1
22 | orjson==3.10.15
23 | pandas==2.2.3
24 | pemja==0.4.1
25 | proto-plus==1.25.0
26 | protobuf==4.23.4
27 | py4j==0.10.9.7
28 | pyarrow==11.0.0
29 | pydot==1.4.2
30 | pymongo==4.10.1
31 | pyparsing==3.2.1
32 | python-dateutil==2.9.0.post0
33 | pytz==2024.2
34 | regex==2024.11.6
35 | requests==2.32.3
36 | ruamel.yaml==0.18.10
37 | ruamel.yaml.clib==0.2.12
38 | six==1.17.0
39 | typing_extensions==4.12.2
40 | tzdata==2024.2
41 | urllib3==2.3.0
42 | zstandard==0.23.0
43 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/01-data-validation-solution/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | This is a README for the first exercise in this section.
 3 | 
 4 | # 0. Follow the instruction in the README.md file in the "exercises" folder
 5 | 
 6 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow.
 7 | 
 8 | # 1. Copy the DAG's starter code
 9 | 
10 | Copy the starter code for the DAG to the `dags` folder you've created while setting up Airflow locally
11 | 
12 | # 2. Restart the scheduler
13 | 
14 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`.
15 | 
16 | After this, start it again using the following command:
17 | 
18 | ```sh
19 | airflow scheduler
20 | ```
21 | 
22 | # 3. Implement the TODOs in the
23 | 
24 | Now implement the TODO comments in the starter code.
25 | 
26 | 
27 | # 4. Start the DAG
28 | 
29 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented.


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/01-data-validation-exercise/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | This is a README for the first exercise in this section.
 3 | 
 4 | # 0. Follow the instruction in the README.md file in the "exercises" folder
 5 | 
 6 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow.
 7 | 
 8 | # 1. Copy the DAG's starter code
 9 | 
10 | Copy the starter code for the DAG to the `dags` folder you've created while setting up Airflow locally
11 | 
12 | # 2. Restart the scheduler
13 | 
14 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`.
15 | 
16 | After this, start it again using the following command:
17 | 
18 | ```sh
19 | airflow scheduler
20 | ```
21 | 
22 | # 3. Implement the TODOs in the code
23 | 
24 | Now implement the TODO comments in the starter code.
25 | 
26 | 
27 | # 4. Start the DAG
28 | 
29 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented.


--------------------------------------------------------------------------------
/05-ml-with-spark/README.md:
--------------------------------------------------------------------------------
 1 | # Install Spark
 2 | 
 3 | To install Spark locally on macOS, run the following command:
 4 | 
 5 | ```sh
 6 | brew install apache-spark
 7 | ```
 8 | 
 9 | To check that it was installed correctly, you can run:
10 | 
11 | ```sh
12 | pyspark --version
13 | ```
14 | 
15 | ---
16 | 
17 | ## Set Up Virtual Environment
18 | 
19 | Create and activate a virtual environment:
20 | 
21 | ```sh
22 | python3 -m venv .
23 | source bin/activate
24 | ```
25 | 
26 | ---
27 | 
28 | ## Install Required Packages
29 | 
30 | Install Jupyter and NumPy (used for numerical operations in machine learning):
31 | 
32 | ```sh
33 | pip install jupyter numpy
34 | ```
35 | 
36 | ---
37 | 
38 | ## Configure PySpark to Use Jupyter Notebooks
39 | 
40 | Set environment variables so PySpark launches in Jupyter Lab:
41 | 
42 | ```sh
43 | export PYSPARK_DRIVER_PYTHON=jupyter
44 | export PYSPARK_DRIVER_PYTHON_OPTS='lab'
45 | ```
46 | 
47 | ---
48 | 
49 | ## Launch PySpark with Jupyter
50 | 
51 | Start the interactive Spark environment in Jupyter Lab:
52 | 
53 | ```sh
54 | pyspark
55 | ```
56 | 
57 | ---


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/spark_etl_reviews.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import avg
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--customer_reviews", required=True, help="Input CSV file path")
 8 |     parser.add_argument("--output_path", required=True, help="Output CSV file path")
 9 |     args = parser.parse_args()
10 | 
11 |     spark = SparkSession \
12 |         .builder \
13 |         .appName("CustomerReviews") \
14 |         .getOrCreate()
15 | 
16 |     customer_reviews = spark.read.csv(
17 |         args.customer_reviews,
18 |         header=True,
19 |     )
20 | 
21 |     customer_reviews = customer_reviews \
22 |         .withColumn("review_score", customer_reviews["review_score"].cast("float"))
23 | 
24 |     result = customer_reviews \
25 |         .groupBy("listing_id") \
26 |         .agg(
27 |             avg("review_score").alias("avg_review_score")
28 |         )
29 | 
30 |     result.write.mode("overwrite").csv(args.output_path)
31 | 
32 |     spark.stop()
33 | 
34 | if __name__ == "__main__":
35 |     main()


--------------------------------------------------------------------------------
/08-flink-stream-processing/products-producer.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka import Producer
 2 | import sys
 3 | import textwrap
 4 | 
 5 | def delivery_callback(err, msg):
 6 |     if err:
 7 |         print("ERROR: Message failed delivery: {}".format(err))
 8 |     else:
 9 |         print(
10 |             textwrap.dedent(
11 |             f"""
12 |                 Produced event to topic {msg.topic()}:
13 |                 value = {msg.value().decode('utf-8')}
14 |             """)
15 |         )
16 | 
17 | def main():
18 |     producer_config = {
19 |         'bootstrap.servers': 'localhost:9092',
20 |     }
21 |     producer = Producer(producer_config)
22 | 
23 |     print("Enter products's data")
24 |     try:
25 |         while True:
26 |             json_line = input("> ").strip()
27 |             if json_line:
28 |                 producer.produce(
29 |                     "products",
30 |                     key=None,
31 |                     value=json_line,
32 |                     callback=delivery_callback
33 |                 )
34 |                 producer.poll(1)
35 |     finally:
36 |         producer.flush()
37 |         producer.close()
38 | 
39 | if __name__ == "__main__":
40 |     main()


--------------------------------------------------------------------------------
/06-data-eng-with-llms/02-structured-output.py:
--------------------------------------------------------------------------------
 1 | %pip install outlines
 2 | 
 3 | from huggingface_hub import login
 4 | 
 5 | login(token="hf_...")
 6 | 
 7 | import torch
 8 | import json
 9 | import outlines
10 | 
11 | model_name = "mistralai/Mistral-7B-Instruct-v0.3"
12 | 
13 | generator = outlines.models.transformers(
14 |     model_name,
15 |     device="cuda",
16 |     model_kwargs={
17 |         "torch_dtype": torch.float16,
18 |     }
19 | )
20 | 
21 | schema = json.dumps({
22 |     "type": "object",
23 |     "properties": {
24 |         "sentiment": {
25 |             "type": "string",
26 |             "enum": ["positive", "negative"]
27 |         }
28 |     },
29 |     "required": ["sentiment"]
30 | })
31 | 
32 | generate_json = outlines.generate.json(generator, schema)
33 | 
34 | def classify_review(review):
35 |     prompt = (
36 |         "Classify the following customer review as positive or negative.\n\n"
37 |         f"Review:\n{review}\n"
38 |     )
39 | 
40 |     output_json = generate_json(prompt, max_tokens=40)
41 | 
42 |     return output_json
43 | 
44 | 
45 | print(classify_review("This is absolutely delightful!"))
46 | 
47 | print(classify_review("This was the worst hotel I've ever seen"))


--------------------------------------------------------------------------------
/07-kafka-streaming/README.md:
--------------------------------------------------------------------------------
 1 | ## Prerequisites
 2 | 
 3 | - [Docker](https://www.docker.com/) installed and running
 4 | - Docker Compose (included with Docker Desktop)
 5 | 
 6 | ---
 7 | 
 8 | ## Start Kafka Broker
 9 | 
10 | Run the following command to start Kafka using the provided `docker-compose.yml` file:
11 | 
12 | ```sh
13 | docker-compose up
14 | ```
15 | 
16 | ---
17 | 
18 | ## Install Kafka CLI Tools
19 | 
20 | To interact with Kafka from the command line, install the Kafka tools:
21 | 
22 | ```sh
23 | brew install kafka
24 | ```
25 | 
26 | ---
27 | 
28 | ## List Kafka Topics
29 | 
30 | Use the following command to list all topics in the Kafka cluster:
31 | 
32 | ```sh
33 | kafka-topics --list --bootstrap-server localhost:9092
34 | ```
35 | 
36 | ---
37 | 
38 | ## Create a Kafka Topic
39 | 
40 | Create a topic named `orders` with 4 partitions and a replication factor of 1:
41 | 
42 | ```sh
43 | kafka-topics --create \
44 |   --bootstrap-server localhost:9092 \
45 |   --topic orders \
46 |   --replication-factor 1 \
47 |   --partitions 4
48 | ```
49 | 
50 | ---
51 | 
52 | ## Verify Topic Creation
53 | 
54 | List topics again to verify the new topic was created:
55 | 
56 | ```sh
57 | kafka-topics --list --bootstrap-server localhost:9092
58 | ```


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-exercise/wiki-consumer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | from confluent_kafka import Consumer
 5 | 
 6 | consumer_conf = {
 7 |     "bootstrap.servers": "localhost:9092",
 8 |     "group.id": "wiki-consumer-group",
 9 |     "auto.offset.reset": "earliest",
10 | }
11 | kafka_topic = "wikipedia-changes"
12 | 
13 | 
14 | def main():
15 |     consumer = Consumer(consumer_conf)
16 |     consumer.subscribe([kafka_topic])
17 | 
18 |     print(f"Consuming messages from topic '{kafka_topic}'")
19 | 
20 |     try:
21 |         while True:
22 |             msg = consumer.poll(timeout=1.0)
23 |             if msg is None:
24 |                 continue
25 | 
26 |             if msg.error():
27 |                 print(f"Error: {msg.error()}", file=sys.stderr)
28 |                 continue
29 | 
30 |             # TODO: Print a message about a Wikipedia edit if two conditions are true:
31 |             # * If a change was made by a bot
32 |             # * If a change is not minor
33 |             # 
34 |             # The printed messages should include the name of an author making a change and 
35 |             # the title of a changed page
36 |             
37 |     finally:
38 |         consumer.close()
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-exercise/flink-app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | 
 4 | from pyflink.common.serialization import SimpleStringSchema
 5 | from pyflink.common.typeinfo import Types
 6 | from pyflink.common.watermark_strategy import WatermarkStrategy
 7 | from pyflink.datastream import StreamExecutionEnvironment
 8 | from pyflink.datastream.connectors.kafka import KafkaSink, KafkaSource, KafkaRecordSerializationSchema
 9 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
10 | 
11 | # TODO: In this exercise you will need to
12 | #
13 | # * Read data written by the "payments-producer.py"
14 | # * Filter payments with amount greater than 500
15 | # * Output new records with only two fields: "payment_id" and "amount"
16 | # * Write output to another Kafka topic
17 | 
18 | # TODO: Implement any functions and types that you need
19 | 
20 | 
21 | def main():
22 |     env = StreamExecutionEnvironment.get_execution_environment()
23 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
24 | 
25 |     # TODO: Create a Kafka source
26 |     # TODO: Create a payments stream
27 | 
28 |     # TODO: Implement stream processing logic
29 | 
30 |     # TODO: Write resulting data to Kafka
31 | 
32 |     env.execute("Payments stream processing")
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     main()


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-solution/wiki-consumer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | from confluent_kafka import Consumer
 5 | 
 6 | consumer_conf = {
 7 |     "bootstrap.servers": "localhost:9092",
 8 |     "group.id": "wiki-consumer-group",
 9 |     "auto.offset.reset": "earliest",
10 | }
11 | kafka_topic = "wikipedia-changes"
12 | 
13 | 
14 | def main():
15 |     consumer = Consumer(consumer_conf)
16 |     consumer.subscribe([kafka_topic])
17 | 
18 |     print(f"Consuming messages from topic '{kafka_topic}'")
19 | 
20 |     try:
21 |         while True:
22 |             msg = consumer.poll(timeout=1.0)
23 |             if msg is None:
24 |                 continue
25 | 
26 |             if msg.error():
27 |                 print(f"Error: {msg.error()}", file=sys.stderr)
28 |                 continue
29 | 
30 |             message_value = msg.value().decode("utf-8")
31 | 
32 |             event = json.loads(message_value)
33 | 
34 |             bot = event.get("bot", False)
35 |             minor = event.get("minor", True)
36 |             title = event.get("title", "Unknown")
37 |             user = event.get("user", "Unknown")
38 | 
39 |             if bot and not minor:
40 |                 print(f"Major bot edit detected: User '{user}' edited '{title}'")
41 | 
42 |     finally:
43 |         consumer.close()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/02-data-processing-with-spark/reviews-per-listing.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pyspark.sql import SparkSession
 3 | import pyspark.sql.functions as F
 4 | 
 5 | parser = argparse.ArgumentParser(description='Most popular listings parameters')
 6 | parser.add_argument('--listings', help='Path to the listings dataset')
 7 | parser.add_argument('--reviews', help='Path to the reviews dataset')
 8 | parser.add_argument('--output', help='Directory to save the output')
 9 | args = parser.parse_args()
10 | 
11 | spark = SparkSession.builder \
12 |     .appName("Most popular listings") \
13 |     .getOrCreate()
14 | 
15 | listings = spark.read.csv(args.listings,
16 |     header=True,
17 |     inferSchema=True,
18 |     sep=",",
19 |     quote='"',
20 |     escape='"',
21 |     multiLine=True,
22 |     mode="PERMISSIVE"
23 | )
24 | 
25 | reviews = spark.read.csv(args.reviews,
26 |     header=True,
27 |     inferSchema=True,
28 |     sep=",",
29 |     quote='"',
30 |     escape='"',
31 |     multiLine=True,
32 |     mode="PERMISSIVE"
33 | )
34 | 
35 | listings_reviews = listings.join(
36 |     reviews, listings.id == reviews.listing_id, how='inner'
37 | )
38 | 
39 | reviews_per_listing = listings_reviews \
40 |   .groupBy(listings.id, listings.name) \
41 |   .agg(
42 |     F.count(reviews.id).alias('num_reviews')
43 |   ) \
44 |   .orderBy('num_reviews', ascending=False) \
45 | 
46 | reviews_per_listing \
47 |   .write \
48 |   .csv(args.output)


--------------------------------------------------------------------------------
/03-data-lake/reviews-per-listing.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pyspark.sql import SparkSession
 3 | import pyspark.sql.functions as F
 4 | 
 5 | parser = argparse.ArgumentParser(description='Most popular listings parameters')
 6 | parser.add_argument('--listings', help='Path to the listings dataset')
 7 | parser.add_argument('--reviews', help='Path to the reviews dataset')
 8 | parser.add_argument('--output', help='Directory to save the output')
 9 | args = parser.parse_args()
10 | 
11 | spark = SparkSession.builder \
12 |     .appName("Most popular listings") \
13 |     .getOrCreate()
14 | 
15 | listings = spark.read.csv(args.listings,
16 |     header=True,
17 |     inferSchema=True,
18 |     sep=",",
19 |     quote='"',
20 |     escape='"',
21 |     multiLine=True,
22 |     mode="PERMISSIVE"
23 | )
24 | 
25 | reviews = spark.read.csv(args.reviews,
26 |     header=True,
27 |     inferSchema=True,
28 |     sep=",",
29 |     quote='"',
30 |     escape='"',
31 |     multiLine=True,
32 |     mode="PERMISSIVE"
33 | )
34 | 
35 | listings_reviews = listings.join(
36 |     reviews, listings.id == reviews.listing_id, how='inner'
37 | )
38 | 
39 | reviews_per_listing = listings_reviews \
40 |   .groupBy(listings.id, listings.name) \
41 |   .agg(
42 |     F.count(reviews.id).alias('num_reviews')
43 |   ) \
44 |   .orderBy('num_reviews', ascending=False) \
45 | 
46 | reviews_per_listing \
47 |   .write \
48 |   .csv(
49 |     args.output,
50 |     header=True,
51 |   )


--------------------------------------------------------------------------------
/07-kafka-streaming/docker-compose-schema-registry.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 |   kafka:
 4 |     image: confluentinc/cp-kafka:7.8.0
 5 |     hostname: kafka
 6 |     container_name: kafka
 7 |     ports:
 8 |       - "9092:9092"
 9 |     environment:
10 |       KAFKA_KRAFT_MODE: "true"
11 |       KAFKA_PROCESS_ROLES: broker,controller
12 |       KAFKA_NODE_ID: 1
13 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
14 |       KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092
15 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT
16 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
17 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092,INTERNAL://kafka:19092
18 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
19 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
20 |       KAFKA_LOG_RETENTION_HOURS: 168
21 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"
22 | 
23 |   schema-registry:
24 |     image: confluentinc/cp-schema-registry:7.5.0
25 |     container_name: schema-registry
26 |     depends_on:
27 |       - kafka
28 |     ports:
29 |       - "8081:8081"
30 |     environment:
31 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: PLAINTEXT://kafka:19092
32 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
33 |       SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
34 |       SCHEMA_REGISTRY_AVRO_COMPATIBILY_LEVEL: full
35 | 
36 | 


--------------------------------------------------------------------------------
/06-data-eng-with-llms/01-simple-transformers.py:
--------------------------------------------------------------------------------
 1 | %pip install transformers
 2 | 
 3 | from huggingface_hub import login
 4 | 
 5 | login(token="hf_...")
 6 | 
 7 | import torch
 8 | from transformers import pipeline
 9 | 
10 | model_name = "mistralai/Mistral-7B-Instruct-v0.3"
11 | 
12 | generator = pipeline(
13 |     "text-generation",
14 |     model=model_name,
15 |     device_map="cuda", 
16 |     torch_dtype=torch.float16,
17 |     max_new_tokens=20,
18 |     return_full_text=False,
19 | )
20 | 
21 | import textwrap
22 | 
23 | def classify_review(review):
24 |     messages = [
25 |         {
26 |             "role": "user",
27 |             "content": textwrap.dedent(f"""
28 |                 You are a sentiment classifier.
29 |                 Is the following customer review positive or negative?
30 |                 Respond with exactly one of the two words: positive, negative.
31 | 
32 |                 Review:
33 |                 ```
34 |                 {review}
35 |                 ```
36 |             """)
37 |         }
38 |     ]
39 |     
40 |     print('----------------------------')
41 |     print(f"{messages[0]['content']}")
42 |     print('----------------------------')
43 | 
44 |     output = generator(messages)
45 |     generated_text = output[0]["generated_text"]
46 |     return generated_text.strip().lower() 
47 | 
48 | print(classify_review("This is absolutely delightful!"))
49 | 
50 | print(classify_review("This was the worst hotel I've ever seen"))


--------------------------------------------------------------------------------
/07-kafka-streaming/03-kafka-connect-consumer.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | from decimal import Decimal
 4 | 
 5 | from confluent_kafka import Consumer
 6 | 
 7 | consumer_config = {
 8 |     "bootstrap.servers": "localhost:9092",
 9 |     "group.id": "postgres-price-consumer",
10 |     "auto.offset.reset": "earliest",
11 | }
12 | 
13 | 
14 | def main():
15 |     consumer = Consumer(consumer_config)
16 | 
17 |     topic = "postgres-.public.orders"
18 |     consumer.subscribe([topic])
19 | 
20 |     try:
21 |         print(f"Consuming messages from topic '{topic}'")
22 |         while True:
23 |             msg = consumer.poll(1.0)
24 | 
25 |             if msg is None:
26 |                 continue
27 |             if msg.error():
28 |                 raise KafkaException(msg.error())
29 | 
30 |             process_message(msg)
31 | 
32 |     finally:
33 |         consumer.close()
34 | 
35 | 
36 | def process_message(msg):
37 |     value = msg.value()
38 | 
39 |     order = json.loads(value.decode("utf-8"))
40 |     total_amount_bytes = (
41 |         order.get("payload", {}).get("after", {}).get("total_amount")
42 |     )
43 | 
44 |     total_amount = decode_decimal(total_amount_bytes)
45 |     print(f"Received order with total amount={total_amount}")
46 | 
47 | 
48 | def decode_decimal(encoded_string, scale=2):
49 |     value_bytes = base64.b64decode(encoded_string)
50 |     unscaled_value = int.from_bytes(value_bytes, byteorder="big", signed=True)
51 |     return Decimal(unscaled_value) / Decimal(10**scale)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-solution/kafka-connect-consumer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from confluent_kafka import Consumer, KafkaError, KafkaException
 4 | 
 5 | conf = {
 6 |     "bootstrap.servers": "localhost:9092",
 7 |     "group.id": "postgres-price-consumer",
 8 |     "auto.offset.reset": "earliest",
 9 | }
10 | 
11 | 
12 | def main():
13 |     consumer = Consumer(conf)
14 | 
15 |     topic = "postgres-.public.orders"
16 |     consumer.subscribe([topic])
17 | 
18 |     try:
19 |         print(f"Consuming messages from topic '{topic}'")
20 |         while True:
21 |             msg = consumer.poll(1.0)
22 | 
23 |             if msg is None:
24 |                 continue
25 |             if msg.error():
26 |                 print(f"Error: {msg.error()}")
27 |                 continue
28 | 
29 |             process_message(msg)
30 | 
31 |     finally:
32 |         consumer.close()
33 | 
34 | 
35 | def process_message(msg):
36 |     value = msg.value()
37 |     order = json.loads(value.decode("utf-8"))
38 |     payload = order.get("payload", {})
39 | 
40 |     before = payload.get("before", None)
41 |     after = payload.get("after", None)
42 | 
43 |     if not before or not after:
44 |         return
45 | 
46 |     before_status = before.get("status")
47 |     after_status = after.get("status")
48 | 
49 |     if before_status == "processed" and after_status == "refunded":
50 |         print(
51 |             f"Status changed from 'processed' to 'refunded' for order: {order.get('payload', {}).get('after', {}).get('id')}"
52 |     )
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/dags/bookings_per_listing_spark.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.functions import count
 3 | import argparse
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--listings_file", required=True, help="Path to the monthly listings file")
 8 |     parser.add_argument("--bookings_file", required=True, help="Path to the hourly bookings file")
 9 |     parser.add_argument("--output_path", required=True, help="Output path for the aggregated results")
10 |     args = parser.parse_args()
11 | 
12 |     print(f"Reading listings from {args.listings_file}")
13 |     print(f"Reading bookings from {args.bookings_file}")
14 |     spark = SparkSession.builder.appName("ListingsBookingsJoin").getOrCreate()
15 | 
16 |     listings = spark.read.csv(args.listings_file,
17 |         header=True,
18 |         inferSchema=True,
19 |         sep=",",
20 |         quote='"',
21 |         escape='"',
22 |         multiLine=True,
23 |         mode="PERMISSIVE"
24 |     )
25 | 
26 |     bookings = spark.read.csv(
27 |         args.bookings_file,
28 |         header=True,
29 |         inferSchema=True,
30 |     )
31 | 
32 |     aggregated = listings \
33 |       .join(bookings, listings["id"] == bookings["listing_id"], how="inner") \
34 |       .groupBy("listing_id", "name", "price") \
35 |       .agg(
36 |         count("booking_id").alias("booking_count")
37 |       )
38 | 
39 |     aggregated.write.mode("overwrite").csv(args.output_path)
40 | 
41 |     print(f"Aggregated results written to {args.output_path}")
42 |     spark.stop()
43 | 
44 | if __name__ == "__main__":
45 |     main()


--------------------------------------------------------------------------------
/07-kafka-streaming/02-kafka-consumer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | from confluent_kafka import Consumer
 5 | 
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(description="Test Kafka consumer")
 9 |     parser.add_argument("--group-id", "-g", help="Consumer group ID")
10 |     parser.add_argument("--topic-name", "-t", help="Topic name ")
11 |     parser.add_argument("--name", "-n", help="Name of this consumer")
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     group_id = args.group_id
16 |     topic_name = args.topic_name
17 |     consumer_name = args.name
18 | 
19 |     consumer_config = {
20 |         "bootstrap.servers": "localhost:9092",
21 |         "group.id": group_id,
22 |         "auto.offset.reset": "earliest",
23 |     }
24 | 
25 |     consumer = Consumer(consumer_config)
26 |     consumer.subscribe([topic_name])
27 | 
28 |     try:
29 |         while True:
30 |             msg = consumer.poll(timeout=1.0)
31 |             if msg is None:
32 |                 # No new messages
33 |                 continue
34 |             if msg.error():
35 |                 # Error while reading messages
36 |                 print(f"[{consumer_name}]Error encountered: {msg.error()}")
37 |                 continue
38 | 
39 |             process_message(consumer_name, msg)
40 | 
41 |     finally:
42 |         consumer.close()
43 | 
44 | 
45 | def process_message(consumer_name, msg):
46 |     value = msg.value()
47 | 
48 |     order = json.loads(value.decode("utf-8"))
49 |     price = order.get("total_price", 0)
50 |     if price < 250:
51 |         return
52 | 
53 |     print(
54 |         f"[{consumer_name}] [partition={msg.partition()}] Received order price={price}"
55 |     )
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-exercise/payments-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_payment():
11 |     payment_id = f"payment-{random.randint(1000, 9999)}"
12 |     user_id = f"user-{random.randint(1, 50)}"
13 |     merchant_id = f"merchant-{random.randint(1, 20)}"
14 |     amount = round(random.uniform(10.0, 1000.0), 2)
15 |     payment_time = datetime.now().isoformat()
16 | 
17 |     payment_event = {
18 |         "payment_id": payment_id,
19 |         "user_id": user_id,
20 |         "merchant_id": merchant_id,
21 |         "amount": amount,
22 |         "payment_time": payment_time
23 |     }
24 |     return payment_event
25 | 
26 | 
27 | def main():
28 | 
29 |     config = {
30 |         "bootstrap.servers": "localhost:9092"
31 |     }
32 | 
33 |     producer = Producer(config)
34 | 
35 |     topic = "payments"
36 | 
37 |     def delivery_callback(err, msg):
38 |         if err:
39 |             print("ERROR: Message failed delivery: {}".format(err))
40 |         else:
41 |             print(
42 |                 textwrap.dedent(
43 |                 f"""
44 |                     Produced event to topic {msg.topic()}:
45 |                     key = {msg.key().decode('utf-8')}
46 |                     value = {msg.value().decode('utf-8')}
47 |                 """)
48 |             )
49 | 
50 |     while True:
51 |         payment = generate_payment()
52 |         print(f"Sending payment: {payment}")
53 | 
54 |         producer.produce(
55 |             topic,
56 |             key=str(payment["user_id"]),
57 |             value=json.dumps(payment),
58 |             callback=delivery_callback,
59 |         )
60 | 
61 |         producer.poll(0)
62 |         time.sleep(1)
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-solution/payments-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_payment():
11 |     payment_id = f"payment-{random.randint(1000, 9999)}"
12 |     user_id = f"user-{random.randint(1, 50)}"
13 |     merchant_id = f"merchant-{random.randint(1, 20)}"
14 |     amount = round(random.uniform(10.0, 1000.0), 2)
15 |     payment_time = datetime.now().isoformat()
16 | 
17 |     payment_event = {
18 |         "payment_id": payment_id,
19 |         "user_id": user_id,
20 |         "merchant_id": merchant_id,
21 |         "amount": amount,
22 |         "payment_time": payment_time
23 |     }
24 |     return payment_event
25 | 
26 | 
27 | def main():
28 | 
29 |     config = {
30 |         "bootstrap.servers": "localhost:9092"
31 |     }
32 | 
33 |     producer = Producer(config)
34 | 
35 |     topic = "payments"
36 | 
37 |     def delivery_callback(err, msg):
38 |         if err:
39 |             print("ERROR: Message failed delivery: {}".format(err))
40 |         else:
41 |             print(
42 |                 textwrap.dedent(
43 |                 f"""
44 |                     Produced event to topic {msg.topic()}:
45 |                     key = {msg.key().decode('utf-8')}
46 |                     value = {msg.value().decode('utf-8')}
47 |                 """)
48 |             )
49 | 
50 |     while True:
51 |         payment = generate_payment()
52 |         print(f"Sending payment: {payment}")
53 | 
54 |         producer.produce(
55 |             topic,
56 |             key=str(payment["user_id"]),
57 |             value=json.dumps(payment),
58 |             callback=delivery_callback,
59 |         )
60 | 
61 |         producer.poll(0)
62 |         time.sleep(1)
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/01-kafka-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | def generate_order():
10 |     countries = [
11 |         "USA",
12 |         "Canada",
13 |         "UK",
14 |         "Germany",
15 |         "France",
16 |         "Australia",
17 |         "Japan",
18 |         "Ireland",
19 |     ]
20 |     order = {
21 |         "order_id": random.randint(1000, 9999),
22 |         "customer_id": random.randint(1, 10),
23 |         "total_price": round(random.uniform(20.0, 1000.0), 2),
24 |         "customer_country": random.choice(countries),
25 |         "merchant_country": random.choice(countries),
26 |         "order_date": datetime.now().isoformat(),
27 |     }
28 |     return order
29 | 
30 | def main():
31 | 
32 |     config = {
33 |         "bootstrap.servers": "localhost:9092"
34 |     }
35 | 
36 |     producer = Producer(config)
37 | 
38 |     topic = "orders"
39 | 
40 |     def delivery_callback(err, msg):
41 |         if err:
42 |             print("ERROR: Message failed delivery: {}".format(err))
43 |         else:
44 |             print(
45 |                 textwrap.dedent(
46 |                 f"""
47 |                     Produced event to topic {msg.topic()}:
48 |                     key = {msg.key().decode('utf-8')}
49 |                     value = {msg.value().decode('utf-8')}
50 |                 """)
51 |             )
52 | 
53 |     while True:
54 |         order = generate_order()
55 |         print(f"Sending order: {order}")
56 | 
57 |         producer.produce(
58 |             topic,
59 |             key=str(order["customer_id"]),
60 |             value=json.dumps(order),
61 |             callback=delivery_callback,
62 |         )
63 | 
64 |         producer.poll(0)
65 | 
66 |         time.sleep(1)
67 | 
68 | if __name__ == "__main__":
69 |     main()


--------------------------------------------------------------------------------
/08-flink-stream-processing/orders-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_order():
11 |     order_id = f"order-{random.randint(1000, 9999)}"
12 |     customer_id = f"customer-{random.randint(1, 200)}"
13 |     product_id = f"product-{random.randint(1, 10)}"
14 |     quantity = random.randint(1, 5)
15 |     price = round(random.uniform(5.0, 100.0), 2)
16 |     order_time = datetime.now().isoformat()
17 | 
18 |     order_event = {
19 |         "order_id": order_id,
20 |         "customer_id": customer_id,
21 |         "product_id": product_id,
22 |         "quantity": quantity,
23 |         "price": price,
24 |         "order_time": order_time
25 |     }
26 |     return order_event
27 | 
28 | 
29 | def main():
30 | 
31 |     config = {
32 |         "bootstrap.servers": "localhost:9092"
33 |     }
34 | 
35 |     producer = Producer(config)
36 | 
37 |     topic = "orders"
38 | 
39 |     def delivery_callback(err, msg):
40 |         if err:
41 |             print("ERROR: Message failed delivery: {}".format(err))
42 |         else:
43 |             print(
44 |                 textwrap.dedent(
45 |                 f"""
46 |                     Produced event to topic {msg.topic()}:
47 |                     key = {msg.key().decode('utf-8')}
48 |                     value = {msg.value().decode('utf-8')}
49 |                 """)
50 |             )
51 | 
52 |     while True:
53 |         order = generate_order()
54 |         print(f"Sending order: {order}")
55 | 
56 |         producer.produce(
57 |             topic,
58 |             key=str(order["customer_id"]),
59 |             value=json.dumps(order),
60 |             callback=delivery_callback,
61 |         )
62 | 
63 |         producer.poll(0)
64 | 
65 |         time.sleep(1)
66 | 
67 | if __name__ == "__main__":
68 |     main()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/customer_reviews_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import dag, task
 2 | from airflow.operators.python import get_current_context
 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook
 4 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
 5 | from datetime import datetime, timedelta
 6 | import os
 7 | import csv
 8 | 
 9 | 
10 | @dag(
11 |     "customer_reviews_dag",
12 |     start_date=datetime(2025, 1, 1),
13 |     schedule_interval="* * * * *",
14 |     catchup=False,
15 |     description="Review average score",
16 | )
17 | def customer_reviews_dag():
18 | 
19 |     @task
20 |     def extract_reviews():
21 |         pg_hook = PostgresHook(postgres_conn_id="postgres_rental_site")
22 | 
23 |         context = get_current_context()
24 |         execution_date = context["execution_date"]
25 |         start_of_minute = execution_date.replace(second=0, microsecond=0)
26 |         end_of_minute = start_of_minute + timedelta(hours=1)
27 | 
28 |         query = f"""
29 |             SELECT review_id, listing_id, review_score, review_comment, review_date
30 |             FROM customer_reviews
31 |             WHERE review_date >= '{start_of_minute.strftime('%Y-%m-%d %H:%M:%S')}'
32 |               AND review_date < '{end_of_minute.strftime('%Y-%m-%d %H:%M:%S')}'
33 |         """
34 | 
35 |         # TODO: Read data from Postgres, and write the results
36 | 
37 |     spark_etl = SparkSubmitOperator(
38 |         task_id="spark_etl_reviews",
39 |         application="dags/spark_etl_reviews.py",
40 |         name="guest_reviews_etl",
41 |         application_args=[
42 |             # TODO: Set input and output paths
43 |             "--customer_reviews", "",
44 |             "--output_path", ""
45 |         ],
46 |         conn_id='spark_rental_site',
47 |     )
48 | 
49 |     extract_task = extract_reviews()
50 |     extract_task >> spark_etl
51 | 
52 | dag_instance = customer_reviews_dag()


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-exercise/payments-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_payment():
11 |     payment_id = f"payment-{random.randint(1000, 9999)}"
12 |     user_id = f"user-{random.randint(1, 50)}"
13 |     merchant_id = f"merchant-{random.randint(1, 20)}"
14 | 
15 |     if random.randint(1, 10) < 2:
16 |         amount = round(random.uniform(10.0, 10000.0), 2)
17 |     else:
18 |         amount = round(random.uniform(10.0, 1000.0), 2)
19 |     payment_time = datetime.now().isoformat()
20 | 
21 |     payment_event = {
22 |         "payment_id": payment_id,
23 |         "user_id": user_id,
24 |         "merchant_id": merchant_id,
25 |         "amount": amount,
26 |         "payment_time": payment_time
27 |     }
28 |     return payment_event
29 | 
30 | 
31 | def main():
32 | 
33 |     config = {
34 |         "bootstrap.servers": "localhost:9092"
35 |     }
36 | 
37 |     producer = Producer(config)
38 | 
39 |     topic = "payments"
40 | 
41 |     def delivery_callback(err, msg):
42 |         if err:
43 |             print("ERROR: Message failed delivery: {}".format(err))
44 |         else:
45 |             print(
46 |                 textwrap.dedent(
47 |                 f"""
48 |                     Produced event to topic {msg.topic()}:
49 |                     key = {msg.key().decode('utf-8')}
50 |                     value = {msg.value().decode('utf-8')}
51 |                 """)
52 |             )
53 | 
54 |     while True:
55 |         payment = generate_payment()
56 |         print(f"Sending payment: {payment}")
57 | 
58 |         producer.produce(
59 |             topic,
60 |             key=str(payment["user_id"]),
61 |             value=json.dumps(payment),
62 |             callback=delivery_callback,
63 |         )
64 | 
65 |         producer.poll(0)
66 |         time.sleep(1)
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-solution/payments-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_payment():
11 |     payment_id = f"payment-{random.randint(1000, 9999)}"
12 |     user_id = f"user-{random.randint(1, 50)}"
13 |     merchant_id = f"merchant-{random.randint(1, 20)}"
14 | 
15 |     if random.randint(1, 10) < 2:
16 |         amount = round(random.uniform(10.0, 10000.0), 2)
17 |     else:
18 |         amount = round(random.uniform(10.0, 1000.0), 2)
19 |     payment_time = datetime.now().isoformat()
20 | 
21 |     payment_event = {
22 |         "payment_id": payment_id,
23 |         "user_id": user_id,
24 |         "merchant_id": merchant_id,
25 |         "amount": amount,
26 |         "payment_time": payment_time
27 |     }
28 |     return payment_event
29 | 
30 | 
31 | def main():
32 | 
33 |     config = {
34 |         "bootstrap.servers": "localhost:9092"
35 |     }
36 | 
37 |     producer = Producer(config)
38 | 
39 |     topic = "payments"
40 | 
41 |     def delivery_callback(err, msg):
42 |         if err:
43 |             print("ERROR: Message failed delivery: {}".format(err))
44 |         else:
45 |             print(
46 |                 textwrap.dedent(
47 |                 f"""
48 |                     Produced event to topic {msg.topic()}:
49 |                     key = {msg.key().decode('utf-8')}
50 |                     value = {msg.value().decode('utf-8')}
51 |                 """)
52 |             )
53 | 
54 |     while True:
55 |         payment = generate_payment()
56 |         print(f"Sending payment: {payment}")
57 | 
58 |         producer.produce(
59 |             topic,
60 |             key=str(payment["user_id"]),
61 |             value=json.dumps(payment),
62 |             callback=delivery_callback,
63 |         )
64 | 
65 |         producer.poll(0)
66 |         time.sleep(1)
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-exercise/kafka-connect-consumer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from confluent_kafka import Consumer, KafkaError, KafkaException
 4 | 
 5 | conf = {
 6 |     "bootstrap.servers": "localhost:9092",
 7 |     "group.id": "postgres-price-consumer",
 8 |     "auto.offset.reset": "earliest",
 9 | }
10 | 
11 | # TODO: Read the "README.md" for instructions on how to set up this exercise
12 | 
13 | def main():
14 |     consumer = Consumer(conf)
15 | 
16 |     topic = "postgres-.public.orders"
17 |     consumer.subscribe([topic])
18 | 
19 |     try:
20 |         print(f"Consuming messages from topic '{topic}'")
21 |         while True:
22 |             msg = consumer.poll(1.0)
23 | 
24 |             if msg is None:
25 |                 continue
26 |             if msg.error():
27 |                 print(f"Error: {msg.error()}")
28 |                 continue
29 | 
30 |             process_message(msg)
31 | 
32 |     finally:
33 |         consumer.close()
34 | 
35 | 
36 | def process_message(msg):
37 |     # TODO: Process incoming WAL record
38 |     # Print a string message if two conditions are true:
39 |     # * If a message is for an update operation
40 |     # * If an order status has changed from "processed" to "refunded"
41 |     # 
42 |     # Note: If you go though the steps in the README.md,
43 |     # each record will contain the "payload" object two fields:
44 |     # * `before` - a snapshot of a database record before it was updated
45 |     # * `after` - a snapshot of a database record after it was updated
46 |     #
47 |     # You will need to extract the "status" column values from both records and compare their values
48 |     # 
49 |     # To get those field you should do something like this:
50 |     #
51 |     # ```py
52 |     # before = wal_record["payload"]["before"]
53 |     # after = wal_record["payload"]["after"]
54 |     # ```
55 |     #
56 |     # But keep in mind that one or both of these fields can be None for some events.
57 |     pass
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/docker-compose-kafka-connect.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 | 
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     hostname: kafka
 7 |     container_name: kafka
 8 |     ports:
 9 |       - "9092:9092"
10 |     environment:
11 |       KAFKA_KRAFT_MODE: "true"
12 |       KAFKA_PROCESS_ROLES: broker,controller
13 |       KAFKA_NODE_ID: 1
14 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
15 |       KAFKA_LISTENERS: EXTERNAL://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092
16 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092,INTERNAL://kafka:19092
19 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
20 |       KAFKA_LOG_RETENTION_HOURS: 168
21 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
22 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"
23 | 
24 |   kafka-connect:
25 |     image: debezium/connect:3.0.0.Final
26 |     container_name: kafka-connect
27 |     depends_on:
28 |       - kafka
29 |     ports:
30 |       - "8083:8083"
31 |     environment:
32 |       BOOTSTRAP_SERVERS: kafka:19092
33 |       GROUP_ID: "kafka-connect-group"
34 |       CONFIG_STORAGE_TOPIC: _connect-configs
35 |       OFFSET_STORAGE_TOPIC: _connect-offsets
36 |       STATUS_STORAGE_TOPIC: _connect-status
37 |       KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
38 |       VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
39 |       INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
40 |       INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
41 | 
42 |   postgres:
43 |     image: postgres:15
44 |     container_name: postgres
45 |     environment:
46 |         POSTGRES_USER: user
47 |         POSTGRES_PASSWORD: password
48 |         POSTGRES_DB: onlineshop
49 |     command:
50 |         - "postgres"
51 |         - "-c"
52 |         - "wal_level=logical"
53 |     ports:
54 |         - "5432:5432"
55 | 
56 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/README.md:
--------------------------------------------------------------------------------
 1 | ## Prerequisites
 2 | 
 3 | - Python 3.11 (Try to use this version if you have errors)
 4 | - Java (required for running Flink)
 5 | 
 6 | ---
 7 | 
 8 | ## Create and Activate a Virtual Environment
 9 | 
10 | ```sh
11 | python3.11 -m venv venv
12 | source ./venv/bin/activate
13 | ```
14 | 
15 | ---
16 | 
17 | ## Install Required Python Packages
18 | 
19 | Install the Confluent Kafka client and Apache Flink with:
20 | 
21 | ```sh
22 | pip install confluent-kafka
23 | pip install apache-flink
24 | ```
25 | 
26 | ---
27 | 
28 | ## Configure `flink` CLI Command
29 | 
30 | Determine the Flink installation path and update the `PATH` environment variable:
31 | 
32 | ```sh
33 | FLINK_HOME=$(./venv/bin/find_flink_home.py)
34 | export PATH=$PATH:$FLINK_HOME/bin
35 | ```
36 | 
37 | Verify the installation:
38 | 
39 | ```sh
40 | flink --version
41 | ```
42 | 
43 | ---
44 | 
45 | ## Download the Kafka Connector
46 | 
47 | Search for **`flink-sql-connector-kafka maven`** in your browser and download the latest available JAR file. Place it in the project directory.
48 | 
49 | ---
50 | 
51 | ## Sample Flink Application
52 | 
53 | Create a Python file named `01-flink-hello-world.py` with the following content:
54 | 
55 | ```python
56 | from pyflink.datastream import StreamExecutionEnvironment
57 | 
58 | def main():
59 |     env = StreamExecutionEnvironment.get_execution_environment()
60 |     data_stream = env.from_collection([1, 2, 3, 4, 5])
61 | 
62 |     mapped_stream = data_stream.map(lambda x: x * 2)
63 | 
64 |     mapped_stream.print()
65 | 
66 |     env.execute("Flink Hello World")
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | ```
71 | 
72 | ---
73 | 
74 | ## Run the Application
75 | 
76 | Run the Flink application locally with the Kafka connector:
77 | 
78 | ```sh
79 | flink run \
80 |   --python 01-flink-hello-world.py \
81 |   --target local \
82 |   --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar
83 | ```
84 | 
85 | ---
86 | 
87 | Flink is now ready and running locally. You can build on this setup in future demos using real-time Kafka streams.


--------------------------------------------------------------------------------
/08-flink-stream-processing/variable-spend-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_order():
11 |     customer_id_num = random.randint(1, 50)
12 |     order_id = f"order-{random.randint(1000, 9999)}"
13 |     customer_id = f"customer-{customer_id_num}"
14 |     product_id = f"product-{random.randint(1, 200)}"
15 |     quantity = random.randint(1, 5)
16 | 
17 |     if customer_id_num < 40:
18 |         price = round(random.uniform(5.0, 100.0), 2)
19 |     else:
20 |         price = round(random.uniform(200.0, 300.0), 2)
21 |     order_time = datetime.now().isoformat()
22 | 
23 |     order_event = {
24 |         "order_id": order_id,
25 |         "customer_id": customer_id,
26 |         "product_id": product_id,
27 |         "quantity": quantity,
28 |         "price": price,
29 |         "order_time": order_time
30 |     }
31 |     return order_event
32 | 
33 | 
34 | def main():
35 | 
36 |     config = {
37 |         "bootstrap.servers": "localhost:9092"
38 |     }
39 | 
40 |     producer = Producer(config)
41 | 
42 |     topic = "orders"
43 | 
44 |     def delivery_callback(err, msg):
45 |         if err:
46 |             print("ERROR: Message failed delivery: {}".format(err))
47 |         else:
48 |             print(
49 |                 textwrap.dedent(
50 |                 f"""
51 |                     Produced event to topic {msg.topic()}:
52 |                     key = {msg.key().decode('utf-8')}
53 |                     value = {msg.value().decode('utf-8')}
54 |                 """)
55 |             )
56 | 
57 |     while True:
58 |         order = generate_order()
59 |         print(f"Sending order: {order}")
60 | 
61 |         producer.produce(
62 |             topic,
63 |             key=str(order["customer_id"]),
64 |             value=json.dumps(order),
65 |             callback=delivery_callback,
66 |         )
67 | 
68 |         producer.poll(0)
69 | 
70 |         time.sleep(1)
71 | 
72 | if __name__ == "__main__":
73 |     main()


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-solution/wiki-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import textwrap
 3 | 
 4 | from confluent_kafka import Producer
 5 | from sseclient import SSEClient
 6 | 
 7 | producer_conf = {"bootstrap.servers": "localhost:9092"}
 8 | kafka_topic = "wikipedia-changes"
 9 | 
10 | 
11 | def delivery_callback(err, msg):
12 |     if err:
13 |         print("ERROR: Message failed delivery: {}".format(err))
14 |     else:
15 |         print(
16 |             textwrap.dedent(f"""
17 |         Produced event to topic {msg.topic()}:
18 |         key = {msg.key().decode('utf-8')}
19 |         value = {msg.value().decode('utf-8')}
20 |         """)
21 |         )
22 | 
23 | 
24 | def main():
25 |     url = "https://stream.wikimedia.org/v2/stream/recentchange"
26 | 
27 |     print(
28 |         f"Starting to consume Wikipedia recent changes from {url} and produce to Kafka topic '{kafka_topic}'..."
29 |     )
30 | 
31 |     producer = Producer(producer_conf)
32 |     messages = SSEClient(url)
33 | 
34 |     for event in messages:
35 |         if event.event == "message" and event.data:
36 |             try:
37 |                 data = json.loads(event.data)
38 |             except json.JSONDecodeError:
39 |                 continue
40 | 
41 |             id = data.get("id")
42 |             message = {
43 |                 "id": id,
44 |                 "type": data.get("type"),
45 |                 "title": data.get("title"),
46 |                 "user": data.get("user"),
47 |                 "bot": data.get("bot"),
48 |                 "timestamp": data.get("timestamp"),
49 |                 "comment": data.get("comment"),
50 |                 "minor": data.get("minor", False),
51 |             }
52 | 
53 |             value = json.dumps(message)
54 |             producer.produce(
55 |                 topic=kafka_topic,
56 |                 key=str(id),
57 |                 value=value,
58 |                 callback=delivery_callback,
59 |             )
60 |             producer.poll(0)
61 | 
62 |     producer.flush()
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-exercise/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 | 
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     hostname: kafka
 7 |     container_name: kafka
 8 |     ports:
 9 |       - "9092:9092"
10 |     environment:
11 |       KAFKA_KRAFT_MODE: "true"
12 |       KAFKA_PROCESS_ROLES: broker,controller
13 |       KAFKA_NODE_ID: 1
14 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
15 |       KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092
16 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092,INTERNAL://kafka:19092
19 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       KAFKA_LOG_RETENTION_HOURS: 168
22 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"
23 | 
24 |   kafka-connect:
25 |     image: debezium/connect:3.0.0.Final
26 |     container_name: kafka-connect
27 |     depends_on:
28 |       - kafka
29 |     ports:
30 |       - "8083:8083"
31 |     environment:
32 |       BOOTSTRAP_SERVERS: kafka:19092
33 |       GROUP_ID: "kafka-connect-group"
34 |       CONFIG_STORAGE_TOPIC: _connect-configs
35 |       OFFSET_STORAGE_TOPIC: _connect-offsets
36 |       STATUS_STORAGE_TOPIC: _connect-status
37 |       KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
38 |       VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
39 |       INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
40 |       INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
41 | 
42 |   postgres:
43 |     image: postgres:15
44 |     container_name: postgres
45 |     environment:
46 |         POSTGRES_USER: user
47 |         POSTGRES_PASSWORD: password
48 |         POSTGRES_DB: onlineshop
49 |     command:
50 |         - "postgres"
51 |         - "-c"
52 |         - "wal_level=logical"
53 |     ports:
54 |         - "5432:5432"
55 | 
56 | 


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-solution/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 | 
 4 |   kafka:
 5 |     image: confluentinc/cp-kafka:7.8.0
 6 |     hostname: kafka
 7 |     container_name: kafka
 8 |     ports:
 9 |       - "9092:9092"
10 |     environment:
11 |       KAFKA_KRAFT_MODE: "true"
12 |       KAFKA_PROCESS_ROLES: broker,controller
13 |       KAFKA_NODE_ID: 1
14 |       KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
15 |       KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093,INTERNAL://0.0.0.0:19092
16 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT,INTERNAL:PLAINTEXT
17 |       KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
18 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092,INTERNAL://kafka:19092
19 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
20 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
21 |       KAFKA_LOG_RETENTION_HOURS: 168
22 |       CLUSTER_ID: "d33bd245-a018-40a3-91b1-b82cf89d9e1c"
23 | 
24 |   kafka-connect:
25 |     image: debezium/connect:3.0.0.Final
26 |     container_name: kafka-connect
27 |     depends_on:
28 |       - kafka
29 |     ports:
30 |       - "8083:8083"
31 |     environment:
32 |       BOOTSTRAP_SERVERS: kafka:19092
33 |       GROUP_ID: "kafka-connect-group"
34 |       CONFIG_STORAGE_TOPIC: _connect-configs
35 |       OFFSET_STORAGE_TOPIC: _connect-offsets
36 |       STATUS_STORAGE_TOPIC: _connect-status
37 |       KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
38 |       VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
39 |       INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
40 |       INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
41 | 
42 |   postgres:
43 |     image: postgres:15
44 |     container_name: postgres
45 |     environment:
46 |         POSTGRES_USER: user
47 |         POSTGRES_PASSWORD: password
48 |         POSTGRES_DB: onlineshop
49 |     command:
50 |         - "postgres"
51 |         - "-c"
52 |         - "wal_level=logical"
53 |     ports:
54 |         - "5432:5432"
55 | 
56 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/popular-products-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_order():
11 | 
12 |     order_id = f"order-{random.randint(1000, 9999)}"
13 |     customer_id = f"customer-{random.randint(1, 200)}"
14 | 
15 |     product_id_num = random.randint(1, 10)
16 |     product_id = f"product-{product_id_num}"
17 |     
18 |     if product_id_num > 3:
19 |         quantity = random.randint(1, 5)
20 |     else:
21 |         quantity = random.randint(1, 20)
22 | 
23 |     price = round(random.uniform(5.0, 100.0), 2)
24 |     current_time = datetime.now()
25 | 
26 |     order_event = {
27 |         "order_id": order_id,
28 |         "customer_id": customer_id,
29 |         "product_id": product_id,
30 |         "quantity": quantity,
31 |         "price": price,
32 |         "order_time": current_time.isoformat()
33 |     }
34 |     return order_event
35 | 
36 | 
37 | def main():
38 | 
39 |     config = {
40 |         "bootstrap.servers": "localhost:9092"
41 |     }
42 | 
43 |     producer = Producer(config)
44 | 
45 |     topic = "orders"
46 | 
47 |     def delivery_callback(err, msg):
48 |         if err:
49 |             print("ERROR: Message failed delivery: {}".format(err))
50 |         else:
51 |             print(
52 |                 textwrap.dedent(
53 |                 f"""
54 |                     Produced event to topic {msg.topic()}:
55 |                     key = {msg.key().decode('utf-8')}
56 |                     value = {msg.value().decode('utf-8')}
57 |                 """)
58 |             )
59 | 
60 |     while True:
61 |         order = generate_order()
62 |         print(f"Sending order: {order}")
63 | 
64 |         producer.produce(
65 |             topic,
66 |             key=str(order["customer_id"]),
67 |             value=json.dumps(order),
68 |             callback=delivery_callback,
69 |         )
70 | 
71 |         producer.poll(0)
72 | 
73 |         time.sleep(1)
74 | 
75 | if __name__ == "__main__":
76 |     main()


--------------------------------------------------------------------------------
/08-flink-stream-processing/late-events-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from confluent_kafka import Producer
 8 | 
 9 | 
10 | def generate_order():
11 |     order_id = f"order-{random.randint(1000, 9999)}"
12 |     customer_id = f"customer-{random.randint(1, 10)}"
13 |     product_id = f"product-{random.randint(1, 200)}"
14 |     quantity = random.randint(1, 5)
15 |     price = round(random.uniform(5.0, 100.0), 2)
16 |     current_time = datetime.now()
17 | 
18 |     if random.random() < 0.2:
19 |         late_by = random.randint(60, 120)
20 |         event_time = current_time - timedelta(seconds=late_by)
21 |     else:
22 |         event_time = current_time
23 | 
24 |     order_event = {
25 |         "order_id": order_id,
26 |         "customer_id": customer_id,
27 |         "product_id": product_id,
28 |         "quantity": quantity,
29 |         "price": price,
30 |         "order_time": event_time.isoformat()
31 |     }
32 |     return order_event
33 | 
34 | 
35 | def main():
36 | 
37 |     config = {
38 |         "bootstrap.servers": "localhost:9092"
39 |     }
40 | 
41 |     producer = Producer(config)
42 | 
43 |     topic = "late-orders"
44 | 
45 |     def delivery_callback(err, msg):
46 |         if err:
47 |             print("ERROR: Message failed delivery: {}".format(err))
48 |         else:
49 |             print(
50 |                 textwrap.dedent(
51 |                 f"""
52 |                     Produced event to topic {msg.topic()}:
53 |                     key = {msg.key().decode('utf-8')}
54 |                     value = {msg.value().decode('utf-8')}
55 |                 """)
56 |             )
57 | 
58 |     while True:
59 |         order = generate_order()
60 |         print(f"Sending order: {order}")
61 | 
62 |         producer.produce(
63 |             topic,
64 |             key=str(order["customer_id"]),
65 |             value=json.dumps(order),
66 |             callback=delivery_callback,
67 |         )
68 | 
69 |         producer.poll(0)
70 | 
71 |         time.sleep(1)
72 | 
73 | if __name__ == "__main__":
74 |     main()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/dags/01-average_page_visits.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import os
 3 | import json
 4 | import random
 5 | 
 6 | from airflow.decorators import dag, task
 7 | from airflow.operators.python import get_current_context
 8 | 
 9 | @dag(
10 |     "average_page_visits",
11 |     start_date=datetime(2025, 1, 1),
12 |     schedule_interval="* * * * *",
13 |     catchup=False,
14 |     description=""
15 | )
16 | def average_page_visits():
17 | 
18 |     def get_data_path():
19 |         context = get_current_context()
20 |         execution_date = context["execution_date"]
21 |         file_date = execution_date.strftime("%Y-%m-%d_%H%M")
22 |         return f"/tmp/page_visits/{file_date}.json"
23 | 
24 |     @task
25 |     def produce_page_visits_data():
26 | 
27 |         page_visits = [
28 |             {"id": 1, "name": "Cozy Apartment", "price": 120, "page_visits": random.randint(0, 50)},
29 |             {"id": 2, "name": "Luxury Condo", "price": 300, "page_visits": random.randint(0, 50)},
30 |             {"id": 3, "name": "Modern Studio", "price": 180, "page_visits": random.randint(0, 50)},
31 |             {"id": 4, "name": "Charming Loft", "price": 150, "page_visits": random.randint(0, 50)},
32 |             {"id": 5, "name": "Spacious Villa", "price": 400, "page_visits": random.randint(0, 50)},
33 |         ]
34 |         file_path = get_data_path()
35 | 
36 |         directory = os.path.dirname(file_path)
37 |         if not os.path.exists(directory):
38 |             os.makedirs(directory)
39 | 
40 |         with open(file_path, "w") as f:
41 |             json.dump(page_visits, f)
42 | 
43 |         print(f"Written to file: {file_path}")
44 | 
45 |     @task
46 |     def process_page_visits_data():
47 |         file_path = get_data_path()
48 | 
49 |         with open(file_path, "r") as f:
50 |             page_visits = json.load(f)
51 | 
52 |         average_price = sum(page_visit["page_visits"] for page_visit in page_visits) / len(page_visits)
53 |         print(f"Average number of page visits {average_price}")
54 | 
55 |     produce_page_visits_data() >> process_page_visits_data()
56 | 
57 | demo_dag = average_page_visits()


--------------------------------------------------------------------------------
/07-kafka-streaming/order.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Order:
 3 |     def __init__(
 4 |         self,
 5 |         order_id,
 6 |         customer_id,
 7 |         total_price,
 8 |         customer_country,
 9 |         merchant_country,
10 |         order_datetime,
11 |     ):
12 |         self.order_id = order_id
13 |         self.customer_id = customer_id
14 |         self.total_price = total_price
15 |         self.customer_country = customer_country
16 |         self.merchant_country = merchant_country
17 |         self.order_datetime = order_datetime
18 |     
19 | 
20 |     @staticmethod
21 |     def from_dict(obj):
22 |         return Order(
23 |             order_id=obj["order_id"],
24 |             customer_id=obj["customer_id"],
25 |             total_price=obj["total_price"],
26 |             customer_country=obj["customer_country"],
27 |             merchant_country=obj["merchant_country"],
28 |             order_datetime=obj["order_datetime"],
29 |         )
30 | 
31 |     def to_dict(self):
32 |         return {
33 |             "order_id": self.order_id,
34 |             "customer_id": self.customer_id,
35 |             "total_price": self.total_price,
36 |             "customer_country": self.customer_country,
37 |             "merchant_country": self.merchant_country,
38 |             "order_datetime": self.order_datetime,
39 |         }
40 | 
41 |     def __str__(self):
42 |         return (f"Order("
43 |                 f"order_id={self.order_id}, "
44 |                 f"customer_id={self.customer_id}, "
45 |                 f"total_price={self.total_price}, "
46 |                 f"customer_country='{self.customer_country}', "
47 |                 f"merchant_country='{self.merchant_country}', "
48 |                 f"order_datetime='{self.order_datetime}')")
49 | 
50 | 
51 | ORDER_SCHEMA = {
52 |     "type": "record",
53 |     "name": "Order",
54 |     "fields": [
55 |         {"name": "order_id", "type": "int"},
56 |         {"name": "customer_id", "type": "string"},
57 |         {"name": "total_price", "type": "float"},
58 |         {"name": "customer_country", "type": "string"},
59 |         {"name": "merchant_country", "type": "string"},
60 |         {"name": "order_datetime", "type": "string"},
61 |     ],
62 | }


--------------------------------------------------------------------------------
/07-kafka-streaming/05-kafka-schema-registry-consumer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | from order import Order, ORDER_SCHEMA
 5 | 
 6 | from confluent_kafka import Consumer, KafkaError
 7 | from confluent_kafka.schema_registry import SchemaRegistryClient
 8 | from confluent_kafka.schema_registry.avro import AvroDeserializer
 9 | from confluent_kafka.serialization import MessageField, SerializationContext
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(description="Test Kafka consumer")
13 |     parser.add_argument("--group-id", "-g", help="Consumer group ID")
14 |     parser.add_argument("--topic-name", "-t", help="Topic name ")
15 | 
16 |     args = parser.parse_args()
17 | 
18 |     group_id = args.group_id
19 |     topic_name = args.topic_name
20 | 
21 |     schema_registry_conf = {"url": "http://localhost:8081"}
22 |     schema_registry_client = SchemaRegistryClient(schema_registry_conf)
23 | 
24 |     avro_deserializer = AvroDeserializer(
25 |         schema_registry_client,
26 |         json.dumps(ORDER_SCHEMA),
27 |         lambda obj, ctx: Order.from_dict(obj),
28 |     )
29 | 
30 |     config = {
31 |         "bootstrap.servers": "localhost:9092",
32 |         "group.id": group_id,
33 |         "auto.offset.reset": "earliest",
34 |     }
35 | 
36 |     consumer = Consumer(config)
37 |     consumer.subscribe([topic_name])
38 | 
39 |     print(f"Starting consumer with group ID '{group_id}'")
40 | 
41 |     try:
42 |         while True:
43 |             msg = consumer.poll(timeout=1.0)
44 |             if msg is None:
45 |                 # No new messages
46 |                 continue
47 |             if msg.error():
48 |                 # Error while reading
49 |                 print(f"Error encountered: {msg.error()}")
50 |                 continue
51 | 
52 |             process_message(avro_deserializer, msg)
53 | 
54 |     finally:
55 |         consumer.close()
56 | 
57 | 
58 | def process_message(avro_deserializer, msg):
59 |     order = avro_deserializer(
60 |         msg.value(), SerializationContext(msg.topic(), MessageField.VALUE)
61 |     )
62 |     if order.total_price < 250:
63 |         return
64 | 
65 |     print(f"Received order price={order.total_price}")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/dags/02-average_page_visits_with_failures.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import os
 3 | import json
 4 | import random
 5 | 
 6 | from airflow.decorators import dag, task
 7 | from airflow.operators.python import get_current_context
 8 | 
 9 | @dag(
10 |     "average_page_visits",
11 |     start_date=datetime(2025, 1, 1),
12 |     schedule_interval="* * * * *",
13 |     catchup=False,
14 |     description=""
15 | )
16 | def average_page_visits():
17 | 
18 |     def get_data_path():
19 |         context = get_current_context()
20 |         execution_date = context["execution_date"]
21 |         file_date = execution_date.strftime("%Y-%m-%d_%H%M")
22 |         return f"/tmp/page_visits/{file_date}.json"
23 | 
24 |     @task
25 |     def produce_page_visits_data():
26 | 
27 |         if random.random() < 0.5:
28 |             raise Exception("Job has failed")
29 | 
30 |         page_visits = [
31 |             {"id": 1, "name": "Cozy Apartment", "price": 120, "page_visits": random.randint(0, 50)},
32 |             {"id": 2, "name": "Luxury Condo", "price": 300, "page_visits": random.randint(0, 50)},
33 |             {"id": 3, "name": "Modern Studio", "price": 180, "page_visits": random.randint(0, 50)},
34 |             {"id": 4, "name": "Charming Loft", "price": 150, "page_visits": random.randint(0, 50)},
35 |             {"id": 5, "name": "Spacious Villa", "price": 400, "page_visits": random.randint(0, 50)},
36 |         ]
37 |         file_path = get_data_path()
38 | 
39 |         directory = os.path.dirname(file_path)
40 |         if not os.path.exists(directory):
41 |             os.makedirs(directory)
42 | 
43 |         with open(file_path, "w") as f:
44 |             json.dump(page_visits, f)
45 | 
46 |         print(f"Written to file: {file_path}")
47 | 
48 |     @task
49 |     def process_page_visits_data():
50 |         file_path = get_data_path()
51 | 
52 |         with open(file_path, "r") as f:
53 |             page_visits = json.load(f)
54 | 
55 |         average_price = sum(page_visit["page_visits"] for page_visit in page_visits) / len(page_visits)
56 |         print(f"Average number of page visits {average_price}")
57 | 
58 |     produce_page_visits_data() >> process_page_visits_data()
59 | 
60 | demo_dag = average_page_visits()


--------------------------------------------------------------------------------
/06-data-eng-with-llms/03-spark-llms.py:
--------------------------------------------------------------------------------
 1 | %pip install transformers
 2 | %pip install outlines
 3 | 
 4 | # -----------------------
 5 | 
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | spark = SparkSession.builder \
 9 |     .appName("ReviewsClassifier") \
10 |     .master("local[*]") \
11 |     .getOrCreate()
12 | 
13 | spark
14 | 
15 | 
16 | # ------------------------
17 | 
18 | reviews = [
19 |     (1, "This is absolutely delightful!"),
20 |     (2, "This was the worst hotel I've ever seen"),
21 |     (3, "Great location but the rooms were dirty."),
22 |     (4, "Staff were friendly and helpful."),
23 |     (5, "Mediocre breakfast, but I'd stay again."),
24 | ]
25 | df = spark.createDataFrame(
26 |     reviews,
27 |     ["review_id", "review"]
28 | )
29 | df.show()
30 | 
31 | # ------------------------
32 | 
33 | import outlines
34 | import json
35 | import torch
36 | 
37 | model_name = "mistralai/Mistral-7B-Instruct-v0.3"
38 | 
39 | schema = json.dumps({
40 |     "type": "object",
41 |     "properties": {
42 |         "sentiment": {
43 |             "type": "string",
44 |             "enum": ["positive", "negative"]
45 |         }
46 |     },
47 |     "required": ["sentiment"]
48 | })
49 | 
50 | def classify(generate_json, review):
51 |     prompt = (
52 |         "Classify the following customer review as positive or negative.\n\n"
53 |         f"Review:\n{review}\n"
54 |     )
55 |     output_json = generate_json(prompt, max_tokens=40)
56 |     return output_json['sentiment']
57 | 
58 | # ------------------------------------------
59 | 
60 | from pyspark.sql.functions import udf
61 | from functools import cache
62 | from huggingface_hub import login
63 | 
64 | @udf("string")
65 | def sentiment_udf(review):
66 | 
67 |     @cache
68 |     def get_generate_json():
69 |         login(token="hf_...")
70 |         generator = outlines.models.transformers(
71 |             model_name,
72 |             device="cuda",
73 |             model_kwargs={"torch_dtype": torch.float16},
74 |         )
75 |         return outlines.generate.json(generator, schema)
76 |     
77 |     generate_json = get_generate_json()
78 | 
79 |     return classify(generate_json, review)
80 | 
81 | 
82 | result_df = df.withColumn("sentiment", sentiment_udf("review"))
83 | result_df.show(truncate=False)


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/01-wikipedia-stream-exercise/wiki-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import textwrap
 3 | 
 4 | from confluent_kafka import Producer
 5 | from sseclient import SSEClient
 6 | 
 7 | producer_conf = {"bootstrap.servers": "localhost:9092"}
 8 | kafka_topic = "wikipedia-changes"
 9 | 
10 | # TODO: Before running a producer.
11 | # 1. Install Kafka CLI tools
12 | #
13 | # ```
14 | # brew install kafka
15 | # ```
16 | #
17 | # 2. Start Kafka
18 | # 
19 | # ```
20 | # docker-compose up
21 | # ```
22 | #
23 | # 3. Create a virtual environment and install dependencies
24 | #
25 | # ```
26 | # python3 -m venv venv
27 | # source venv/bin/activate
28 | # pip install -r requirements.txt
29 | # ```
30 | 
31 | # TODO: Read docs about Wikipedia edit stream: https://www.mediawiki.org/wiki/Manual:RCFeed
32 | 
33 | def delivery_report(err, msg):
34 |     if err:
35 |         print("ERROR: Message failed delivery: {}".format(err))
36 |     else:
37 |         print(
38 |             textwrap.dedent(f"""
39 |         Produced event to topic {msg.topic()}:
40 |         key = {msg.key().decode('utf-8')}
41 |         value = {msg.value().decode('utf-8')}
42 |         """)
43 |         )
44 | 
45 | 
46 | def main():
47 |     url = "https://stream.wikimedia.org/v2/stream/recentchange"
48 | 
49 |     print(
50 |         f"Starting to consume Wikipedia recent changes from {url} and produce to Kafka topic '{kafka_topic}'..."
51 |     )
52 | 
53 |     producer = Producer(producer_conf)
54 |     messages = SSEClient(url)
55 | 
56 |     for event in messages:
57 |         if event.event == "message" and event.data:
58 |             try:
59 |                 data = json.loads(event.data)
60 |             except json.JSONDecodeError:
61 |                 continue
62 | 
63 |             print(data)
64 | 
65 |             # TODO: Produce a Kafka messages from a Wikistream update message
66 |             # * Parse the input message
67 |             # * Extract fields you need to write 
68 |             # * Create a JSON object for a new Kafka even
69 |             # * Write a messages to a Kafka topic
70 |             # 
71 |             # To test your producer, run the following command:
72 |             #
73 |             # ```
74 |             # kafka-console-consumer --bootstrap-server localhost:9092 --topic wikipedia-changes --from-beginning
75 |             # ```
76 | 
77 |     producer.flush()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/03-data-lake/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Setup for Data Lake
 2 | 
 3 | This README describes how to configure your AWS account and credentials to enable programmatic access via the AWS CLI. This setup will be used for working with services like Amazon S3 in the data lake section.
 4 | 
 5 | ---
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | - An [AWS account](https://aws.amazon.com/) (sign-up requires a valid payment method)
10 | - [AWS CLI installed](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html)
11 | 
12 | ---
13 | 
14 | ## Step 1: Sign In to AWS
15 | 
16 | 1. Go to the [AWS Management Console](https://aws.amazon.com/console/).
17 | 2. Log in as the **root user** (only for initial setup).
18 | 3. Navigate to the **IAM (Identity and Access Management)** service using the search bar.
19 | 
20 | ---
21 | 
22 | ## Step 2: Create an IAM User
23 | 
24 | 1. In the IAM dashboard, go to **Users** → **Create user**.
25 | 2. Enter a username, e.g., `rental-website-admin`.
26 | 3. Under **Permissions**, select **Attach existing policies directly** and choose **AdministratorAccess** for full access.
27 |    - Alternatively, you can choose more restricted permissions (e.g., S3-only or read-only).
28 | 4. Continue through the wizard and create the user.
29 | 
30 | ---
31 | 
32 | ## Step 3: Generate Programmatic Credentials
33 | 
34 | 1. After creating the user, go to the **Security credentials** tab.
35 | 2. Under **Access keys**, click **Create access key**.
36 | 3. Choose **Command Line Interface (CLI)** as the use case.
37 | 4. Click **Create access key**.
38 | 5. Copy both the **Access key ID** and **Secret access key**.
39 | 
40 | > ⚠️ The secret key is shown only once. Store it securely.
41 | 
42 | ---
43 | 
44 | ## Step 4: Set Up Local AWS Credentials
45 | 
46 | Create the AWS credentials file:
47 | 
48 | ```sh
49 | vim ~/.aws/credentials
50 | ```
51 | 
52 | Paste the following into the file, replacing the placeholders with your actual keys:
53 | 
54 | ```ini
55 | [default]
56 | aws_access_key_id = YOUR_ACCESS_KEY_ID
57 | aws_secret_access_key = YOUR_SECRET_ACCESS_KEY
58 | ```
59 | 
60 | Save and close the file.
61 | 
62 | ---
63 | 
64 | ## Step 5: Test Your Configuration
65 | 
66 | Run the following command to verify that your credentials are working:
67 | 
68 | ```sh
69 | aws s3 ls
70 | ```
71 | 
72 | You should see a list of accessible S3 buckets (or an empty list if you have none yet).
73 | 
74 | ---
75 | 
76 | You're now ready to use the AWS CLI to interact with AWS services.


--------------------------------------------------------------------------------
/02-data-processing-with-spark/02-reading-airbnb-data.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "c219df52-cb8d-4431-831a-3751a69062f2",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "from pyspark.sql import SparkSession\n",
11 |     "\n",
12 |     "spark = SparkSession.builder \\\n",
13 |     "    .appName(\"Read Inside Airbnb data\") \\\n",
14 |     "    .getOrCreate()"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "id": "ec00f92d-c582-4970-9617-ff0a9852cc45",
21 |    "metadata": {},
22 |    "outputs": [],
23 |    "source": [
24 |     "listings = spark.read.csv(\"data/listings.csv.gz\", \n",
25 |     "    header=True,\n",
26 |     "    inferSchema=True,\n",
27 |     "    sep=\",\", \n",
28 |     "    quote='\"',\n",
29 |     "    escape='\"', \n",
30 |     "    multiLine=True,\n",
31 |     "    mode=\"PERMISSIVE\" \n",
32 |     ")"
33 |    ]
34 |   },
35 |   {
36 |    "cell_type": "code",
37 |    "execution_count": null,
38 |    "id": "173f066e-9e4c-40bf-9b30-befcf4b0e4ec",
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": [
42 |     "for field in listings.schema:\n",
43 |     "    print(field)"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "id": "1bf33965-8cb2-48b6-bea0-d1da7de3382f",
50 |    "metadata": {},
51 |    "outputs": [],
52 |    "source": [
53 |     "neighbourhoods = listings.select(listings.neighbourhood_cleansed)\n",
54 |     "neighbourhoods.show(20)"
55 |    ]
56 |   },
57 |   {
58 |    "cell_type": "code",
59 |    "execution_count": null,
60 |    "id": "d108827d-ad80-43dc-9648-ef8b21c08a49",
61 |    "metadata": {},
62 |    "outputs": [],
63 |    "source": [
64 |     "neighbourhoods = listings.select(listings.neighbourhood_cleansed)\n",
65 |     "neighbourhoods.show(20, truncate=False)"
66 |    ]
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3 (ipykernel)",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.12.7"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 5
90 | }
91 | 


--------------------------------------------------------------------------------
/06-data-eng-with-llms/README.md:
--------------------------------------------------------------------------------
 1 | # Databricks Workspace Setup
 2 | 
 3 | This README describes how to create and configure a Databricks workspace on AWS, set up compute, and prepare the environment for running notebooks with GPU acceleration and Hugging Face models.
 4 | 
 5 | ---
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | - An AWS account with administrator access
10 | - A Hugging Face account with an access token
11 | 
12 | ---
13 | 
14 | ## Step 1: Create a Databricks Workspace
15 | 
16 | 1. Go to [https://databricks.com](https://databricks.com) and sign in
17 | 2. Go to **Manage Account** → **Workspaces**.
18 | 3. Click **Create Workspace**.
19 | 4. You’ll be redirected to an AWS login page. Sign in using your AWS root or IAM credentials.
20 | 5. AWS will ask you to create a **CloudFormation stack**. A stack is a group of AWS resources (e.g., EC2, S3) that Databricks uses to manage compute.
21 | 6. Acknowledge the required permissions and click **Create stack**.
22 | 7. Wait for the stack creation to complete. The workspace will then appear in your Databricks account.
23 | 
24 | ---
25 | 
26 | ## Step 2: Create a Compute Cluster
27 | 
28 | 1. Go to your Databricks workspace.
29 | 2. Navigate to **Compute** → **Create Compute**.
30 | 3. Choose a **GPU-accelerated instance**.
31 | 4. Set the number of Spark workers:
32 |    - **Min workers:** 0
33 |    - **Max workers:** 1
34 |    (This ensures a worker is only created when Spark is actually used.)
35 | 5. Create the compute cluster.
36 | 
37 | ---
38 | 
39 | ## Step 3: Create a Notebook
40 | 
41 | 1. In the workspace, go to **Workspace** → **Create** → **Notebook**.
42 | 2. Name it something like: `llm-classification`.
43 | 3. Select the compute cluster you just created as the execution environment.
44 | 
45 | ---
46 | 
47 | ## Step 4: Configure Hugging Face Token
48 | 
49 | To download models from Hugging Face, you need an access token.
50 | 
51 | 1. Go to [https://huggingface.co](https://huggingface.co).
52 | 2. Click on your **User icon** → **Access Tokens**.
53 | 3. Create a new token and copy it.
54 | 
55 | In your Databricks notebook, run the following:
56 | 
57 | ```python
58 | %pip install transformers
59 | 
60 | from huggingface_hub import login
61 | 
62 | login(token="hf_...")
63 | ```
64 | 
65 | > ⚠️ Do **not** hardcode your token in production environments. Use Databricks Secrets instead. This example uses a token directly for simplicity in a demo setting.
66 | 
67 | ---
68 | 
69 | You're now ready to run notebooks using Spark and Hugging Face models in Databricks.


--------------------------------------------------------------------------------
/07-kafka-streaming/04-kafka-schema-registry-producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import textwrap
 4 | import time
 5 | from datetime import datetime
 6 | from order import Order, ORDER_SCHEMA
 7 | 
 8 | from confluent_kafka import Producer
 9 | from confluent_kafka.schema_registry import SchemaRegistryClient
10 | from confluent_kafka.schema_registry.avro import AvroSerializer
11 | from confluent_kafka.serialization import (
12 |     MessageField,
13 |     SerializationContext,
14 | )
15 | 
16 | 
17 | def generate_order():
18 |     countries = [
19 |         "USA",
20 |         "Canada",
21 |         "UK",
22 |         "Germany",
23 |         "France",
24 |         "Australia",
25 |         "Japan",
26 |         "Ireland",
27 |     ]
28 |     return Order(
29 |         order_id=random.randint(1000, 9999),
30 |         customer_id='123',
31 |         total_price=round(random.uniform(20.0, 1000.0), 2),
32 |         customer_country=random.choice(countries),
33 |         merchant_country=random.choice(countries),
34 |         order_datetime=datetime.now().isoformat()
35 |     )
36 | 
37 | 
38 | def main():
39 |     schema_registry_config = {
40 |         "url": "http://localhost:8081"
41 |     }
42 |     schema_registry_client = SchemaRegistryClient(schema_registry_config)
43 | 
44 |     avro_serializer = AvroSerializer(
45 |         schema_registry_client,
46 |         json.dumps(ORDER_SCHEMA),
47 |         lambda obj, ctx: obj.to_dict()
48 |     )
49 | 
50 |     producer_config = {
51 |         "bootstrap.servers": "localhost:9092",
52 |         "acks": "all"
53 |     }
54 | 
55 |     producer = Producer(producer_config)
56 | 
57 |     topic = "orders.avro"
58 | 
59 |     def delivery_callback(err, msg):
60 |         if err:
61 |             print("ERROR: Message failed delivery: {}".format(err))
62 |         else:
63 |             print(
64 |                 textwrap.dedent(
65 |                 f"""
66 |                     Produced event to topic {msg.topic()}:
67 |                     key = {msg.key().decode('utf-8')}
68 |                 """)
69 |             )
70 | 
71 |     while True:
72 |         order = generate_order()
73 |         print(f"Sending order: {order}")
74 | 
75 |         serialized_data = avro_serializer(
76 |             order, SerializationContext(topic, MessageField.VALUE)
77 |         )
78 |         producer.produce(
79 |             topic,
80 |             key=str(order.order_id).encode(),
81 |             value=serialized_data,
82 |             callback=delivery_callback,
83 |         )
84 | 
85 |         producer.poll(0)
86 | 
87 |         time.sleep(1)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-exercise/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # 0. Stop all Docker containers
 4 | 
 5 | Before you start, you would need to stop Docker containers related to this bootcamp
 6 | running on your machine.
 7 | 
 8 | # 1. Start Kafka
 9 | 
10 | First, start Kafka using Docker Compose:
11 | 
12 | ```sh
13 | docker-compose up
14 | ```
15 | 
16 | 
17 | ## 2. Create a Python Virtual Environment
18 | 
19 | Run the following commands to create a virtual environment and install dependencies:
20 | 
21 | ```bash
22 | python3 -m venv venv
23 | source ./venv/bin/activate
24 | pip install -r requirements.txt
25 | ```
26 | 
27 | ## 3. Download the Flink Kafka Connector
28 | 
29 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven:
30 | 
31 | ```bash
32 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar
33 | ```
34 | 
35 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file
36 | 
37 | ## 4. Configure the `flink` Command in the Virtual Environment
38 | 
39 | Ensure the `flink` CLI is available in the virtual environment.
40 | 
41 | ```bash
42 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin"
43 | ```
44 | 
45 | Run `flink` without arguments to check if it was installed correctly
46 | 
47 | ```bash
48 | flink --version
49 | ```
50 | 
51 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment.
52 | 
53 | ## 5. Implement the Flink Application
54 | 
55 | 
56 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments.
57 | 
58 | 
59 | ## 6. Start the Producer
60 | 
61 | In the virtual environment run the producer script to generate the payments data:
62 | 
63 | ```bash
64 | python payments-producer.py
65 | ```
66 | 
67 | This will produce random payments data to the `payments` Kafka topic.
68 | 
69 | ## 7. (Optional) Verify Producer Output
70 | 
71 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic:
72 | 
73 | ```bash
74 | kafka-console-consumer --topic payments \
75 |   --bootstrap-server localhost:9092 \
76 |   --from-beginning
77 | ```
78 | 
79 | Make sure that you see payments data and that it is written every second.
80 | 
81 | ## 8. Run the Flink Application
82 | 
83 | Run the Flink application to process the payments data:
84 | 
85 | ```bash
86 | flink run \
87 |   --python flink-app.py \
88 |   --target local \
89 |   --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar
90 | ```
91 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-solution/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # 0. Stop all Docker containers
 4 | 
 5 | Before you start, you would need to stop Docker containers related to this bootcamp
 6 | running on your machine.
 7 | 
 8 | # 1. Start Kafka
 9 | 
10 | First, start Kafka using Docker Compose:
11 | 
12 | ```sh
13 | docker-compose up
14 | ```
15 | 
16 | 
17 | ## 2. Create a Python Virtual Environment
18 | 
19 | Run the following commands to create a virtual environment and install dependencies:
20 | 
21 | ```bash
22 | python3 -m venv venv
23 | source ./venv/bin/activate
24 | pip install -r requirements.txt
25 | ```
26 | 
27 | ## 3. Download the Flink Kafka Connector
28 | 
29 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven:
30 | 
31 | ```bash
32 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar
33 | ```
34 | 
35 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file
36 | 
37 | ## 4. Configure the `flink` Command in the Virtual Environment
38 | 
39 | Ensure the `flink` CLI is available in the virtual environment.
40 | 
41 | ```bash
42 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin"
43 | ```
44 | 
45 | Run `flink` without arguments to check if it was installed correctly
46 | 
47 | ```bash
48 | flink --version
49 | ```
50 | 
51 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment.
52 | 
53 | ## 5. Implement the Flink Application
54 | 
55 | 
56 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments.
57 | 
58 | 
59 | ## 6. Start the Producer
60 | 
61 | In the virtual environment run the producer script to generate the payments data:
62 | 
63 | ```bash
64 | python payments-producer.py
65 | ```
66 | 
67 | This will produce random payments data to the `payments` Kafka topic.
68 | 
69 | ## 7. (Optional) Verify Producer Output
70 | 
71 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic:
72 | 
73 | ```bash
74 | kafka-console-consumer --topic payments \
75 |   --bootstrap-server localhost:9092 \
76 |   --from-beginning
77 | ```
78 | 
79 | Make sure that you see payments data and that it is written every second.
80 | 
81 | ## 8. Run the Flink Application
82 | 
83 | Run the Flink application to process the payments data:
84 | 
85 | ```bash
86 | flink run \
87 |   --python flink-app.py \
88 |   --target local \
89 |   --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar
90 | ```
91 | 


--------------------------------------------------------------------------------
/02-data-processing-with-spark/README.md:
--------------------------------------------------------------------------------
  1 | # Install Spark
  2 | 
  3 | To install Spark locally on macOS, run the following command:
  4 | 
  5 | ```sh
  6 | brew install apache-spark
  7 | ```
  8 | 
  9 | To check that it was installed correctly, you can run:
 10 | 
 11 | ```sh
 12 | pyspark --version
 13 | ```
 14 | 
 15 | ---
 16 | 
 17 | # Create a Virtual Environment
 18 | 
 19 | Create a virtual environment in a directory named `venv`:
 20 | 
 21 | ```sh
 22 | python -m venv venv
 23 | ```
 24 | 
 25 | Activate the virtual environment:
 26 | 
 27 | ```sh
 28 | source venv/bin/activate
 29 | ```
 30 | 
 31 | ---
 32 | 
 33 | # Install Jupyter Lab
 34 | 
 35 | Run the following command inside the virtual environment to install Jupyter Lab:
 36 | 
 37 | ```sh
 38 | pip install jupyter
 39 | ```
 40 | 
 41 | Then configure PySpark to use Jupyter Lab as the driver:
 42 | 
 43 | ```sh
 44 | export PYSPARK_DRIVER_PYTHON=jupyter
 45 | export PYSPARK_DRIVER_PYTHON_OPTS='lab'
 46 | ```
 47 | 
 48 | ---
 49 | 
 50 | # Start Jupyter Notebooks with Local PySpark
 51 | 
 52 | Now you can start Jupyter Lab with PySpark:
 53 | 
 54 | ```sh
 55 | pyspark
 56 | ```
 57 | 
 58 | This will open a Jupyter Lab interface in your browser where you can interact with Spark using notebooks.
 59 | 
 60 | ---
 61 | 
 62 | # Spark "Hello World"
 63 | 
 64 | You can try to run the following Spark code to verify everything is working:
 65 | 
 66 | ```python
 67 | from pyspark.sql import SparkSession
 68 | 
 69 | spark = (SparkSession.builder
 70 |          .appName("First Spark application")
 71 |          .getOrCreate())
 72 | 
 73 | data = [
 74 |     {"userId": 1, "paymentAmount": 100.0, "date": "2025-01-01"},
 75 |     {"userId": 2, "paymentAmount": 150.5, "date": "2025-01-02"},
 76 |     {"userId": 3, "paymentAmount": 200.75, "date": "2025-01-03"},
 77 |     {"userId": 2, "paymentAmount":  50.25, "date": "2025-01-04"},
 78 |     {"userId": 1, "paymentAmount":  80.0,  "date": "2025-01-05"},
 79 | ]
 80 | 
 81 | df = spark.createDataFrame(data)
 82 | df.count()
 83 | ```
 84 | 
 85 | You should see the number of rows in the DataFrame as the output.
 86 | 
 87 | ---
 88 | 
 89 | # Clean Up
 90 | 
 91 | When you're done working, follow these steps to shut everything down:
 92 | 
 93 | 1. **Stop the Spark session (in a notebook):**
 94 | 
 95 |     ```python
 96 |     spark.stop()
 97 |     ```
 98 | 
 99 | 2. **Stop Jupyter Lab (in terminal):**
100 | 
101 |     Press `Ctrl+C` in the terminal where `pyspark` was running.
102 | 
103 | 3. **Deactivate the virtual environment:**
104 | 
105 |     ```sh
106 |     deactivate
107 |     ```
108 | 
109 | At this point, you're back to your global Python environment. You're now ready to continue developing Spark applications!
110 | 
111 | ---


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/customer_reviews_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import dag, task
 2 | from airflow.operators.python import get_current_context
 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook
 4 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
 5 | from datetime import datetime, timedelta
 6 | import os
 7 | import csv
 8 | 
 9 | 
10 | @dag(
11 |     "customer_reviews_dag",
12 |     start_date=datetime(2025, 1, 1),
13 |     schedule_interval="* * * * *",
14 |     catchup=False,
15 |     description="Review average score",
16 | )
17 | def customer_reviews_dag():
18 | 
19 |     @task
20 |     def extract_reviews():
21 |         pg_hook = PostgresHook(postgres_conn_id="postgres_rental_site")
22 | 
23 |         context = get_current_context()
24 |         execution_date = context["execution_date"]
25 |         start_of_minute = execution_date.replace(second=0, microsecond=0)
26 |         end_of_minute = start_of_minute + timedelta(hours=1)
27 | 
28 |         query = f"""
29 |             SELECT review_id, listing_id, review_score, review_comment, review_date
30 |             FROM customer_reviews
31 |             WHERE review_date >= '{start_of_minute.strftime('%Y-%m-%d %H:%M:%S')}'
32 |               AND review_date < '{end_of_minute.strftime('%Y-%m-%d %H:%M:%S')}'
33 |         """
34 | 
35 |         records = pg_hook.get_records(query)
36 |         column_names = ["review_id", "listing_id", "review_score", "review_comment", "review_date"]
37 | 
38 |         file_date = execution_date.strftime('%Y%m%d_%H%M')
39 |         file_path = f"/tmp/data/customer_reviews/{file_date}/customer_reviews.csv"
40 | 
41 |         directory = os.path.dirname(file_path)
42 |         if not os.path.exists(directory):
43 |             os.makedirs(directory)
44 | 
45 |         with open(file_path, "w", newline="") as csvfile:
46 |             writer = csv.writer(csvfile)
47 |             writer.writerow(column_names)
48 |             writer.writerows(records)
49 | 
50 |         print(f"Customer reviews written to {file_path}")
51 | 
52 |     spark_etl = SparkSubmitOperator(
53 |         task_id="spark_etl_reviews",
54 |         application="dags/spark_etl_reviews.py",
55 |         name="guest_reviews_etl",
56 |         application_args=[
57 |             "--customer_reviews", "/tmp/data/customer_reviews/{{ execution_date.strftime('%Y%m%d_%H%M') }}/customer_reviews.csv",
58 |             "--output_path", "/tmp/data/avg_review_score_by_listing/{{ execution_date.strftime('%Y%m%d_%H%M') }}"
59 |         ],
60 |         conn_id='spark_rental_site',
61 |     )
62 | 
63 |     extract_task = extract_reviews()
64 |     extract_task >> spark_etl
65 | 
66 | dag_instance = customer_reviews_dag()


--------------------------------------------------------------------------------
/08-flink-stream-processing/02-orders-processing.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | 
 4 | from pyflink.common.serialization import SimpleStringSchema
 5 | from pyflink.common.typeinfo import Types
 6 | from pyflink.common.watermark_strategy import WatermarkStrategy
 7 | from pyflink.datastream import StreamExecutionEnvironment
 8 | from pyflink.datastream.connectors.kafka import KafkaSink, KafkaSource, KafkaRecordSerializationSchema
 9 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
10 | 
11 | 
12 | @dataclass
13 | class Order:
14 |     order_id: str
15 |     customer_id: str
16 |     product_id: str
17 |     quantity: int
18 |     price: float
19 |     order_time: str
20 | 
21 | 
22 | def parse_order(json_str):
23 |     data = json.loads(json_str)
24 |     return Order(
25 |         order_id=data.get("order_id", "unknown"),
26 |         customer_id=data.get("customer_id", "unknown"),
27 |         product_id=data.get("product_id", "unknown"),
28 |         quantity=data.get("quantity", 0),
29 |         price=float(data.get("price", 0.0)),
30 |         order_time=data.get("order_time", "unknown")
31 |     )
32 | 
33 | 
34 | def filter_high_price(order):
35 |     return order.price > 10
36 | 
37 | 
38 | def convert_order(order):
39 | 
40 |     simplified = {
41 |         "order_id": order.order_id,
42 |         "price": order.price,
43 |     }
44 | 
45 |     return json.dumps(simplified)
46 | 
47 | 
48 | def main():
49 |     env = StreamExecutionEnvironment.get_execution_environment()
50 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
51 | 
52 |     kafka_source = KafkaSource.builder() \
53 |         .set_bootstrap_servers("localhost:9092") \
54 |         .set_topics("orders") \
55 |         .set_group_id("flink-consumer-group") \
56 |         .set_value_only_deserializer(SimpleStringSchema()) \
57 |         .build()
58 | 
59 |     orders_stream = env.from_source(
60 |         kafka_source,
61 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
62 |         source_name="kafka_source"
63 |     )
64 | 
65 |     filtered_stream = orders_stream \
66 |         .map(parse_order) \
67 |         .filter(filter_high_price) \
68 |         .map(convert_order, Types.STRING())
69 | 
70 |     filtered_stream.print()
71 | 
72 |     kafka_sink = KafkaSink.builder() \
73 |         .set_bootstrap_servers("localhost:9092") \
74 |         .set_record_serializer(
75 |             KafkaRecordSerializationSchema.builder()
76 |             .set_topic("filtered-orders")
77 |             .set_value_serialization_schema(SimpleStringSchema())
78 |             .build()
79 |         ) \
80 |         .build()
81 | 
82 |     filtered_stream.sink_to(kafka_sink)
83 | 
84 |     env.execute("Orders stream processing")
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/README.md:
--------------------------------------------------------------------------------
  1 | ## Create and Activate a Virtual Environment
  2 | 
  3 | Create a virtual environment using a supported Python version:
  4 | 
  5 | ```sh
  6 | python -m venv venv
  7 | ```
  8 | 
  9 | Activate the virtual environment:
 10 | 
 11 | ```sh
 12 | source venv/bin/activate
 13 | ```
 14 | 
 15 | Check the Python version to confirm you're using the right one:
 16 | 
 17 | ```sh
 18 | python --version
 19 | ```
 20 | 
 21 | ---
 22 | 
 23 | ## Install Apache Airflow
 24 | 
 25 | Airflow must be installed with a constraints file to ensure compatible dependencies.
 26 | 
 27 | ```sh
 28 | pip install 'apache-airflow==2.10.4' \
 29 |   --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.12.txt"
 30 | ```
 31 | 
 32 | ---
 33 | 
 34 | ## Initialize the Metadata Database
 35 | 
 36 | Set up the Airflow database:
 37 | 
 38 | ```sh
 39 | airflow db migrate
 40 | ```
 41 | 
 42 | ---
 43 | 
 44 | ## Create an Admin User
 45 | 
 46 | Create a user account for logging into the Airflow web UI:
 47 | 
 48 | ```sh
 49 | airflow users create \
 50 |   --username admin \
 51 |   --firstname John \
 52 |   --lastname Doe \
 53 |   --role Admin \
 54 |   --email admin@example.com \
 55 |   --password admin
 56 | ```
 57 | 
 58 | ---
 59 | 
 60 | ## Configure DAGs Folder and Disable Example DAGs
 61 | 
 62 | Find the Airflow home directory:
 63 | 
 64 | ```sh
 65 | airflow info
 66 | ```
 67 | 
 68 | Locate the `airflow_home` value in the output. The config file is at:
 69 | 
 70 | ```
 71 | <airflow_home>/airflow.cfg
 72 | ```
 73 | 
 74 | Get your current working directory (where you will store DAGs):
 75 | 
 76 | ```sh
 77 | pwd
 78 | ```
 79 | 
 80 | Edit the configuration file:
 81 | 
 82 | ```sh
 83 | vim <airflow_home>/airflow.cfg
 84 | ```
 85 | 
 86 | Update the following settings:
 87 | 
 88 | ```ini
 89 | dags_folder = /your/current/directory
 90 | load_examples = False
 91 | ```
 92 | 
 93 | Save and exit.
 94 | 
 95 | ---
 96 | 
 97 | ## Start Airflow Services
 98 | 
 99 | Start the Airflow webserver on port 8080:
100 | 
101 | ```sh
102 | airflow webserver --port 8080
103 | ```
104 | 
105 | In a new terminal with the virtual environment activated, start the scheduler:
106 | 
107 | ```sh
108 | airflow scheduler
109 | ```
110 | 
111 | ---
112 | 
113 | ## Access the Web UI
114 | 
115 | Open a browser and go to:
116 | 
117 | ```
118 | http://localhost:8080
119 | ```
120 | 
121 | Login credentials:
122 | 
123 | - **Username:** `admin`
124 | - **Password:** `admin`
125 | 
126 | ---
127 | 
128 | ## Clean Up
129 | 
130 | To stop Airflow and exit the virtual environment:
131 | 
132 | 1. Press `Ctrl+C` in both terminal windows to stop the webserver and scheduler.
133 | 2. Deactivate the virtual environment:
134 | 
135 | ```sh
136 | deactivate
137 | ```
138 | 
139 | Airflow is now installed and ready to use.


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-solution/flink-app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | 
 4 | from pyflink.common.serialization import SimpleStringSchema
 5 | from pyflink.common.typeinfo import Types
 6 | from pyflink.common.watermark_strategy import WatermarkStrategy
 7 | from pyflink.datastream import StreamExecutionEnvironment
 8 | from pyflink.datastream.connectors.kafka import KafkaSink, KafkaSource, KafkaRecordSerializationSchema
 9 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
10 | 
11 | 
12 | @dataclass
13 | class Payment:
14 |     payment_id: str
15 |     user_id: str
16 |     merchant_id: str
17 |     amount: float
18 |     currency: str
19 |     payment_time: str
20 | 
21 | 
22 | def parse_payment(json_str):
23 |     data = json.loads(json_str)
24 |     return Payment(
25 |         payment_id=data.get("payment_id", "unknown"),
26 |         user_id=data.get("user_id", "unknown"),
27 |         merchant_id=data.get("merchant_id", "unknown"),
28 |         amount=float(data.get("amount", 0.0)),
29 |         currency=data.get("currency", "unknown"),
30 |         payment_time=data.get("payment_time", "unknown")
31 |     )
32 | 
33 | 
34 | def filter_high_amount(payment):
35 |     return payment.amount > 500
36 | 
37 | 
38 | def convert_payment(payment):
39 |     simplified = {
40 |         "payment_id": payment.payment_id,
41 |         "amount": payment.amount,
42 |     }
43 |     return json.dumps(simplified)
44 | 
45 | 
46 | def main():
47 |     env = StreamExecutionEnvironment.get_execution_environment()
48 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
49 | 
50 |     kafka_source = KafkaSource.builder() \
51 |         .set_bootstrap_servers("localhost:9092") \
52 |         .set_topics("payments") \
53 |         .set_group_id("flink-consumer-group") \
54 |         .set_value_only_deserializer(SimpleStringSchema()) \
55 |         .build()
56 | 
57 |     payments_stream = env.from_source(
58 |         kafka_source,
59 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
60 |         source_name="kafka_source"
61 |     )
62 | 
63 |     filtered_stream = payments_stream \
64 |         .map(parse_payment) \
65 |         .filter(filter_high_amount) \
66 |         .map(convert_payment, Types.STRING())
67 | 
68 |     filtered_stream.print("FilteredStream")
69 | 
70 |     kafka_sink = KafkaSink.builder() \
71 |         .set_bootstrap_servers("localhost:9092") \
72 |         .set_record_serializer(
73 |             KafkaRecordSerializationSchema.builder()
74 |             .set_topic("filtered-payments")
75 |             .set_value_serialization_schema(SimpleStringSchema())
76 |             .build()
77 |         ) \
78 |         .build()
79 | 
80 |     filtered_stream.sink_to(kafka_sink)
81 | 
82 |     env.execute("Payments stream processing")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-solution/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 0. Stop all Docker containers
  4 | 
  5 | Before you start, you would need to stop Docker containers related to this bootcamp
  6 | running on your machine.
  7 | 
  8 | 
  9 | # 1. Start Kafka
 10 | 
 11 | First, start Kafka using Docker Compose:
 12 | 
 13 | ```sh
 14 | docker-compose up
 15 | ```
 16 | 
 17 | 
 18 | ## 2. Create a Python Virtual Environment
 19 | 
 20 | Run the following commands to create a virtual environment and install dependencies:
 21 | 
 22 | ```bash
 23 | python -m venv venv
 24 | source ./venv/bin/activate
 25 | pip install -r requirements.txt
 26 | ```
 27 | 
 28 | 
 29 | ## 3. Download the Flink Kafka Connector
 30 | 
 31 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven:
 32 | 
 33 | ```bash
 34 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar
 35 | ```
 36 | 
 37 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file
 38 | 
 39 | 
 40 | ## 4. Configure the `flink` Command in the Virtual Environment
 41 | 
 42 | Ensure the `flink` CLI is available in the virtual environment.
 43 | 
 44 | ```bash
 45 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin"
 46 | ```
 47 | 
 48 | Run `flink` without arguments to check if it was installed correctly
 49 | 
 50 | ```bash
 51 | flink --version
 52 | ```
 53 | 
 54 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment.
 55 | 
 56 | 
 57 | ## 5. Implement the Flink Application
 58 | 
 59 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments.
 60 | 
 61 | 
 62 | ## 6. Start the Producer
 63 | 
 64 | In the virtual environment run the producer script to generate the payments data:
 65 | 
 66 | ```bash
 67 | python payments-producer.py
 68 | ```
 69 | 
 70 | This will produce random payments data to the `payments` Kafka topic.
 71 | 
 72 | 
 73 | ## 7. (Optional) Verify Producer Output
 74 | 
 75 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic:
 76 | 
 77 | ```bash
 78 | kafka-console-consumer --topic payments \
 79 |   --bootstrap-server localhost:9092 \
 80 |   --from-beginning
 81 | ```
 82 | 
 83 | Make sure that you see payments data and that it is written every second.
 84 | 
 85 | 
 86 | ## 8. Run the Flink Application
 87 | 
 88 | Run the Flink application to process the data. Make sure the Kafka connector JAR is included in the classpath.
 89 | 
 90 | ```bash
 91 | flink run \
 92 |   --python flink-app.py \
 93 |   --target local \
 94 |   --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar
 95 | ```
 96 | 
 97 | 
 98 | ## 9. Verify Flink Output
 99 | 
100 | Run the Flink application to process the payments data:
101 | 
102 | ```bash
103 | kafka-console-consumer --topic filtered-payments \
104 |   --bootstrap-server localhost:9092 \
105 |   --from-beginning
106 | ```
107 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/01-payments-data-exercise/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 0. Stop all Docker containers
  4 | 
  5 | Before you start, you would need to stop Docker containers related to this bootcamp
  6 | running on your machine.
  7 | 
  8 | 
  9 | # 1. Start Kafka
 10 | 
 11 | First, start Kafka using Docker Compose:
 12 | 
 13 | ```sh
 14 | docker-compose up
 15 | ```
 16 | 
 17 | 
 18 | ## 2. Create a Python Virtual Environment
 19 | 
 20 | Run the following commands to create a virtual environment and install dependencies:
 21 | 
 22 | ```bash
 23 | python3.11 -m venv venv
 24 | source ./venv/bin/activate
 25 | pip install -r requirements.txt
 26 | ```
 27 | 
 28 | 
 29 | ## 3. Download the Flink Kafka Connector
 30 | 
 31 | We need to download the Kafka connector for Flink. Run the following command to download it from Maven:
 32 | 
 33 | ```bash
 34 | wget https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.4.0-1.20/flink-sql-connector-kafka-3.4.0-1.20.jar
 35 | ```
 36 | 
 37 | The downloaded `.jar` file should be placed in the current directory with the `flink-app.py` file
 38 | 
 39 | 
 40 | ## 4. Configure the `flink` Command in the Virtual Environment
 41 | 
 42 | Ensure the `flink` CLI is available in the virtual environment.
 43 | 
 44 | ```bash
 45 | export PATH=$PATH:"$(./venv/bin/find_flink_home.py)/bin"
 46 | ```
 47 | 
 48 | Run `flink` without arguments to check if it was installed correctly
 49 | 
 50 | ```bash
 51 | flink --version
 52 | ```
 53 | 
 54 | NOTE: You would need to set the `PATH` variable every time to activate a virtual environment.
 55 | 
 56 | 
 57 | ## 5. Implement the Flink Application
 58 | 
 59 | Implement the Flink application in the `flink-app.py` follow the instructions in the `TODO` comments.
 60 | 
 61 | 
 62 | ## 6. Start the Producer
 63 | 
 64 | In the virtual environment run the producer script to generate the payments data:
 65 | 
 66 | ```bash
 67 | python payments-producer.py
 68 | ```
 69 | 
 70 | This will produce random payments data to the `payments` Kafka topic.
 71 | 
 72 | 
 73 | ## 7. (Optional) Verify Producer Output
 74 | 
 75 | To double-check that everything is working as expected, run Kafka CLI tools to check that the producer is writing data to the `payments` topic:
 76 | 
 77 | ```bash
 78 | kafka-console-consumer --topic payments \
 79 |   --bootstrap-server localhost:9092 \
 80 |   --from-beginning
 81 | ```
 82 | 
 83 | Make sure that you see payments data and that it is written every second.
 84 | 
 85 | 
 86 | ## 8. Run the Flink Application
 87 | 
 88 | Run the Flink application to process the data. Make sure the Kafka connector JAR is included in the classpath.
 89 | 
 90 | ```bash
 91 | flink run \
 92 |   --python flink-app.py \
 93 |   --target local \
 94 |   --jarfile flink-sql-connector-kafka-3.4.0-1.20.jar
 95 | ```
 96 | 
 97 | 
 98 | ## 9. Verify Flink Output
 99 | 
100 | Run the Flink application to process the payments data:
101 | 
102 | ```bash
103 | kafka-console-consumer --topic filtered-payments \
104 |   --bootstrap-server localhost:9092 \
105 |   --from-beginning
106 | ```
107 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/dags/03-bookings_per_listing.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import dag, task
 2 | from airflow.operators.python import get_current_context
 3 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
 4 | from datetime import datetime
 5 | import os
 6 | import csv
 7 | import random
 8 | 
 9 | 
10 | @dag(
11 |     "bookings_spark_pipeline",
12 |     start_date=datetime(2025, 1, 1),
13 |     schedule_interval="* * * * *",
14 |     catchup=False,
15 |     description="",
16 | )
17 | def bookings_spark_pipeline():
18 | 
19 |     @task
20 |     def generate_bookings():
21 |         context = get_current_context()
22 |         execution_date = context["execution_date"]
23 | 
24 |         file_date = execution_date.strftime("%Y-%m-%d_%H%M")
25 |         file_path = f"/tmp/data/bookings/{file_date}/bookings.csv"
26 | 
27 |         num_bookings = random.randint(30, 50)
28 |         bookings = []
29 |         for i in range(num_bookings):
30 |             booking = {
31 |                 "booking_id": random.randint(1000, 5000),
32 |                 "listing_id": random.choice([13913, 17402, 24328, 33332, 116268, 117203, 127652, 127860]),
33 |                 "user_id": random.randint(1000, 5000),
34 |                 "booking_time": execution_date.strftime("%Y-%m-%d %H:%M:%S"),
35 |                 "status": random.choice(["confirmed", "cancelled", "pending"])
36 |             }
37 |             bookings.append(booking)
38 | 
39 |         directory = os.path.dirname(file_path)
40 |         if not os.path.exists(directory):
41 |             os.makedirs(directory)
42 | 
43 |         fieldnames = ["booking_id", "listing_id", "user_id", "booking_time", "status"]
44 | 
45 |         with open(file_path, "w", newline="") as csvfile:
46 |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
47 |             writer.writeheader()
48 |             for booking in bookings:
49 |                 writer.writerow({
50 |                     "booking_id": booking["booking_id"],
51 |                     "listing_id": booking["listing_id"],
52 |                     "user_id": booking["user_id"],
53 |                     "booking_time": booking["booking_time"],
54 |                     "status": booking["status"]
55 |                 })
56 | 
57 |         print(f"Generated bookings data written to {file_path}")
58 | 
59 |     spark_job = SparkSubmitOperator(
60 |         task_id="process_listings_and_bookings",
61 |         application="bookings_per_listing_spark.py",
62 |         name="listings_bookings_join",
63 |         application_args=[
64 |             "--listings_file", "/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz",
65 |             "--bookings_file", "/tmp/data/bookings/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}/bookings.csv",
66 |             "--output_path", "/tmp/data/bookings_per_listing/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}"
67 |         ],
68 |         conn_id='spark_booking',
69 |     )
70 | 
71 |     bookings_file = generate_bookings()
72 |     bookings_file >> spark_job
73 | 
74 | dag_instance = bookings_spark_pipeline()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | This is a README for how to set up your environment to work on exercises for this section
  3 | 
  4 | # 1. Create a Python virtual environment
  5 | 
  6 | To create a Python virtual environment run the following command:
  7 | 
  8 | ```sh
  9 | python -m venv venv
 10 | ```
 11 | 
 12 | Note that you might need to run it with a specific Python version to make sure it is compatible with the Airflow version you are using.
 13 | 
 14 | ```sh
 15 | python3.12 -m venv venv
 16 | ```
 17 | 
 18 | # 2. Activate a Python virtual environment
 19 | 
 20 | To activate a virtual environment run the following command:
 21 | 
 22 | ```sh
 23 | source venv/bin/activate
 24 | ```
 25 | 
 26 | # 3. Install Airflow
 27 | 
 28 | Install Airflow using these commands:
 29 | 
 30 | ```sh
 31 | AIRFLOW_VERSION="2.10.4"
 32 | # Set this variable to your Python version
 33 | PYTHON_VERSION="3.12"
 34 | 
 35 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt"
 36 | 
 37 | pip install "apache-airflow==${AIRFLOW_VERSION}" \
 38 |     --constraint "${CONSTRAINT_URL}"
 39 | ```
 40 | 
 41 | Not all Airflow versions are compatible with all Python versions.
 42 | 
 43 | You can find a list of compatible versions on this page: https://pypi.org/project/apache-airflow/
 44 | 
 45 | # 4. Create an Airflow database
 46 | 
 47 | Run this command to create an Airflow database
 48 | 
 49 | ```sh
 50 | airflow db migrate
 51 | ```
 52 | 
 53 | # 5. Create a folder for DAGs
 54 | 
 55 | Create a folder for the DAGs you will implement
 56 | 
 57 | ```sh
 58 | mkdir dags
 59 | ```
 60 | 
 61 | # 6. Update Airflow configuration
 62 | 
 63 | To update the Airflow configuration get the location of the Airflow's home directory. To do this first run the following command.
 64 | 
 65 | ```sh
 66 | airflow info
 67 | ```
 68 | 
 69 | And after this update edit the Airflow configuration.
 70 | 
 71 | ```sh
 72 | vim <airflow-home-path>/airflow.cfg
 73 | ```
 74 | 
 75 | In this configuration file you would need to change two values:
 76 | 
 77 | * `dags_folder` to the path to the `dags` folder you've just created
 78 | * `load_examples` to `False`
 79 | 
 80 | 
 81 | # 7. Create an Airflow user
 82 | 
 83 | ```sh
 84 | airflow users create \
 85 |   --username admin \
 86 |   --firstname <your-name> \
 87 |   --lastname <your-surname> \
 88 |   --role Admin \
 89 |   --email admin@example.com \
 90 |   --password admin
 91 | ```
 92 | 
 93 | 
 94 | # 8. Start a web server for Airflow
 95 | 
 96 | To start a web server run the following command:
 97 | 
 98 | ```sh
 99 | airflow webserver --port 8080
100 | ```
101 | 
102 | # 9. Start a scheduler
103 | 
104 | In a different terminal session, activate a virtual environment in the same folder and start an Airflow scheduler
105 | 
106 | ```sh
107 | source venv/bin/activate
108 | airflow scheduler
109 | ```
110 | 
111 | # 10. Check if Airflow is working
112 | 
113 | You should now be able to go to `localhost:8080`, and log into Airflow UI. You should you the username and password you've selected in step `7`.
114 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/01-data-validation-exercise/data_validation_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import dag, task
 2 | from airflow.operators.python import get_current_context
 3 | from datetime import datetime
 4 | import os
 5 | import json
 6 | import random
 7 | 
 8 | 
 9 | # TODO: Use the @dag decorator to create a DAG that:
10 | # * Runs every minute
11 | # * Does not use catchup
12 | def data_quality_pipeline():
13 | 
14 |     CORRECT_PROB = 0.7
15 | 
16 |     def get_bookings_path(context):
17 |         execution_date = context["execution_date"]
18 |         file_date = execution_date.strftime("%Y-%m-%d_%H-%M")
19 |         return f"/tmp/data/bookings/{file_date}/bookings.json"
20 | 
21 |     def generate_booking_id(i):
22 |         if random.random() < CORRECT_PROB:
23 |             return i + 1
24 | 
25 |         return ""
26 | 
27 |     def generate_listing_id():
28 |         if random.random() < CORRECT_PROB:
29 |             return random.choice([1, 2, 3, 4, 5])
30 | 
31 |         return ""
32 | 
33 |     def generate_user_id(correct_prob=0.7):
34 |         return random.randint(1000, 5000) if random.random() < correct_prob else ""
35 | 
36 |     def generate_booking_time(execution_date):
37 |         if random.random() < CORRECT_PROB:
38 |             return execution_date.strftime('%Y-%m-%d %H:%M:%S')
39 | 
40 |         return ""
41 | 
42 |     def generate_status():
43 |         if random.random() < CORRECT_PROB:
44 |             return random.choice(["confirmed", "pending", "cancelled"])
45 | 
46 |         return random.choice(["unknown", "", "error"])
47 | 
48 |     @task
49 |     def generate_bookings():
50 |         context = get_current_context()
51 |         booking_path = get_bookings_path(context)
52 | 
53 |         num_bookings = random.randint(5, 15)
54 |         bookings = []
55 | 
56 |         for i in range(num_bookings):
57 |             booking = {
58 |                 "booking_id": generate_booking_id(i),
59 |                 "listing_id": generate_listing_id(),
60 |                 "user_id": generate_user_id(),
61 |                 "booking_time": generate_booking_time(context["execution_date"]),
62 |                 "status": generate_status()
63 |             }
64 |             bookings.append(booking)
65 | 
66 |         directory = os.path.dirname(booking_path)
67 |         if not os.path.exists(directory):
68 |             os.makedirs(directory)
69 | 
70 |         with open(booking_path, "w") as f:
71 |             json.dump(bookings, f, indent=4)
72 | 
73 |         print(f"Written to file: {booking_path}")
74 | 
75 |     # TODO: Create a data quality check task that reads bookings data and validates every record.
76 |     # For every invalid record it should return a validation record that includes:
77 |     # * A record position in an input file
78 |     # * A list of identified violations
79 |     #
80 |     # Here is a list of validations it should perform:
81 |     # * Check if each of the fields is missing
82 |     # * Check if the "status" field has one of the valid values
83 |     #
84 |     # It should write all found anomalies into an input file.
85 | 
86 |     # TODO: Define dependencies between tasks
87 | 
88 | # TODO: Create an instance of the DAG
89 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/05-local-state.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | 
 4 | from pyflink.common.serialization import SimpleStringSchema
 5 | from pyflink.common.typeinfo import Types
 6 | from pyflink.common.watermark_strategy import WatermarkStrategy
 7 | from pyflink.datastream import (
 8 |     StreamExecutionEnvironment, KeyedProcessFunction, RuntimeContext
 9 | )
10 | from pyflink.datastream.connectors.kafka import KafkaSource
11 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
12 | from pyflink.datastream.state import ValueState, ValueStateDescriptor
13 | 
14 | 
15 | TIER_1_THRESHOLD = 300.0
16 | TIER_2_THRESHOLD = 1000.0
17 | 
18 | 
19 | @dataclass
20 | class Order:
21 |     order_id: str
22 |     customer_id: str
23 |     product_id: str
24 |     quantity: int
25 |     price: float
26 |     order_time: str
27 | 
28 | 
29 | def parse_order(json_str) -> Order:
30 |     data = json.loads(json_str)
31 |     return Order(
32 |         order_id=data.get("order_id", "unknown"),
33 |         customer_id=data.get("customer_id", "unknown"),
34 |         product_id=data.get("product_id", "unknown"),
35 |         quantity=int(data.get("quantity", 0)),
36 |         price=float(data.get("price", 0.0)),
37 |         order_time=data.get("order_time", "unknown")
38 |     )
39 | 
40 | 
41 | class LoyaltyTierFunction(KeyedProcessFunction):
42 | 
43 |     def open(self, runtime_context):
44 |         spend_desc = ValueStateDescriptor("total_spend", Types.DOUBLE())
45 |         self.total_spend_state = runtime_context.get_state(spend_desc)
46 | 
47 |     def process_element(self, order, ctx):
48 |         current_spend = self.total_spend_state.value() or 0
49 | 
50 |         order_total = order.price * order.quantity
51 |         new_total_spend = current_spend + order_total
52 |         self.total_spend_state.update(new_total_spend)
53 | 
54 |         if new_total_spend >= TIER_1_THRESHOLD:
55 |             yield json.dumps({
56 |                 "customer_id": order.customer_id,
57 |                 "total_spend": new_total_spend,
58 |                 "tier": 1
59 |             })
60 | 
61 |         if new_total_spend >= TIER_2_THRESHOLD:
62 |             yield json.dumps({
63 |                 "customer_id": order.customer_id,
64 |                 "total_spend": new_total_spend,
65 |                 "tier": 2
66 |             })
67 | 
68 | 
69 | def main():
70 |     env = StreamExecutionEnvironment.get_execution_environment()
71 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
72 | 
73 |     kafka_source = KafkaSource.builder() \
74 |         .set_bootstrap_servers("localhost:9092") \
75 |         .set_topics("orders") \
76 |         .set_group_id("customers-loyalty-tiers") \
77 |         .set_value_only_deserializer(SimpleStringSchema()) \
78 |         .build()
79 | 
80 |     orders_stream = env.from_source(
81 |         source=kafka_source,
82 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
83 |         source_name="kafka_source"
84 |     )
85 | 
86 |     loyalty_stream = orders_stream \
87 |         .map(parse_order) \
88 |         .key_by(lambda o: o.customer_id) \
89 |         .process(LoyaltyTierFunction(), Types.STRING())
90 | 
91 |     loyalty_stream.print("LoyaltyTierEvent")
92 | 
93 |     env.execute("Loyalty Tier Tracking")
94 | 
95 | if __name__ == "__main__":
96 |     main()


--------------------------------------------------------------------------------
/02-data-processing-with-spark/01-test-pyspark-app.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "c219df52-cb8d-4431-831a-3751a69062f2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "\n",
 12 |     "spark = SparkSession.builder \\\n",
 13 |     "    .appName(\"First Spark application\") \\\n",
 14 |     "    .getOrCreate()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "b0b346e6-5cc9-41a7-8008-2a640bece180",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": []
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "ec00f92d-c582-4970-9617-ff0a9852cc45",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "data = [\n",
 33 |     "    {\"userId\": 1, \"paymentAmount\": 100.0, \"date\": \"2025-01-01\"},\n",
 34 |     "    {\"userId\": 2, \"paymentAmount\": 150.5, \"date\": \"2025-01-02\"},\n",
 35 |     "    {\"userId\": 3, \"paymentAmount\": 200.75, \"date\": \"2025-01-03\"},\n",
 36 |     "    {\"userId\": 2, \"paymentAmount\": 50.25, \"date\": \"2025-01-04\"},\n",
 37 |     "    {\"userId\": 1, \"paymentAmount\": 80.0, \"date\": \"2025-01-05\"}\n",
 38 |     "]\n",
 39 |     "\n",
 40 |     "df = spark.createDataFrame(data)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "070d0281-927f-430c-a522-14e93ae1d399",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "df.show()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "5ed8ce14-7f1e-4a27-a5f8-e0f8f2aba424",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "df.count()"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "3acb5930-966d-4c86-96b1-0d36e52c499c",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "data = data_values = [\n",
 71 |     "    (1, 100.0, \"2025-01-01\"),\n",
 72 |     "    (2, 150.5, \"2025-01-02\"),\n",
 73 |     "    (3, 200.75, \"2025-01-03\"),\n",
 74 |     "    (2, 50.25, \"2025-01-04\"),\n",
 75 |     "    (1, 80.0, \"2025-01-05\")\n",
 76 |     "]\n",
 77 |     "\n",
 78 |     "df = spark.createDataFrame(data, [\"userId\", \"amount\", \"date\"])\n",
 79 |     "df.show(3)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "888929b6-e789-456d-a638-a93c88d85814",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "spark.stop()"
 90 |    ]
 91 |   }
 92 |  ],
 93 |  "metadata": {
 94 |   "kernelspec": {
 95 |    "display_name": "Python 3 (ipykernel)",
 96 |    "language": "python",
 97 |    "name": "python3"
 98 |   },
 99 |   "language_info": {
100 |    "codemirror_mode": {
101 |     "name": "ipython",
102 |     "version": 3
103 |    },
104 |    "file_extension": ".py",
105 |    "mimetype": "text/x-python",
106 |    "name": "python",
107 |    "nbconvert_exporter": "python",
108 |    "pygments_lexer": "ipython3",
109 |    "version": "3.12.7"
110 |   }
111 |  },
112 |  "nbformat": 4,
113 |  "nbformat_minor": 5
114 | }
115 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/03-windows-aggregation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | from datetime import datetime
 4 | 
 5 | from pyflink.common import Time
 6 | from pyflink.common.typeinfo import Types
 7 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
 8 | from pyflink.datastream.window import TumblingProcessingTimeWindows
 9 | 
10 | from pyflink.common.serialization import SimpleStringSchema
11 | from pyflink.common.typeinfo import Types
12 | from pyflink.datastream.connectors.kafka import KafkaSource
13 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
14 | from pyflink.common.watermark_strategy import WatermarkStrategy
15 | from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction
16 | 
17 | 
18 | @dataclass
19 | class Order:
20 |     order_id: str
21 |     customer_id: str
22 |     product_id: str
23 |     quantity: int
24 |     price: float
25 |     order_time: str
26 | 
27 | 
28 | def parse_order(json_str) -> Order:
29 |     data = json.loads(json_str)
30 |     return Order(
31 |         order_id=data.get("order_id", "unknown"),
32 |         customer_id=data.get("customer_id", "unknown"),
33 |         product_id=data.get("product_id", "unknown"),
34 |         quantity=data.get("quantity", 0),
35 |         price=float(data.get("price", 0.0)),
36 |         order_time=data.get("order_time", "unknown")
37 |     )
38 | 
39 | 
40 | class AggregateWindowFunction(ProcessWindowFunction):
41 |     def process(self,
42 |                 key,
43 |                 context,
44 |                 elements):
45 | 
46 |         total_quantity = 0
47 |         total_sum = 0
48 | 
49 |         for input in elements:
50 |             total_quantity += input.quantity
51 |             total_sum += input.quantity * input.price
52 | 
53 |         result = {
54 |             "product_id": key,
55 |             "total_quantity": total_quantity,
56 |             "total_spent": round(total_sum, 2),
57 |             "window_start": datetime.utcfromtimestamp(
58 |                 context.window().start / 1000
59 |             ).isoformat(),
60 |             "window_end": datetime.utcfromtimestamp(
61 |                 context.window().end / 1000
62 |             ).isoformat(),
63 |         }
64 |         return [json.dumps(result)]
65 | 
66 | 
67 | def main():
68 |     env = StreamExecutionEnvironment.get_execution_environment()
69 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
70 | 
71 |     kafka_source = KafkaSource.builder() \
72 |         .set_bootstrap_servers("localhost:9092") \
73 |         .set_topics("orders") \
74 |         .set_group_id("flink-window-aggregation-group") \
75 |         .set_value_only_deserializer(SimpleStringSchema()) \
76 |         .build()
77 | 
78 | 
79 |     orders_stream = env.from_source(
80 |         kafka_source,
81 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
82 |         source_name="kafka_source"
83 |     )
84 | 
85 |     windowed_stream = orders_stream \
86 |         .map(parse_order) \
87 |         .key_by(lambda x: x.product_id) \
88 |         .window(TumblingProcessingTimeWindows.of(Time.seconds(30))) \
89 |         .process(AggregateWindowFunction(),
90 |                  Types.STRING())
91 | 
92 |     windowed_stream.print()
93 | 
94 |     env.execute("Window-based aggregation")
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/dags/04-bookings_per_listing_with_sensor.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import dag, task
 2 | from airflow.operators.python import get_current_context
 3 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
 4 | from airflow.sensors.filesystem import FileSensor
 5 | from datetime import datetime
 6 | import os
 7 | import csv
 8 | import random
 9 | 
10 | 
11 | @dag(
12 |     "bookings_spark_pipeline",
13 |     start_date=datetime(2025, 1, 1),
14 |     schedule_interval="* * * * *",
15 |     catchup=False,
16 |     description="",
17 | )
18 | def bookings_spark_pipeline():
19 | 
20 |     @task
21 |     def generate_bookings():
22 |         context = get_current_context()
23 |         execution_date = context["execution_date"]
24 | 
25 |         file_date = execution_date.strftime("%Y-%m-%d_%H%M")
26 |         file_path = f"/tmp/data/bookings/{file_date}/bookings.csv"
27 | 
28 |         num_bookings = random.randint(30, 50)
29 |         bookings = []
30 |         for i in range(num_bookings):
31 |             booking = {
32 |                 "booking_id": random.randint(1000, 5000),
33 |                 "listing_id": random.choice([13913, 17402, 24328, 33332, 116268, 117203, 127652, 127860]),
34 |                 "user_id": random.randint(1000, 5000),
35 |                 "booking_time": execution_date.strftime("%Y-%m-%d %H:%M:%S"),
36 |                 "status": random.choice(["confirmed", "cancelled", "pending"])
37 |             }
38 |             bookings.append(booking)
39 | 
40 |         directory = os.path.dirname(file_path)
41 |         if not os.path.exists(directory):
42 |             os.makedirs(directory)
43 | 
44 |         fieldnames = ["booking_id", "listing_id", "user_id", "booking_time", "status"]
45 | 
46 |         with open(file_path, "w", newline="") as csvfile:
47 |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
48 |             writer.writeheader()
49 |             for booking in bookings:
50 |                 writer.writerow({
51 |                     "booking_id": booking["booking_id"],
52 |                     "listing_id": booking["listing_id"],
53 |                     "user_id": booking["user_id"],
54 |                     "booking_time": booking["booking_time"],
55 |                     "status": booking["status"]
56 |                 })
57 | 
58 |         print(f"Generated bookings data written to {file_path}")
59 | 
60 |     wait_for_listings_file = FileSensor(
61 |         task_id="wait_for_listings_file",
62 |         fs_conn_id="local_fs",
63 |         filepath="/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz",
64 |         poke_interval=30,
65 |         timeout=600,
66 |     )
67 | 
68 |     spark_job = SparkSubmitOperator(
69 |         task_id="process_listings_and_bookings",
70 |         application="bookings_per_listing_spark.py",
71 |         name="listings_bookings_join",
72 |         application_args=[
73 |             "--listings_file", "/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz",
74 |             "--bookings_file", "/tmp/data/bookings/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}/bookings.csv",
75 |             "--output_path", "/tmp/data/bookings_per_listing/{{ execution_date.strftime('%Y-%m-%d_%H%M') }}"
76 |         ],
77 |         conn_id='spark_booking',
78 |     )
79 | 
80 |     bookings_file = generate_bookings()
81 |     bookings_file >> spark_job
82 |     wait_for_listings_file >> spark_job
83 | 
84 | dag_instance = bookings_spark_pipeline()


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-exercise/flink-app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | 
 4 | from pyflink.common.serialization import SimpleStringSchema
 5 | from pyflink.common.typeinfo import Types
 6 | from pyflink.datastream import StreamExecutionEnvironment
 7 | from pyflink.datastream.connectors.kafka import KafkaSource
 8 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
 9 | from pyflink.common.watermark_strategy import WatermarkStrategy
10 | from pyflink.common import Time
11 | from pyflink.datastream.window import TumblingProcessingTimeWindows
12 | from pyflink.datastream.state import ValueStateDescriptor
13 | from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction
14 | 
15 | 
16 | @dataclass
17 | class Payment:
18 |     payment_id: str
19 |     user_id: str
20 |     merchant_id: str
21 |     amount: float
22 |     payment_time: str
23 | 
24 | 
25 | def parse_payment(json_str: str) -> Payment:
26 |     data = json.loads(json_str)
27 |     return Payment(
28 |         payment_id=data.get("payment_id", "unknown"),
29 |         user_id=data.get("user_id", "unknown"),
30 |         merchant_id=data.get("merchant_id", "unknown"),
31 |         amount=float(data.get("amount", 0.0)),
32 |         payment_time=data.get("payment_time", "unknown")
33 |     )
34 | 
35 | 
36 | class PaymentsAnomaliesDetector(ProcessWindowFunction):
37 | 
38 |     def open(self, runtime_context):
39 |         # TODO: Define state for the total number of payments from a merchant
40 |         self.total_count = None
41 |         # TODO: Define state for the sum of all payment amounts from a merchant
42 |         self.total_amount = None
43 | 
44 |     def process(self,
45 |                 key,
46 |                 context,
47 |                 elements):
48 |         current_total_count = self.total_count.value() or 0
49 |         current_total_amount = self.total_amount.value() or 0
50 | 
51 |         window_total = 0
52 |         window_count = 0
53 | 
54 |         for input in elements:
55 |             # TODO: Compute window_total and window_count using elements in the window
56 | 
57 |         if current_total_count > 0:
58 |             # TODO: Compute average payment amount using values from the local state
59 |             current_average = None
60 |             # TODO: Compute average payment amount for the current window
61 |             window_average = None
62 | 
63 |             if window_average > 1.5 * current_average:
64 |                 # TODO: Emit a record about a detected anomaly
65 | 
66 |         # TODO: Update local state using data from the current window
67 | 
68 | 
69 | def main():
70 |     env = StreamExecutionEnvironment.get_execution_environment()
71 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
72 | 
73 |     kafka_source = KafkaSource.builder() \
74 |         .set_bootstrap_servers("localhost:9092") \
75 |         .set_topics("payments") \
76 |         .set_group_id("flink-consumer-group") \
77 |         .set_value_only_deserializer(SimpleStringSchema()) \
78 |         .build()
79 | 
80 |     payments_stream = env.from_source(
81 |         kafka_source,
82 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
83 |         source_name="kafka_source"
84 |     ).map(parse_payment)
85 | 
86 | 
87 |     anomalies_stream = payments_stream 
88 |     # TODO: Add stream processing steps for anomaly detection.
89 |     # For each merchant, use PaymentsAnomaliesDetector on
90 |     # 10 seconds tumbling windows
91 | 
92 |     anomalies_stream.print("DetectedAnomalies")
93 | 
94 |     env.execute("Payment anomalies detection")
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/dags/05-bookings_per_listing_with_postgres.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import dag, task
 2 | from airflow.operators.python import get_current_context
 3 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
 4 | from airflow.sensors.filesystem import FileSensor
 5 | from airflow.providers.postgres.hooks.postgres import PostgresHook
 6 | from datetime import datetime, timedelta
 7 | import os
 8 | import csv
 9 | 
10 | 
11 | @dag(
12 |     "bookings_spark_pipeline",
13 |     start_date=datetime(2025, 1, 1),
14 |     schedule_interval="* * * * *",
15 |     catchup=False,
16 |     description="",
17 | )
18 | def bookings_per_listing_with_sensor():
19 | 
20 |     @task
21 |     def read_bookings_from_postgres():
22 |         context = get_current_context()
23 |         execution_date = context["execution_date"]
24 |         file_date = execution_date.strftime("%Y-%m-%d_%H-%M")
25 | 
26 |         file_path = f"/tmp/data/bookings/{file_date}/bookings.csv"
27 | 
28 |         start_of_minute = execution_date.replace(second=0, microsecond=0)
29 |         end_of_minute = start_of_minute + timedelta(minutes=1)
30 | 
31 |         pg_hook = PostgresHook(postgres_conn_id="postgres_default")
32 |         query = f"""
33 |             SELECT booking_id, listing_id, user_id, booking_time, status
34 |             FROM bookings
35 |             WHERE booking_time >= '{start_of_minute.strftime('%Y-%m-%d %H:%M:%S')}'
36 |               AND booking_time < '{end_of_minute.strftime('%Y-%m-%d %H:%M:%S')}'
37 |         """
38 |         records = pg_hook.get_records(query)
39 | 
40 |         bookings = []
41 | 
42 |         print(f"Read {len(records)} from Postgres")
43 |         for record in records:
44 |             booking = {
45 |                 "booking_id": record[0],
46 |                 "listing_id": record[1],
47 |                 "user_id": record[2],
48 |                 "booking_time": record[3].strftime('%Y-%m-%d %H:%M:%S'),
49 |                 "status": record[4]
50 |             }
51 |             bookings.append(booking)
52 | 
53 |         directory = os.path.dirname(file_path)
54 |         if not os.path.exists(directory):
55 |             os.makedirs(directory)
56 | 
57 |         fieldnames = ["booking_id", "listing_id", "user_id", "booking_time", "status"]
58 | 
59 |         with open(file_path, "w", newline="") as csvfile:
60 |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
61 |             writer.writeheader()
62 |             for booking in bookings:
63 |                 writer.writerow({
64 |                     "booking_id": booking["booking_id"],
65 |                     "listing_id": booking["listing_id"],
66 |                     "user_id": booking["user_id"],
67 |                     "booking_time": booking["booking_time"],
68 |                     "status": booking["status"]
69 |                 })
70 | 
71 |         print(f"Generated bookings data written to {file_path}")
72 | 
73 |     wait_for_listings_file = FileSensor(
74 |         task_id="wait_for_listings_file",
75 |         fs_conn_id="local_fs",
76 |         filepath="/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz",
77 |         poke_interval=30,
78 |         timeout=600,
79 |     )
80 | 
81 |     spark_job = SparkSubmitOperator(
82 |         task_id="process_listings_and_bookings",
83 |         application="bookings_per_listing_spark.py",
84 |         name="listings_bookings_join",
85 |         application_args=[
86 |             "--listings_file", "/tmp/data/listings/{{ execution_date.strftime('%Y-%m') }}/listings.csv.gz",
87 |             "--bookings_file", "/tmp/data/bookings/{{ execution_date.strftime('%Y-%m-%d_%H-%M') }}/bookings.csv",
88 |             "--output_path", "/tmp/data/bookings_per_listing/{{ execution_date.strftime('%Y-%m-%d_%H-%M') }}"
89 |         ],
90 |         conn_id='spark_default',
91 |     )
92 | 
93 |     bookings_file = read_bookings_from_postgres()
94 |     bookings_file >> spark_job
95 |     wait_for_listings_file >> spark_job
96 | 
97 | dag_instance = bookings_per_listing_with_sensor()


--------------------------------------------------------------------------------
/08-flink-stream-processing/exercises/02-anomalities-detector-solution/flink-app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from dataclasses import dataclass
  3 | 
  4 | from pyflink.common.serialization import SimpleStringSchema
  5 | from pyflink.common.typeinfo import Types
  6 | from pyflink.datastream import StreamExecutionEnvironment
  7 | from pyflink.datastream.connectors.kafka import KafkaSource
  8 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
  9 | from pyflink.common.watermark_strategy import WatermarkStrategy
 10 | from pyflink.common import Time
 11 | from pyflink.datastream.window import TumblingProcessingTimeWindows
 12 | from pyflink.datastream.state import ValueStateDescriptor
 13 | from pyflink.datastream import StreamExecutionEnvironment, ProcessWindowFunction
 14 | 
 15 | 
 16 | @dataclass
 17 | class Payment:
 18 |     payment_id: str
 19 |     user_id: str
 20 |     merchant_id: str
 21 |     amount: float
 22 |     payment_time: str
 23 | 
 24 | 
 25 | def parse_payment(json_str: str) -> Payment:
 26 |     data = json.loads(json_str)
 27 |     return Payment(
 28 |         payment_id=data.get("payment_id", "unknown"),
 29 |         user_id=data.get("user_id", "unknown"),
 30 |         merchant_id=data.get("merchant_id", "unknown"),
 31 |         amount=float(data.get("amount", 0.0)),
 32 |         payment_time=data.get("payment_time", "unknown")
 33 |     )
 34 | 
 35 | 
 36 | class PaymentsAnomaliesDetector(ProcessWindowFunction):
 37 | 
 38 |     def open(self, runtime_context):
 39 |         self.total_count = runtime_context.get_state(
 40 |             ValueStateDescriptor("total_count", Types.LONG())
 41 |         )
 42 |         self.total_amount = runtime_context.get_state(
 43 |             ValueStateDescriptor("total_amount", Types.DOUBLE())
 44 |         )
 45 | 
 46 |     def process(self,
 47 |                 key,
 48 |                 context,
 49 |                 elements):
 50 |         current_total_count = self.total_count.value() or 0
 51 |         current_total_amount = self.total_amount.value() or 0
 52 | 
 53 |         window_total = 0
 54 |         window_count = 0
 55 | 
 56 |         for input in elements:
 57 |             window_count += 1
 58 |             window_total += input.amount
 59 | 
 60 |         if current_total_count > 0:
 61 |             current_average = current_total_amount / current_total_count
 62 |             window_average = window_total / window_count
 63 | 
 64 |             if window_average > 1.5 * current_average:
 65 |                 yield json.dumps({
 66 |                     "merchant_id": key,
 67 |                     "running_average": current_average,
 68 |                     "window_average": window_average,
 69 |                 })
 70 | 
 71 |         new_total_count = current_total_count + window_count
 72 |         new_total_amount = current_total_amount + window_total
 73 | 
 74 |         self.total_count.update(new_total_count)
 75 |         self.total_amount.update(new_total_amount)
 76 | 
 77 | 
 78 | def main():
 79 |     env = StreamExecutionEnvironment.get_execution_environment()
 80 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
 81 | 
 82 |     kafka_source = KafkaSource.builder() \
 83 |         .set_bootstrap_servers("localhost:9092") \
 84 |         .set_topics("payments") \
 85 |         .set_group_id("flink-consumer-group") \
 86 |         .set_value_only_deserializer(SimpleStringSchema()) \
 87 |         .build()
 88 | 
 89 |     payments_stream = env.from_source(
 90 |         kafka_source,
 91 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
 92 |         source_name="kafka_source"
 93 |     ).map(parse_payment)
 94 | 
 95 |     anomalies_stream = payments_stream \
 96 |         .key_by(lambda payment: payment.merchant_id) \
 97 |         .window(TumblingProcessingTimeWindows.of(Time.seconds(10))) \
 98 |         .process(PaymentsAnomaliesDetector(), output_type=Types.STRING())
 99 | 
100 |     anomalies_stream.print("DetectedAnomalies")
101 | 
102 |     env.execute("Payment anomalies detection")
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()


--------------------------------------------------------------------------------
/07-kafka-streaming/exercises/02-kafka-connect-exercise/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # 0. Stop all Docker containers 
  3 | 
  4 | Before you start, you would need to stop Docker containers related to this bootcamp 
  5 | running on your machine.
  6 | 
  7 | # 1. Start Kafka
  8 | 
  9 | First, start Kafka, Kafka Connect, and Postrges using Docker Compose:
 10 | 
 11 | ```sh
 12 | docker-compose up
 13 | ```
 14 | 
 15 | 
 16 | # 2. Create a virtual environment and install dependencies
 17 | 
 18 | Run the following commands to create a virtual environment and install dependencies:
 19 | 
 20 | ```sh
 21 | python3 -m venv venv
 22 | source venv/bin/activate
 23 | pip install -r requirements.txt
 24 | ```
 25 | 
 26 | # 3. Create a Debezium connector
 27 | 
 28 | Once you have Kafka Connect running, you need to create a connector to read a stream of updates from Postgres. Run this command to create it:
 29 | 
 30 | ```sh
 31 | curl -X POST -H "Content-Type: application/json" -d @config_debezium.json  http://localhost:8083/connectors
 32 | ```
 33 | 
 34 | # 4. Connect to a database
 35 | 
 36 | You will need to execute several SQL operations. To connect to a database, use the following arguments:
 37 | 
 38 | * *URL* - `127.0.0.1:5432`
 39 | * *Username* - `user`
 40 | * **Password** - `password`
 41 | * **Database** - `onlineshop`
 42 | 
 43 | # 5. Create "orders" table
 44 | 
 45 | First, you need to create the `orders` table using the following SQL statement:
 46 | 
 47 | ```sql
 48 | CREATE TABLE orders (
 49 |     id SERIAL PRIMARY KEY,
 50 |     customer_id VARCHAR(50) NOT NULL,
 51 |     customer_name VARCHAR(100),
 52 |     customer_email VARCHAR(255),
 53 |     product_id VARCHAR(50),
 54 |     total_amount NUMERIC(10, 2),
 55 |     order_date TIMESTAMPTZ,
 56 |     status VARCHAR(50),
 57 |     payment_method VARCHAR(50)
 58 | );
 59 | ```
 60 | 
 61 | 
 62 | # 6. Alter table
 63 | 
 64 | By default, WAL records produced by Postgres will only contain data in a table after the update. To include a snapshot of data before the update, we need to run the following SQL command:
 65 | 
 66 | ```sql
 67 | ALTER TABLE orders REPLICA IDENTITY FULL;
 68 | ```
 69 | 
 70 | After this command every `UPDATE` or `DELETE` operation on the `orders` table, Postgres will log the entire row’s data before and after the update in the Write-Ahead Log.
 71 | 
 72 | # 7. Create a new order 
 73 | 
 74 | Once you have a table, you can create the `orders` table using this SQL statement.
 75 | 
 76 | ```sql
 77 | INSERT INTO orders (
 78 |     customer_id,
 79 |     customer_name,
 80 |     customer_email,
 81 |     product_id,
 82 |     total_amount,
 83 |     order_date,
 84 |     status,
 85 |     payment_method
 86 | )
 87 | VALUES (
 88 |     'CUST-1234',
 89 |     'John Smith',
 90 |     'john.smith@example.com',
 91 |     'PROD-XYZ789',
 92 |     59.95,
 93 |     '2024-12-09T10:45:00Z',
 94 |     'processed',
 95 |     'paypal'
 96 | )
 97 | RETURNING id;
 98 | ```
 99 | 
100 | This should return the `id` of the newly created record that you can use to perform an update operation.
101 | 
102 | # 8. Update an order status
103 | 
104 | Now, we can update the created record. You can do it using this command:
105 | 
106 | ```sql
107 | UPDATE orders
108 | SET status = 'refunded'
109 | WHERE id = 1
110 | ```
111 | 
112 | Since it changes the `status` value from `processed` to `refunded` it should.
113 | 
114 | # 9. Check if Kafka Connect writes records to Kafka
115 | 
116 | Run the following command to test if Kafka Connect writes records to Kafka:
117 | 
118 | ```sh
119 | kafka-console-consumer --bootstrap-server localhost:9092 --topic postgres-.public.orders --from-beginning
120 | ```
121 | 
122 | You should see two records: one for the `INSERT` operation and another one for the `UPDATE` operation.
123 | 
124 | # 10. Implement and run your consumer and see if it works
125 | 
126 | You should now implement and run your Python consumer.
127 | It should print a single message for the executed update operation.
128 | 
129 | # 11. (Optional) Create more test records
130 | 
131 | If you need more test records, you can repeat steps **7** and **8** again for a new record, but you would need to change the `id` comparison value in the `UPDATE` statement.


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-exercise/README.md:
--------------------------------------------------------------------------------
  1 | This is a README for the second exercise in this section.
  2 | 
  3 | # 0. Follow the instruction in the README.md file in the "exercises" folder
  4 | 
  5 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow.
  6 | 
  7 | # 1. Install Airflow provider
  8 | 
  9 | Since your DAG will read data from Postgres and use Spark, you need to first install Postgres and Spark providers:
 10 | 
 11 | ```sh
 12 | pip install apache-airflow-providers-postgres
 13 | pip install apache-airflow-providers-apache-spark
 14 | ```
 15 | 
 16 | You need to run this command in the virtual environment you've created for exercises in this section.
 17 | 
 18 | # 2. Start the Postgres database
 19 | 
 20 | Start the Postgres instance from which your DAG will ingest data using the Docker Compose command:
 21 | 
 22 | ```sh
 23 | docker-compose up
 24 | ```
 25 | 
 26 | # 3. Create a table in the Postgres database
 27 | 
 28 | Having a database running we can create a table from which Airflow will ingest data.
 29 | 
 30 | To connect to a database use the following parameters:
 31 | 
 32 | * *Host* - `localhost`
 33 | * *Database* - `rental_site`
 34 | * *Login* - `user`
 35 | * *Password* - `password`
 36 | * *Port* - `5432`
 37 | 
 38 | Then, execute this statement to create a database for this exercise:
 39 | 
 40 | ```sql
 41 | CREATE TABLE customer_reviews (
 42 |     review_id SERIAL PRIMARY KEY,
 43 |     listing_id INT NOT NULL,
 44 |     review_score INT NOT NULL,
 45 |     review_comment TEXT,
 46 |     review_date TIMESTAMP NOT NULL DEFAULT NOW()
 47 | );
 48 | ```
 49 | 
 50 | # 4. Create a Postgres connection in the Airflow UI
 51 | 
 52 | Create a Postgres connection, so your DAG could use a Postgres hook. 
 53 | 
 54 | To do it run the following command from your virtual environment:
 55 | 
 56 | ```sh
 57 | airflow connections add 'postgres_rental_site' \
 58 |     --conn-type 'postgres' \
 59 |     --conn-host 'localhost' \
 60 |     --conn-login 'user' \
 61 |     --conn-password 'password' \
 62 |     --conn-port '5432' \
 63 |     --conn-schema 'rental_site'
 64 | ```
 65 | 
 66 | # 5. Create a Spark connection in the Airflow UI
 67 | 
 68 | Create a Spark connection, so your DAG could run Spark applications.
 69 | 
 70 | To do it run the following command from your virtual environment:
 71 | 
 72 | ```sh
 73 | airflow connections add 'spark_rental_site' \
 74 |     --conn-type 'spark' \
 75 |     --conn-host 'local' \
 76 |     --conn-extra '{"deploy_mode": "client"}'
 77 | ```
 78 | 
 79 | 
 80 | # 6. Copy the DAG and the Spark code
 81 | 
 82 | Copy the following files to the `dags` folder you've created while setting up Airflow locally:
 83 | 
 84 | * `customer_reviews_dag.py` - Airflow DAGs implementing customer reviews processing
 85 | * `spark_etl_reviews.py` - Spark job for processing customer reviews
 86 | 
 87 | # 7. Restart the scheduler
 88 | 
 89 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`.
 90 | 
 91 | After this, start it again using the following command:
 92 | 
 93 | ```sh
 94 | airflow scheduler
 95 | ```
 96 | 
 97 | # 8. Implement the TODOs in the code
 98 | 
 99 | Now implement the TODO comments in the starter code.
100 | 
101 | 
102 | # 9. Start the DAG
103 | 
104 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented.
105 | 
106 | # 10. Add some test reviews to test the created pipeline
107 | 
108 | Now you can test your pipeline. Add these reviews to the `customer_reviews` table:
109 | 
110 | ```sql
111 | INSERT INTO customer_reviews (listing_id, review_score, review_comment, review_date)
112 | VALUES
113 |     (101, 5, 'Excellent stay, highly recommend!', NOW()),
114 |     (101, 5, 'Great location!', NOW()),
115 |     (102, 4, 'Good location but a bit noisy.', NOW()),
116 |     (102, 3, 'Poor room service.', NOW()),
117 |     (103, 3, 'Could have been worse.', NOW());
118 | ```
119 | 
120 | At the next run your pipeline will read these reviews and compute an average score per listing ID.
121 | 


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/02-process-customer-reviews-solution/README.md:
--------------------------------------------------------------------------------
  1 | This is a README for the second exercise in this section.
  2 | 
  3 | # 0. Follow the instruction in the README.md file in the "exercises" folder
  4 | 
  5 | Before following steps in this README, follow the steps in the `README.md` file in the `exercises` folder to set up your local Airflow.
  6 | 
  7 | # 1. Install Airflow provider
  8 | 
  9 | Since your DAG will read data from Postgres and use Spark, you need to first install Postgres and Spark providers:
 10 | 
 11 | ```sh
 12 | pip install apache-airflow-providers-postgres
 13 | pip install apache-airflow-providers-apache-spark
 14 | ```
 15 | 
 16 | You need to run this command in the virtual environment you've created for exercises in this section.
 17 | 
 18 | # 2. Start the Postgres database
 19 | 
 20 | Start the Postgres instance from which your DAG will ingest data using the Docker Compose command:
 21 | 
 22 | ```sh
 23 | docker-compose up
 24 | ```
 25 | 
 26 | # 3. Create a table in the Postgres database
 27 | 
 28 | Having a database running we can create a table from which Airflow will ingest data.
 29 | 
 30 | To connect to a database use the following parameters:
 31 | 
 32 | * *Host* - `localhost`
 33 | * *Database* - `rental_site`
 34 | * *Login* - `user`
 35 | * *Password* - `password`
 36 | * *Port* - `5432`
 37 | 
 38 | Then, execute this statement to create a database for this exercise:
 39 | 
 40 | ```sql
 41 | CREATE TABLE customer_reviews (
 42 |     review_id SERIAL PRIMARY KEY,
 43 |     listing_id INT NOT NULL,
 44 |     review_score INT NOT NULL,
 45 |     review_comment TEXT,
 46 |     review_date TIMESTAMP NOT NULL DEFAULT NOW()
 47 | );
 48 | ```
 49 | 
 50 | # 4. Create a Postgres connection in the Airflow UI
 51 | 
 52 | Create a Postgres connection, so your DAG could use a Postgres hook. 
 53 | 
 54 | To do it run the following command from your virtual environment:
 55 | 
 56 | ```sh
 57 | airflow connections add 'postgres_rental_site' \
 58 |     --conn-type 'postgres' \
 59 |     --conn-host 'localhost' \
 60 |     --conn-login 'user' \
 61 |     --conn-password 'password' \
 62 |     --conn-port '5432' \
 63 |     --conn-schema 'rental_site'
 64 | ```
 65 | 
 66 | # 5. Create a Spark connection in the Airflow UI
 67 | 
 68 | Create a Spark connection, so your DAG could run Spark applications.
 69 | 
 70 | To do it run the following command from your virtual environment:
 71 | 
 72 | ```sh
 73 | airflow connections add 'spark_rental_site' \
 74 |     --conn-type 'spark' \
 75 |     --conn-host 'local' \
 76 |     --conn-extra '{"deploy_mode": "client"}'
 77 | ```
 78 | 
 79 | 
 80 | # 6. Copy the DAG and the Spark code
 81 | 
 82 | Copy the following files to the `dags` folder you've created while setting up Airflow locally:
 83 | 
 84 | * `customer_reviews_dag.py` - Airflow DAGs implementing customer reviews processing
 85 | * `spark_etl_reviews.py` - Spark job for processing customer reviews
 86 | 
 87 | # 7. Restart the scheduler
 88 | 
 89 | To restart a scheduler process open the terminal with the running scheduler process, and stop it using the `Ctrl+C`.
 90 | 
 91 | After this, start it again using the following command:
 92 | 
 93 | ```sh
 94 | airflow scheduler
 95 | ```
 96 | 
 97 | # 8. Implement the TODOs in the code
 98 | 
 99 | Now implement the TODO comments in the starter code.
100 | 
101 | 
102 | # 9. Start the DAG
103 | 
104 | Once the DAG is implemented you can start it by clicking on the toggle in the Airflow UI for the DAG you've implemented.
105 | 
106 | # 10. Add some test reviews to test the created pipeline
107 | 
108 | Now you can test your pipeline. Add these reviews to the `customer_reviews` table:
109 | 
110 | ```sql
111 | INSERT INTO customer_reviews (listing_id, review_score, review_comment, review_date)
112 | VALUES
113 |     (101, 5, 'Excellent stay, highly recommend!', NOW()),
114 |     (101, 5, 'Great location!', NOW()),
115 |     (102, 4, 'Good location but a bit noisy.', NOW()),
116 |     (102, 3, 'Poor room service.', NOW()),
117 |     (103, 3, 'Could have been worse.', NOW());
118 | ```
119 | 
120 | At the next run your pipeline will read these reviews and compute an average score per listing ID.
121 | 


--------------------------------------------------------------------------------
/08-flink-stream-processing/04-late-events-processing.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from dataclasses import dataclass
  3 | from datetime import datetime
  4 | 
  5 | from pyflink.common import Time
  6 | from pyflink.common.serialization import SimpleStringSchema
  7 | from pyflink.common.watermark_strategy import WatermarkStrategy, TimestampAssigner
  8 | from pyflink.common.time import Duration
  9 | from pyflink.common.typeinfo import Types
 10 | from pyflink.datastream import OutputTag, StreamExecutionEnvironment, ProcessWindowFunction, TimeCharacteristic
 11 | from pyflink.datastream.connectors.kafka import KafkaSource
 12 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
 13 | from pyflink.datastream.window import TumblingEventTimeWindows
 14 | 
 15 | @dataclass
 16 | class Order:
 17 |     order_id: str
 18 |     customer_id: str
 19 |     product_id: str
 20 |     quantity: int
 21 |     price: float
 22 |     order_time: str
 23 | 
 24 | 
 25 | def parse_order(json_str) -> Order:
 26 |     data = json.loads(json_str)
 27 |     order_time_seconds = datetime.fromisoformat(data["order_time"])
 28 |     return Order(
 29 |         order_id=data.get("order_id", "unknown"),
 30 |         customer_id=data.get("customer_id", "unknown"),
 31 |         product_id=data.get("product_id", "unknown"),
 32 |         quantity=data.get("quantity", 0),
 33 |         price=float(data.get("price", 0.0)),
 34 |         order_time=data.get("order_time", "unknown")
 35 |     )
 36 | 
 37 | 
 38 | class OrderTimestampAssigner(TimestampAssigner):
 39 |     def extract_timestamp(self, value, record_timestamp) -> int:
 40 |         dt = datetime.fromisoformat(value.order_time)
 41 |         return int(dt.timestamp() * 1000)
 42 | 
 43 | 
 44 | class AggregateWindowFunction(ProcessWindowFunction):
 45 |     def process(self,
 46 |                 key,
 47 |                 context,
 48 |                 elements):
 49 | 
 50 |         total_quantity = 0
 51 |         total_sum = 0
 52 | 
 53 |         for input in elements:
 54 |             total_quantity += input.quantity
 55 |             total_sum += input.quantity * input.price
 56 | 
 57 |         result = {
 58 |             "product_id": key,
 59 |             "total_quantity": total_quantity,
 60 |             "total_spent": round(total_sum, 2),
 61 |             "window_start": datetime.utcfromtimestamp(
 62 |                 context.window().start / 1000
 63 |             ).isoformat(),
 64 |             "window_end": datetime.utcfromtimestamp(
 65 |                 context.window().end / 1000
 66 |             ).isoformat(),
 67 |         }
 68 |         return [json.dumps(result)]
 69 | 
 70 | 
 71 | def main():
 72 |     env = StreamExecutionEnvironment.get_execution_environment()
 73 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
 74 |     env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
 75 | 
 76 |     kafka_source = KafkaSource.builder() \
 77 |         .set_bootstrap_servers("localhost:9092") \
 78 |         .set_topics("late-orders") \
 79 |         .set_group_id("eventtime-demo") \
 80 |         .set_value_only_deserializer(SimpleStringSchema()) \
 81 |         .build()
 82 | 
 83 |     stream = env.from_source(
 84 |         kafka_source,
 85 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
 86 |         source_name="kafka_source"
 87 |     )
 88 | 
 89 |     watermark_strategy = WatermarkStrategy \
 90 |         .for_bounded_out_of_orderness(Duration.of_seconds(10)) \
 91 |         .with_timestamp_assigner(OrderTimestampAssigner())
 92 | 
 93 |     late_tag = OutputTag("late-events", Types.PICKLED_BYTE_ARRAY())
 94 | 
 95 |     windowed_stream = stream \
 96 |         .map(parse_order) \
 97 |         .assign_timestamps_and_watermarks(watermark_strategy) \
 98 |         .key_by(lambda x: x.product_id) \
 99 |         .window(TumblingEventTimeWindows.of(Time.seconds(30))) \
100 |         .side_output_late_data(late_tag) \
101 |         .process(AggregateWindowFunction(), Types.STRING())
102 | 
103 |     windowed_stream.print("Aggregated")
104 | 
105 |     late_stream = windowed_stream.get_side_output(late_tag)
106 |     late_stream.print("LateEvents")
107 | 
108 |     env.execute("Advanced Event-Time Window Demo")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()


--------------------------------------------------------------------------------
/08-flink-stream-processing/06-connecting-streams.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from dataclasses import dataclass
  3 | from pyflink.common.typeinfo import Types
  4 | from pyflink.common.serialization import SimpleStringSchema
  5 | from pyflink.common.watermark_strategy import WatermarkStrategy
  6 | 
  7 | from pyflink.datastream import (
  8 |     StreamExecutionEnvironment,
  9 |     RuntimeContext
 10 | )
 11 | from pyflink.datastream.functions import CoProcessFunction
 12 | from pyflink.datastream.execution_mode import RuntimeExecutionMode
 13 | from pyflink.datastream.state import ValueStateDescriptor
 14 | from pyflink.datastream.connectors.kafka import KafkaSource
 15 | 
 16 | 
 17 | @dataclass
 18 | class Order:
 19 |     order_id: str
 20 |     customer_id: str
 21 |     product_id: str
 22 |     quantity: int
 23 |     price: float
 24 |     order_time: str
 25 | 
 26 | @dataclass
 27 | class Product:
 28 |     product_id: str
 29 |     name: str
 30 |     category: str
 31 | 
 32 | 
 33 | def parse_order(line: str):
 34 |     data = json.loads(line)
 35 |     return Order(
 36 |         order_id=data.get("order_id", ""),
 37 |         customer_id=data.get("customer_id", ""),
 38 |         product_id=data.get("product_id", ""),
 39 |         quantity=int(data.get("quantity", 0)),
 40 |         price=float(data.get("price", 0.0)),
 41 |         order_time=data.get("order_time", "")
 42 |     )
 43 | 
 44 | def parse_product(line: str):
 45 |     data = json.loads(line)
 46 |     return Product(
 47 |         product_id=data.get("product_id", ""),
 48 |         name=data.get("name", "Unknown"),
 49 |         category=data.get("category", "Unknown")
 50 |     )
 51 | 
 52 | 
 53 | class OrdersProductsCoProcess(CoProcessFunction):
 54 | 
 55 |     def open(self, runtime_context: RuntimeContext):
 56 |         product_desc = ValueStateDescriptor("product_info", Types.PICKLED_BYTE_ARRAY())
 57 |         self.product_state = runtime_context.get_state(product_desc)
 58 | 
 59 |     def process_element1(self, value, ctx):
 60 |         product = self.product_state.value()
 61 | 
 62 |         if product:
 63 |             enriched = {
 64 |                 "order_id": value.order_id,
 65 |                 "customer_id": value.customer_id,
 66 |                 "product_id": value.product_id,
 67 |                 "quantity": value.quantity,
 68 |                 "price": value.price,
 69 |                 "product_name": product.name,
 70 |                 "product_category": product.category,
 71 |             }
 72 |         else:
 73 |             enriched = {
 74 |                 "order_id": value.order_id,
 75 |                 "customer_id": value.customer_id,
 76 |                 "product_id": value.product_id,
 77 |                 "quantity": value.quantity,
 78 |                 "price": value.price,
 79 |                 "product_name": "Unknown",
 80 |                 "product_category": "Unknown",
 81 |             }
 82 | 
 83 |         yield json.dumps(enriched)
 84 | 
 85 |     def process_element2(self, value, ctx):
 86 |         self.product_state.update(value)
 87 | 
 88 | 
 89 | def main():
 90 |     env = StreamExecutionEnvironment.get_execution_environment()
 91 |     env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
 92 | 
 93 |     orders_source = KafkaSource.builder() \
 94 |         .set_bootstrap_servers("localhost:9092") \
 95 |         .set_topics("orders") \
 96 |         .set_group_id("streams_join_consumer") \
 97 |         .set_value_only_deserializer(SimpleStringSchema()) \
 98 |         .build()
 99 | 
100 |     products_source = KafkaSource.builder() \
101 |         .set_bootstrap_servers("localhost:9092") \
102 |         .set_topics("products") \
103 |         .set_group_id("streams_join_consumer") \
104 |         .set_value_only_deserializer(SimpleStringSchema()) \
105 |         .build()
106 | 
107 |     orders_stream = env.from_source(
108 |         source=orders_source,
109 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
110 |         source_name="orders_source"
111 |     ).map(parse_order)
112 | 
113 |     products_stream = env.from_source(
114 |         source=products_source,
115 |         watermark_strategy=WatermarkStrategy.no_watermarks(),
116 |         source_name="products_source"
117 |     ).map(parse_product)
118 | 
119 |     products_stream.print("ProductsStream")
120 | 
121 |     keyed_orders = orders_stream.key_by(lambda o: o.product_id)
122 |     keyed_products = products_stream.key_by(lambda c: c.product_id)
123 | 
124 |     connected = keyed_orders.connect(keyed_products)
125 | 
126 |     enriched_stream = connected.process(
127 |         OrdersProductsCoProcess(),
128 |         output_type=Types.STRING()
129 |     )
130 | 
131 |     enriched_stream.print("EnrichedOrder")
132 | 
133 |     env.execute("Connecting streams")
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main()


--------------------------------------------------------------------------------
/04-orchestration-with-airflow/exercises/01-data-validation-solution/data_validation_dag.py:
--------------------------------------------------------------------------------
  1 | from airflow.decorators import dag, task
  2 | from airflow.operators.python import get_current_context
  3 | from datetime import datetime
  4 | import os
  5 | import json
  6 | import random
  7 | 
  8 | 
  9 | @dag(
 10 |    "data_quality_pipeline",
 11 |     start_date=datetime(2025, 1, 1),
 12 |     schedule_interval='* * * * *',
 13 |     catchup=False,
 14 |     description="Data Quality Check DAG",
 15 | )
 16 | def data_quality_pipeline():
 17 | 
 18 |     CORRECT_PROB = 0.7
 19 | 
 20 |     def get_bookings_path(context):
 21 |         execution_date = context["execution_date"]
 22 |         file_date = execution_date.strftime("%Y-%m-%d_%H-%M")
 23 |         return f"/tmp/data/bookings/{file_date}/bookings.json"
 24 | 
 25 |     def generate_booking_id(i):
 26 |         if random.random() < CORRECT_PROB:
 27 |             return i + 1
 28 | 
 29 |         return ""
 30 | 
 31 |     def generate_listing_id():
 32 |         if random.random() < CORRECT_PROB:
 33 |             return random.choice([1, 2, 3, 4, 5])
 34 | 
 35 |         return ""
 36 | 
 37 |     def generate_user_id(correct_prob=0.7):
 38 |         return random.randint(1000, 5000) if random.random() < correct_prob else ""
 39 | 
 40 |     def generate_booking_time(execution_date):
 41 |         if random.random() < CORRECT_PROB:
 42 |             return execution_date.strftime('%Y-%m-%d %H:%M:%S')
 43 | 
 44 |         return ""
 45 | 
 46 |     def generate_status():
 47 |         if random.random() < CORRECT_PROB:
 48 |             return random.choice(["confirmed", "pending", "cancelled"])
 49 | 
 50 |         return random.choice(["unknown", "", "error"])
 51 | 
 52 |     @task
 53 |     def generate_bookings():
 54 |         context = get_current_context()
 55 |         booking_path = get_bookings_path(context)
 56 | 
 57 |         num_bookings = random.randint(5, 15)
 58 |         bookings = []
 59 | 
 60 |         for i in range(num_bookings):
 61 |             booking = {
 62 |                 "booking_id": generate_booking_id(i),
 63 |                 "listing_id": generate_listing_id(),
 64 |                 "user_id": generate_user_id(),
 65 |                 "booking_time": generate_booking_time(context["execution_date"]),
 66 |                 "status": generate_status()
 67 |             }
 68 |             bookings.append(booking)
 69 | 
 70 |         directory = os.path.dirname(booking_path)
 71 |         if not os.path.exists(directory):
 72 |             os.makedirs(directory)
 73 | 
 74 |         with open(booking_path, "w") as f:
 75 |             json.dump(bookings, f, indent=4)
 76 | 
 77 |         print(f"Written to file: {booking_path}")
 78 | 
 79 |     def get_anomalies_path(context):
 80 |         execution_date = context["execution_date"]
 81 |         file_date = execution_date.strftime("%Y-%m-%d_%H-%M")
 82 |         return f"/tmp/data/anomalies/{file_date}/anomalies.json"
 83 | 
 84 |     @task
 85 |     def quality_check():
 86 |         context = get_current_context()
 87 |         booking_path = get_bookings_path(context)
 88 | 
 89 |         anomalies = []
 90 |         valid_statuses = {"confirmed", "pending", "cancelled"}
 91 | 
 92 |         with open(booking_path, "r") as f:
 93 |             bookings = json.load(f)
 94 | 
 95 |         for index, row in enumerate(bookings):
 96 |             row_anomalies = []
 97 |             if not row["booking_id"]:
 98 |                 row_anomalies.append("Missing booking_id")
 99 |             if not row["listing_id"]:
100 |                 row_anomalies.append("Missing listing_id")
101 |             if not row["user_id"]:
102 |                 row_anomalies.append("Missing user_id")
103 |             if not row["booking_time"]:
104 |                 row_anomalies.append("Missing booking_time")
105 |             if not row["status"]:
106 |                 row_anomalies.append("Missing status")
107 | 
108 | 
109 |             if row["status"] and row["status"] not in valid_statuses:
110 |                 row_anomalies.append(f"Invalid status: {row['status']}")
111 | 
112 |             if row_anomalies:
113 |                 anomalies.append({
114 |                     "booking_id": index,
115 |                     "anomalies": row_anomalies,
116 |                 })
117 | 
118 |         anomalies_file = get_anomalies_path(context)
119 |         directory = os.path.dirname(anomalies_file)
120 |         if not os.path.exists(directory):
121 |             os.makedirs(directory)
122 | 
123 |         with open(anomalies_file, "w") as f:
124 |             json.dump(anomalies, f, indent=4)
125 | 
126 |         print(f"Completed validation for {booking_path}. Anomalies found: {len(anomalies)}")
127 |         print(f"Result written to {anomalies_file}")
128 | 
129 |     generate_bookings() >> quality_check()
130 | 
131 | dag_instance = data_quality_pipeline()
132 | 


--------------------------------------------------------------------------------
/02-data-processing-with-spark/exercises/02-aggregation-functions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "2321b05f-9964-42dd-a049-6226310a5e08",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "\n",
 12 |     "spark = SparkSession.builder \\\n",
 13 |     "    .appName(\"Spark aggregation functions\") \\\n",
 14 |     "    .getOrCreate()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "b409f1b4-0cb7-4544-ae6e-5ecf760cba7d",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "listings = spark.read.csv(\"../data/listings.csv.gz\", \n",
 25 |     "    header=True,\n",
 26 |     "    inferSchema=True,\n",
 27 |     "    sep=\",\", \n",
 28 |     "    quote='\"',\n",
 29 |     "    escape='\"', \n",
 30 |     "    multiLine=True,\n",
 31 |     "    mode=\"PERMISSIVE\" \n",
 32 |     ")\n",
 33 |     "listings.printSchema()"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "ed5808e2-8296-4e55-8b9f-8ced9e7cae0b",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "reviews = spark.read.csv(\"../data/reviews.csv.gz\", \n",
 44 |     "    header=True,\n",
 45 |     "    inferSchema=True,\n",
 46 |     "    sep=\",\",\n",
 47 |     "    quote='\"',\n",
 48 |     "    escape='\"',\n",
 49 |     "    multiLine=True,\n",
 50 |     "    mode=\"PERMISSIVE\"\n",
 51 |     ")\n",
 52 |     "reviews.printSchema()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "fde34e78-3feb-4fd3-a675-3831adf3bc73",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# 1. Count the number of reviews per listing using the \"reviews\" dataset\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "5bc370f9-0375-4b2c-a8cf-7fd7437e1978",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# 2. Compute the total number of listings and average review score per host\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "id": "84d9e7bd-341e-4468-881c-ef4491df08b4",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# 3: Find the top ten listings with the highest number of reviews\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "ada1d343-15b9-4245-9801-7dd468ffd9cd",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# 4. Find the top five neighborhoods with the most listings\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "a45177ab-5191-4cbf-8f07-5244d78b4f58",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# 5. Get a data frame with the following four columns:\n",
103 |     "# * Listing's ID\n",
104 |     "# * Listing's name\n",
105 |     "# * Reviewer's name\n",
106 |     "# * Review's comment\n",
107 |     "# Use \"join\" to combine data from two datasets\n"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "c70008c5-cb99-4079-8309-e812449bd8d7",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# 6.Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews\n",
118 |     "# Use the \"length\" function from the \"pyspark.sql.functions\" to get a lenght of a review\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "f2d55601-9eb6-496c-b214-6ad56d2aec53",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# 7. Using the \"join\" operator find listings without reviews.\n",
129 |     "# Hint: Use \"left_join\" or \"left_anti\" join type when implementing this\n"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "id": "888ce8f0-6e49-41ba-b521-64147316cf22",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": []
139 |   }
140 |  ],
141 |  "metadata": {
142 |   "kernelspec": {
143 |    "display_name": "Python 3 (ipykernel)",
144 |    "language": "python",
145 |    "name": "python3"
146 |   },
147 |   "language_info": {
148 |    "codemirror_mode": {
149 |     "name": "ipython",
150 |     "version": 3
151 |    },
152 |    "file_extension": ".py",
153 |    "mimetype": "text/x-python",
154 |    "name": "python",
155 |    "nbconvert_exporter": "python",
156 |    "pygments_lexer": "ipython3",
157 |    "version": "3.13.2"
158 |   }
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 5
162 | }
163 | 


--------------------------------------------------------------------------------
/02-data-processing-with-spark/03-processing-airbnb-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "c219df52-cb8d-4431-831a-3751a69062f2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "\n",
 12 |     "spark = SparkSession.builder \\\n",
 13 |     "    .appName(\"Inside Airbnb data processing\") \\\n",
 14 |     "    .getOrCreate()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "ec00f92d-c582-4970-9617-ff0a9852cc45",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "listings = spark.read.csv(\"data/listings.csv.gz\", \n",
 25 |     "    header=True,\n",
 26 |     "    inferSchema=True,\n",
 27 |     "    sep=\",\",\n",
 28 |     "    quote='\"',\n",
 29 |     "    escape='\"',\n",
 30 |     "    multiLine=True,\n",
 31 |     "    mode=\"PERMISSIVE\"\n",
 32 |     ")"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "608ed71d-7a78-46df-9190-4fb49c6b62ce",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "review_locations = listings.select(listings.review_scores_location)\n",
 43 |     "review_locations.show()"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "5ae431b3-c1c2-47e8-bae4-fb10f87fb5b3",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "listings \\\n",
 54 |     "  .select(listings.review_scores_location) \\\n",
 55 |     "  .show()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "7739ebc9-8b6b-428e-af57-7e2616f39b14",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "high_score_listings = listings \\\n",
 66 |     "  .filter(listings.review_scores_location > 4.5) \\\n",
 67 |     "  .select('id', 'price', 'name', 'review_scores_location')\n",
 68 |     "\n",
 69 |     "high_score_listings.show(20, truncate=False)\n"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "5306bebf-6f83-430e-affd-6618494210b9",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "high_score_listings.dropna().show(20, truncate=False)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "de1b6945-d850-4640-b985-99c1bd8e228b",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "high_score_listings.schema['price']"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "d42041b9-e078-430a-ab27-7ba97ebb8d75",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "from pyspark.sql.functions import regexp_replace\n",
100 |     "\n",
101 |     "price_num_df = listings \\\n",
102 |     "  .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float')) \\\n",
103 |     "\n",
104 |     "price_num_df.schema['price_num']"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "2611fddb-c125-4c8b-9cbc-387ed12529e0",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "price_num_df \\\n",
115 |     "  .select('price_num', 'name') \\\n",
116 |     "  .show(20, truncate=False)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "f3f00f4c-e0ce-4d06-b5fe-0b2cd8539f46",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "price_num_df.filter( (price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5)) \\\n",
127 |     "  .select('name', 'price', 'review_scores_location') \\\n",
128 |     "  .show(truncate=False)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "ca2e5d7c-ee58-4b09-8eda-cb7b5d1ab898",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "price_num_df.filter('price_num < 100 AND review_scores_location > 4.5') \\\n",
139 |     "  .select('name', 'price', 'review_scores_location') \\\n",
140 |     "  .show(truncate=False)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "cfb893b6-003b-40ec-82d3-b5bba6de90c0",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "listings \\\n",
151 |     "  .select(listings.property_type) \\\n",
152 |     "  .distinct() \\\n",
153 |     "  .show(truncate=False)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "67666c54-ea5f-4589-ad3a-5ca82434d1e8",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "listings \\\n",
164 |     "  .select(listings.property_type, listings.room_type) \\\n",
165 |     "  .distinct() \\\n",
166 |     "  .show(truncate=False)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "id": "2686c317-8726-4276-ae8a-d4ce474cd487",
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "listings \\\n",
177 |     "  .select(listings.property_type) \\\n",
178 |     "  .distinct() \\\n",
179 |     "  .write \\\n",
180 |     "  .csv('data/property_types')"
181 |    ]
182 |   }
183 |  ],
184 |  "metadata": {
185 |   "kernelspec": {
186 |    "display_name": "Python 3 (ipykernel)",
187 |    "language": "python",
188 |    "name": "python3"
189 |   },
190 |   "language_info": {
191 |    "codemirror_mode": {
192 |     "name": "ipython",
193 |     "version": 3
194 |    },
195 |    "file_extension": ".py",
196 |    "mimetype": "text/x-python",
197 |    "name": "python",
198 |    "nbconvert_exporter": "python",
199 |    "pygments_lexer": "ipython3",
200 |    "version": "3.12.7"
201 |   }
202 |  },
203 |  "nbformat": 4,
204 |  "nbformat_minor": 5
205 | }
206 | 


--------------------------------------------------------------------------------
/05-ml-with-spark/04-pyspark-pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 8,
  6 |    "id": "53eb4614-6b2e-4c33-8822-c230a9dba13b",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "\n",
 12 |     "spark = SparkSession \\\n",
 13 |     "  .builder \\\n",
 14 |     "  .appName(\"PySpark pipeline\") \\\n",
 15 |     "  .getOrCreate()"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 9,
 21 |    "id": "af281fdf-a02b-4602-9c29-c0d22a726ecd",
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "root\n",
 29 |       " |-- longitude: double (nullable = true)\n",
 30 |       " |-- latitude: double (nullable = true)\n",
 31 |       " |-- housing_median_age: double (nullable = true)\n",
 32 |       " |-- total_rooms: double (nullable = true)\n",
 33 |       " |-- total_bedrooms: double (nullable = true)\n",
 34 |       " |-- population: double (nullable = true)\n",
 35 |       " |-- households: double (nullable = true)\n",
 36 |       " |-- median_income: double (nullable = true)\n",
 37 |       " |-- median_house_value: double (nullable = true)\n",
 38 |       " |-- ocean_proximity: string (nullable = true)\n",
 39 |       "\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "data = spark.read.csv(\"data/housing.csv\", header=True, inferSchema=True)\n",
 45 |     "\n",
 46 |     "data.printSchema()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 10,
 52 |    "id": "6c567350-0cdf-4d1f-82f5-77871e985665",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "20433"
 59 |       ]
 60 |      },
 61 |      "execution_count": 10,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "filtered_data = data.na.drop(subset=['total_bedrooms'])\n",
 68 |     "filtered_data.count()"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 11,
 74 |    "id": "0c891b8b-d1f9-481f-a45f-e9915bab6ea0",
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "Train size:  16395\n",
 82 |       "Test size:  4038\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)\n",
 88 |     "print(\"Train size: \", train_data.count())\n",
 89 |     "print(\"Test size: \", test_data.count())"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 12,
 95 |    "id": "f58c0b33-a7b6-4d68-9bb0-de49309b86a7",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "from pyspark.ml.regression import LinearRegression\n",
100 |     "from pyspark.ml import Pipeline\n",
101 |     "from pyspark.ml.feature import VectorAssembler\n",
102 |     "from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler\n",
103 |     "\n",
104 |     "indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')\n",
105 |     "encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec', dropLast=False)\n",
106 |     "\n",
107 |     "\n",
108 |     "feature_cols = ['housing_median_age', 'total_rooms', 'total_bedrooms',\n",
109 |     "                'population', 'households', 'median_income', 'ocean_proximity_vec']\n",
110 |     "assembler = VectorAssembler(inputCols=feature_cols, outputCol='unscaled_features')\n",
111 |     "scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)\n",
112 |     "\n",
113 |     "lr = LinearRegression(featuresCol='features', labelCol='median_house_value', regParam=0.001)\n",
114 |     "\n",
115 |     "pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler, lr])\n",
116 |     "\n",
117 |     "pipeline_model = pipeline.fit(train_data)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 13,
123 |    "id": "1ccc3c07-f82e-49c1-b085-7aa4724d3266",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "test_predictions = pipeline_model.transform(test_data)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 14,
133 |    "id": "b1b5796c-9e3b-4272-aa35-edaac2541f30",
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "Mean Absolute Error (MAE): 50597.33640580943\n"
141 |      ]
142 |     }
143 |    ],
144 |    "source": [
145 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
146 |     "\n",
147 |     "evaluator_mae = RegressionEvaluator(labelCol='median_house_value', predictionCol='prediction', metricName='mae')\n",
148 |     "mae = evaluator_mae.evaluate(test_predictions)\n",
149 |     "print(f\"Mean Absolute Error (MAE): {mae}\")"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "2ada0719-b3d2-42e2-8840-6c0388a11a85",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": []
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "c0a1b109-a13a-41f9-b8a2-7d4e86669c8b",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": []
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.12.7"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 5
190 | }
191 | 


--------------------------------------------------------------------------------
/02-data-processing-with-spark/exercises/03-advanced-spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "b0ce2631-b840-4a7f-8183-fc50cb1977ad",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "\n",
 12 |     "spark = SparkSession.builder \\\n",
 13 |     "    .appName(\"Spark aggregation functions\") \\\n",
 14 |     "    .getOrCreate()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "078a3193-f1a5-4e85-b50a-f80cf5908b18",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "listings = spark.read.csv(\"../data/listings.csv.gz\", \n",
 25 |     "    header=True,\n",
 26 |     "    inferSchema=True,\n",
 27 |     "    sep=\",\", \n",
 28 |     "    quote='\"',\n",
 29 |     "    escape='\"', \n",
 30 |     "    multiLine=True,\n",
 31 |     "    mode=\"PERMISSIVE\" \n",
 32 |     ")\n",
 33 |     "listings.printSchema()"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "32095599-a1da-408e-b315-3e0481e8bb22",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "reviews = spark.read.csv(\"../data/reviews.csv.gz\", \n",
 44 |     "    header=True,\n",
 45 |     "    inferSchema=True,\n",
 46 |     "    sep=\",\",\n",
 47 |     "    quote='\"',\n",
 48 |     "    escape='\"',\n",
 49 |     "    multiLine=True,\n",
 50 |     "    mode=\"PERMISSIVE\"\n",
 51 |     ")\n",
 52 |     "reviews.printSchema()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "ba5fc7ba-8be1-4680-aaf1-6724d1399e1d",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# 1. For each listing compute string category depending on its price, and add it as a new column.\n",
 63 |     "# A category is defined in the following way:\n",
 64 |     "#\n",
 65 |     "# * price < 50 -> \"Budget\"\n",
 66 |     "# * 50 <= price < 150 -> \"Mid-range\"\n",
 67 |     "# * price >= 150 -> \"Luxury\"\n",
 68 |     "# \n",
 69 |     "# Only include listings where the price is not null.\n",
 70 |     "# Count the number of listings in each category\n",
 71 |     "\n",
 72 |     "from pyspark.sql.functions import regexp_replace\n",
 73 |     "\n",
 74 |     "listings = listings.withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))\n",
 75 |     "\n",
 76 |     "# TODO: Implement a UDF\n",
 77 |     "# TODO: Apply the UDF to create a new DataFrame"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "69b6d82e-6255-40bb-be8f-837c0cef6571",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# 2. In this task you will need to compute a santiment score per review, and then an average sentiment score per listing.\n",
 88 |     "# A santiment score indicates how \"positive\" or \"negative\" a review is. The higher the score the more positive it is, and vice-versa.\n",
 89 |     "#\n",
 90 |     "# To compute a sentiment score per review compute the number of positive words in a review and subtract the number of negative\n",
 91 |     "# words in the same review (the list of words is already provided)\n",
 92 |     "#\n",
 93 |     "# To complete this task, compute a DataFrame that contains the following fields:\n",
 94 |     "# * name - the name of a listing\n",
 95 |     "# * average_sentiment - average sentiment of reviews computed using the algorithm described above\n",
 96 |     "from pyspark.sql.types import FloatType\n",
 97 |     "\n",
 98 |     "# Lists of positive and negative words\n",
 99 |     "positive_words = {'good', 'great', 'excellent', 'amazing', 'fantastic', 'wonderful', 'pleasant', 'lovely', 'nice', 'enjoyed'}\n",
100 |     "negative_words = {'bad', 'terrible', 'awful', 'horrible', 'disappointing', 'poor', 'hate', 'unpleasant', 'dirty', 'noisy'}\n",
101 |     "\n",
102 |     "# TODO: Implement the UDF\n",
103 |     "def sentiment_score(comment):\n",
104 |     "    pass\n",
105 |     "\n",
106 |     "sentiment_score_udf = udf(sentiment_score, FloatType())\n",
107 |     "\n",
108 |     "reviews_with_sentiment = reviews \\\n",
109 |     "  .withColumn(\n",
110 |     "    'sentiment_score',\n",
111 |     "    sentiment_score_udf(reviews.comments)\n",
112 |     "  )\n",
113 |     "\n",
114 |     "# TODO: Create a final DataFrame"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "637b15b2-66df-4e9b-9bc1-8ba328e14aee",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# 3. Rewrite the following code from the previous exercise using SparkSQL:\n",
125 |     "#\n",
126 |     "# ```\n",
127 |     "# from pyspark.sql.functions import length, avg, count\n",
128 |     "# \n",
129 |     "# reviews_with_comment_length = reviews.withColumn('comment_length', length('comments'))\n",
130 |     "# reviews_with_comment_length \\\n",
131 |     "#   .join(listings, reviews_with_comment_length.listing_id == listings.id, 'inner') \\\n",
132 |     "#   .groupBy('listing_id').agg(\n",
133 |     "#       avg(reviews_with_comment_length.comment_length).alias('average_comment_length'),\n",
134 |     "#       count(reviews_with_comment_length.id).alias('reviews_count')\n",
135 |     "#   ) \\\n",
136 |     "#   .filter('reviews_count >= 5') \\\n",
137 |     "#   .orderBy('average_comment_length', ascending=False) \\\n",
138 |     "#   .show()\n",
139 |     "# ```\n",
140 |     "# This was a solution for the the task:\n",
141 |     "#\n",
142 |     "# \"Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews\"\n",
143 |     "\n",
144 |     "reviews.createOrReplaceTempView(\"reviews\")\n",
145 |     "listings.createOrReplaceTempView(\"listings\")\n",
146 |     "\n",
147 |     "# Write the SQL query\n",
148 |     "sql_query = \"\"\"\n",
149 |     "...\n",
150 |     "\"\"\"\n",
151 |     "\n",
152 |     "spark \\\n",
153 |     "  .sql(sql_query) \\\n",
154 |     "  .show()\n"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "9cd68c71-0ce8-4a21-be62-a822fce18522",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# 4. [Optional][Challenge]\n",
165 |     "# Calculate an average time passed from the first review for each host in the listings dataset. \n",
166 |     "# To implmenet a custom aggregation function you would need to use \"pandas_udf\" function to write a custom aggregation function.\n",
167 |     "#\n",
168 |     "# Documentation about \"pandas_udf\": https://spark.apache.org/docs/3.4.2/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html \n",
169 |     "#\n",
170 |     "# To use \"pandas_udf\" you would need to install two additional dependencies in the virtual environment you use for PySpark:\n",
171 |     "# Run these commands:\n",
172 |     "# ```\n",
173 |     "# pip install pandas\n",
174 |     "# pip install pyarrow\n",
175 |     "# ```\n",
176 |     "\n",
177 |     "from pyspark.sql.functions import col, pandas_udf\n",
178 |     "from pyspark.sql.types import DoubleType\n",
179 |     "from pyspark.sql.functions import PandasUDFType\n",
180 |     "import pandas as pd\n",
181 |     "\n",
182 |     "@pandas_udf(DoubleType(), functionType=PandasUDFType.GROUPED_AGG)\n",
183 |     "def average_days_since_first_review_udf(first_review_series) -> float:\n",
184 |     "    # TODO: Implement the UDF\n",
185 |     "    pass\n",
186 |     "\n",
187 |     "listings \\\n",
188 |     "  .filter(\n",
189 |     "    listings.first_review.isNotNull()\n",
190 |     "  ) \\\n",
191 |     "  .groupBy('host_id') \\\n",
192 |     "  .agg(\n",
193 |     "    average_days_since_first_review_udf(listings.first_review).alias('average_days_since_first_review_days')\n",
194 |     "  ) \\\n",
195 |     "  .show()"
196 |    ]
197 |   }
198 |  ],
199 |  "metadata": {
200 |   "kernelspec": {
201 |    "display_name": "Python 3 (ipykernel)",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.12.7"
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 5
220 | }
221 | 


--------------------------------------------------------------------------------