├── api
├── requirements.txt
├── Dockerfile
├── request.py
├── api.py
└── template
│ └── customers.html
├── img
├── red.png
└── download.png
├── src
├── requirements.txt
├── Dockerfile
└── setup_connectors.sh
├── generate_data
├── requirements.txt
├── Dockerfile
└── generate_data.py
├── kafka
└── Dockerfile
├── .env
├── README.md
├── spark
├── Dockerfile
└── spark-defaults.conf
├── notebooks
├── spark-defaults.conf
├── Dockerfile
└── pyspark.ipynb
└── docker-compose.yml
/api/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.1.1
2 | mysql-connector-python==8.0.28
3 | pandas==1.3.5
4 |
--------------------------------------------------------------------------------
/img/red.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stefen-Taime/stream-ingestion-redpanda-minio/HEAD/img/red.png
--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.5.2
2 | SQLAlchemy==1.4.46
3 | PyMySQL==1.0.2
4 | kafka-python==2.0.2
--------------------------------------------------------------------------------
/img/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stefen-Taime/stream-ingestion-redpanda-minio/HEAD/img/download.png
--------------------------------------------------------------------------------
/generate_data/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.1.1
2 | mysql-connector-python==8.0.28
3 | pandas==1.3.5
4 | Faker==13.1.0
5 |
--------------------------------------------------------------------------------
/src/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim
2 |
3 | WORKDIR /opt/src
4 | COPY requirements.txt /opt/src
5 | RUN pip install --upgrade pip && pip install -r requirements.txt
6 |
7 | ENTRYPOINT ["tail", "-f", "/dev/null"]
--------------------------------------------------------------------------------
/generate_data/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8
2 |
3 | WORKDIR /app
4 |
5 | COPY requirements.txt .
6 | RUN pip install --no-cache-dir -r requirements.txt
7 |
8 | COPY . .
9 |
10 | CMD ["python", "generate_data.py"]
11 |
--------------------------------------------------------------------------------
/api/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9
2 |
3 | WORKDIR /app
4 |
5 | COPY requirements.txt .
6 | RUN pip install --no-cache-dir -r requirements.txt
7 |
8 | COPY . .
9 |
10 | EXPOSE 8000
11 |
12 | CMD ["python", "api.py"]
13 |
--------------------------------------------------------------------------------
/kafka/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debezium/connect
2 |
3 | RUN curl -O https://d1i4a15mxbxib1.cloudfront.net/api/plugins/confluentinc/kafka-connect-s3/versions/10.3.1/confluentinc-kafka-connect-s3-10.3.1.zip \
4 | && unzip confluentinc-kafka-connect-s3-10.3.1.zip \
5 | && mv confluentinc-kafka-connect-s3-10.3.1 /kafka/connect/ \
6 | && rm confluentinc-kafka-connect-s3-10.3.1.zip
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | # redpanda-console
2 | KAFKA_BROKERS="redpanda:9092"
3 |
4 | # MinIO
5 | MINIO_ROOT_USER="minio"
6 | MINIO_ROOT_PASSWORD="minio123"
7 | MINIO_ACCESS_KEY="minio"
8 | MINIO_SECRET_KEY="minio123"
9 |
10 | # MySQL
11 | MYSQL_ROOT_PASSWORD="debezium"
12 | MYSQL_USER="admin"
13 | MYSQL_PASSWORD="admin123"
14 |
15 | # kafka connect
16 | BOOTSTRAP_SERVERS="redpanda:9092"
17 | GROUP_ID="1"
18 | CONFIG_STORAGE_TOPIC="debezium.configs"
19 | OFFSET_STORAGE_TOPIC="debezium.offset"
20 | STATUS_STORAGE_TOPIC="debezium.status"
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Real-Time Data Processing and Analytics with Docker, MySQL, Redpanda, MinIO, and Apache Spark Using Delta Lake
2 |
3 |
4 |
5 | ## Architectural overview
6 |
7 | 
8 |
9 | [Medium](https://medium.com/@stefentaime_10958/real-time-data-processing-and-analytics-with-docker-mysql-redpanda-minio-and-apache-spark-eca83f210ef6) In this article, you will learn how to set up a real-time data processing and analytics environment using Docker, MySQL, Redpanda, MinIO, and Apache Spark.
10 |
11 |
--------------------------------------------------------------------------------
/api/request.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | data = {
4 | "id": "5a5c562e-4386-44ad-bf6f-bab91081781e",
5 | "plate_number": "7695-OOO",
6 | "car_make": "Ford",
7 | "car_year": 2012,
8 | "owner_name": "Stefen",
9 | "owner_address": "92834 Kim Unions\nPort Harryport, MD 61729",
10 | "owner_phone_number": "+1505698632",
11 | "subscription_status": "active",
12 | "subscription_start": None,
13 | "subscription_end": None,
14 | "balance": 100.0,
15 | "timestamp": "2023-03-03T14:37:49",
16 | "rate": 9.99
17 | }
18 |
19 | response = requests.post("http://0.0.0.0:8000/send_data", json=data)
20 |
21 | print(response.status_code)
22 | print(response.json())
23 |
--------------------------------------------------------------------------------
/spark/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/bitnami/spark:3.3
2 |
3 | USER root
4 |
5 | # Install prerequisites
6 | RUN apt-get update && apt-get install -y curl
7 |
8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
9 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
10 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
11 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
12 | && mv s3-2.18.41.jar /opt/bitnami/spark/jars \
13 | && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \
14 | && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \
15 | && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars
--------------------------------------------------------------------------------
/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar
2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension
3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog
4 | spark.hadoop.fs.s3a.endpoint http://minio:9000
5 | spark.hadoop.fs.s3a.access.key minio
6 | spark.hadoop.fs.s3a.secret.key minio123
7 | spark.hadoop.fs.s3a.path.style.access true
8 | spark.hadoop.fs.s3a.connection.ssl.enabled false
9 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
--------------------------------------------------------------------------------
/notebooks/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.jars /usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar
2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension
3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog
4 | spark.hadoop.fs.s3a.endpoint http://minio:9000
5 | spark.hadoop.fs.s3a.access.key minio
6 | spark.hadoop.fs.s3a.secret.key minio123
7 | spark.hadoop.fs.s3a.path.style.access true
8 | spark.hadoop.fs.s3a.connection.ssl.enabled false
9 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
--------------------------------------------------------------------------------
/notebooks/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/all-spark-notebook:python-3.8
2 |
3 | USER root
4 |
5 | RUN curl -O https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \
6 | && tar zxvf spark-3.3.2-bin-hadoop3.tgz \
7 | && rm -rf spark-3.3.2-bin-hadoop3.tgz \
8 | && mv spark-3.3.2-bin-hadoop3/ /usr/local/ \
9 | && rm -rf /usr/local/spark \
10 | && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \
11 | && ln -s /usr/local/spark-3.3.2-bin-hadoop3 /usr/local/spark
12 |
13 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
14 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
15 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \
16 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
17 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
18 | && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \
19 | && mv s3-2.18.41.jar /usr/local/spark/jars \
20 | && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \
21 | && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \
22 | && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \
23 | && mv delta-storage-2.2.0.jar /usr/local/spark/jars \
24 | && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars
--------------------------------------------------------------------------------
/src/setup_connectors.sh:
--------------------------------------------------------------------------------
1 | # create connector source for MySQL
2 | curl --request POST \
3 | --url http://localhost:8083/connectors \
4 | --header 'Content-Type: application/json' \
5 | --data '{
6 | "name": "src-mys",
7 | "config": {
8 | "connector.class": "io.debezium.connector.mysql.MySqlConnector",
9 | "tasks.max": "1",
10 | "database.hostname": "mysql",
11 | "database.port": "3306",
12 | "database.user": "debezium",
13 | "database.password": "dbz",
14 | "database.server.id": "184054",
15 | "database.include.list": "inventory",
16 | "decimal.handling.mode": "double",
17 | "topic.prefix": "dbserver1",
18 | "schema.history.internal.kafka.bootstrap.servers": "redpanda:9092",
19 | "schema.history.internal.kafka.topic": "schema-changes.inventory"
20 | }
21 | }'
22 |
23 | # create connector sink MySQL to S3
24 | curl --request POST \
25 | --url http://localhost:8083/connectors \
26 | --header 'Content-Type: application/json' \
27 | --data '{
28 | "name": "sink_aws-s3",
29 | "config": {
30 | "topics.regex": "dbserver1.inventory.*",
31 | "topics.dir": "inventory",
32 | "connector.class": "io.confluent.connect.s3.S3SinkConnector",
33 | "key.converter": "org.apache.kafka.connect.json.JsonConverter",
34 | "value.converter": "org.apache.kafka.connect.json.JsonConverter",
35 | "format.class": "io.confluent.connect.s3.format.json.JsonFormat",
36 | "flush.size": "1",
37 | "store.url": "http://minio:9000",
38 | "storage.class": "io.confluent.connect.s3.storage.S3Storage",
39 | "s3.region": "us-east-1",
40 | "s3.bucket.name": "warehouse",
41 | "aws.access.key.id": "minio",
42 | "aws.secret.access.key": "minio123"
43 | }
44 | }'
45 |
--------------------------------------------------------------------------------
/api/api.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, jsonify, render_template
2 | import mysql.connector
3 | import pandas as pd
4 |
5 | app = Flask(__name__, template_folder='template')
6 |
7 | db_config = {
8 | "host": "10.0.0.25",
9 | "user": "root",
10 | "password": "debezium",
11 | "database": "inventory"
12 | }
13 |
14 | @app.route('/send_data', methods=['POST'])
15 | def send_data():
16 | data = request.get_json()
17 |
18 |
19 | conn = mysql.connector.connect(**db_config)
20 |
21 | cursor = conn.cursor()
22 |
23 | insert_query = '''
24 | INSERT INTO customers (id, plate_number, car_make, car_year, owner_name, owner_address, owner_phone_number, subscription_status, subscription_start, subscription_end, balance, timestamp)
25 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
26 | '''
27 | cursor.execute(insert_query, (
28 | data['id'],
29 | data['plate_number'],
30 | data['car_make'],
31 | data['car_year'],
32 | data['owner_name'],
33 | data['owner_address'],
34 | data['owner_phone_number'],
35 | data['subscription_status'],
36 | data['subscription_start'],
37 | data['subscription_end'],
38 | data['balance'],
39 | data['timestamp']
40 | ))
41 |
42 | conn.commit()
43 |
44 | cursor.close()
45 | conn.close()
46 |
47 | return jsonify({"status": "success"}), 200
48 |
49 | @app.route('/customers', methods=['GET'])
50 | def customers():
51 | plate_number = request.args.get('plate_number', '')
52 | page = int(request.args.get('page', 1))
53 | items_per_page = 10
54 |
55 | conn = mysql.connector.connect(**db_config)
56 |
57 | # Create a cursor
58 | cursor = conn.cursor()
59 |
60 | # Fetch customers filtered by plate_number and apply pagination
61 | select_query = '''
62 | SELECT * FROM customers
63 | WHERE plate_number LIKE %s
64 | LIMIT %s OFFSET %s
65 | '''
66 | cursor.execute(select_query, (f"%{plate_number}%", items_per_page, (page - 1) * items_per_page))
67 | customers = cursor.fetchall()
68 |
69 | # Get the total number of customers
70 | cursor.execute("SELECT COUNT(*) FROM customers WHERE plate_number LIKE %s", (f"%{plate_number}%",))
71 | total_customers = cursor.fetchone()[0]
72 |
73 | # Close the cursor and connection
74 | cursor.close()
75 | conn.close()
76 |
77 | return render_template('customers.html', customers=customers, plate_number=plate_number, page=page, total_pages=(total_customers // items_per_page) + 1)
78 |
79 |
80 | if __name__ == '__main__':
81 | app.run(host='0.0.0.0', port=8000)
--------------------------------------------------------------------------------
/api/template/customers.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Customers
7 |
8 |
9 |
10 |
11 |
12 |
Customers
13 |
14 |
21 |
22 |
23 |
24 |
25 | | Plate Number |
26 | Car Make |
27 | Car Year |
28 | Owner Name |
29 | Adress |
30 | Balance |
31 |
32 |
33 |
34 |
35 |
36 | {% for customer in customers %}
37 |
38 | | {{ customer[1] }} |
39 | {{ customer[2] }} |
40 | {{ customer[3] }} |
41 | {{ customer[4] }} |
42 | {{ customer[5] }} |
43 | {{ customer[10] }} |
44 |
45 | {% endfor %}
46 |
47 |
48 |
49 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/generate_data/generate_data.py:
--------------------------------------------------------------------------------
1 | import random
2 | import uuid
3 | from faker import Faker
4 | import pandas as pd
5 | import mysql.connector
6 | from datetime import datetime, timedelta
7 |
8 | # Initialize Faker
9 | fake = Faker()
10 |
11 | # Number of data points to generate
12 | num_records = 1000
13 |
14 | # Generate synthetic data
15 | data = []
16 |
17 | for _ in range(num_records):
18 | unique_id = str(uuid.uuid4())
19 | plate_number = f"{random.randint(1000, 9999)}-{fake.random_element(elements=('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG', 'HHH', 'III', 'JJJ', 'KKK', 'LLL', 'MMM', 'NNN', 'OOO', 'PPP', 'QQQ', 'RRR', 'SSS', 'TTT', 'UUU', 'VVV', 'WWW', 'XXX', 'YYY', 'ZZZ'))}"
20 |
21 | car_info = {
22 | "make": fake.random_element(elements=("Toyota", "Honda", "Ford", "Chevrolet", "Nissan", "Volkswagen", "BMW", "Mercedes-Benz")),
23 | "year": random.randint(2000, 2023)
24 | }
25 |
26 | owner_info = {
27 | "name": fake.name(),
28 | "address": fake.address(),
29 | "phone_number": fake.phone_number().replace("x", " ext. ") # Modify phone number format
30 | }
31 |
32 | subscription_status = fake.random_element(elements=("active", "expired", "none"))
33 |
34 | if subscription_status != "none":
35 | subscription_start = fake.date_between(start_date='-3y', end_date='today')
36 | subscription_end = subscription_start + timedelta(days=365)
37 | else:
38 | subscription_start = None
39 | subscription_end = None
40 |
41 | balance = round(random.uniform(0, 500), 2)
42 |
43 | timestamp = fake.date_time_between(start_date='-30d', end_date='now').strftime('%Y-%m-%d %H:%M:%S')
44 |
45 |
46 | record = {
47 | "id": unique_id,
48 | "plate_number": plate_number,
49 | "car_make": car_info["make"],
50 | "car_year": car_info["year"],
51 | "owner_name": owner_info["name"],
52 | "owner_address": owner_info["address"],
53 | "owner_phone_number": owner_info["phone_number"],
54 | "subscription_status": subscription_status,
55 | "subscription_start": subscription_start,
56 | "subscription_end": subscription_end,
57 | "balance": balance,
58 | "timestamp": timestamp
59 | }
60 |
61 | data.append(record)
62 |
63 | # Convert data to a pandas DataFrame
64 | df = pd.DataFrame(data)
65 |
66 | # Connect to the MySQL database
67 | db_config = {
68 | "host": "mysql",
69 | "user": "root",
70 | "password": "debezium",
71 | "database": "inventory"
72 | }
73 | conn = mysql.connector.connect(**db_config)
74 |
75 | # Create a cursor
76 | cursor = conn.cursor()
77 |
78 | # Create the 'customers' table if it doesn't exist
79 | create_table_query = '''
80 | CREATE TABLE IF NOT EXISTS customers (
81 | id VARCHAR(255) NOT NULL,
82 | plate_number VARCHAR(255) NOT NULL,
83 | car_make VARCHAR(255) NOT NULL,
84 | car_year INT NOT NULL,
85 | owner_name VARCHAR(255) NOT NULL,
86 | owner_address TEXT NOT NULL,
87 | owner_phone_number VARCHAR(255) NOT NULL,
88 | subscription_status ENUM('active', 'expired', 'none') NOT NULL,
89 | subscription_start DATE,
90 | subscription_end DATE,
91 | balance DECIMAL(10, 2) NOT NULL,
92 | timestamp TIMESTAMP NOT NULL
93 | )
94 | '''
95 | cursor.execute(create_table_query)
96 |
97 | # Store the synthetic data in the 'customers' table
98 | for index, row in df.iterrows():
99 | insert_query = '''
100 | INSERT INTO customers (id, plate_number, car_make, car_year, owner_name, owner_address, owner_phone_number, subscription_status, subscription_start, subscription_end, balance, timestamp)
101 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
102 | '''
103 | cursor.execute(insert_query, (
104 | row['id'],
105 | row['plate_number'],
106 | row['car_make'],
107 | row['car_year'],
108 | row['owner_name'],
109 | row['owner_address'],
110 | row['owner_phone_number'],
111 | row['subscription_status'],
112 | row['subscription_start'],
113 | row['subscription_end'],
114 | row['balance'],
115 | row['timestamp']
116 | ))
117 |
118 | # Commit the changes and close the cursor
119 | conn.commit()
120 | cursor.close()
121 |
122 | # Close the database connection
123 | conn.close()
124 |
125 | print("Synthetic data stored in the 'customers' table in the MySQL database")
126 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.7"
2 | services:
3 | redpanda:
4 | image: vectorized/redpanda
5 | container_name: redpanda
6 | ports:
7 | - "9092:9092"
8 | - "29092:29092"
9 | command:
10 | - redpanda
11 | - start
12 | - --overprovisioned
13 | - --smp
14 | - "1"
15 | - --memory
16 | - "1G"
17 | - --reserve-memory
18 | - "0M"
19 | - --node-id
20 | - "0"
21 | - --kafka-addr
22 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
23 | - --advertise-kafka-addr
24 | - PLAINTEXT://redpanda:29092,OUTSIDE://redpanda:9092
25 | - --check=false
26 | networks:
27 | - spark_network
28 |
29 | redpanda-console:
30 | image: vectorized/console
31 | container_name: redpanda_console
32 | depends_on:
33 | - redpanda
34 | ports:
35 | - "5000:8080"
36 | env_file:
37 | - .env
38 | networks:
39 | - spark_network
40 |
41 | minio:
42 | hostname: minio
43 | image: "minio/minio"
44 | container_name: minio
45 | ports:
46 | - "9001:9001"
47 | - "9000:9000"
48 | command: [ "server", "/data", "--console-address", ":9001" ]
49 | volumes:
50 | - ./minio/data:/data
51 | env_file:
52 | - .env
53 | networks:
54 | - spark_network
55 |
56 | mc:
57 | image: minio/mc
58 | container_name: mc
59 | hostname: mc
60 | environment:
61 | - AWS_ACCESS_KEY_ID=minio
62 | - AWS_SECRET_ACCESS_KEY=minio123
63 | - AWS_REGION=us-east-1
64 | entrypoint: >
65 | /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; done; /usr/bin/mc mb minio/warehouse; /usr/bin/mc policy set public minio/warehouse; exit 0; "
66 | depends_on:
67 | - minio
68 | networks:
69 | - spark_network
70 |
71 | mysql:
72 | image: debezium/example-mysql:1.6
73 | container_name: mysql
74 | volumes:
75 | - ./mysql/data:/var/lib/mysql
76 | ports:
77 | - "3306:3306"
78 | env_file:
79 | - .env
80 | networks:
81 | - spark_network
82 |
83 | kafka-connect:
84 | build:
85 | context: ./kafka
86 | dockerfile: ./Dockerfile
87 | container_name: kafka_connect
88 | depends_on:
89 | - redpanda
90 | ports:
91 | - "8083:8083"
92 | env_file:
93 | - .env
94 | networks:
95 | - spark_network
96 |
97 | adminer:
98 | image: adminer:latest
99 | ports:
100 | - 8085:8080/tcp
101 | deploy:
102 | restart_policy:
103 | condition: on-failure
104 | networks:
105 | - spark_network
106 |
107 | spark-master:
108 | build:
109 | context: ./spark
110 | dockerfile: ./Dockerfile
111 | container_name: "spark-master"
112 | environment:
113 | - SPARK_MODE=master
114 | - SPARK_LOCAL_IP=spark-master
115 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
116 | - SPARK_RPC_ENCRYPTION_ENABLED=no
117 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
118 | - SPARK_SSL_ENABLED=no
119 | ports:
120 | - "7077:7077"
121 | - "8080:8080"
122 | volumes:
123 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
124 | networks:
125 | - spark_network
126 |
127 | spark-worker-1:
128 | image: docker.io/bitnami/spark:3.3
129 | container_name: "spark-worker-1"
130 | environment:
131 | - SPARK_MODE=worker
132 | - SPARK_MASTER_URL=spark://spark-master:7077
133 | - SPARK_WORKER_MEMORY=4G
134 | - SPARK_WORKER_CORES=1
135 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
136 | - SPARK_RPC_ENCRYPTION_ENABLED=no
137 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
138 | - SPARK_SSL_ENABLED=no
139 | networks:
140 | - spark_network
141 |
142 | spark-worker-2:
143 | image: docker.io/bitnami/spark:3.3
144 | container_name: "spark-worker-2"
145 | environment:
146 | - SPARK_MODE=worker
147 | - SPARK_MASTER_URL=spark://spark-master:7077
148 | - SPARK_WORKER_MEMORY=4G
149 | - SPARK_WORKER_CORES=1
150 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
151 | - SPARK_RPC_ENCRYPTION_ENABLED=no
152 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
153 | - SPARK_SSL_ENABLED=no
154 | networks:
155 | - spark_network
156 |
157 | spark-notebook:
158 | build:
159 | context: ./notebooks
160 | dockerfile: ./Dockerfile
161 | container_name: "spark-notebook"
162 | user: root
163 | environment:
164 | - JUPYTER_ENABLE_LAB="yes"
165 | - GRANT_SUDO="yes"
166 | volumes:
167 | - ./notebooks:/home/jovyan/work
168 | - ./notebooks/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf
169 | ports:
170 | - "8888:8888"
171 | - "4040:4040"
172 | networks:
173 | - spark_network
174 |
175 | generate_data:
176 | build: ./generate_data
177 | container_name: generate_data
178 | command: python generate_data.py
179 | depends_on:
180 | - mysql
181 | networks:
182 | - spark_network
183 |
184 | api:
185 | build: ./api
186 | ports:
187 | - "8000:8000"
188 | depends_on:
189 | - mysql
190 |
191 |
192 | networks:
193 | spark_network:
194 | driver: bridge
195 | name: spark_network
--------------------------------------------------------------------------------
/notebooks/pyspark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "2ef67329-7f99-451a-8bdb-e91e369d034b",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Requirement already satisfied: pyspark in /usr/local/spark-3.3.2-bin-hadoop3/python (3.3.2)\n",
14 | "Collecting py4j==0.10.9.5\n",
15 | " Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n",
16 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
17 | "\u001b[?25hInstalling collected packages: py4j\n",
18 | "Successfully installed py4j-0.10.9.5\n",
19 | "+--------------------+------+---+--------------------+-----------+-------------+\n",
20 | "| after|before| op| source|transaction| ts_ms|\n",
21 | "+--------------------+------+---+--------------------+-----------+-------------+\n",
22 | "|{100.0, Ford, 201...| null| r|{mysql, inventory...| null|1679965843817|\n",
23 | "+--------------------+------+---+--------------------+-----------+-------------+\n",
24 | "\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "!pip install pyspark\n",
30 | "\n",
31 | "from pyspark.sql import SparkSession\n",
32 | "\n",
33 | "# Initialize Spark session\n",
34 | "spark = SparkSession.builder \\\n",
35 | " .appName(\"Inventory ETL\") \\\n",
36 | " .config(\"spark.sql.parquet.datetimeRebaseModeInWrite\", \"LEGACY\") \\\n",
37 | " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.2.0\") \\\n",
38 | " .getOrCreate()\n",
39 | "\n",
40 | "# Set the necessary AWS credentials\n",
41 | "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.access.key\", \"minio\")\n",
42 | "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.secret.key\", \"minio123\")\n",
43 | "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.endpoint\", \"minio:9000\")\n",
44 | "\n",
45 | "# Set the path to the JSON file\n",
46 | "get_users_file = \"s3a://warehouse/inventory/dbserver1.inventory.customers/partition=0/*.json\"\n",
47 | "\n",
48 | "# Read the JSON file\n",
49 | "raw_data = spark.read \\\n",
50 | " .format(\"json\") \\\n",
51 | " .option(\"inferSchema\", \"true\") \\\n",
52 | " .json(get_users_file)\n",
53 | "\n",
54 | "# Display raw data\n",
55 | "raw_data.show()\n"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 2,
61 | "id": "34027a94-39cd-4dde-9bc0-43e99417bd66",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# Process the data\n",
66 | "silver_data = raw_data.select(\n",
67 | " \"after.id\",\n",
68 | " \"after.plate_number\",\n",
69 | " \"after.car_make\",\n",
70 | " \"after.car_year\",\n",
71 | " \"after.owner_name\",\n",
72 | " \"after.owner_address\",\n",
73 | " \"after.owner_phone_number\",\n",
74 | " \"after.subscription_status\",\n",
75 | " \"after.subscription_start\",\n",
76 | " \"after.subscription_end\",\n",
77 | " \"after.balance\",\n",
78 | " \"after.timestamp\"\n",
79 | ")"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "id": "a4e36db0-4039-4792-ad0a-c81d8d8a3ca4",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "silver_data.write.parquet(\"s3a://warehouse/inventory/silver_data\", mode=\"overwrite\")"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 4,
95 | "id": "bc38035e-bccc-419c-9f5d-784b6d8bf5a9",
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "id": "10794ce4-7037-487d-a163-e3f344047589",
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n",
113 | "| id|plate_number|car_make|car_year|owner_name| owner_address|owner_phone_number|subscription_status|subscription_start|subscription_end|balance| timestamp|\n",
114 | "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n",
115 | "|5a5c562e-4386-44a...| 7695-OOO| Ford| 2012| Stefen|92834 Kim Unions\\...| +14385064453| active| null| null| 100.0|2023-03-03T14:37:49Z|\n",
116 | "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n",
117 | "\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "silver_data.show()"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 6,
128 | "id": "3747067d-5514-4222-af58-4e5ec5d1d1dc",
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "name": "stdout",
133 | "output_type": "stream",
134 | "text": [
135 | "Collecting twilio\n",
136 | " Downloading twilio-7.17.0-py2.py3-none-any.whl (1.4 MB)\n",
137 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
138 | "\u001b[?25hRequirement already satisfied: requests>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from twilio) (2.28.1)\n",
139 | "Requirement already satisfied: PyJWT<3.0.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from twilio) (2.5.0)\n",
140 | "Requirement already satisfied: pytz in /opt/conda/lib/python3.8/site-packages (from twilio) (2022.4)\n",
141 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (1.26.11)\n",
142 | "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (2022.9.24)\n",
143 | "Requirement already satisfied: charset-normalizer<3,>=2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (2.1.1)\n",
144 | "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (3.4)\n",
145 | "Installing collected packages: twilio\n",
146 | "Successfully installed twilio-7.17.0\n",
147 | "Collecting mysql-connector-python\n",
148 | " Downloading mysql_connector_python-8.0.32-cp38-cp38-manylinux1_x86_64.whl (23.5 MB)\n",
149 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.5/23.5 MB\u001b[0m \u001b[31m13.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
150 | "\u001b[?25hCollecting protobuf<=3.20.3,>=3.11.0\n",
151 | " Downloading protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n",
152 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
153 | "\u001b[?25hInstalling collected packages: protobuf, mysql-connector-python\n",
154 | " Attempting uninstall: protobuf\n",
155 | " Found existing installation: protobuf 4.21.7\n",
156 | " Uninstalling protobuf-4.21.7:\n",
157 | " Successfully uninstalled protobuf-4.21.7\n",
158 | "Successfully installed mysql-connector-python-8.0.32 protobuf-3.20.3\n"
159 | ]
160 | }
161 | ],
162 | "source": [
163 | "!pip install twilio\n",
164 | "!pip install mysql-connector-python"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 7,
170 | "id": "7cdf3ad9-c375-4d99-b528-633a65d026a9",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "from datetime import datetime as dt, timedelta, timezone\n",
175 | "import pytz\n",
176 | "from twilio.rest import Client\n",
177 | "from pyspark.sql import Row\n",
178 | "from datetime import datetime, timezone\n",
179 | "from pyspark.sql import SparkSession\n",
180 | "from pyspark.sql.functions import col, udf\n",
181 | "from pyspark.sql.types import BooleanType\n",
182 | "import datetime\n",
183 | "import mysql.connector\n",
184 | "from typing import Optional\n",
185 | "\n",
186 | "# Additional imports\n",
187 | "from mysql.connector import Error\n",
188 | "\n",
189 | "TWILIO_ACCOUNT_SID = '",
190 | "TWILIO_AUTH_TOKEN = '" ,
191 | "TWILIO_PHONE_NUMBER = '",
192 | "\n",
193 | "client = Client(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)\n",
194 | "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")\n",
195 | "\n",
196 | "def get_rate_for_customer(timestamp, subscription_status):\n",
197 | " if subscription_status == 'active':\n",
198 | " if 0 <= timestamp.hour < 6 or 11 <= timestamp.hour < 16:\n",
199 | " return 2.99\n",
200 | " elif 6 <= timestamp.hour < 11 or 16 <= timestamp.hour < 23:\n",
201 | " return 3.99\n",
202 | " else:\n",
203 | " return 9.99\n",
204 | "\n",
205 | " # Add a default rate value to avoid NoneType issues\n",
206 | " return 0.0\n",
207 | "\n",
208 | "\n",
209 | "def is_subscription_active(subscription_start: dt, subscription_end: dt, current_time: dt) -> bool:\n",
210 | " return subscription_start <= current_time <= subscription_end\n",
211 | "\n",
212 | "def get_subscription_status(subscription_end: dt, current_time: dt) -> bool:\n",
213 | " grace_period = timedelta(days=7)\n",
214 | " return current_time <= subscription_end + grace_period\n",
215 | "\n",
216 | "\n",
217 | "def send_sms(phone_number, message):\n",
218 | " try:\n",
219 | " client.messages.create(\n",
220 | " body=message,\n",
221 | " from_=TWILIO_PHONE_NUMBER,\n",
222 | " to=phone_number\n",
223 | " )\n",
224 | " print(f\"SMS sent to {phone_number}: {message}\")\n",
225 | " except Exception as e:\n",
226 | " print(f\"Error sending SMS: {e}\")\n",
227 | "\n",
228 | "from pyspark.sql.functions import col\n",
229 | "\n",
230 | "def is_valid_balance(value):\n",
231 | " try:\n",
232 | " float(value)\n",
233 | " return True\n",
234 | " except ValueError:\n",
235 | " return False\n",
236 | "\n",
237 | "valid_balance_udf = udf(is_valid_balance, BooleanType())\n",
238 | "\n",
239 | "silver_data = silver_data.filter(valid_balance_udf(col(\"balance\")))\n",
240 | "\n",
241 | "# Database configuration\n",
242 | "db_config = {\n",
243 | " \"host\": \"mysql\",\n",
244 | " \"user\": \"root\",\n",
245 | " \"password\": \"debezium\",\n",
246 | " \"database\": \"inventory\"\n",
247 | "}\n",
248 | "\n",
249 | "def update_customer_balance(customer_id, new_balance):\n",
250 | " try:\n",
251 | " connection = mysql.connector.connect(**db_config)\n",
252 | " cursor = connection.cursor()\n",
253 | " update_query = \"UPDATE customers SET balance = %s WHERE id = %s\"\n",
254 | " cursor.execute(update_query, (new_balance, customer_id))\n",
255 | " connection.commit()\n",
256 | " print(f\"Updated balance for customer {customer_id}: {new_balance}\")\n",
257 | " except Error as e:\n",
258 | " print(f\"Error updating balance: {e}\")\n",
259 | " finally:\n",
260 | " if connection.is_connected():\n",
261 | " cursor.close()\n",
262 | " connection.close() \n",
263 | "\n",
264 | "from datetime import datetime, timezone\n",
265 | "\n",
266 | "def safe_date_conversion(date_string: Optional[str]) -> dt:\n",
267 | " if date_string is None or not isinstance(date_string, str):\n",
268 | " return dt(1970, 1, 1, tzinfo=timezone.utc)\n",
269 | " try:\n",
270 | " return dt.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)\n",
271 | " except ValueError:\n",
272 | " return dt(1970, 1, 1, tzinfo=timezone.utc)\n",
273 | "\n",
274 | "def process_plate(row: Row) -> None:\n",
275 | " print(f\"Processing plate: {row.plate_number}\")\n",
276 | " current_time = dt.now(timezone.utc)\n",
277 | " try:\n",
278 | " plate_timestamp = dt.fromisoformat(row.timestamp[:-1]).replace(tzinfo=timezone.utc)\n",
279 | " except ValueError:\n",
280 | " plate_timestamp = dt.fromtimestamp(0, timezone.utc)\n",
281 | "\n",
282 | " subscription_start = safe_date_conversion(row.subscription_start)\n",
283 | " subscription_end = safe_date_conversion(row.subscription_end)\n",
284 | "\n",
285 | " is_active = is_subscription_active(subscription_start, subscription_end, current_time)\n",
286 | " rate = get_rate_for_customer(plate_timestamp, row.subscription_status)\n",
287 | "\n",
288 | " balance = float(row.balance)\n",
289 | " new_balance = balance - rate\n",
290 | "\n",
291 | " if row.subscription_status == 'none':\n",
292 | " message = f\"Dear {row.owner_name}, your car with plate number {row.plate_number} is not registered. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n",
293 | " send_sms(row.owner_phone_number, message)\n",
294 | " elif is_active: # Changed from row.subscription_status == 'active'\n",
295 | " message = f\"Dear {row.owner_name}, your subscription is active. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n",
296 | " send_sms(row.owner_phone_number, message)\n",
297 | " elif not get_subscription_status(subscription_end, current_time):\n",
298 | " message = f\"Dear {row.owner_name}, your subscription has expired. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n",
299 | " send_sms(row.owner_phone_number, message)\n",
300 | "\n",
301 | " update_customer_balance(row.id, new_balance)\n",
302 | "\n",
303 | "silver_data.foreach(process_plate)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 8,
309 | "id": "8beefe0b-5fae-43d7-a903-d82a8cab1eae",
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "data": {
314 | "text/plain": [
315 | "\"\\nsample_data = Row(\\n id='5a5c562e-4386-44ad-bf6f-bab91081781e',\\n plate_number='7695-OOO',\\n car_make='Ford',\\n car_year=2012,\\n owner_name='Becky Smith',\\n owner_address='92834 Kim Unions\\nPort Harryport, MD 61729',\\n owner_phone_number='+14385064453',\\n subscription_status='none',\\n subscription_start=None,\\n subscription_end=None,\\n balance=100.0, # Replace 'Exc=' with a valid float value\\n timestamp='2023-03-03T14:37:49Z',\\n rate=9.99\\n)\\n\\nprocess_plate(sample_data)\\n\""
316 | ]
317 | },
318 | "execution_count": 8,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "\"\"\"\n",
325 | "sample_data = Row(\n",
326 | " id='5a5c562e-4386-44ad-bf6f-bab91081781e',\n",
327 | " plate_number='7695-OOO',\n",
328 | " car_make='Ford',\n",
329 | " car_year=2012,\n",
330 | " owner_name='Becky Smith',\n",
331 | " owner_address='92834 Kim Unions\\nPort Harryport, MD 61729',\n",
332 | " owner_phone_number='+14354123654',\n",
333 | " subscription_status='none',\n",
334 | " subscription_start=None,\n",
335 | " subscription_end=None,\n",
336 | " balance=100.0, # Replace 'Exc=' with a valid float value\n",
337 | " timestamp='2023-03-03T14:37:49Z',\n",
338 | " rate=9.99\n",
339 | ")\n",
340 | "\n",
341 | "process_plate(sample_data)\n",
342 | "\"\"\""
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 9,
348 | "id": "84eb738c-e811-44e1-84e5-a56002973ea6",
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "+-------------------+-----+\n",
356 | "|subscription_status|count|\n",
357 | "+-------------------+-----+\n",
358 | "| active| 1|\n",
359 | "+-------------------+-----+\n",
360 | "\n",
361 | "Daily Metrics:\n",
362 | "+----------+--------------+-------------+\n",
363 | "| date|total_passages|total_revenue|\n",
364 | "+----------+--------------+-------------+\n",
365 | "|2023-03-03| 1| 3.99|\n",
366 | "+----------+--------------+-------------+\n",
367 | "\n",
368 | "Weekly Metrics:\n",
369 | "+----+------------+--------------+-------------+\n",
370 | "|year|week_of_year|total_passages|total_revenue|\n",
371 | "+----+------------+--------------+-------------+\n",
372 | "|2023| 9| 1| 3.99|\n",
373 | "+----+------------+--------------+-------------+\n",
374 | "\n",
375 | "Monthly Metrics:\n",
376 | "+----+-----+--------------+-------------+\n",
377 | "|year|month|total_passages|total_revenue|\n",
378 | "+----+-----+--------------+-------------+\n",
379 | "|2023| 3| 1| 3.99|\n",
380 | "+----+-----+--------------+-------------+\n",
381 | "\n",
382 | "Quarterly Metrics:\n",
383 | "+----+-------+--------------+-------------+\n",
384 | "|year|quarter|total_passages|total_revenue|\n",
385 | "+----+-------+--------------+-------------+\n",
386 | "|2023| 1| 1| 3.99|\n",
387 | "+----+-------+--------------+-------------+\n",
388 | "\n",
389 | "Yearly Metrics:\n",
390 | "+----+--------------+-------------+\n",
391 | "|year|total_passages|total_revenue|\n",
392 | "+----+--------------+-------------+\n",
393 | "|2023| 1| 3.99|\n",
394 | "+----+--------------+-------------+\n",
395 | "\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "gold_data = silver_data.groupBy(\"subscription_status\").count()\n",
401 | "\n",
402 | "gold_data.show()\n",
403 | "\n",
404 | "gold_data.write.parquet(\"s3a://warehouse/inventory/gold_data\", mode=\"overwrite\")\n",
405 | "\n",
406 | "\n",
407 | "import pyspark.sql.functions as F\n",
408 | "from pyspark.sql import SparkSession\n",
409 | "\n",
410 | "class MetricsAdapter:\n",
411 | " def __init__(self, silver_table, warehouse_path):\n",
412 | " self.silver_table = silver_table\n",
413 | " self.warehouse_path = warehouse_path\n",
414 | " \n",
415 | " def show_metrics(self):\n",
416 | " daily_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/daily_metrics')\n",
417 | " weekly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/weekly_metrics')\n",
418 | " monthly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/monthly_metrics')\n",
419 | " quarterly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/quarterly_metrics')\n",
420 | " yearly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/yearly_metrics')\n",
421 | " subscription_status_count = silver_data.groupBy(\"subscription_status\").count()\n",
422 | "\n",
423 | " print(\"Daily Metrics:\")\n",
424 | " daily_metrics.show(5)\n",
425 | "\n",
426 | " print(\"Weekly Metrics:\")\n",
427 | " weekly_metrics.show(5)\n",
428 | "\n",
429 | " print(\"Monthly Metrics:\")\n",
430 | " monthly_metrics.show(5)\n",
431 | "\n",
432 | " print(\"Quarterly Metrics:\")\n",
433 | " quarterly_metrics.show(5)\n",
434 | "\n",
435 | " print(\"Yearly Metrics:\")\n",
436 | " yearly_metrics.show(5) \n",
437 | "\n",
438 | " def transform(self):\n",
439 | " # Calculate the week, month, quarter, and year from the timestamp\n",
440 | " time_based_metrics = self.silver_table.withColumn(\"date\", F.to_date(\"timestamp\")) \\\n",
441 | " .withColumn(\"year\", F.year(\"timestamp\")) \\\n",
442 | " .withColumn(\"quarter\", F.quarter(\"timestamp\")) \\\n",
443 | " .withColumn(\"month\", F.month(\"timestamp\")) \\\n",
444 | " .withColumn(\"week_of_year\", F.weekofyear(\"timestamp\")) \\\n",
445 | " .withColumn(\"total_passages\", F.lit(1)) \\\n",
446 | " .withColumn(\"total_revenue\", F.when(self.silver_table.timestamp.substr(12, 2).cast(\"int\") < 12, 2.99).otherwise(3.99))\n",
447 | "\n",
448 | "\n",
449 | " # Daily metrics\n",
450 | " daily_metrics = time_based_metrics.groupBy(\"date\").agg(\n",
451 | " F.count(\"*\").alias(\"total_passages\"),\n",
452 | " F.sum(F.when(time_based_metrics.timestamp.substr(12, 2).cast(\"int\") < 12, 2.99).otherwise(3.99)).alias(\"total_revenue\")\n",
453 | " )\n",
454 | " daily_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/daily_metrics')\n",
455 | "\n",
456 | " # Weekly metrics\n",
457 | " weekly_metrics = time_based_metrics.groupBy(\"year\", \"week_of_year\").agg(\n",
458 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n",
459 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
460 | " )\n",
461 | " weekly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/weekly_metrics')\n",
462 | "\n",
463 | " # Monthly metrics\n",
464 | " monthly_metrics = time_based_metrics.groupBy(\"year\", \"month\").agg(\n",
465 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n",
466 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
467 | " )\n",
468 | " monthly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/monthly_metrics')\n",
469 | "\n",
470 | " # Quarterly metrics\n",
471 | " quarterly_metrics = time_based_metrics.groupBy(\"year\", \"quarter\").agg(\n",
472 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n",
473 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
474 | " )\n",
475 | " quarterly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/quarterly_metrics')\n",
476 | "\n",
477 | " # Yearly metrics\n",
478 | " yearly_metrics = time_based_metrics.groupBy(\"year\").agg(\n",
479 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n",
480 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
481 | " )\n",
482 | " yearly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/yearly_metrics')\n",
483 | "\n",
484 | "# Example usage\n",
485 | "spark = SparkSession.builder.getOrCreate()\n",
486 | "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")\n",
487 | "warehouse_path = \"s3a://warehouse/inventory/gold_data\"\n",
488 | "metrics_adapter = MetricsAdapter(silver_data, warehouse_path)\n",
489 | "metrics_adapter.transform()\n",
490 | "\n",
491 | "metrics_adapter.show_metrics()"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": null,
497 | "id": "9079a66c-d57f-4ab2-a52d-3ee7a2929490",
498 | "metadata": {},
499 | "outputs": [],
500 | "source": []
501 | }
502 | ],
503 | "metadata": {
504 | "kernelspec": {
505 | "display_name": "Python 3 (ipykernel)",
506 | "language": "python",
507 | "name": "python3"
508 | },
509 | "language_info": {
510 | "codemirror_mode": {
511 | "name": "ipython",
512 | "version": 3
513 | },
514 | "file_extension": ".py",
515 | "mimetype": "text/x-python",
516 | "name": "python",
517 | "nbconvert_exporter": "python",
518 | "pygments_lexer": "ipython3",
519 | "version": "3.8.13"
520 | }
521 | },
522 | "nbformat": 4,
523 | "nbformat_minor": 5
524 | }
525 |
--------------------------------------------------------------------------------