├── api
    ├── requirements.txt
    ├── Dockerfile
    ├── request.py
    ├── api.py
    └── template
    │   └── customers.html
├── img
    ├── red.png
    └── download.png
├── src
    ├── requirements.txt
    ├── Dockerfile
    └── setup_connectors.sh
├── generate_data
    ├── requirements.txt
    ├── Dockerfile
    └── generate_data.py
├── kafka
    └── Dockerfile
├── .env
├── README.md
├── spark
    ├── Dockerfile
    └── spark-defaults.conf
├── notebooks
    ├── spark-defaults.conf
    ├── Dockerfile
    └── pyspark.ipynb
└── docker-compose.yml


/api/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.1.1
2 | mysql-connector-python==8.0.28
3 | pandas==1.3.5
4 | 


--------------------------------------------------------------------------------
/img/red.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stefen-Taime/stream-ingestion-redpanda-minio/HEAD/img/red.png


--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.5.2
2 | SQLAlchemy==1.4.46
3 | PyMySQL==1.0.2
4 | kafka-python==2.0.2


--------------------------------------------------------------------------------
/img/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stefen-Taime/stream-ingestion-redpanda-minio/HEAD/img/download.png


--------------------------------------------------------------------------------
/generate_data/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.1.1
2 | mysql-connector-python==8.0.28
3 | pandas==1.3.5
4 | Faker==13.1.0
5 | 


--------------------------------------------------------------------------------
/src/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim
2 | 
3 | WORKDIR /opt/src
4 | COPY requirements.txt /opt/src
5 | RUN pip install --upgrade pip && pip install -r requirements.txt
6 | 
7 | ENTRYPOINT ["tail", "-f", "/dev/null"]


--------------------------------------------------------------------------------
/generate_data/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt .
 6 | RUN pip install --no-cache-dir -r requirements.txt
 7 | 
 8 | COPY . .
 9 | 
10 | CMD ["python", "generate_data.py"]
11 | 


--------------------------------------------------------------------------------
/api/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt .
 6 | RUN pip install --no-cache-dir -r requirements.txt
 7 | 
 8 | COPY . .
 9 | 
10 | EXPOSE 8000
11 | 
12 | CMD ["python", "api.py"]
13 | 


--------------------------------------------------------------------------------
/kafka/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debezium/connect
2 | 
3 | RUN curl -O https://d1i4a15mxbxib1.cloudfront.net/api/plugins/confluentinc/kafka-connect-s3/versions/10.3.1/confluentinc-kafka-connect-s3-10.3.1.zip \
4 |     && unzip confluentinc-kafka-connect-s3-10.3.1.zip \
5 |     && mv confluentinc-kafka-connect-s3-10.3.1 /kafka/connect/ \
6 |     && rm confluentinc-kafka-connect-s3-10.3.1.zip


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
 1 | # redpanda-console
 2 | KAFKA_BROKERS="redpanda:9092"
 3 | 
 4 | # MinIO
 5 | MINIO_ROOT_USER="minio"
 6 | MINIO_ROOT_PASSWORD="minio123"
 7 | MINIO_ACCESS_KEY="minio"
 8 | MINIO_SECRET_KEY="minio123"
 9 | 
10 | # MySQL
11 | MYSQL_ROOT_PASSWORD="debezium"
12 | MYSQL_USER="admin"
13 | MYSQL_PASSWORD="admin123"
14 | 
15 | # kafka connect
16 | BOOTSTRAP_SERVERS="redpanda:9092"
17 | GROUP_ID="1"
18 | CONFIG_STORAGE_TOPIC="debezium.configs"
19 | OFFSET_STORAGE_TOPIC="debezium.offset"
20 | STATUS_STORAGE_TOPIC="debezium.status"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Real-Time Data Processing and Analytics with Docker, MySQL, Redpanda, MinIO, and Apache Spark Using Delta Lake
 2 | 
 3 | 
 4 | 
 5 | ## Architectural overview
 6 | 
 7 | ![Architecture](/img/red.png)
 8 | 
 9 | [Medium](https://medium.com/@stefentaime_10958/real-time-data-processing-and-analytics-with-docker-mysql-redpanda-minio-and-apache-spark-eca83f210ef6) In this article, you will learn how to set up a real-time data processing and analytics environment using Docker, MySQL, Redpanda, MinIO, and Apache Spark.
10 | 
11 | 


--------------------------------------------------------------------------------
/api/request.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | data = {
 4 |     "id": "5a5c562e-4386-44ad-bf6f-bab91081781e",
 5 |     "plate_number": "7695-OOO",
 6 |     "car_make": "Ford",
 7 |     "car_year": 2012,
 8 |     "owner_name": "Stefen",
 9 |     "owner_address": "92834 Kim Unions\nPort Harryport, MD 61729",
10 |     "owner_phone_number": "+1505698632",
11 |     "subscription_status": "active",
12 |     "subscription_start": None,
13 |     "subscription_end": None,
14 |     "balance": 100.0,
15 |     "timestamp": "2023-03-03T14:37:49",
16 |     "rate": 9.99
17 | }
18 | 
19 | response = requests.post("http://0.0.0.0:8000/send_data", json=data)
20 | 
21 | print(response.status_code)
22 | print(response.json())
23 | 


--------------------------------------------------------------------------------
/spark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/bitnami/spark:3.3
 2 | 
 3 | USER root
 4 | 
 5 | # Install prerequisites
 6 | RUN apt-get update && apt-get install -y curl
 7 | 
 8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
 9 |     && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
10 |     && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
11 |     && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
12 |     && mv s3-2.18.41.jar /opt/bitnami/spark/jars \
13 |     && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \
14 |     && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \
15 |     && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars


--------------------------------------------------------------------------------
/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.jars                                          jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar
2 | spark.sql.extensions                                io.delta.sql.DeltaSparkSessionExtension
3 | spark.sql.catalog.spark_catalog                     org.apache.spark.sql.delta.catalog.DeltaCatalog
4 | spark.hadoop.fs.s3a.endpoint                        http://minio:9000
5 | spark.hadoop.fs.s3a.access.key                      minio
6 | spark.hadoop.fs.s3a.secret.key                      minio123
7 | spark.hadoop.fs.s3a.path.style.access               true
8 | spark.hadoop.fs.s3a.connection.ssl.enabled          false
9 | spark.hadoop.fs.s3a.impl                            org.apache.hadoop.fs.s3a.S3AFileSystem


--------------------------------------------------------------------------------
/notebooks/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.jars                                          /usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar
2 | spark.sql.extensions                                io.delta.sql.DeltaSparkSessionExtension
3 | spark.sql.catalog.spark_catalog                     org.apache.spark.sql.delta.catalog.DeltaCatalog
4 | spark.hadoop.fs.s3a.endpoint                        http://minio:9000
5 | spark.hadoop.fs.s3a.access.key                      minio
6 | spark.hadoop.fs.s3a.secret.key                      minio123
7 | spark.hadoop.fs.s3a.path.style.access               true
8 | spark.hadoop.fs.s3a.connection.ssl.enabled          false
9 | spark.hadoop.fs.s3a.impl                            org.apache.hadoop.fs.s3a.S3AFileSystem


--------------------------------------------------------------------------------
/notebooks/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jupyter/all-spark-notebook:python-3.8
 2 | 
 3 | USER root 
 4 | 
 5 | RUN curl -O https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \
 6 |     && tar zxvf spark-3.3.2-bin-hadoop3.tgz \
 7 |     && rm -rf spark-3.3.2-bin-hadoop3.tgz \
 8 |     && mv spark-3.3.2-bin-hadoop3/ /usr/local/ \
 9 |     && rm -rf /usr/local/spark \
10 |     && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \
11 |     && ln -s /usr/local/spark-3.3.2-bin-hadoop3 /usr/local/spark
12 | 
13 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
14 |     && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
15 |     && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \
16 |     && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
17 |     && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
18 |     && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \
19 |     && mv s3-2.18.41.jar /usr/local/spark/jars \
20 |     && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \
21 |     && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \
22 |     && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \
23 |     && mv delta-storage-2.2.0.jar /usr/local/spark/jars \
24 |     && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars


--------------------------------------------------------------------------------
/src/setup_connectors.sh:
--------------------------------------------------------------------------------
 1 | # create connector source for MySQL
 2 | curl --request POST \
 3 |   --url http://localhost:8083/connectors \
 4 |   --header 'Content-Type: application/json' \
 5 |   --data '{
 6 |   "name": "src-mys",
 7 |   "config": {
 8 |     "connector.class": "io.debezium.connector.mysql.MySqlConnector",
 9 |     "tasks.max": "1",
10 |     "database.hostname": "mysql",
11 |     "database.port": "3306",
12 |     "database.user": "debezium",
13 |     "database.password": "dbz",
14 |     "database.server.id": "184054",
15 |     "database.include.list": "inventory",
16 |     "decimal.handling.mode": "double",
17 |     "topic.prefix": "dbserver1",
18 |     "schema.history.internal.kafka.bootstrap.servers": "redpanda:9092",
19 |     "schema.history.internal.kafka.topic": "schema-changes.inventory"
20 |   }
21 | }'
22 | 
23 | # create connector sink MySQL to S3
24 | curl --request POST \
25 |   --url http://localhost:8083/connectors \
26 |   --header 'Content-Type: application/json' \
27 |   --data '{
28 |   "name": "sink_aws-s3",
29 |   "config": {
30 |     "topics.regex": "dbserver1.inventory.*",
31 |     "topics.dir": "inventory",
32 |     "connector.class": "io.confluent.connect.s3.S3SinkConnector",
33 |     "key.converter": "org.apache.kafka.connect.json.JsonConverter",
34 |     "value.converter": "org.apache.kafka.connect.json.JsonConverter",
35 |     "format.class": "io.confluent.connect.s3.format.json.JsonFormat",
36 |     "flush.size": "1",
37 |     "store.url": "http://minio:9000",
38 |     "storage.class": "io.confluent.connect.s3.storage.S3Storage",
39 |     "s3.region": "us-east-1",
40 |     "s3.bucket.name": "warehouse",
41 |     "aws.access.key.id": "minio",
42 |     "aws.secret.access.key": "minio123"
43 |   }
44 | }'
45 | 


--------------------------------------------------------------------------------
/api/api.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify, render_template
 2 | import mysql.connector
 3 | import pandas as pd
 4 | 
 5 | app = Flask(__name__, template_folder='template')
 6 | 
 7 | db_config = {
 8 |         "host": "10.0.0.25",
 9 |         "user": "root",
10 |         "password": "debezium",
11 |         "database": "inventory"
12 |     }
13 | 
14 | @app.route('/send_data', methods=['POST'])
15 | def send_data():
16 |     data = request.get_json()
17 | 
18 |     
19 |     conn = mysql.connector.connect(**db_config)
20 | 
21 |     cursor = conn.cursor()
22 | 
23 |     insert_query = '''
24 |     INSERT INTO customers (id, plate_number, car_make, car_year, owner_name, owner_address, owner_phone_number, subscription_status, subscription_start, subscription_end, balance, timestamp)
25 |     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
26 |     '''
27 |     cursor.execute(insert_query, (
28 |         data['id'],
29 |         data['plate_number'],
30 |         data['car_make'],
31 |         data['car_year'],
32 |         data['owner_name'],
33 |         data['owner_address'],
34 |         data['owner_phone_number'],
35 |         data['subscription_status'],
36 |         data['subscription_start'],
37 |         data['subscription_end'],
38 |         data['balance'],
39 |         data['timestamp']
40 |     ))
41 | 
42 |     conn.commit()
43 | 
44 |     cursor.close()
45 |     conn.close()
46 | 
47 |     return jsonify({"status": "success"}), 200
48 | 
49 | @app.route('/customers', methods=['GET'])
50 | def customers():
51 |     plate_number = request.args.get('plate_number', '')
52 |     page = int(request.args.get('page', 1))
53 |     items_per_page = 10
54 | 
55 |     conn = mysql.connector.connect(**db_config)
56 | 
57 |     # Create a cursor
58 |     cursor = conn.cursor()
59 | 
60 |     # Fetch customers filtered by plate_number and apply pagination
61 |     select_query = '''
62 |     SELECT * FROM customers
63 |     WHERE plate_number LIKE %s
64 |     LIMIT %s OFFSET %s
65 |     '''
66 |     cursor.execute(select_query, (f"%{plate_number}%", items_per_page, (page - 1) * items_per_page))
67 |     customers = cursor.fetchall()
68 | 
69 |     # Get the total number of customers
70 |     cursor.execute("SELECT COUNT(*) FROM customers WHERE plate_number LIKE %s", (f"%{plate_number}%",))
71 |     total_customers = cursor.fetchone()[0]
72 | 
73 |     # Close the cursor and connection
74 |     cursor.close()
75 |     conn.close()
76 | 
77 |     return render_template('customers.html', customers=customers, plate_number=plate_number, page=page, total_pages=(total_customers // items_per_page) + 1)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     app.run(host='0.0.0.0', port=8000)


--------------------------------------------------------------------------------
/api/template/customers.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Customers</title>
 7 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
 8 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
 9 | </head>
10 | <body>
11 |     <div class="container">
12 |         <h1 class="my-4">Customers</h1>
13 | 
14 |         <form method="GET" action="/customers" class="mb-3">
15 |             <div class="input-group">
16 |                 <label class="input-group-text" for="plate_number">Filter by Plate Number:</label>
17 |                 <input type="text" class="form-control" id="plate_number" name="plate_number" value="{{ plate_number }}">
18 |                 <button type="submit" class="btn btn-primary">Search</button>
19 |             </div>
20 |         </form>
21 | 
22 |         <table class="table table-striped table-hover">
23 |             <thead>
24 |                 <tr>
25 |                     <th>Plate Number</th>
26 |                     <th>Car Make</th>
27 |                     <th>Car Year</th>
28 |                     <th>Owner Name</th>
29 |                     <th>Adress</th>
30 |                     <th>Balance</th>
31 | 
32 | 
33 |                 </tr>
34 |             </thead>
35 |             <tbody>
36 |                 {% for customer in customers %}
37 |                     <tr>
38 |                         <td>{{ customer[1] }}</td>
39 |                         <td>{{ customer[2] }}</td>
40 |                         <td>{{ customer[3] }}</td>
41 |                         <td>{{ customer[4] }}</td>
42 |                         <td>{{ customer[5] }}</td>
43 |                         <td>{{ customer[10] }}</td>
44 |                     </tr>
45 |                 {% endfor %}
46 |             </tbody>
47 |         </table>
48 | 
49 |         <nav aria-label="Page navigation">
50 |             <ul class="pagination">
51 |                 {% for i in range(1, total_pages + 1) %}
52 |                     {% if i == page %}
53 |                         <li class="page-item active">
54 |                             <span class="page-link">{{ i }}</span>
55 |                         </li>
56 |                     {% else %}
57 |                         <li class="page-item">
58 |                             <a class="page-link" href="{{ url_for('customers', plate_number=plate_number, page=i) }}">{{ i }}</a>
59 |                         </li>
60 |                     {% endif %}
61 |                 {% endfor %}
62 |             </ul>
63 |         </nav>
64 |     </div>
65 | </body>
66 | </html>
67 | 


--------------------------------------------------------------------------------
/generate_data/generate_data.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import uuid
  3 | from faker import Faker
  4 | import pandas as pd
  5 | import mysql.connector
  6 | from datetime import datetime, timedelta
  7 | 
  8 | # Initialize Faker
  9 | fake = Faker()
 10 | 
 11 | # Number of data points to generate
 12 | num_records = 1000
 13 | 
 14 | # Generate synthetic data
 15 | data = []
 16 | 
 17 | for _ in range(num_records):
 18 |     unique_id = str(uuid.uuid4())
 19 |     plate_number = f"{random.randint(1000, 9999)}-{fake.random_element(elements=('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG', 'HHH', 'III', 'JJJ', 'KKK', 'LLL', 'MMM', 'NNN', 'OOO', 'PPP', 'QQQ', 'RRR', 'SSS', 'TTT', 'UUU', 'VVV', 'WWW', 'XXX', 'YYY', 'ZZZ'))}"
 20 |     
 21 |     car_info = {
 22 |         "make": fake.random_element(elements=("Toyota", "Honda", "Ford", "Chevrolet", "Nissan", "Volkswagen", "BMW", "Mercedes-Benz")),
 23 |         "year": random.randint(2000, 2023)
 24 |     }
 25 |     
 26 |     owner_info = {
 27 |         "name": fake.name(),
 28 |         "address": fake.address(),
 29 |         "phone_number": fake.phone_number().replace("x", " ext. ")  # Modify phone number format
 30 |     }
 31 |     
 32 |     subscription_status = fake.random_element(elements=("active", "expired", "none"))
 33 |     
 34 |     if subscription_status != "none":
 35 |         subscription_start = fake.date_between(start_date='-3y', end_date='today')
 36 |         subscription_end = subscription_start + timedelta(days=365)
 37 |     else:
 38 |         subscription_start = None
 39 |         subscription_end = None
 40 | 
 41 |     balance = round(random.uniform(0, 500), 2)
 42 |     
 43 |     timestamp = fake.date_time_between(start_date='-30d', end_date='now').strftime('%Y-%m-%d %H:%M:%S')
 44 | 
 45 |     
 46 |     record = {
 47 |         "id": unique_id,
 48 |         "plate_number": plate_number,
 49 |         "car_make": car_info["make"],
 50 |         "car_year": car_info["year"],
 51 |         "owner_name": owner_info["name"],
 52 |         "owner_address": owner_info["address"],
 53 |         "owner_phone_number": owner_info["phone_number"],
 54 |         "subscription_status": subscription_status,
 55 |         "subscription_start": subscription_start,
 56 |         "subscription_end": subscription_end,
 57 |         "balance": balance,
 58 |         "timestamp": timestamp
 59 |     }
 60 |     
 61 |     data.append(record)
 62 | 
 63 | # Convert data to a pandas DataFrame
 64 | df = pd.DataFrame(data)
 65 | 
 66 | # Connect to the MySQL database
 67 | db_config = {
 68 |     "host": "mysql",
 69 |     "user": "root",
 70 |     "password": "debezium",
 71 |     "database": "inventory"
 72 | }
 73 | conn = mysql.connector.connect(**db_config)
 74 | 
 75 | # Create a cursor
 76 | cursor = conn.cursor()
 77 | 
 78 | # Create the 'customers' table if it doesn't exist
 79 | create_table_query = '''
 80 | CREATE TABLE IF NOT EXISTS customers (
 81 |     id VARCHAR(255) NOT NULL,
 82 |     plate_number VARCHAR(255) NOT NULL,
 83 |     car_make VARCHAR(255) NOT NULL,
 84 |     car_year INT NOT NULL,
 85 |     owner_name VARCHAR(255) NOT NULL,
 86 |     owner_address TEXT NOT NULL,
 87 |     owner_phone_number VARCHAR(255) NOT NULL,
 88 |     subscription_status ENUM('active', 'expired', 'none') NOT NULL,
 89 |     subscription_start DATE,
 90 |     subscription_end DATE,
 91 |     balance DECIMAL(10, 2) NOT NULL,
 92 |     timestamp TIMESTAMP NOT NULL
 93 | )
 94 | '''
 95 | cursor.execute(create_table_query)
 96 | 
 97 | # Store the synthetic data in the 'customers' table
 98 | for index, row in df.iterrows():
 99 |     insert_query = '''
100 |     INSERT INTO customers (id, plate_number, car_make, car_year, owner_name, owner_address, owner_phone_number, subscription_status, subscription_start, subscription_end, balance, timestamp)
101 |     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
102 |     '''
103 |     cursor.execute(insert_query, (
104 |         row['id'],
105 |         row['plate_number'],
106 |         row['car_make'],
107 |         row['car_year'],
108 |         row['owner_name'],
109 |         row['owner_address'],
110 |         row['owner_phone_number'],
111 |         row['subscription_status'],
112 |         row['subscription_start'],
113 |         row['subscription_end'],
114 |         row['balance'],
115 |         row['timestamp']
116 |     ))
117 | 
118 | # Commit the changes and close the cursor
119 | conn.commit()
120 | cursor.close()
121 | 
122 | # Close the database connection
123 | conn.close()
124 | 
125 | print("Synthetic data stored in the 'customers' table in the MySQL database")
126 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.7"
  2 | services:
  3 |   redpanda:
  4 |     image: vectorized/redpanda
  5 |     container_name: redpanda
  6 |     ports:
  7 |       - "9092:9092"
  8 |       - "29092:29092"
  9 |     command:
 10 |       - redpanda
 11 |       - start
 12 |       - --overprovisioned
 13 |       - --smp
 14 |       - "1"
 15 |       - --memory
 16 |       - "1G"
 17 |       - --reserve-memory
 18 |       - "0M"
 19 |       - --node-id
 20 |       - "0"
 21 |       - --kafka-addr
 22 |       - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
 23 |       - --advertise-kafka-addr
 24 |       - PLAINTEXT://redpanda:29092,OUTSIDE://redpanda:9092
 25 |       - --check=false
 26 |     networks:
 27 |       - spark_network  
 28 | 
 29 |   redpanda-console:
 30 |     image: vectorized/console
 31 |     container_name: redpanda_console
 32 |     depends_on:
 33 |       - redpanda
 34 |     ports:
 35 |       - "5000:8080"
 36 |     env_file:
 37 |       - .env
 38 |     networks:
 39 |       - spark_network  
 40 | 
 41 |   minio:
 42 |     hostname: minio
 43 |     image: "minio/minio"
 44 |     container_name: minio
 45 |     ports:
 46 |       - "9001:9001"
 47 |       - "9000:9000"
 48 |     command: [ "server", "/data", "--console-address", ":9001" ]
 49 |     volumes:
 50 |       - ./minio/data:/data
 51 |     env_file:
 52 |       - .env
 53 |     networks:
 54 |       - spark_network  
 55 | 
 56 |   mc:
 57 |     image: minio/mc
 58 |     container_name: mc
 59 |     hostname: mc
 60 |     environment:
 61 |       - AWS_ACCESS_KEY_ID=minio
 62 |       - AWS_SECRET_ACCESS_KEY=minio123
 63 |       - AWS_REGION=us-east-1
 64 |     entrypoint: >
 65 |       /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; done; /usr/bin/mc mb minio/warehouse; /usr/bin/mc policy set public minio/warehouse; exit 0; "
 66 |     depends_on:
 67 |       - minio
 68 |     networks:
 69 |       - spark_network  
 70 | 
 71 |   mysql:
 72 |     image: debezium/example-mysql:1.6
 73 |     container_name: mysql
 74 |     volumes:
 75 |       - ./mysql/data:/var/lib/mysql
 76 |     ports:
 77 |       - "3306:3306"
 78 |     env_file:
 79 |       - .env
 80 |     networks:
 81 |       - spark_network  
 82 | 
 83 |   kafka-connect:
 84 |     build:
 85 |       context: ./kafka
 86 |       dockerfile: ./Dockerfile
 87 |     container_name: kafka_connect
 88 |     depends_on:
 89 |       - redpanda
 90 |     ports:
 91 |       - "8083:8083"
 92 |     env_file:
 93 |       - .env
 94 |     networks:
 95 |       - spark_network  
 96 | 
 97 |   adminer:
 98 |     image: adminer:latest
 99 |     ports:
100 |       - 8085:8080/tcp
101 |     deploy:
102 |      restart_policy:
103 |        condition: on-failure 
104 |     networks:
105 |       - spark_network      
106 | 
107 |   spark-master:
108 |     build:
109 |       context: ./spark
110 |       dockerfile: ./Dockerfile
111 |     container_name: "spark-master"
112 |     environment:
113 |       - SPARK_MODE=master
114 |       - SPARK_LOCAL_IP=spark-master
115 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
116 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
117 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
118 |       - SPARK_SSL_ENABLED=no
119 |     ports:
120 |       - "7077:7077"
121 |       - "8080:8080"
122 |     volumes:
123 |       - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
124 |     networks:
125 |       - spark_network
126 | 
127 |   spark-worker-1:
128 |     image: docker.io/bitnami/spark:3.3
129 |     container_name: "spark-worker-1"
130 |     environment:
131 |       - SPARK_MODE=worker
132 |       - SPARK_MASTER_URL=spark://spark-master:7077
133 |       - SPARK_WORKER_MEMORY=4G
134 |       - SPARK_WORKER_CORES=1
135 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
136 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
137 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
138 |       - SPARK_SSL_ENABLED=no
139 |     networks:
140 |       - spark_network
141 | 
142 |   spark-worker-2:
143 |     image: docker.io/bitnami/spark:3.3
144 |     container_name: "spark-worker-2"
145 |     environment:
146 |       - SPARK_MODE=worker
147 |       - SPARK_MASTER_URL=spark://spark-master:7077
148 |       - SPARK_WORKER_MEMORY=4G
149 |       - SPARK_WORKER_CORES=1
150 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
151 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
152 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
153 |       - SPARK_SSL_ENABLED=no
154 |     networks:
155 |       - spark_network
156 | 
157 |   spark-notebook:
158 |     build:
159 |       context: ./notebooks
160 |       dockerfile: ./Dockerfile
161 |     container_name: "spark-notebook"
162 |     user: root
163 |     environment:
164 |       - JUPYTER_ENABLE_LAB="yes"
165 |       - GRANT_SUDO="yes"
166 |     volumes:
167 |       - ./notebooks:/home/jovyan/work
168 |       - ./notebooks/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf
169 |     ports:
170 |       - "8888:8888"
171 |       - "4040:4040"
172 |     networks:
173 |       - spark_network
174 | 
175 |   generate_data:
176 |     build: ./generate_data
177 |     container_name: generate_data
178 |     command: python generate_data.py
179 |     depends_on:
180 |       - mysql
181 |     networks:
182 |       - spark_network
183 | 
184 |   api:
185 |     build: ./api
186 |     ports:
187 |       - "8000:8000"
188 |     depends_on:
189 |       - mysql          
190 | 
191 | 
192 | networks:
193 |   spark_network:
194 |     driver: bridge
195 |     name: spark_network


--------------------------------------------------------------------------------
/notebooks/pyspark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "2ef67329-7f99-451a-8bdb-e91e369d034b",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "Requirement already satisfied: pyspark in /usr/local/spark-3.3.2-bin-hadoop3/python (3.3.2)\n",
 14 |       "Collecting py4j==0.10.9.5\n",
 15 |       "  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n",
 16 |       "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
 17 |       "\u001b[?25hInstalling collected packages: py4j\n",
 18 |       "Successfully installed py4j-0.10.9.5\n",
 19 |       "+--------------------+------+---+--------------------+-----------+-------------+\n",
 20 |       "|               after|before| op|              source|transaction|        ts_ms|\n",
 21 |       "+--------------------+------+---+--------------------+-----------+-------------+\n",
 22 |       "|{100.0, Ford, 201...|  null|  r|{mysql, inventory...|       null|1679965843817|\n",
 23 |       "+--------------------+------+---+--------------------+-----------+-------------+\n",
 24 |       "\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "!pip install pyspark\n",
 30 |     "\n",
 31 |     "from pyspark.sql import SparkSession\n",
 32 |     "\n",
 33 |     "# Initialize Spark session\n",
 34 |     "spark = SparkSession.builder \\\n",
 35 |     "    .appName(\"Inventory ETL\") \\\n",
 36 |     "    .config(\"spark.sql.parquet.datetimeRebaseModeInWrite\", \"LEGACY\") \\\n",
 37 |     "    .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.2.0\") \\\n",
 38 |     "    .getOrCreate()\n",
 39 |     "\n",
 40 |     "# Set the necessary AWS credentials\n",
 41 |     "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.access.key\", \"minio\")\n",
 42 |     "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.secret.key\", \"minio123\")\n",
 43 |     "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.endpoint\", \"minio:9000\")\n",
 44 |     "\n",
 45 |     "# Set the path to the JSON file\n",
 46 |     "get_users_file = \"s3a://warehouse/inventory/dbserver1.inventory.customers/partition=0/*.json\"\n",
 47 |     "\n",
 48 |     "# Read the JSON file\n",
 49 |     "raw_data = spark.read \\\n",
 50 |     "    .format(\"json\") \\\n",
 51 |     "    .option(\"inferSchema\", \"true\") \\\n",
 52 |     "    .json(get_users_file)\n",
 53 |     "\n",
 54 |     "# Display raw data\n",
 55 |     "raw_data.show()\n"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 2,
 61 |    "id": "34027a94-39cd-4dde-9bc0-43e99417bd66",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# Process the data\n",
 66 |     "silver_data = raw_data.select(\n",
 67 |     "    \"after.id\",\n",
 68 |     "    \"after.plate_number\",\n",
 69 |     "    \"after.car_make\",\n",
 70 |     "    \"after.car_year\",\n",
 71 |     "    \"after.owner_name\",\n",
 72 |     "    \"after.owner_address\",\n",
 73 |     "    \"after.owner_phone_number\",\n",
 74 |     "    \"after.subscription_status\",\n",
 75 |     "    \"after.subscription_start\",\n",
 76 |     "    \"after.subscription_end\",\n",
 77 |     "    \"after.balance\",\n",
 78 |     "    \"after.timestamp\"\n",
 79 |     ")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "id": "a4e36db0-4039-4792-ad0a-c81d8d8a3ca4",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "silver_data.write.parquet(\"s3a://warehouse/inventory/silver_data\", mode=\"overwrite\")"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "id": "bc38035e-bccc-419c-9f5d-784b6d8bf5a9",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "id": "10794ce4-7037-487d-a163-e3f344047589",
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n",
113 |       "|                  id|plate_number|car_make|car_year|owner_name|       owner_address|owner_phone_number|subscription_status|subscription_start|subscription_end|balance|           timestamp|\n",
114 |       "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n",
115 |       "|5a5c562e-4386-44a...|    7695-OOO|    Ford|    2012|    Stefen|92834 Kim Unions\\...|      +14385064453|             active|              null|            null|  100.0|2023-03-03T14:37:49Z|\n",
116 |       "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n",
117 |       "\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "silver_data.show()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 6,
128 |    "id": "3747067d-5514-4222-af58-4e5ec5d1d1dc",
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "Collecting twilio\n",
136 |       "  Downloading twilio-7.17.0-py2.py3-none-any.whl (1.4 MB)\n",
137 |       "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
138 |       "\u001b[?25hRequirement already satisfied: requests>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from twilio) (2.28.1)\n",
139 |       "Requirement already satisfied: PyJWT<3.0.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from twilio) (2.5.0)\n",
140 |       "Requirement already satisfied: pytz in /opt/conda/lib/python3.8/site-packages (from twilio) (2022.4)\n",
141 |       "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (1.26.11)\n",
142 |       "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (2022.9.24)\n",
143 |       "Requirement already satisfied: charset-normalizer<3,>=2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (2.1.1)\n",
144 |       "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (3.4)\n",
145 |       "Installing collected packages: twilio\n",
146 |       "Successfully installed twilio-7.17.0\n",
147 |       "Collecting mysql-connector-python\n",
148 |       "  Downloading mysql_connector_python-8.0.32-cp38-cp38-manylinux1_x86_64.whl (23.5 MB)\n",
149 |       "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.5/23.5 MB\u001b[0m \u001b[31m13.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
150 |       "\u001b[?25hCollecting protobuf<=3.20.3,>=3.11.0\n",
151 |       "  Downloading protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n",
152 |       "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
153 |       "\u001b[?25hInstalling collected packages: protobuf, mysql-connector-python\n",
154 |       "  Attempting uninstall: protobuf\n",
155 |       "    Found existing installation: protobuf 4.21.7\n",
156 |       "    Uninstalling protobuf-4.21.7:\n",
157 |       "      Successfully uninstalled protobuf-4.21.7\n",
158 |       "Successfully installed mysql-connector-python-8.0.32 protobuf-3.20.3\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "!pip install twilio\n",
164 |     "!pip install mysql-connector-python"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 7,
170 |    "id": "7cdf3ad9-c375-4d99-b528-633a65d026a9",
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "from datetime import datetime as dt, timedelta, timezone\n",
175 |     "import pytz\n",
176 |     "from twilio.rest import Client\n",
177 |     "from pyspark.sql import Row\n",
178 |     "from datetime import datetime, timezone\n",
179 |     "from pyspark.sql import SparkSession\n",
180 |     "from pyspark.sql.functions import col, udf\n",
181 |     "from pyspark.sql.types import BooleanType\n",
182 |     "import datetime\n",
183 |     "import mysql.connector\n",
184 |     "from typing import Optional\n",
185 |     "\n",
186 |     "# Additional imports\n",
187 |     "from mysql.connector import Error\n",
188 |     "\n",
189 |     "TWILIO_ACCOUNT_SID = '",
190 |     "TWILIO_AUTH_TOKEN =  '" ,
191 |     "TWILIO_PHONE_NUMBER = '",
192 |     "\n",
193 |     "client = Client(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)\n",
194 |     "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")\n",
195 |     "\n",
196 |     "def get_rate_for_customer(timestamp, subscription_status):\n",
197 |     "    if subscription_status == 'active':\n",
198 |     "        if 0 <= timestamp.hour < 6 or 11 <= timestamp.hour < 16:\n",
199 |     "            return 2.99\n",
200 |     "        elif 6 <= timestamp.hour < 11 or 16 <= timestamp.hour < 23:\n",
201 |     "            return 3.99\n",
202 |     "    else:\n",
203 |     "        return 9.99\n",
204 |     "\n",
205 |     "    # Add a default rate value to avoid NoneType issues\n",
206 |     "    return 0.0\n",
207 |     "\n",
208 |     "\n",
209 |     "def is_subscription_active(subscription_start: dt, subscription_end: dt, current_time: dt) -> bool:\n",
210 |     "    return subscription_start <= current_time <= subscription_end\n",
211 |     "\n",
212 |     "def get_subscription_status(subscription_end: dt, current_time: dt) -> bool:\n",
213 |     "    grace_period = timedelta(days=7)\n",
214 |     "    return current_time <= subscription_end + grace_period\n",
215 |     "\n",
216 |     "\n",
217 |     "def send_sms(phone_number, message):\n",
218 |     "    try:\n",
219 |     "        client.messages.create(\n",
220 |     "            body=message,\n",
221 |     "            from_=TWILIO_PHONE_NUMBER,\n",
222 |     "            to=phone_number\n",
223 |     "        )\n",
224 |     "        print(f\"SMS sent to {phone_number}: {message}\")\n",
225 |     "    except Exception as e:\n",
226 |     "        print(f\"Error sending SMS: {e}\")\n",
227 |     "\n",
228 |     "from pyspark.sql.functions import col\n",
229 |     "\n",
230 |     "def is_valid_balance(value):\n",
231 |     "    try:\n",
232 |     "        float(value)\n",
233 |     "        return True\n",
234 |     "    except ValueError:\n",
235 |     "        return False\n",
236 |     "\n",
237 |     "valid_balance_udf = udf(is_valid_balance, BooleanType())\n",
238 |     "\n",
239 |     "silver_data = silver_data.filter(valid_balance_udf(col(\"balance\")))\n",
240 |     "\n",
241 |     "# Database configuration\n",
242 |     "db_config = {\n",
243 |     "    \"host\": \"mysql\",\n",
244 |     "    \"user\": \"root\",\n",
245 |     "    \"password\": \"debezium\",\n",
246 |     "    \"database\": \"inventory\"\n",
247 |     "}\n",
248 |     "\n",
249 |     "def update_customer_balance(customer_id, new_balance):\n",
250 |     "    try:\n",
251 |     "        connection = mysql.connector.connect(**db_config)\n",
252 |     "        cursor = connection.cursor()\n",
253 |     "        update_query = \"UPDATE customers SET balance = %s WHERE id = %s\"\n",
254 |     "        cursor.execute(update_query, (new_balance, customer_id))\n",
255 |     "        connection.commit()\n",
256 |     "        print(f\"Updated balance for customer {customer_id}: {new_balance}\")\n",
257 |     "    except Error as e:\n",
258 |     "        print(f\"Error updating balance: {e}\")\n",
259 |     "    finally:\n",
260 |     "        if connection.is_connected():\n",
261 |     "            cursor.close()\n",
262 |     "            connection.close() \n",
263 |     "\n",
264 |     "from datetime import datetime, timezone\n",
265 |     "\n",
266 |     "def safe_date_conversion(date_string: Optional[str]) -> dt:\n",
267 |     "    if date_string is None or not isinstance(date_string, str):\n",
268 |     "        return dt(1970, 1, 1, tzinfo=timezone.utc)\n",
269 |     "    try:\n",
270 |     "        return dt.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)\n",
271 |     "    except ValueError:\n",
272 |     "        return dt(1970, 1, 1, tzinfo=timezone.utc)\n",
273 |     "\n",
274 |     "def process_plate(row: Row) -> None:\n",
275 |     "    print(f\"Processing plate: {row.plate_number}\")\n",
276 |     "    current_time = dt.now(timezone.utc)\n",
277 |     "    try:\n",
278 |     "        plate_timestamp = dt.fromisoformat(row.timestamp[:-1]).replace(tzinfo=timezone.utc)\n",
279 |     "    except ValueError:\n",
280 |     "        plate_timestamp = dt.fromtimestamp(0, timezone.utc)\n",
281 |     "\n",
282 |     "    subscription_start = safe_date_conversion(row.subscription_start)\n",
283 |     "    subscription_end = safe_date_conversion(row.subscription_end)\n",
284 |     "\n",
285 |     "    is_active = is_subscription_active(subscription_start, subscription_end, current_time)\n",
286 |     "    rate = get_rate_for_customer(plate_timestamp, row.subscription_status)\n",
287 |     "\n",
288 |     "    balance = float(row.balance)\n",
289 |     "    new_balance = balance - rate\n",
290 |     "\n",
291 |     "    if row.subscription_status == 'none':\n",
292 |     "        message = f\"Dear {row.owner_name}, your car with plate number {row.plate_number} is not registered. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n",
293 |     "        send_sms(row.owner_phone_number, message)\n",
294 |     "    elif is_active:  # Changed from row.subscription_status == 'active'\n",
295 |     "        message = f\"Dear {row.owner_name}, your subscription is active. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n",
296 |     "        send_sms(row.owner_phone_number, message)\n",
297 |     "    elif not get_subscription_status(subscription_end, current_time):\n",
298 |     "        message = f\"Dear {row.owner_name}, your subscription has expired. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n",
299 |     "        send_sms(row.owner_phone_number, message)\n",
300 |     "\n",
301 |     "        update_customer_balance(row.id, new_balance)\n",
302 |     "\n",
303 |     "silver_data.foreach(process_plate)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 8,
309 |    "id": "8beefe0b-5fae-43d7-a903-d82a8cab1eae",
310 |    "metadata": {},
311 |    "outputs": [
312 |     {
313 |      "data": {
314 |       "text/plain": [
315 |        "\"\\nsample_data = Row(\\n    id='5a5c562e-4386-44ad-bf6f-bab91081781e',\\n    plate_number='7695-OOO',\\n    car_make='Ford',\\n    car_year=2012,\\n    owner_name='Becky Smith',\\n    owner_address='92834 Kim Unions\\nPort Harryport, MD 61729',\\n    owner_phone_number='+14385064453',\\n    subscription_status='none',\\n    subscription_start=None,\\n    subscription_end=None,\\n    balance=100.0,  # Replace 'Exc=' with a valid float value\\n    timestamp='2023-03-03T14:37:49Z',\\n    rate=9.99\\n)\\n\\nprocess_plate(sample_data)\\n\""
316 |       ]
317 |      },
318 |      "execution_count": 8,
319 |      "metadata": {},
320 |      "output_type": "execute_result"
321 |     }
322 |    ],
323 |    "source": [
324 |     "\"\"\"\n",
325 |     "sample_data = Row(\n",
326 |     "    id='5a5c562e-4386-44ad-bf6f-bab91081781e',\n",
327 |     "    plate_number='7695-OOO',\n",
328 |     "    car_make='Ford',\n",
329 |     "    car_year=2012,\n",
330 |     "    owner_name='Becky Smith',\n",
331 |     "    owner_address='92834 Kim Unions\\nPort Harryport, MD 61729',\n",
332 |     "    owner_phone_number='+14354123654',\n",
333 |     "    subscription_status='none',\n",
334 |     "    subscription_start=None,\n",
335 |     "    subscription_end=None,\n",
336 |     "    balance=100.0,  # Replace 'Exc=' with a valid float value\n",
337 |     "    timestamp='2023-03-03T14:37:49Z',\n",
338 |     "    rate=9.99\n",
339 |     ")\n",
340 |     "\n",
341 |     "process_plate(sample_data)\n",
342 |     "\"\"\""
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 9,
348 |    "id": "84eb738c-e811-44e1-84e5-a56002973ea6",
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "name": "stdout",
353 |      "output_type": "stream",
354 |      "text": [
355 |       "+-------------------+-----+\n",
356 |       "|subscription_status|count|\n",
357 |       "+-------------------+-----+\n",
358 |       "|             active|    1|\n",
359 |       "+-------------------+-----+\n",
360 |       "\n",
361 |       "Daily Metrics:\n",
362 |       "+----------+--------------+-------------+\n",
363 |       "|      date|total_passages|total_revenue|\n",
364 |       "+----------+--------------+-------------+\n",
365 |       "|2023-03-03|             1|         3.99|\n",
366 |       "+----------+--------------+-------------+\n",
367 |       "\n",
368 |       "Weekly Metrics:\n",
369 |       "+----+------------+--------------+-------------+\n",
370 |       "|year|week_of_year|total_passages|total_revenue|\n",
371 |       "+----+------------+--------------+-------------+\n",
372 |       "|2023|           9|             1|         3.99|\n",
373 |       "+----+------------+--------------+-------------+\n",
374 |       "\n",
375 |       "Monthly Metrics:\n",
376 |       "+----+-----+--------------+-------------+\n",
377 |       "|year|month|total_passages|total_revenue|\n",
378 |       "+----+-----+--------------+-------------+\n",
379 |       "|2023|    3|             1|         3.99|\n",
380 |       "+----+-----+--------------+-------------+\n",
381 |       "\n",
382 |       "Quarterly Metrics:\n",
383 |       "+----+-------+--------------+-------------+\n",
384 |       "|year|quarter|total_passages|total_revenue|\n",
385 |       "+----+-------+--------------+-------------+\n",
386 |       "|2023|      1|             1|         3.99|\n",
387 |       "+----+-------+--------------+-------------+\n",
388 |       "\n",
389 |       "Yearly Metrics:\n",
390 |       "+----+--------------+-------------+\n",
391 |       "|year|total_passages|total_revenue|\n",
392 |       "+----+--------------+-------------+\n",
393 |       "|2023|             1|         3.99|\n",
394 |       "+----+--------------+-------------+\n",
395 |       "\n"
396 |      ]
397 |     }
398 |    ],
399 |    "source": [
400 |     "gold_data = silver_data.groupBy(\"subscription_status\").count()\n",
401 |     "\n",
402 |     "gold_data.show()\n",
403 |     "\n",
404 |     "gold_data.write.parquet(\"s3a://warehouse/inventory/gold_data\", mode=\"overwrite\")\n",
405 |     "\n",
406 |     "\n",
407 |     "import pyspark.sql.functions as F\n",
408 |     "from pyspark.sql import SparkSession\n",
409 |     "\n",
410 |     "class MetricsAdapter:\n",
411 |     "    def __init__(self, silver_table, warehouse_path):\n",
412 |     "        self.silver_table = silver_table\n",
413 |     "        self.warehouse_path = warehouse_path\n",
414 |     "        \n",
415 |     "    def show_metrics(self):\n",
416 |     "        daily_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/daily_metrics')\n",
417 |     "        weekly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/weekly_metrics')\n",
418 |     "        monthly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/monthly_metrics')\n",
419 |     "        quarterly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/quarterly_metrics')\n",
420 |     "        yearly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/yearly_metrics')\n",
421 |     "        subscription_status_count = silver_data.groupBy(\"subscription_status\").count()\n",
422 |     "\n",
423 |     "        print(\"Daily Metrics:\")\n",
424 |     "        daily_metrics.show(5)\n",
425 |     "\n",
426 |     "        print(\"Weekly Metrics:\")\n",
427 |     "        weekly_metrics.show(5)\n",
428 |     "\n",
429 |     "        print(\"Monthly Metrics:\")\n",
430 |     "        monthly_metrics.show(5)\n",
431 |     "\n",
432 |     "        print(\"Quarterly Metrics:\")\n",
433 |     "        quarterly_metrics.show(5)\n",
434 |     "\n",
435 |     "        print(\"Yearly Metrics:\")\n",
436 |     "        yearly_metrics.show(5)    \n",
437 |     "\n",
438 |     "    def transform(self):\n",
439 |     "        # Calculate the week, month, quarter, and year from the timestamp\n",
440 |     "        time_based_metrics = self.silver_table.withColumn(\"date\", F.to_date(\"timestamp\")) \\\n",
441 |     "            .withColumn(\"year\", F.year(\"timestamp\")) \\\n",
442 |     "            .withColumn(\"quarter\", F.quarter(\"timestamp\")) \\\n",
443 |     "            .withColumn(\"month\", F.month(\"timestamp\")) \\\n",
444 |     "            .withColumn(\"week_of_year\", F.weekofyear(\"timestamp\")) \\\n",
445 |     "            .withColumn(\"total_passages\", F.lit(1)) \\\n",
446 |     "            .withColumn(\"total_revenue\", F.when(self.silver_table.timestamp.substr(12, 2).cast(\"int\") < 12, 2.99).otherwise(3.99))\n",
447 |     "\n",
448 |     "\n",
449 |     "        # Daily metrics\n",
450 |     "        daily_metrics = time_based_metrics.groupBy(\"date\").agg(\n",
451 |     "            F.count(\"*\").alias(\"total_passages\"),\n",
452 |     "            F.sum(F.when(time_based_metrics.timestamp.substr(12, 2).cast(\"int\") < 12, 2.99).otherwise(3.99)).alias(\"total_revenue\")\n",
453 |     "        )\n",
454 |     "        daily_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/daily_metrics')\n",
455 |     "\n",
456 |     "        # Weekly metrics\n",
457 |     "        weekly_metrics = time_based_metrics.groupBy(\"year\", \"week_of_year\").agg(\n",
458 |     "            F.sum(\"total_passages\").alias(\"total_passages\"),\n",
459 |     "            F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
460 |     "        )\n",
461 |     "        weekly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/weekly_metrics')\n",
462 |     "\n",
463 |     "        # Monthly metrics\n",
464 |     "        monthly_metrics = time_based_metrics.groupBy(\"year\", \"month\").agg(\n",
465 |     "            F.sum(\"total_passages\").alias(\"total_passages\"),\n",
466 |     "            F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
467 |     "        )\n",
468 |     "        monthly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/monthly_metrics')\n",
469 |     "\n",
470 |     "        # Quarterly metrics\n",
471 |     "        quarterly_metrics = time_based_metrics.groupBy(\"year\", \"quarter\").agg(\n",
472 |     "            F.sum(\"total_passages\").alias(\"total_passages\"),\n",
473 |     "            F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
474 |     "        )\n",
475 |     "        quarterly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/quarterly_metrics')\n",
476 |     "\n",
477 |     "        # Yearly metrics\n",
478 |     "        yearly_metrics = time_based_metrics.groupBy(\"year\").agg(\n",
479 |     "            F.sum(\"total_passages\").alias(\"total_passages\"),\n",
480 |     "            F.sum(\"total_revenue\").alias(\"total_revenue\")\n",
481 |     "        )\n",
482 |     "        yearly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/yearly_metrics')\n",
483 |     "\n",
484 |     "# Example usage\n",
485 |     "spark = SparkSession.builder.getOrCreate()\n",
486 |     "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")\n",
487 |     "warehouse_path = \"s3a://warehouse/inventory/gold_data\"\n",
488 |     "metrics_adapter = MetricsAdapter(silver_data, warehouse_path)\n",
489 |     "metrics_adapter.transform()\n",
490 |     "\n",
491 |     "metrics_adapter.show_metrics()"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "id": "9079a66c-d57f-4ab2-a52d-3ee7a2929490",
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": []
501 |   }
502 |  ],
503 |  "metadata": {
504 |   "kernelspec": {
505 |    "display_name": "Python 3 (ipykernel)",
506 |    "language": "python",
507 |    "name": "python3"
508 |   },
509 |   "language_info": {
510 |    "codemirror_mode": {
511 |     "name": "ipython",
512 |     "version": 3
513 |    },
514 |    "file_extension": ".py",
515 |    "mimetype": "text/x-python",
516 |    "name": "python",
517 |    "nbconvert_exporter": "python",
518 |    "pygments_lexer": "ipython3",
519 |    "version": "3.8.13"
520 |   }
521 |  },
522 |  "nbformat": 4,
523 |  "nbformat_minor": 5
524 | }
525 | 


--------------------------------------------------------------------------------