├── api ├── requirements.txt ├── Dockerfile ├── request.py ├── api.py └── template │ └── customers.html ├── img ├── red.png └── download.png ├── src ├── requirements.txt ├── Dockerfile └── setup_connectors.sh ├── generate_data ├── requirements.txt ├── Dockerfile └── generate_data.py ├── kafka └── Dockerfile ├── .env ├── README.md ├── spark ├── Dockerfile └── spark-defaults.conf ├── notebooks ├── spark-defaults.conf ├── Dockerfile └── pyspark.ipynb └── docker-compose.yml /api/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.1.1 2 | mysql-connector-python==8.0.28 3 | pandas==1.3.5 4 | -------------------------------------------------------------------------------- /img/red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stefen-Taime/stream-ingestion-redpanda-minio/HEAD/img/red.png -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.5.2 2 | SQLAlchemy==1.4.46 3 | PyMySQL==1.0.2 4 | kafka-python==2.0.2 -------------------------------------------------------------------------------- /img/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stefen-Taime/stream-ingestion-redpanda-minio/HEAD/img/download.png -------------------------------------------------------------------------------- /generate_data/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.1.1 2 | mysql-connector-python==8.0.28 3 | pandas==1.3.5 4 | Faker==13.1.0 5 | -------------------------------------------------------------------------------- /src/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | WORKDIR /opt/src 4 | COPY requirements.txt /opt/src 5 | RUN pip install --upgrade pip && pip install -r requirements.txt 6 | 7 | ENTRYPOINT ["tail", "-f", "/dev/null"] -------------------------------------------------------------------------------- /generate_data/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt . 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | 8 | COPY . . 9 | 10 | CMD ["python", "generate_data.py"] 11 | -------------------------------------------------------------------------------- /api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt . 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | 8 | COPY . . 9 | 10 | EXPOSE 8000 11 | 12 | CMD ["python", "api.py"] 13 | -------------------------------------------------------------------------------- /kafka/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debezium/connect 2 | 3 | RUN curl -O https://d1i4a15mxbxib1.cloudfront.net/api/plugins/confluentinc/kafka-connect-s3/versions/10.3.1/confluentinc-kafka-connect-s3-10.3.1.zip \ 4 | && unzip confluentinc-kafka-connect-s3-10.3.1.zip \ 5 | && mv confluentinc-kafka-connect-s3-10.3.1 /kafka/connect/ \ 6 | && rm confluentinc-kafka-connect-s3-10.3.1.zip -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | # redpanda-console 2 | KAFKA_BROKERS="redpanda:9092" 3 | 4 | # MinIO 5 | MINIO_ROOT_USER="minio" 6 | MINIO_ROOT_PASSWORD="minio123" 7 | MINIO_ACCESS_KEY="minio" 8 | MINIO_SECRET_KEY="minio123" 9 | 10 | # MySQL 11 | MYSQL_ROOT_PASSWORD="debezium" 12 | MYSQL_USER="admin" 13 | MYSQL_PASSWORD="admin123" 14 | 15 | # kafka connect 16 | BOOTSTRAP_SERVERS="redpanda:9092" 17 | GROUP_ID="1" 18 | CONFIG_STORAGE_TOPIC="debezium.configs" 19 | OFFSET_STORAGE_TOPIC="debezium.offset" 20 | STATUS_STORAGE_TOPIC="debezium.status" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Data Processing and Analytics with Docker, MySQL, Redpanda, MinIO, and Apache Spark Using Delta Lake 2 | 3 | 4 | 5 | ## Architectural overview 6 | 7 | ![Architecture](/img/red.png) 8 | 9 | [Medium](https://medium.com/@stefentaime_10958/real-time-data-processing-and-analytics-with-docker-mysql-redpanda-minio-and-apache-spark-eca83f210ef6) In this article, you will learn how to set up a real-time data processing and analytics environment using Docker, MySQL, Redpanda, MinIO, and Apache Spark. 10 | 11 | -------------------------------------------------------------------------------- /api/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | data = { 4 | "id": "5a5c562e-4386-44ad-bf6f-bab91081781e", 5 | "plate_number": "7695-OOO", 6 | "car_make": "Ford", 7 | "car_year": 2012, 8 | "owner_name": "Stefen", 9 | "owner_address": "92834 Kim Unions\nPort Harryport, MD 61729", 10 | "owner_phone_number": "+1505698632", 11 | "subscription_status": "active", 12 | "subscription_start": None, 13 | "subscription_end": None, 14 | "balance": 100.0, 15 | "timestamp": "2023-03-03T14:37:49", 16 | "rate": 9.99 17 | } 18 | 19 | response = requests.post("http://0.0.0.0:8000/send_data", json=data) 20 | 21 | print(response.status_code) 22 | print(response.json()) 23 | -------------------------------------------------------------------------------- /spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/bitnami/spark:3.3 2 | 3 | USER root 4 | 5 | # Install prerequisites 6 | RUN apt-get update && apt-get install -y curl 7 | 8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \ 9 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \ 10 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \ 11 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \ 12 | && mv s3-2.18.41.jar /opt/bitnami/spark/jars \ 13 | && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \ 14 | && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \ 15 | && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars -------------------------------------------------------------------------------- /spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar 2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension 3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog 4 | spark.hadoop.fs.s3a.endpoint http://minio:9000 5 | spark.hadoop.fs.s3a.access.key minio 6 | spark.hadoop.fs.s3a.secret.key minio123 7 | spark.hadoop.fs.s3a.path.style.access true 8 | spark.hadoop.fs.s3a.connection.ssl.enabled false 9 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem -------------------------------------------------------------------------------- /notebooks/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.jars /usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar 2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension 3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog 4 | spark.hadoop.fs.s3a.endpoint http://minio:9000 5 | spark.hadoop.fs.s3a.access.key minio 6 | spark.hadoop.fs.s3a.secret.key minio123 7 | spark.hadoop.fs.s3a.path.style.access true 8 | spark.hadoop.fs.s3a.connection.ssl.enabled false 9 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem -------------------------------------------------------------------------------- /notebooks/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/all-spark-notebook:python-3.8 2 | 3 | USER root 4 | 5 | RUN curl -O https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \ 6 | && tar zxvf spark-3.3.2-bin-hadoop3.tgz \ 7 | && rm -rf spark-3.3.2-bin-hadoop3.tgz \ 8 | && mv spark-3.3.2-bin-hadoop3/ /usr/local/ \ 9 | && rm -rf /usr/local/spark \ 10 | && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \ 11 | && ln -s /usr/local/spark-3.3.2-bin-hadoop3 /usr/local/spark 12 | 13 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \ 14 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \ 15 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \ 16 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \ 17 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \ 18 | && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \ 19 | && mv s3-2.18.41.jar /usr/local/spark/jars \ 20 | && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \ 21 | && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \ 22 | && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \ 23 | && mv delta-storage-2.2.0.jar /usr/local/spark/jars \ 24 | && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars -------------------------------------------------------------------------------- /src/setup_connectors.sh: -------------------------------------------------------------------------------- 1 | # create connector source for MySQL 2 | curl --request POST \ 3 | --url http://localhost:8083/connectors \ 4 | --header 'Content-Type: application/json' \ 5 | --data '{ 6 | "name": "src-mys", 7 | "config": { 8 | "connector.class": "io.debezium.connector.mysql.MySqlConnector", 9 | "tasks.max": "1", 10 | "database.hostname": "mysql", 11 | "database.port": "3306", 12 | "database.user": "debezium", 13 | "database.password": "dbz", 14 | "database.server.id": "184054", 15 | "database.include.list": "inventory", 16 | "decimal.handling.mode": "double", 17 | "topic.prefix": "dbserver1", 18 | "schema.history.internal.kafka.bootstrap.servers": "redpanda:9092", 19 | "schema.history.internal.kafka.topic": "schema-changes.inventory" 20 | } 21 | }' 22 | 23 | # create connector sink MySQL to S3 24 | curl --request POST \ 25 | --url http://localhost:8083/connectors \ 26 | --header 'Content-Type: application/json' \ 27 | --data '{ 28 | "name": "sink_aws-s3", 29 | "config": { 30 | "topics.regex": "dbserver1.inventory.*", 31 | "topics.dir": "inventory", 32 | "connector.class": "io.confluent.connect.s3.S3SinkConnector", 33 | "key.converter": "org.apache.kafka.connect.json.JsonConverter", 34 | "value.converter": "org.apache.kafka.connect.json.JsonConverter", 35 | "format.class": "io.confluent.connect.s3.format.json.JsonFormat", 36 | "flush.size": "1", 37 | "store.url": "http://minio:9000", 38 | "storage.class": "io.confluent.connect.s3.storage.S3Storage", 39 | "s3.region": "us-east-1", 40 | "s3.bucket.name": "warehouse", 41 | "aws.access.key.id": "minio", 42 | "aws.secret.access.key": "minio123" 43 | } 44 | }' 45 | -------------------------------------------------------------------------------- /api/api.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify, render_template 2 | import mysql.connector 3 | import pandas as pd 4 | 5 | app = Flask(__name__, template_folder='template') 6 | 7 | db_config = { 8 | "host": "10.0.0.25", 9 | "user": "root", 10 | "password": "debezium", 11 | "database": "inventory" 12 | } 13 | 14 | @app.route('/send_data', methods=['POST']) 15 | def send_data(): 16 | data = request.get_json() 17 | 18 | 19 | conn = mysql.connector.connect(**db_config) 20 | 21 | cursor = conn.cursor() 22 | 23 | insert_query = ''' 24 | INSERT INTO customers (id, plate_number, car_make, car_year, owner_name, owner_address, owner_phone_number, subscription_status, subscription_start, subscription_end, balance, timestamp) 25 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 26 | ''' 27 | cursor.execute(insert_query, ( 28 | data['id'], 29 | data['plate_number'], 30 | data['car_make'], 31 | data['car_year'], 32 | data['owner_name'], 33 | data['owner_address'], 34 | data['owner_phone_number'], 35 | data['subscription_status'], 36 | data['subscription_start'], 37 | data['subscription_end'], 38 | data['balance'], 39 | data['timestamp'] 40 | )) 41 | 42 | conn.commit() 43 | 44 | cursor.close() 45 | conn.close() 46 | 47 | return jsonify({"status": "success"}), 200 48 | 49 | @app.route('/customers', methods=['GET']) 50 | def customers(): 51 | plate_number = request.args.get('plate_number', '') 52 | page = int(request.args.get('page', 1)) 53 | items_per_page = 10 54 | 55 | conn = mysql.connector.connect(**db_config) 56 | 57 | # Create a cursor 58 | cursor = conn.cursor() 59 | 60 | # Fetch customers filtered by plate_number and apply pagination 61 | select_query = ''' 62 | SELECT * FROM customers 63 | WHERE plate_number LIKE %s 64 | LIMIT %s OFFSET %s 65 | ''' 66 | cursor.execute(select_query, (f"%{plate_number}%", items_per_page, (page - 1) * items_per_page)) 67 | customers = cursor.fetchall() 68 | 69 | # Get the total number of customers 70 | cursor.execute("SELECT COUNT(*) FROM customers WHERE plate_number LIKE %s", (f"%{plate_number}%",)) 71 | total_customers = cursor.fetchone()[0] 72 | 73 | # Close the cursor and connection 74 | cursor.close() 75 | conn.close() 76 | 77 | return render_template('customers.html', customers=customers, plate_number=plate_number, page=page, total_pages=(total_customers // items_per_page) + 1) 78 | 79 | 80 | if __name__ == '__main__': 81 | app.run(host='0.0.0.0', port=8000) -------------------------------------------------------------------------------- /api/template/customers.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Customers 7 | 8 | 9 | 10 | 11 |
12 |

Customers

13 | 14 |
15 |
16 | 17 | 18 | 19 |
20 |
21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | {% for customer in customers %} 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | {% endfor %} 46 | 47 |
Plate NumberCar MakeCar YearOwner NameAdressBalance
{{ customer[1] }}{{ customer[2] }}{{ customer[3] }}{{ customer[4] }}{{ customer[5] }}{{ customer[10] }}
48 | 49 | 64 |
65 | 66 | 67 | -------------------------------------------------------------------------------- /generate_data/generate_data.py: -------------------------------------------------------------------------------- 1 | import random 2 | import uuid 3 | from faker import Faker 4 | import pandas as pd 5 | import mysql.connector 6 | from datetime import datetime, timedelta 7 | 8 | # Initialize Faker 9 | fake = Faker() 10 | 11 | # Number of data points to generate 12 | num_records = 1000 13 | 14 | # Generate synthetic data 15 | data = [] 16 | 17 | for _ in range(num_records): 18 | unique_id = str(uuid.uuid4()) 19 | plate_number = f"{random.randint(1000, 9999)}-{fake.random_element(elements=('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG', 'HHH', 'III', 'JJJ', 'KKK', 'LLL', 'MMM', 'NNN', 'OOO', 'PPP', 'QQQ', 'RRR', 'SSS', 'TTT', 'UUU', 'VVV', 'WWW', 'XXX', 'YYY', 'ZZZ'))}" 20 | 21 | car_info = { 22 | "make": fake.random_element(elements=("Toyota", "Honda", "Ford", "Chevrolet", "Nissan", "Volkswagen", "BMW", "Mercedes-Benz")), 23 | "year": random.randint(2000, 2023) 24 | } 25 | 26 | owner_info = { 27 | "name": fake.name(), 28 | "address": fake.address(), 29 | "phone_number": fake.phone_number().replace("x", " ext. ") # Modify phone number format 30 | } 31 | 32 | subscription_status = fake.random_element(elements=("active", "expired", "none")) 33 | 34 | if subscription_status != "none": 35 | subscription_start = fake.date_between(start_date='-3y', end_date='today') 36 | subscription_end = subscription_start + timedelta(days=365) 37 | else: 38 | subscription_start = None 39 | subscription_end = None 40 | 41 | balance = round(random.uniform(0, 500), 2) 42 | 43 | timestamp = fake.date_time_between(start_date='-30d', end_date='now').strftime('%Y-%m-%d %H:%M:%S') 44 | 45 | 46 | record = { 47 | "id": unique_id, 48 | "plate_number": plate_number, 49 | "car_make": car_info["make"], 50 | "car_year": car_info["year"], 51 | "owner_name": owner_info["name"], 52 | "owner_address": owner_info["address"], 53 | "owner_phone_number": owner_info["phone_number"], 54 | "subscription_status": subscription_status, 55 | "subscription_start": subscription_start, 56 | "subscription_end": subscription_end, 57 | "balance": balance, 58 | "timestamp": timestamp 59 | } 60 | 61 | data.append(record) 62 | 63 | # Convert data to a pandas DataFrame 64 | df = pd.DataFrame(data) 65 | 66 | # Connect to the MySQL database 67 | db_config = { 68 | "host": "mysql", 69 | "user": "root", 70 | "password": "debezium", 71 | "database": "inventory" 72 | } 73 | conn = mysql.connector.connect(**db_config) 74 | 75 | # Create a cursor 76 | cursor = conn.cursor() 77 | 78 | # Create the 'customers' table if it doesn't exist 79 | create_table_query = ''' 80 | CREATE TABLE IF NOT EXISTS customers ( 81 | id VARCHAR(255) NOT NULL, 82 | plate_number VARCHAR(255) NOT NULL, 83 | car_make VARCHAR(255) NOT NULL, 84 | car_year INT NOT NULL, 85 | owner_name VARCHAR(255) NOT NULL, 86 | owner_address TEXT NOT NULL, 87 | owner_phone_number VARCHAR(255) NOT NULL, 88 | subscription_status ENUM('active', 'expired', 'none') NOT NULL, 89 | subscription_start DATE, 90 | subscription_end DATE, 91 | balance DECIMAL(10, 2) NOT NULL, 92 | timestamp TIMESTAMP NOT NULL 93 | ) 94 | ''' 95 | cursor.execute(create_table_query) 96 | 97 | # Store the synthetic data in the 'customers' table 98 | for index, row in df.iterrows(): 99 | insert_query = ''' 100 | INSERT INTO customers (id, plate_number, car_make, car_year, owner_name, owner_address, owner_phone_number, subscription_status, subscription_start, subscription_end, balance, timestamp) 101 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 102 | ''' 103 | cursor.execute(insert_query, ( 104 | row['id'], 105 | row['plate_number'], 106 | row['car_make'], 107 | row['car_year'], 108 | row['owner_name'], 109 | row['owner_address'], 110 | row['owner_phone_number'], 111 | row['subscription_status'], 112 | row['subscription_start'], 113 | row['subscription_end'], 114 | row['balance'], 115 | row['timestamp'] 116 | )) 117 | 118 | # Commit the changes and close the cursor 119 | conn.commit() 120 | cursor.close() 121 | 122 | # Close the database connection 123 | conn.close() 124 | 125 | print("Synthetic data stored in the 'customers' table in the MySQL database") 126 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | services: 3 | redpanda: 4 | image: vectorized/redpanda 5 | container_name: redpanda 6 | ports: 7 | - "9092:9092" 8 | - "29092:29092" 9 | command: 10 | - redpanda 11 | - start 12 | - --overprovisioned 13 | - --smp 14 | - "1" 15 | - --memory 16 | - "1G" 17 | - --reserve-memory 18 | - "0M" 19 | - --node-id 20 | - "0" 21 | - --kafka-addr 22 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 23 | - --advertise-kafka-addr 24 | - PLAINTEXT://redpanda:29092,OUTSIDE://redpanda:9092 25 | - --check=false 26 | networks: 27 | - spark_network 28 | 29 | redpanda-console: 30 | image: vectorized/console 31 | container_name: redpanda_console 32 | depends_on: 33 | - redpanda 34 | ports: 35 | - "5000:8080" 36 | env_file: 37 | - .env 38 | networks: 39 | - spark_network 40 | 41 | minio: 42 | hostname: minio 43 | image: "minio/minio" 44 | container_name: minio 45 | ports: 46 | - "9001:9001" 47 | - "9000:9000" 48 | command: [ "server", "/data", "--console-address", ":9001" ] 49 | volumes: 50 | - ./minio/data:/data 51 | env_file: 52 | - .env 53 | networks: 54 | - spark_network 55 | 56 | mc: 57 | image: minio/mc 58 | container_name: mc 59 | hostname: mc 60 | environment: 61 | - AWS_ACCESS_KEY_ID=minio 62 | - AWS_SECRET_ACCESS_KEY=minio123 63 | - AWS_REGION=us-east-1 64 | entrypoint: > 65 | /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; done; /usr/bin/mc mb minio/warehouse; /usr/bin/mc policy set public minio/warehouse; exit 0; " 66 | depends_on: 67 | - minio 68 | networks: 69 | - spark_network 70 | 71 | mysql: 72 | image: debezium/example-mysql:1.6 73 | container_name: mysql 74 | volumes: 75 | - ./mysql/data:/var/lib/mysql 76 | ports: 77 | - "3306:3306" 78 | env_file: 79 | - .env 80 | networks: 81 | - spark_network 82 | 83 | kafka-connect: 84 | build: 85 | context: ./kafka 86 | dockerfile: ./Dockerfile 87 | container_name: kafka_connect 88 | depends_on: 89 | - redpanda 90 | ports: 91 | - "8083:8083" 92 | env_file: 93 | - .env 94 | networks: 95 | - spark_network 96 | 97 | adminer: 98 | image: adminer:latest 99 | ports: 100 | - 8085:8080/tcp 101 | deploy: 102 | restart_policy: 103 | condition: on-failure 104 | networks: 105 | - spark_network 106 | 107 | spark-master: 108 | build: 109 | context: ./spark 110 | dockerfile: ./Dockerfile 111 | container_name: "spark-master" 112 | environment: 113 | - SPARK_MODE=master 114 | - SPARK_LOCAL_IP=spark-master 115 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 116 | - SPARK_RPC_ENCRYPTION_ENABLED=no 117 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 118 | - SPARK_SSL_ENABLED=no 119 | ports: 120 | - "7077:7077" 121 | - "8080:8080" 122 | volumes: 123 | - ./spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 124 | networks: 125 | - spark_network 126 | 127 | spark-worker-1: 128 | image: docker.io/bitnami/spark:3.3 129 | container_name: "spark-worker-1" 130 | environment: 131 | - SPARK_MODE=worker 132 | - SPARK_MASTER_URL=spark://spark-master:7077 133 | - SPARK_WORKER_MEMORY=4G 134 | - SPARK_WORKER_CORES=1 135 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 136 | - SPARK_RPC_ENCRYPTION_ENABLED=no 137 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 138 | - SPARK_SSL_ENABLED=no 139 | networks: 140 | - spark_network 141 | 142 | spark-worker-2: 143 | image: docker.io/bitnami/spark:3.3 144 | container_name: "spark-worker-2" 145 | environment: 146 | - SPARK_MODE=worker 147 | - SPARK_MASTER_URL=spark://spark-master:7077 148 | - SPARK_WORKER_MEMORY=4G 149 | - SPARK_WORKER_CORES=1 150 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 151 | - SPARK_RPC_ENCRYPTION_ENABLED=no 152 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 153 | - SPARK_SSL_ENABLED=no 154 | networks: 155 | - spark_network 156 | 157 | spark-notebook: 158 | build: 159 | context: ./notebooks 160 | dockerfile: ./Dockerfile 161 | container_name: "spark-notebook" 162 | user: root 163 | environment: 164 | - JUPYTER_ENABLE_LAB="yes" 165 | - GRANT_SUDO="yes" 166 | volumes: 167 | - ./notebooks:/home/jovyan/work 168 | - ./notebooks/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf 169 | ports: 170 | - "8888:8888" 171 | - "4040:4040" 172 | networks: 173 | - spark_network 174 | 175 | generate_data: 176 | build: ./generate_data 177 | container_name: generate_data 178 | command: python generate_data.py 179 | depends_on: 180 | - mysql 181 | networks: 182 | - spark_network 183 | 184 | api: 185 | build: ./api 186 | ports: 187 | - "8000:8000" 188 | depends_on: 189 | - mysql 190 | 191 | 192 | networks: 193 | spark_network: 194 | driver: bridge 195 | name: spark_network -------------------------------------------------------------------------------- /notebooks/pyspark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "2ef67329-7f99-451a-8bdb-e91e369d034b", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Requirement already satisfied: pyspark in /usr/local/spark-3.3.2-bin-hadoop3/python (3.3.2)\n", 14 | "Collecting py4j==0.10.9.5\n", 15 | " Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n", 16 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 17 | "\u001b[?25hInstalling collected packages: py4j\n", 18 | "Successfully installed py4j-0.10.9.5\n", 19 | "+--------------------+------+---+--------------------+-----------+-------------+\n", 20 | "| after|before| op| source|transaction| ts_ms|\n", 21 | "+--------------------+------+---+--------------------+-----------+-------------+\n", 22 | "|{100.0, Ford, 201...| null| r|{mysql, inventory...| null|1679965843817|\n", 23 | "+--------------------+------+---+--------------------+-----------+-------------+\n", 24 | "\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "!pip install pyspark\n", 30 | "\n", 31 | "from pyspark.sql import SparkSession\n", 32 | "\n", 33 | "# Initialize Spark session\n", 34 | "spark = SparkSession.builder \\\n", 35 | " .appName(\"Inventory ETL\") \\\n", 36 | " .config(\"spark.sql.parquet.datetimeRebaseModeInWrite\", \"LEGACY\") \\\n", 37 | " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.2.0\") \\\n", 38 | " .getOrCreate()\n", 39 | "\n", 40 | "# Set the necessary AWS credentials\n", 41 | "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.access.key\", \"minio\")\n", 42 | "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.secret.key\", \"minio123\")\n", 43 | "spark.sparkContext._jsc.hadoopConfiguration().set(\"fs.s3a.endpoint\", \"minio:9000\")\n", 44 | "\n", 45 | "# Set the path to the JSON file\n", 46 | "get_users_file = \"s3a://warehouse/inventory/dbserver1.inventory.customers/partition=0/*.json\"\n", 47 | "\n", 48 | "# Read the JSON file\n", 49 | "raw_data = spark.read \\\n", 50 | " .format(\"json\") \\\n", 51 | " .option(\"inferSchema\", \"true\") \\\n", 52 | " .json(get_users_file)\n", 53 | "\n", 54 | "# Display raw data\n", 55 | "raw_data.show()\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "id": "34027a94-39cd-4dde-9bc0-43e99417bd66", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# Process the data\n", 66 | "silver_data = raw_data.select(\n", 67 | " \"after.id\",\n", 68 | " \"after.plate_number\",\n", 69 | " \"after.car_make\",\n", 70 | " \"after.car_year\",\n", 71 | " \"after.owner_name\",\n", 72 | " \"after.owner_address\",\n", 73 | " \"after.owner_phone_number\",\n", 74 | " \"after.subscription_status\",\n", 75 | " \"after.subscription_start\",\n", 76 | " \"after.subscription_end\",\n", 77 | " \"after.balance\",\n", 78 | " \"after.timestamp\"\n", 79 | ")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "id": "a4e36db0-4039-4792-ad0a-c81d8d8a3ca4", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "silver_data.write.parquet(\"s3a://warehouse/inventory/silver_data\", mode=\"overwrite\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "id": "bc38035e-bccc-419c-9f5d-784b6d8bf5a9", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "id": "10794ce4-7037-487d-a163-e3f344047589", 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n", 113 | "| id|plate_number|car_make|car_year|owner_name| owner_address|owner_phone_number|subscription_status|subscription_start|subscription_end|balance| timestamp|\n", 114 | "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n", 115 | "|5a5c562e-4386-44a...| 7695-OOO| Ford| 2012| Stefen|92834 Kim Unions\\...| +14385064453| active| null| null| 100.0|2023-03-03T14:37:49Z|\n", 116 | "+--------------------+------------+--------+--------+----------+--------------------+------------------+-------------------+------------------+----------------+-------+--------------------+\n", 117 | "\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "silver_data.show()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 6, 128 | "id": "3747067d-5514-4222-af58-4e5ec5d1d1dc", 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "Collecting twilio\n", 136 | " Downloading twilio-7.17.0-py2.py3-none-any.whl (1.4 MB)\n", 137 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", 138 | "\u001b[?25hRequirement already satisfied: requests>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from twilio) (2.28.1)\n", 139 | "Requirement already satisfied: PyJWT<3.0.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from twilio) (2.5.0)\n", 140 | "Requirement already satisfied: pytz in /opt/conda/lib/python3.8/site-packages (from twilio) (2022.4)\n", 141 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (1.26.11)\n", 142 | "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (2022.9.24)\n", 143 | "Requirement already satisfied: charset-normalizer<3,>=2 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (2.1.1)\n", 144 | "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests>=2.0.0->twilio) (3.4)\n", 145 | "Installing collected packages: twilio\n", 146 | "Successfully installed twilio-7.17.0\n", 147 | "Collecting mysql-connector-python\n", 148 | " Downloading mysql_connector_python-8.0.32-cp38-cp38-manylinux1_x86_64.whl (23.5 MB)\n", 149 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.5/23.5 MB\u001b[0m \u001b[31m13.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 150 | "\u001b[?25hCollecting protobuf<=3.20.3,>=3.11.0\n", 151 | " Downloading protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n", 152 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 153 | "\u001b[?25hInstalling collected packages: protobuf, mysql-connector-python\n", 154 | " Attempting uninstall: protobuf\n", 155 | " Found existing installation: protobuf 4.21.7\n", 156 | " Uninstalling protobuf-4.21.7:\n", 157 | " Successfully uninstalled protobuf-4.21.7\n", 158 | "Successfully installed mysql-connector-python-8.0.32 protobuf-3.20.3\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "!pip install twilio\n", 164 | "!pip install mysql-connector-python" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "id": "7cdf3ad9-c375-4d99-b528-633a65d026a9", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "from datetime import datetime as dt, timedelta, timezone\n", 175 | "import pytz\n", 176 | "from twilio.rest import Client\n", 177 | "from pyspark.sql import Row\n", 178 | "from datetime import datetime, timezone\n", 179 | "from pyspark.sql import SparkSession\n", 180 | "from pyspark.sql.functions import col, udf\n", 181 | "from pyspark.sql.types import BooleanType\n", 182 | "import datetime\n", 183 | "import mysql.connector\n", 184 | "from typing import Optional\n", 185 | "\n", 186 | "# Additional imports\n", 187 | "from mysql.connector import Error\n", 188 | "\n", 189 | "TWILIO_ACCOUNT_SID = '", 190 | "TWILIO_AUTH_TOKEN = '" , 191 | "TWILIO_PHONE_NUMBER = '", 192 | "\n", 193 | "client = Client(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)\n", 194 | "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")\n", 195 | "\n", 196 | "def get_rate_for_customer(timestamp, subscription_status):\n", 197 | " if subscription_status == 'active':\n", 198 | " if 0 <= timestamp.hour < 6 or 11 <= timestamp.hour < 16:\n", 199 | " return 2.99\n", 200 | " elif 6 <= timestamp.hour < 11 or 16 <= timestamp.hour < 23:\n", 201 | " return 3.99\n", 202 | " else:\n", 203 | " return 9.99\n", 204 | "\n", 205 | " # Add a default rate value to avoid NoneType issues\n", 206 | " return 0.0\n", 207 | "\n", 208 | "\n", 209 | "def is_subscription_active(subscription_start: dt, subscription_end: dt, current_time: dt) -> bool:\n", 210 | " return subscription_start <= current_time <= subscription_end\n", 211 | "\n", 212 | "def get_subscription_status(subscription_end: dt, current_time: dt) -> bool:\n", 213 | " grace_period = timedelta(days=7)\n", 214 | " return current_time <= subscription_end + grace_period\n", 215 | "\n", 216 | "\n", 217 | "def send_sms(phone_number, message):\n", 218 | " try:\n", 219 | " client.messages.create(\n", 220 | " body=message,\n", 221 | " from_=TWILIO_PHONE_NUMBER,\n", 222 | " to=phone_number\n", 223 | " )\n", 224 | " print(f\"SMS sent to {phone_number}: {message}\")\n", 225 | " except Exception as e:\n", 226 | " print(f\"Error sending SMS: {e}\")\n", 227 | "\n", 228 | "from pyspark.sql.functions import col\n", 229 | "\n", 230 | "def is_valid_balance(value):\n", 231 | " try:\n", 232 | " float(value)\n", 233 | " return True\n", 234 | " except ValueError:\n", 235 | " return False\n", 236 | "\n", 237 | "valid_balance_udf = udf(is_valid_balance, BooleanType())\n", 238 | "\n", 239 | "silver_data = silver_data.filter(valid_balance_udf(col(\"balance\")))\n", 240 | "\n", 241 | "# Database configuration\n", 242 | "db_config = {\n", 243 | " \"host\": \"mysql\",\n", 244 | " \"user\": \"root\",\n", 245 | " \"password\": \"debezium\",\n", 246 | " \"database\": \"inventory\"\n", 247 | "}\n", 248 | "\n", 249 | "def update_customer_balance(customer_id, new_balance):\n", 250 | " try:\n", 251 | " connection = mysql.connector.connect(**db_config)\n", 252 | " cursor = connection.cursor()\n", 253 | " update_query = \"UPDATE customers SET balance = %s WHERE id = %s\"\n", 254 | " cursor.execute(update_query, (new_balance, customer_id))\n", 255 | " connection.commit()\n", 256 | " print(f\"Updated balance for customer {customer_id}: {new_balance}\")\n", 257 | " except Error as e:\n", 258 | " print(f\"Error updating balance: {e}\")\n", 259 | " finally:\n", 260 | " if connection.is_connected():\n", 261 | " cursor.close()\n", 262 | " connection.close() \n", 263 | "\n", 264 | "from datetime import datetime, timezone\n", 265 | "\n", 266 | "def safe_date_conversion(date_string: Optional[str]) -> dt:\n", 267 | " if date_string is None or not isinstance(date_string, str):\n", 268 | " return dt(1970, 1, 1, tzinfo=timezone.utc)\n", 269 | " try:\n", 270 | " return dt.fromisoformat(date_string[:-1]).replace(tzinfo=timezone.utc)\n", 271 | " except ValueError:\n", 272 | " return dt(1970, 1, 1, tzinfo=timezone.utc)\n", 273 | "\n", 274 | "def process_plate(row: Row) -> None:\n", 275 | " print(f\"Processing plate: {row.plate_number}\")\n", 276 | " current_time = dt.now(timezone.utc)\n", 277 | " try:\n", 278 | " plate_timestamp = dt.fromisoformat(row.timestamp[:-1]).replace(tzinfo=timezone.utc)\n", 279 | " except ValueError:\n", 280 | " plate_timestamp = dt.fromtimestamp(0, timezone.utc)\n", 281 | "\n", 282 | " subscription_start = safe_date_conversion(row.subscription_start)\n", 283 | " subscription_end = safe_date_conversion(row.subscription_end)\n", 284 | "\n", 285 | " is_active = is_subscription_active(subscription_start, subscription_end, current_time)\n", 286 | " rate = get_rate_for_customer(plate_timestamp, row.subscription_status)\n", 287 | "\n", 288 | " balance = float(row.balance)\n", 289 | " new_balance = balance - rate\n", 290 | "\n", 291 | " if row.subscription_status == 'none':\n", 292 | " message = f\"Dear {row.owner_name}, your car with plate number {row.plate_number} is not registered. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n", 293 | " send_sms(row.owner_phone_number, message)\n", 294 | " elif is_active: # Changed from row.subscription_status == 'active'\n", 295 | " message = f\"Dear {row.owner_name}, your subscription is active. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n", 296 | " send_sms(row.owner_phone_number, message)\n", 297 | " elif not get_subscription_status(subscription_end, current_time):\n", 298 | " message = f\"Dear {row.owner_name}, your subscription has expired. The rate of ${rate} has been charged for your recent passage. Your new balance is ${new_balance:.2f}.\"\n", 299 | " send_sms(row.owner_phone_number, message)\n", 300 | "\n", 301 | " update_customer_balance(row.id, new_balance)\n", 302 | "\n", 303 | "silver_data.foreach(process_plate)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 8, 309 | "id": "8beefe0b-5fae-43d7-a903-d82a8cab1eae", 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/plain": [ 315 | "\"\\nsample_data = Row(\\n id='5a5c562e-4386-44ad-bf6f-bab91081781e',\\n plate_number='7695-OOO',\\n car_make='Ford',\\n car_year=2012,\\n owner_name='Becky Smith',\\n owner_address='92834 Kim Unions\\nPort Harryport, MD 61729',\\n owner_phone_number='+14385064453',\\n subscription_status='none',\\n subscription_start=None,\\n subscription_end=None,\\n balance=100.0, # Replace 'Exc=' with a valid float value\\n timestamp='2023-03-03T14:37:49Z',\\n rate=9.99\\n)\\n\\nprocess_plate(sample_data)\\n\"" 316 | ] 317 | }, 318 | "execution_count": 8, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "\"\"\"\n", 325 | "sample_data = Row(\n", 326 | " id='5a5c562e-4386-44ad-bf6f-bab91081781e',\n", 327 | " plate_number='7695-OOO',\n", 328 | " car_make='Ford',\n", 329 | " car_year=2012,\n", 330 | " owner_name='Becky Smith',\n", 331 | " owner_address='92834 Kim Unions\\nPort Harryport, MD 61729',\n", 332 | " owner_phone_number='+14354123654',\n", 333 | " subscription_status='none',\n", 334 | " subscription_start=None,\n", 335 | " subscription_end=None,\n", 336 | " balance=100.0, # Replace 'Exc=' with a valid float value\n", 337 | " timestamp='2023-03-03T14:37:49Z',\n", 338 | " rate=9.99\n", 339 | ")\n", 340 | "\n", 341 | "process_plate(sample_data)\n", 342 | "\"\"\"" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 9, 348 | "id": "84eb738c-e811-44e1-84e5-a56002973ea6", 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "+-------------------+-----+\n", 356 | "|subscription_status|count|\n", 357 | "+-------------------+-----+\n", 358 | "| active| 1|\n", 359 | "+-------------------+-----+\n", 360 | "\n", 361 | "Daily Metrics:\n", 362 | "+----------+--------------+-------------+\n", 363 | "| date|total_passages|total_revenue|\n", 364 | "+----------+--------------+-------------+\n", 365 | "|2023-03-03| 1| 3.99|\n", 366 | "+----------+--------------+-------------+\n", 367 | "\n", 368 | "Weekly Metrics:\n", 369 | "+----+------------+--------------+-------------+\n", 370 | "|year|week_of_year|total_passages|total_revenue|\n", 371 | "+----+------------+--------------+-------------+\n", 372 | "|2023| 9| 1| 3.99|\n", 373 | "+----+------------+--------------+-------------+\n", 374 | "\n", 375 | "Monthly Metrics:\n", 376 | "+----+-----+--------------+-------------+\n", 377 | "|year|month|total_passages|total_revenue|\n", 378 | "+----+-----+--------------+-------------+\n", 379 | "|2023| 3| 1| 3.99|\n", 380 | "+----+-----+--------------+-------------+\n", 381 | "\n", 382 | "Quarterly Metrics:\n", 383 | "+----+-------+--------------+-------------+\n", 384 | "|year|quarter|total_passages|total_revenue|\n", 385 | "+----+-------+--------------+-------------+\n", 386 | "|2023| 1| 1| 3.99|\n", 387 | "+----+-------+--------------+-------------+\n", 388 | "\n", 389 | "Yearly Metrics:\n", 390 | "+----+--------------+-------------+\n", 391 | "|year|total_passages|total_revenue|\n", 392 | "+----+--------------+-------------+\n", 393 | "|2023| 1| 3.99|\n", 394 | "+----+--------------+-------------+\n", 395 | "\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "gold_data = silver_data.groupBy(\"subscription_status\").count()\n", 401 | "\n", 402 | "gold_data.show()\n", 403 | "\n", 404 | "gold_data.write.parquet(\"s3a://warehouse/inventory/gold_data\", mode=\"overwrite\")\n", 405 | "\n", 406 | "\n", 407 | "import pyspark.sql.functions as F\n", 408 | "from pyspark.sql import SparkSession\n", 409 | "\n", 410 | "class MetricsAdapter:\n", 411 | " def __init__(self, silver_table, warehouse_path):\n", 412 | " self.silver_table = silver_table\n", 413 | " self.warehouse_path = warehouse_path\n", 414 | " \n", 415 | " def show_metrics(self):\n", 416 | " daily_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/daily_metrics')\n", 417 | " weekly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/weekly_metrics')\n", 418 | " monthly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/monthly_metrics')\n", 419 | " quarterly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/quarterly_metrics')\n", 420 | " yearly_metrics = spark.read.format('delta').load(self.warehouse_path + '/gold/yearly_metrics')\n", 421 | " subscription_status_count = silver_data.groupBy(\"subscription_status\").count()\n", 422 | "\n", 423 | " print(\"Daily Metrics:\")\n", 424 | " daily_metrics.show(5)\n", 425 | "\n", 426 | " print(\"Weekly Metrics:\")\n", 427 | " weekly_metrics.show(5)\n", 428 | "\n", 429 | " print(\"Monthly Metrics:\")\n", 430 | " monthly_metrics.show(5)\n", 431 | "\n", 432 | " print(\"Quarterly Metrics:\")\n", 433 | " quarterly_metrics.show(5)\n", 434 | "\n", 435 | " print(\"Yearly Metrics:\")\n", 436 | " yearly_metrics.show(5) \n", 437 | "\n", 438 | " def transform(self):\n", 439 | " # Calculate the week, month, quarter, and year from the timestamp\n", 440 | " time_based_metrics = self.silver_table.withColumn(\"date\", F.to_date(\"timestamp\")) \\\n", 441 | " .withColumn(\"year\", F.year(\"timestamp\")) \\\n", 442 | " .withColumn(\"quarter\", F.quarter(\"timestamp\")) \\\n", 443 | " .withColumn(\"month\", F.month(\"timestamp\")) \\\n", 444 | " .withColumn(\"week_of_year\", F.weekofyear(\"timestamp\")) \\\n", 445 | " .withColumn(\"total_passages\", F.lit(1)) \\\n", 446 | " .withColumn(\"total_revenue\", F.when(self.silver_table.timestamp.substr(12, 2).cast(\"int\") < 12, 2.99).otherwise(3.99))\n", 447 | "\n", 448 | "\n", 449 | " # Daily metrics\n", 450 | " daily_metrics = time_based_metrics.groupBy(\"date\").agg(\n", 451 | " F.count(\"*\").alias(\"total_passages\"),\n", 452 | " F.sum(F.when(time_based_metrics.timestamp.substr(12, 2).cast(\"int\") < 12, 2.99).otherwise(3.99)).alias(\"total_revenue\")\n", 453 | " )\n", 454 | " daily_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/daily_metrics')\n", 455 | "\n", 456 | " # Weekly metrics\n", 457 | " weekly_metrics = time_based_metrics.groupBy(\"year\", \"week_of_year\").agg(\n", 458 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n", 459 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n", 460 | " )\n", 461 | " weekly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/weekly_metrics')\n", 462 | "\n", 463 | " # Monthly metrics\n", 464 | " monthly_metrics = time_based_metrics.groupBy(\"year\", \"month\").agg(\n", 465 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n", 466 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n", 467 | " )\n", 468 | " monthly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/monthly_metrics')\n", 469 | "\n", 470 | " # Quarterly metrics\n", 471 | " quarterly_metrics = time_based_metrics.groupBy(\"year\", \"quarter\").agg(\n", 472 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n", 473 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n", 474 | " )\n", 475 | " quarterly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/quarterly_metrics')\n", 476 | "\n", 477 | " # Yearly metrics\n", 478 | " yearly_metrics = time_based_metrics.groupBy(\"year\").agg(\n", 479 | " F.sum(\"total_passages\").alias(\"total_passages\"),\n", 480 | " F.sum(\"total_revenue\").alias(\"total_revenue\")\n", 481 | " )\n", 482 | " yearly_metrics.write.format('delta').mode('overwrite').option(\"mergeSchema\", \"true\").save(self.warehouse_path + '/gold/yearly_metrics')\n", 483 | "\n", 484 | "# Example usage\n", 485 | "spark = SparkSession.builder.getOrCreate()\n", 486 | "silver_data = spark.read.parquet(\"s3a://warehouse/inventory/silver_data\")\n", 487 | "warehouse_path = \"s3a://warehouse/inventory/gold_data\"\n", 488 | "metrics_adapter = MetricsAdapter(silver_data, warehouse_path)\n", 489 | "metrics_adapter.transform()\n", 490 | "\n", 491 | "metrics_adapter.show_metrics()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "id": "9079a66c-d57f-4ab2-a52d-3ee7a2929490", 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [] 501 | } 502 | ], 503 | "metadata": { 504 | "kernelspec": { 505 | "display_name": "Python 3 (ipykernel)", 506 | "language": "python", 507 | "name": "python3" 508 | }, 509 | "language_info": { 510 | "codemirror_mode": { 511 | "name": "ipython", 512 | "version": 3 513 | }, 514 | "file_extension": ".py", 515 | "mimetype": "text/x-python", 516 | "name": "python", 517 | "nbconvert_exporter": "python", 518 | "pygments_lexer": "ipython3", 519 | "version": "3.8.13" 520 | } 521 | }, 522 | "nbformat": 4, 523 | "nbformat_minor": 5 524 | } 525 | --------------------------------------------------------------------------------