├── .gitignore
├── Dockerfiles
    ├── presto
    │   ├── etc
    │   │   ├── log.properties
    │   │   ├── node.properties
    │   │   ├── jvm.config
    │   │   └── config.properties
    │   └── Dockerfile
    ├── hive-metastore
    │   ├── entrypoint.sh
    │   ├── Dockerfile
    │   └── metastore-site.xml
    └── spark
    │   ├── start-spark.sh
    │   └── Dockerfile
├── images
    └── 1.png
├── presto-config
    └── hive.properties
├── config.env
├── workspace
    ├── dependencies
    │   └── packages_installer.sh
    ├── postgres_to_s3.py
    └── clean_data.py
├── README.md
├── CarParts.sql
└── docker-compose.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | database-data
2 | minio


--------------------------------------------------------------------------------
/Dockerfiles/presto/etc/log.properties:
--------------------------------------------------------------------------------
1 | com.facebook.presto=INFO


--------------------------------------------------------------------------------
/images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ysfesr/Building-Data-LakeHouse/HEAD/images/1.png


--------------------------------------------------------------------------------
/Dockerfiles/presto/etc/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=production
2 | node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
3 | node.data-dir=/var/presto/data


--------------------------------------------------------------------------------
/Dockerfiles/presto/etc/jvm.config:
--------------------------------------------------------------------------------
1 | -server
2 | -Xmx16G
3 | -XX:+UseG1GC
4 | -XX:G1HeapRegionSize=32M
5 | -XX:+UseGCOverheadLimit
6 | -XX:+ExplicitGCInvokesConcurrent
7 | -XX:+HeapDumpOnOutOfMemoryError
8 | -XX:+ExitOnOutOfMemoryError


--------------------------------------------------------------------------------
/presto-config/hive.properties:
--------------------------------------------------------------------------------
1 | connector.name=hive-hadoop2
2 | hive.metastore.uri=thrift://hive-metastore:9083
3 | hive.s3.path-style-access=true
4 | hive.s3.endpoint=http://minio:9000
5 | hive.s3.aws-access-key=admin
6 | hive.s3.aws-secret-key=123456789
7 | hive.non-managed-table-writes-enabled=true


--------------------------------------------------------------------------------
/Dockerfiles/presto/etc/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | query.max-memory=50GB
5 | query.max-memory-per-node=1GB
6 | query.max-total-memory-per-node=2GB
7 | discovery-server.enabled=true
8 | discovery.uri=http://localhost:8080


--------------------------------------------------------------------------------
/config.env:
--------------------------------------------------------------------------------
 1 | export AWS_ACCESS_KEY=admin
 2 | export AWS_SECRET_KEY=123456789
 3 | export AWS_S3_ENDPOINT=http://minio:9000
 4 | export AWS_BUCKET_NAME=datalake
 5 | export POSTGRES_USER=root
 6 | export POSTGRES_PASSWORD=root
 7 | export POSTGRES_DB=CarParts
 8 | export POSTGRES_ENDPOINT=postgres:5432
 9 | export HIVE_METASTORE_URI=thrift://hive-metastore:9083
10 | 


--------------------------------------------------------------------------------
/Dockerfiles/hive-metastore/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export HADOOP_HOME=/opt/hadoop-3.2.0
4 | export HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar
5 | export JAVA_HOME=/usr/local/openjdk-8
6 | 
7 | /opt/apache-hive-metastore-3.0.0-bin/bin/schematool -initSchema -dbType mysql
8 | /opt/apache-hive-metastore-3.0.0-bin/bin/start-metastore
9 | 


--------------------------------------------------------------------------------
/workspace/dependencies/packages_installer.sh:
--------------------------------------------------------------------------------
1 | wget -P /opt/spark/jars https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.1/delta-core_2.12-1.0.1.jar
2 | wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar
3 | wget -P /opt/spark/jars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar
4 | wget -P /opt/spark/jars https://jdbc.postgresql.org/download/postgresql-42.3.5.jar


--------------------------------------------------------------------------------
/Dockerfiles/spark/start-spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . "/opt/spark/bin/load-spark-env.sh"
 4 | 
 5 | if [ "$SPARK_WORKLOAD" == "master" ];
 6 | then
 7 | 
 8 | export SPARK_MASTER_HOST=`hostname`
 9 | 
10 | cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG
11 | 
12 | elif [ "$SPARK_WORKLOAD" == "worker" ];
13 | then
14 | 
15 | cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG
16 | 
17 | elif [ "$SPARK_WORKLOAD" == "submit" ];
18 | then
19 |     echo "SPARK SUBMIT"
20 | else
21 |     echo "Undefined Workload Type $SPARK_WORKLOAD, must specify: master, worker, submit"
22 | fi


--------------------------------------------------------------------------------
/Dockerfiles/hive-metastore/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8u242-jre
 2 | 
 3 | WORKDIR /opt
 4 | 
 5 | ENV HADOOP_VERSION=3.2.0
 6 | ENV METASTORE_VERSION=3.0.0
 7 | 
 8 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
 9 | ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin
10 | 
11 | RUN curl -L https://archive.apache.org/dist/hive/hive-standalone-metastore-${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \
12 |     curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf -
13 | 
14 | RUN curl -o mysql-connector-java-8.0.19.jar https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.19/mysql-connector-java-8.0.19.jar && \
15 |     cp mysql-connector-java-8.0.19.jar ${HIVE_HOME}/lib/ 
16 | 
17 | COPY metastore-site.xml ${HIVE_HOME}/conf
18 | COPY entrypoint.sh /entrypoint.sh
19 | 
20 | RUN groupadd -r hive --gid=1000 && \
21 |     useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive && \
22 |     chown hive:hive -R ${HIVE_HOME} && \
23 |     chown hive:hive /entrypoint.sh && chmod +x /entrypoint.sh
24 | 
25 | USER hive
26 | EXPOSE 9083
27 | 
28 | ENTRYPOINT ["sh", "-c", "/entrypoint.sh"]


--------------------------------------------------------------------------------
/Dockerfiles/presto/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre
 2 | 
 3 | # Presto version will be passed in at build time
 4 | ARG PRESTO_VERSION=0.272.1
 5 | 
 6 | # Set the URL to download
 7 | ARG PRESTO_BIN=https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz
 8 | 
 9 | # Update the base image OS and install wget and python
10 | RUN apt-get update
11 | RUN apt-get install -y wget python less
12 | 
13 | # Download Presto and unpack it to /opt/presto
14 | RUN wget --quiet ${PRESTO_BIN}
15 | RUN mkdir -p /opt
16 | RUN tar -xf presto-server-${PRESTO_VERSION}.tar.gz -C /opt
17 | RUN rm presto-server-${PRESTO_VERSION}.tar.gz
18 | RUN ln -s /opt/presto-server-${PRESTO_VERSION} /opt/presto
19 | 
20 | # Copy configuration files on the host into the image
21 | COPY etc /opt/presto/etc
22 | 
23 | # Download the Presto CLI and put it in the image
24 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar
25 | RUN mv presto-cli-${PRESTO_VERSION}-executable.jar /usr/local/bin/presto
26 | RUN chmod +x /usr/local/bin/presto
27 | 
28 | # Specify the entrypoint to start
29 | ENTRYPOINT /opt/presto/bin/launcher run


--------------------------------------------------------------------------------
/Dockerfiles/spark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:11.0.11-jre-slim-buster as builder
 2 | 
 3 | # Add Dependencies for PySpark
 4 | RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy
 5 | 
 6 | RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1
 7 | 
 8 | # Fix the value of PYTHONHASHSEED
 9 | # Note: this is needed when you use Python 3.3 or greater
10 | ENV SPARK_VERSION=3.1.1 \
11 | HADOOP_VERSION=3.2 \
12 | SPARK_HOME=/opt/spark \
13 | PYTHONHASHSEED=1
14 | 
15 | RUN wget --no-verbose -O apache-spark.tgz "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \
16 | && mkdir -p /opt/spark \
17 | && tar -xf apache-spark.tgz -C /opt/spark --strip-components=1 \
18 | && rm apache-spark.tgz
19 | 
20 | 
21 | FROM builder as apache-spark
22 | 
23 | WORKDIR /opt/spark
24 | 
25 | ENV SPARK_MASTER_PORT=7077 \
26 | SPARK_MASTER_WEBUI_PORT=8080 \
27 | SPARK_LOG_DIR=/opt/spark/logs \
28 | SPARK_MASTER_LOG=/opt/spark/logs/spark-master.out \
29 | SPARK_WORKER_LOG=/opt/spark/logs/spark-worker.out \
30 | SPARK_WORKER_WEBUI_PORT=8080 \
31 | SPARK_WORKER_PORT=7000 \
32 | SPARK_MASTER="spark://spark-master:7077" \
33 | SPARK_WORKLOAD="master"
34 | 
35 | EXPOSE 8080 7077 7000
36 | 
37 | RUN mkdir -p $SPARK_LOG_DIR && \
38 | touch $SPARK_MASTER_LOG && \
39 | touch $SPARK_WORKER_LOG && \
40 | ln -sf /dev/stdout $SPARK_MASTER_LOG && \
41 | ln -sf /dev/stdout $SPARK_WORKER_LOG
42 | 
43 | ENV PATH /opt/spark/bin:/opt/spark/sbin:$PATH
44 | 
45 | COPY start-spark.sh /
46 | 
47 | CMD ["/bin/bash", "/start-spark.sh"]


--------------------------------------------------------------------------------
/Dockerfiles/hive-metastore/metastore-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <property>
 3 |         <name>metastore.thrift.uris</name>
 4 |         <value>thrift://0.0.0.0:9083</value>
 5 |         <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
 6 |     </property>
 7 |     <property>
 8 |         <name>metastore.task.threads.always</name>
 9 |         <value>org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask</value>
10 |     </property>
11 |     <property>
12 |         <name>metastore.expression.proxy</name>
13 |         <value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
14 |     </property>
15 |     <property>
16 |         <name>metastore.warehouse.dir</name>
17 |         <value>s3a://datalake/warehouse/</value>
18 |     </property>
19 |     <property>
20 |         <name>javax.jdo.option.ConnectionDriverName</name>
21 |         <value>com.mysql.cj.jdbc.Driver</value>
22 |     </property>
23 | 
24 |     <property>
25 |         <name>javax.jdo.option.ConnectionURL</name>
26 |         <value>jdbc:mysql://mariadb:3306/metastore_db</value>
27 |     </property>
28 | 
29 |     <property>
30 |         <name>javax.jdo.option.ConnectionUserName</name>
31 |         <value>admin</value>
32 |     </property>
33 | 
34 |     <property>
35 |         <name>javax.jdo.option.ConnectionPassword</name>
36 |         <value>admin</value>
37 |     </property>
38 | 
39 |     <property>
40 |         <name>fs.s3a.access.key</name>
41 |         <value>admin</value>
42 |     </property>
43 |     <property>
44 |         <name>fs.s3a.secret.key</name>
45 |         <value>123456789</value>
46 |     </property>
47 |     <property>
48 |         <name>fs.s3a.endpoint</name>
49 |         <value>http://minio:9000</value>
50 |     </property>
51 |     <property>
52 |         <name>fs.s3a.path.style.access</name>
53 |         <value>true</value>
54 |     </property>
55 | 
56 | </configuration>
57 | 


--------------------------------------------------------------------------------
/workspace/postgres_to_s3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pyspark.sql import SparkSession
 3 | from datetime import date
 4 | 
 5 | today = date.today().strftime("%b-%d-%Y")
 6 | 
 7 | 
 8 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
 9 | AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
10 | AWS_S3_ENDPOINT = os.getenv("AWS_S3_ENDPOINT")
11 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
12 | 
13 | POSTGRES_USER = os.getenv("POSTGRES_USER")
14 | POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
15 | POSTGRES_ENDPOINT = os.getenv("POSTGRES_ENDPOINT")
16 | POSTGRES_DB = os.getenv("POSTGRES_DB")
17 | 
18 | spark = SparkSession.builder \
19 |     .appName('Postgres to S3 pipeline') \
20 |     .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY) \
21 |     .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY) \
22 |     .config("fs.s3a.endpoint", AWS_S3_ENDPOINT)\
23 |     .config("spark.hadoop.fs.s3a.path.style.access", "true")\
24 |     .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
25 |     .config("fs.s3a.connection.ssl.enabled", "false")\
26 |     .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\
27 |     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
28 |     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
29 |     .config('spark.jars','/opt/spark/jars/aws-java-sdk-bundle-1.11.375.jar')\
30 |     .config('spark.jars','/opt/spark/jars/hadoop-aws-3.2.0.jar')\
31 |     .config('spark.jars','/opt/spark/jars/delta-core_2.12-1.0.1.jar')\
32 |     .config('spark.jars','/opt/spark/jars/postgresql-42.3.5.jar')\
33 |     .getOrCreate()
34 | 
35 | spark.sparkContext.setLogLevel("ERROR")
36 | 
37 | tables_names = ['Part_in_Order', 'Supplier', 'Brand', 'Part', 'Part_for_Car', 'Part_Supplier', \
38 |                'Customer', 'Customer_Statut', 'Orders', 'Car_Manufacturer', 'Car', 'Part_Maker']
39 | 
40 | postgres_url= f"jdbc:postgresql://{POSTGRES_ENDPOINT}/{POSTGRES_DB}"
41 | 
42 | for table_name in tables_names:
43 |     print(f"{table_name} table transformation ...")
44 | 
45 |     spark.read \
46 |     .format("jdbc") \
47 |     .option("url", postgres_url) \
48 |     .option("dbtable", table_name) \
49 |     .option("user", POSTGRES_USER) \
50 |     .option("password", POSTGRES_PASSWORD) \
51 |     .option("driver", "org.postgresql.Driver") \
52 |     .load() \
53 |     .write \
54 |     .format("delta")\
55 |     .mode("overwrite")\
56 |     .save(f"s3a://{AWS_BUCKET_NAME}/bronze/CarPartsDB/{today}/{table_name}")
57 |     print(f"{table_name} table done!")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Building Data Lakehouse
 2 | 
 3 | This project is designed to construct a data lakehouse. This data lakehouse will enable organizations to store, manage, and analyze large datasets in a cost-effective, secure, and scalable manner. The data lakehouse will provide a centralized repository for all data, allowing users to easily access and query the data with a unified interface.
 4 | 
 5 | Minio will provide distributed object storage to store the data, Delta Lake will provide ACID-compliant transactions for managing the data, Spark will enable distributed computing for analytics, Presto will provide fast SQL queries, and Hive Metastore will provide a unified catalog for the data. This data lakehouse will enable organizations to quickly and easily access and analyze valuable data, allowing them to make better data-driven decisions.
 6 | 
 7 | This project aims also to create an Extract, Load, and Transform (ELT) pipeline to ingest data from a Postgres database into our lakehouse. The ELT pipeline will make use of Apache Spark, to extract the data from the Postgres database, load it into the lakehouse, and then transform it into the desired format. Once the data is loaded into the lakehouse, it will be available for downstream analytics and reporting.
 8 | ## Architecture
 9 | 
10 | ![Architecture](/images/1.png "Architecture")
11 | 
12 | 
13 | ## Setup
14 | - First, build Spark and Presto docker image
15 | ```bash
16 | docker build -t presto:0.272.1 ./Dockerfiles/presto
17 | docker build -t cluster-apache-spark:3.1.1 Dockerfiles/spark
18 | ```
19 | - Run docker compose
20 | ```bash
21 | docker-compose up
22 | ```
23 | 
24 | - Create a bucket in [minio](http://localhost:9001) to store our data (name it datalake)
25 | 
26 | - Create a Postgres database (name it CarParts and use CarParts.sql file to create tables)
27 | - Install jar files needed for our spark project
28 | ```bash
29 | docker exec -it master bash /opt/workspace/dependencies/packages_installer.sh 
30 | ```
31 | - Run the first script
32 | ```bash
33 | docker exec -it master spark-submit --master spark://master:7077 \
34 |         --deploy-mode cluster \
35 |         --executor-memory 5G \
36 |         --executor-cores 8 \
37 |         /opt/workspace/postgres_to_s3.py
38 | ```
39 | 
40 | - Run the second script
41 | ```bash
42 | docker exec -it master spark-submit --master spark://master:7077 \
43 |         --deploy-mode cluster \
44 |         --executor-memory 5G \
45 |         --executor-cores 8 \
46 |         /opt/workspace/clean_data.py
47 | ```
48 | ## links
49 | - **Spark master UI:**    http://localhost:9090
50 | - **Spark worker a UI:**  http://localhost:9091
51 | - **Spark worker b UI:**  http://localhost:9092
52 | - **Minio:**  http://localhost:9001
53 | - **Presto:** http://localhost:8000
54 | 
55 | ## Built With
56 | 
57 | - Spark
58 | - Minio
59 | - PostgreSQL
60 | - Hive Metastore
61 | - Presto
62 | - Delta Lake
63 | 
64 | 
65 | ## Author
66 | 
67 | **Youssef EL ASERY**
68 | 
69 | - [Profile](https://github.com/ysfesr "Youssef ELASERY")
70 | - [Linkedin](https://www.linkedin.com/in/youssef-elasery/ "Welcome")
71 | - [Kaggle](https://www.kaggle.com/youssefelasery "Welcome")
72 | 
73 | 
74 | ## 🤝 Support
75 | 
76 | Contributions, issues, and feature requests are welcome!
77 | 
78 | Give a ⭐️ if you like this project!
79 | 


--------------------------------------------------------------------------------
/CarParts.sql:
--------------------------------------------------------------------------------
 1 | -- Clean the database
 2 | DROP TABLE IF EXISTS Part_in_Order;
 3 | DROP TABLE IF EXISTS Supplier;
 4 | DROP TABLE IF EXISTS Brand;
 5 | DROP TABLE IF EXISTS Part;
 6 | DROP TABLE IF EXISTS Part_for_Car;
 7 | DROP TABLE IF EXISTS Part_Supplier;
 8 | DROP TABLE IF EXISTS Customer;
 9 | DROP TABLE IF EXISTS Customer_Statut;
10 | DROP TABLE IF EXISTS Orders;
11 | DROP TABLE IF EXISTS Car_Manufacturer;
12 | DROP TABLE IF EXISTS Car;
13 | DROP TABLE IF EXISTS Part_Maker;
14 | 
15 | -- Create the Schema
16 | CREATE TABLE IF NOT EXISTS Customer_Statut(
17 |             statut_id INT NOT NULL PRIMARY KEY,
18 |             statut VARCHAR(255) NOT NULL
19 | );
20 | 
21 | CREATE TABLE IF NOT EXISTS Customer(
22 |             customer_id INT NOT NULL PRIMARY KEY,
23 |             statut_id INT NOT NULL,
24 |             individual_or_organization VARCHAR(50) NOT NULL,
25 |             organisation_name varchar(50),
26 |             individual_first_name varchar(50),
27 |             individual_last_name VARCHAR(50)
28 | );
29 | 
30 | CREATE TABLE IF NOT EXISTS Orders(
31 |             order_id INT NOT NULL PRIMARY KEY,
32 |             customer_id INT NOT NULL,
33 |             amount_due INT NOT NULL
34 |             );
35 |          
36 | CREATE TABLE IF NOT EXISTS Car_Manufacturer(
37 |             car_manufacturer_id INT NOT NULL PRIMARY KEY,
38 |             name VARCHAR(50) NOT NULL
39 |             );
40 |            
41 | CREATE TABLE IF NOT EXISTS Car(
42 |             car_id INT NOT NULL PRIMARY KEY,
43 |             car_manufacturer_id INT NOT NULL,
44 |             date_of_manufacture DATE NOT NULL,
45 |             model VARCHAR(50) NOT NULL
46 |             );
47 |            
48 | CREATE TABLE IF NOT EXISTS Supplier(
49 |             supplier_id INT NOT NULL PRIMARY KEY,
50 |             name VARCHAR(50) NOT NULL,
51 |             street_address VARCHAR(50) NOT NULL,
52 |             town VARCHAR(50) NOT NULL,
53 |             country VARCHAR(50) NOT NULL,
54 |             postcode INT NOT NULL,
55 |             phone VARCHAR(50) NOT NULL
56 | );
57 |            
58 | CREATE TABLE IF NOT EXISTS Brand(
59 |             brand_id INT NOT NULL PRIMARY KEY,
60 |             name VARCHAR(50) NOT NULL
61 | );
62 |            
63 | CREATE TABLE IF NOT EXISTS Part_Maker(
64 |             part_maker_id INT NOT NULL PRIMARY KEY,
65 |             name VARCHAR(50) NOT NULL
66 | );
67 |            
68 | CREATE TABLE IF NOT EXISTS Part(
69 |         part_id INT NOT NULL PRIMARY KEY,
70 |         brand_id INT NOT NULL,
71 |         supplier_id INT NOT NULL,
72 |         part_group_id INT NOT NULL,
73 |         part_maker_id INT NOT NULL,
74 |         part_name VARCHAR(50) NOT NULL,
75 |         main_supplier_name VARCHAR(50) NOT NULL,
76 |         price_to_us INT NOT NULL,
77 |         price_to_customer INT NOT NULL
78 | );
79 | 
80 | CREATE TABLE IF NOT EXISTS Part_for_Car(
81 |             car_id INT NOT NULL,
82 |             part_id INT NOT NULL
83 | );
84 |            
85 | CREATE TABLE IF NOT EXISTS Part_Supplier(
86 |         part_supplier_id INT NOT NULL PRIMARY KEY,
87 |         part_id INT NOT NULL,
88 |         supplier_id INT NOT NULL
89 | );
90 | 
91 | CREATE TABLE IF NOT EXISTS Part_in_Order(
92 |             part_in_order_id INT NOT NULL,
93 |             order_id INT NOT NULL,
94 |             part_supplier_id INT NOT NULL,
95 |             actual_sale_price INT NOT NULL,
96 |             quantity INT NOT NULL
97 | );
98 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.3"
  2 | services:
  3 |   spark-master:
  4 |     image: cluster-apache-spark:3.1.1  # docker build -t cluster-apache-spark:3.1.1 ./Dockerfiles/spark
  5 |     container_name: master
  6 |     ports:
  7 |       - "9090:8080"
  8 |       - "7077:7077"
  9 |     volumes:
 10 |        - ./workspace:/opt/workspace
 11 |     env_file:
 12 |       - ./config.env
 13 |     environment:
 14 |       - SPARK_LOCAL_IP=spark-master
 15 |       - SPARK_WORKLOAD=master
 16 |       - PATH=/usr/local/openjdk-11/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/spark/bin:/opt/spark/sbin
 17 |     
 18 |   spark-worker-a:
 19 |     image: cluster-apache-spark:3.1.1
 20 |     container_name: worker-a
 21 |     ports:
 22 |       - "9091:8080"
 23 |       - "7000:7000"
 24 |     depends_on:
 25 |       - spark-master
 26 |     environment:
 27 |       - SPARK_MASTER=spark://spark-master:7077
 28 |       - SPARK_WORKER_CORES=1
 29 |       - SPARK_WORKER_MEMORY=1G
 30 |       - SPARK_DRIVER_MEMORY=1G
 31 |       - SPARK_EXECUTOR_MEMORY=1G
 32 |       - SPARK_WORKLOAD=worker
 33 |       - SPARK_LOCAL_IP=spark-worker-a
 34 |     volumes:
 35 |        - ./workspace:/opt/workspace
 36 |   spark-worker-b:
 37 |     image: cluster-apache-spark:3.1.1
 38 |     container_name: worker-b
 39 |     ports:
 40 |       - "9092:8080"
 41 |       - "7001:7000"
 42 |     depends_on:
 43 |       - spark-master
 44 |     environment:
 45 |       - SPARK_MASTER=spark://spark-master:7077
 46 |       - SPARK_WORKER_CORES=1
 47 |       - SPARK_WORKER_MEMORY=1G
 48 |       - SPARK_DRIVER_MEMORY=1G
 49 |       - SPARK_EXECUTOR_MEMORY=1G
 50 |       - SPARK_WORKLOAD=worker
 51 |       - SPARK_LOCAL_IP=spark-worker-b
 52 |     volumes:
 53 |         - ./workspace:/opt/workspace
 54 | 
 55 |   minio:
 56 |     image: quay.io/minio/minio
 57 |     container_name: minio
 58 |     restart: always
 59 |     environment:
 60 |         - MINIO_ROOT_USER=admin
 61 |         - MINIO_ROOT_PASSWORD=123456789
 62 | 
 63 |     ports:
 64 |         - "9001:9001"
 65 |         - "9000:9000"
 66 |     volumes:
 67 |         - ./minio/data:/data
 68 |     command: server /data --console-address ":9001" 
 69 |     healthcheck:
 70 |         test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
 71 |         interval: 30s
 72 |         timeout: 30s
 73 |         retries: 3
 74 |   db:
 75 |     image: postgres:14.2
 76 |     container_name: postgres
 77 |     restart: always
 78 |     volumes:
 79 |         - ./database-data:/var/lib/postgresql/data/
 80 |     environment:
 81 |         POSTGRES_USER: root
 82 |         POSTGRES_PASSWORD: root
 83 |         POSTGRES_DB: test_db
 84 |     ports:
 85 |     - "5432:5432"
 86 |   mariadb:
 87 |     image: mariadb:latest
 88 |     container_name: mariadb
 89 |     ports:
 90 |       - 3306:3306
 91 |     environment:
 92 |       USE_S3_STORAGE: 0
 93 |       MYSQL_ROOT_PASSWORD: root
 94 |       MYSQL_USER: admin
 95 |       MYSQL_PASSWORD: admin
 96 |       MYSQL_DATABASE: metastore_db
 97 | 
 98 |   hive-metastore:
 99 |     build: ./Dockerfiles/hive-metastore
100 |     container_name: hive-metastore
101 |     restart: unless-stopped
102 |     ports:
103 |       - 9083:9083
104 |   
105 |   presto:
106 |     image: presto:0.272.1 # docker build -t presto:0.272.1 ./Dockerfiles/presto
107 |     container_name: presto
108 |     volumes:
109 |       - ./presto-config:/opt/presto/etc/catalog
110 |     ports:
111 |     - 8000:8080
112 | 


--------------------------------------------------------------------------------
/workspace/clean_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pyspark.sql import SparkSession
 3 | from datetime import date
 4 | 
 5 | today = date.today().strftime("%b-%d-%Y")
 6 | 
 7 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
 8 | AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
 9 | AWS_S3_ENDPOINT = os.getenv("AWS_S3_ENDPOINT")
10 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
11 | HIVE_METASTORE_URI = os.getenv("HIVE_METASTORE_URI")
12 | 
13 | spark = SparkSession.builder \
14 |     .appName('Clean data') \
15 |     .config("hive.metastore.uris", "thrift://hive-metastore:9083")\
16 |     .config("spark.sql.warehouse.dir","s3a://datalake/warehouse")\
17 |     .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY) \
18 |     .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY) \
19 |     .config("fs.s3a.endpoint", AWS_S3_ENDPOINT)\
20 |     .config("spark.hadoop.fs.s3a.path.style.access", "true")\
21 |     .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
22 |     .config("fs.s3a.connection.ssl.enabled", "false")\
23 |     .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\
24 |     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
25 |     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
26 |     .config('spark.jars','/opt/spark/jars/aws-java-sdk-bundle-1.11.375.jar')\
27 |     .config('spark.jars','/opt/spark/jars/hadoop-aws-3.2.0.jar')\
28 |     .config('spark.jars','/opt/spark/jars/delta-core_2.12-1.0.1.jar')\
29 |     .enableHiveSupport()\
30 |     .getOrCreate()
31 | 
32 | spark.sparkContext.setLogLevel("ERROR")
33 | 
34 | spark.sql("CREATE DATABASE IF NOT EXISTS dwh COMMENT 'Data Warehouse for Car Part'")
35 | 
36 | 
37 | # Reading tables from landing area
38 | print('\nReading ...')
39 | Brand = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Brand')
40 | Car = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Car')
41 | Customer = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Customer')
42 | Orders = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Orders')
43 | Part_for_Car = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_for_Car')
44 | Part_in_Order = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_in_Order')
45 | Part_Maker = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_Maker')
46 | Part_Supplier = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_Supplier')
47 | Part = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part')
48 | Supplier = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Supplier')
49 | print('End of reading... \n')
50 | 
51 | 
52 | 
53 | # transforming tables to a set of dimensionel tables
54 | print('\ntransforming ...')
55 | Brand.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Brand').saveAsTable("dwh.DimBrand")
56 | Car.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Car').saveAsTable("dwh.DimCar")
57 | Customer.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Customer').saveAsTable("dwh.DimCustomer")
58 | Orders.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Orders').saveAsTable("dwh.DimOrders")
59 | Part_Maker.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part_Maker').saveAsTable("dwh.DimPartMaker")
60 | Part_for_Car.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part_for_Car').saveAsTable("dwh.DimPartForCar")
61 | Part_Supplier.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part_Supplier').saveAsTable("dwh.DimPartSupplier")
62 | Supplier.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Supplier').saveAsTable("dwh.DimSupplier")
63 | Part.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part').saveAsTable("dwh.DimPart")
64 | 
65 | Part_in_Order.join(Orders, 'order_id') \
66 |     .join(Part_Supplier,'part_supplier_id')\
67 |     .join(Part, 'part_id')\
68 |     .join(Part_for_Car, 'part_id')\
69 |     .join(Car, 'car_id')\
70 |     .select("part_in_order_id", "brand_id", "car_id", "car_manufacturer_id", "customer_id", "order_id", "part_id",
71 |        "part_maker_id", "part_supplier_id", Part.supplier_id, "actual_sale_price", "quantity")\
72 |     .write.format('delta').mode('overwrite')\
73 |     .option('path','s3a://datalake/silver/warehouse/CarParts/Fact_part_in_Order').saveAsTable("dwh.FactPartInOrder")
74 | print('End Of Transforming')


--------------------------------------------------------------------------------