├── .gitignore ├── Dockerfiles ├── presto │ ├── etc │ │ ├── log.properties │ │ ├── node.properties │ │ ├── jvm.config │ │ └── config.properties │ └── Dockerfile ├── hive-metastore │ ├── entrypoint.sh │ ├── Dockerfile │ └── metastore-site.xml └── spark │ ├── start-spark.sh │ └── Dockerfile ├── images └── 1.png ├── presto-config └── hive.properties ├── config.env ├── workspace ├── dependencies │ └── packages_installer.sh ├── postgres_to_s3.py └── clean_data.py ├── README.md ├── CarParts.sql └── docker-compose.yml /.gitignore: -------------------------------------------------------------------------------- 1 | database-data 2 | minio -------------------------------------------------------------------------------- /Dockerfiles/presto/etc/log.properties: -------------------------------------------------------------------------------- 1 | com.facebook.presto=INFO -------------------------------------------------------------------------------- /images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ysfesr/Building-Data-LakeHouse/HEAD/images/1.png -------------------------------------------------------------------------------- /Dockerfiles/presto/etc/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=production 2 | node.id=ffffffff-ffff-ffff-ffff-ffffffffffff 3 | node.data-dir=/var/presto/data -------------------------------------------------------------------------------- /Dockerfiles/presto/etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx16G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:+UseGCOverheadLimit 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:+ExitOnOutOfMemoryError -------------------------------------------------------------------------------- /presto-config/hive.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore.uri=thrift://hive-metastore:9083 3 | hive.s3.path-style-access=true 4 | hive.s3.endpoint=http://minio:9000 5 | hive.s3.aws-access-key=admin 6 | hive.s3.aws-secret-key=123456789 7 | hive.non-managed-table-writes-enabled=true -------------------------------------------------------------------------------- /Dockerfiles/presto/etc/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | query.max-memory=50GB 5 | query.max-memory-per-node=1GB 6 | query.max-total-memory-per-node=2GB 7 | discovery-server.enabled=true 8 | discovery.uri=http://localhost:8080 -------------------------------------------------------------------------------- /config.env: -------------------------------------------------------------------------------- 1 | export AWS_ACCESS_KEY=admin 2 | export AWS_SECRET_KEY=123456789 3 | export AWS_S3_ENDPOINT=http://minio:9000 4 | export AWS_BUCKET_NAME=datalake 5 | export POSTGRES_USER=root 6 | export POSTGRES_PASSWORD=root 7 | export POSTGRES_DB=CarParts 8 | export POSTGRES_ENDPOINT=postgres:5432 9 | export HIVE_METASTORE_URI=thrift://hive-metastore:9083 10 | -------------------------------------------------------------------------------- /Dockerfiles/hive-metastore/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export HADOOP_HOME=/opt/hadoop-3.2.0 4 | export HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar 5 | export JAVA_HOME=/usr/local/openjdk-8 6 | 7 | /opt/apache-hive-metastore-3.0.0-bin/bin/schematool -initSchema -dbType mysql 8 | /opt/apache-hive-metastore-3.0.0-bin/bin/start-metastore 9 | -------------------------------------------------------------------------------- /workspace/dependencies/packages_installer.sh: -------------------------------------------------------------------------------- 1 | wget -P /opt/spark/jars https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.1/delta-core_2.12-1.0.1.jar 2 | wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar 3 | wget -P /opt/spark/jars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar 4 | wget -P /opt/spark/jars https://jdbc.postgresql.org/download/postgresql-42.3.5.jar -------------------------------------------------------------------------------- /Dockerfiles/spark/start-spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . "/opt/spark/bin/load-spark-env.sh" 4 | 5 | if [ "$SPARK_WORKLOAD" == "master" ]; 6 | then 7 | 8 | export SPARK_MASTER_HOST=`hostname` 9 | 10 | cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG 11 | 12 | elif [ "$SPARK_WORKLOAD" == "worker" ]; 13 | then 14 | 15 | cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG 16 | 17 | elif [ "$SPARK_WORKLOAD" == "submit" ]; 18 | then 19 | echo "SPARK SUBMIT" 20 | else 21 | echo "Undefined Workload Type $SPARK_WORKLOAD, must specify: master, worker, submit" 22 | fi -------------------------------------------------------------------------------- /Dockerfiles/hive-metastore/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8u242-jre 2 | 3 | WORKDIR /opt 4 | 5 | ENV HADOOP_VERSION=3.2.0 6 | ENV METASTORE_VERSION=3.0.0 7 | 8 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} 9 | ENV HIVE_HOME=/opt/apache-hive-metastore-${METASTORE_VERSION}-bin 10 | 11 | RUN curl -L https://archive.apache.org/dist/hive/hive-standalone-metastore-${METASTORE_VERSION}/hive-standalone-metastore-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \ 12 | curl -L https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - 13 | 14 | RUN curl -o mysql-connector-java-8.0.19.jar https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.19/mysql-connector-java-8.0.19.jar && \ 15 | cp mysql-connector-java-8.0.19.jar ${HIVE_HOME}/lib/ 16 | 17 | COPY metastore-site.xml ${HIVE_HOME}/conf 18 | COPY entrypoint.sh /entrypoint.sh 19 | 20 | RUN groupadd -r hive --gid=1000 && \ 21 | useradd -r -g hive --uid=1000 -d ${HIVE_HOME} hive && \ 22 | chown hive:hive -R ${HIVE_HOME} && \ 23 | chown hive:hive /entrypoint.sh && chmod +x /entrypoint.sh 24 | 25 | USER hive 26 | EXPOSE 9083 27 | 28 | ENTRYPOINT ["sh", "-c", "/entrypoint.sh"] -------------------------------------------------------------------------------- /Dockerfiles/presto/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre 2 | 3 | # Presto version will be passed in at build time 4 | ARG PRESTO_VERSION=0.272.1 5 | 6 | # Set the URL to download 7 | ARG PRESTO_BIN=https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz 8 | 9 | # Update the base image OS and install wget and python 10 | RUN apt-get update 11 | RUN apt-get install -y wget python less 12 | 13 | # Download Presto and unpack it to /opt/presto 14 | RUN wget --quiet ${PRESTO_BIN} 15 | RUN mkdir -p /opt 16 | RUN tar -xf presto-server-${PRESTO_VERSION}.tar.gz -C /opt 17 | RUN rm presto-server-${PRESTO_VERSION}.tar.gz 18 | RUN ln -s /opt/presto-server-${PRESTO_VERSION} /opt/presto 19 | 20 | # Copy configuration files on the host into the image 21 | COPY etc /opt/presto/etc 22 | 23 | # Download the Presto CLI and put it in the image 24 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar 25 | RUN mv presto-cli-${PRESTO_VERSION}-executable.jar /usr/local/bin/presto 26 | RUN chmod +x /usr/local/bin/presto 27 | 28 | # Specify the entrypoint to start 29 | ENTRYPOINT /opt/presto/bin/launcher run -------------------------------------------------------------------------------- /Dockerfiles/spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:11.0.11-jre-slim-buster as builder 2 | 3 | # Add Dependencies for PySpark 4 | RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy 5 | 6 | RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 7 | 8 | # Fix the value of PYTHONHASHSEED 9 | # Note: this is needed when you use Python 3.3 or greater 10 | ENV SPARK_VERSION=3.1.1 \ 11 | HADOOP_VERSION=3.2 \ 12 | SPARK_HOME=/opt/spark \ 13 | PYTHONHASHSEED=1 14 | 15 | RUN wget --no-verbose -O apache-spark.tgz "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \ 16 | && mkdir -p /opt/spark \ 17 | && tar -xf apache-spark.tgz -C /opt/spark --strip-components=1 \ 18 | && rm apache-spark.tgz 19 | 20 | 21 | FROM builder as apache-spark 22 | 23 | WORKDIR /opt/spark 24 | 25 | ENV SPARK_MASTER_PORT=7077 \ 26 | SPARK_MASTER_WEBUI_PORT=8080 \ 27 | SPARK_LOG_DIR=/opt/spark/logs \ 28 | SPARK_MASTER_LOG=/opt/spark/logs/spark-master.out \ 29 | SPARK_WORKER_LOG=/opt/spark/logs/spark-worker.out \ 30 | SPARK_WORKER_WEBUI_PORT=8080 \ 31 | SPARK_WORKER_PORT=7000 \ 32 | SPARK_MASTER="spark://spark-master:7077" \ 33 | SPARK_WORKLOAD="master" 34 | 35 | EXPOSE 8080 7077 7000 36 | 37 | RUN mkdir -p $SPARK_LOG_DIR && \ 38 | touch $SPARK_MASTER_LOG && \ 39 | touch $SPARK_WORKER_LOG && \ 40 | ln -sf /dev/stdout $SPARK_MASTER_LOG && \ 41 | ln -sf /dev/stdout $SPARK_WORKER_LOG 42 | 43 | ENV PATH /opt/spark/bin:/opt/spark/sbin:$PATH 44 | 45 | COPY start-spark.sh / 46 | 47 | CMD ["/bin/bash", "/start-spark.sh"] -------------------------------------------------------------------------------- /Dockerfiles/hive-metastore/metastore-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | metastore.thrift.uris 4 | thrift://0.0.0.0:9083 5 | Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. 6 | 7 | 8 | metastore.task.threads.always 9 | org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask 10 | 11 | 12 | metastore.expression.proxy 13 | org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy 14 | 15 | 16 | metastore.warehouse.dir 17 | s3a://datalake/warehouse/ 18 | 19 | 20 | javax.jdo.option.ConnectionDriverName 21 | com.mysql.cj.jdbc.Driver 22 | 23 | 24 | 25 | javax.jdo.option.ConnectionURL 26 | jdbc:mysql://mariadb:3306/metastore_db 27 | 28 | 29 | 30 | javax.jdo.option.ConnectionUserName 31 | admin 32 | 33 | 34 | 35 | javax.jdo.option.ConnectionPassword 36 | admin 37 | 38 | 39 | 40 | fs.s3a.access.key 41 | admin 42 | 43 | 44 | fs.s3a.secret.key 45 | 123456789 46 | 47 | 48 | fs.s3a.endpoint 49 | http://minio:9000 50 | 51 | 52 | fs.s3a.path.style.access 53 | true 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /workspace/postgres_to_s3.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pyspark.sql import SparkSession 3 | from datetime import date 4 | 5 | today = date.today().strftime("%b-%d-%Y") 6 | 7 | 8 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 9 | AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY") 10 | AWS_S3_ENDPOINT = os.getenv("AWS_S3_ENDPOINT") 11 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") 12 | 13 | POSTGRES_USER = os.getenv("POSTGRES_USER") 14 | POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD") 15 | POSTGRES_ENDPOINT = os.getenv("POSTGRES_ENDPOINT") 16 | POSTGRES_DB = os.getenv("POSTGRES_DB") 17 | 18 | spark = SparkSession.builder \ 19 | .appName('Postgres to S3 pipeline') \ 20 | .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY) \ 21 | .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY) \ 22 | .config("fs.s3a.endpoint", AWS_S3_ENDPOINT)\ 23 | .config("spark.hadoop.fs.s3a.path.style.access", "true")\ 24 | .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ 25 | .config("fs.s3a.connection.ssl.enabled", "false")\ 26 | .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\ 27 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ 28 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\ 29 | .config('spark.jars','/opt/spark/jars/aws-java-sdk-bundle-1.11.375.jar')\ 30 | .config('spark.jars','/opt/spark/jars/hadoop-aws-3.2.0.jar')\ 31 | .config('spark.jars','/opt/spark/jars/delta-core_2.12-1.0.1.jar')\ 32 | .config('spark.jars','/opt/spark/jars/postgresql-42.3.5.jar')\ 33 | .getOrCreate() 34 | 35 | spark.sparkContext.setLogLevel("ERROR") 36 | 37 | tables_names = ['Part_in_Order', 'Supplier', 'Brand', 'Part', 'Part_for_Car', 'Part_Supplier', \ 38 | 'Customer', 'Customer_Statut', 'Orders', 'Car_Manufacturer', 'Car', 'Part_Maker'] 39 | 40 | postgres_url= f"jdbc:postgresql://{POSTGRES_ENDPOINT}/{POSTGRES_DB}" 41 | 42 | for table_name in tables_names: 43 | print(f"{table_name} table transformation ...") 44 | 45 | spark.read \ 46 | .format("jdbc") \ 47 | .option("url", postgres_url) \ 48 | .option("dbtable", table_name) \ 49 | .option("user", POSTGRES_USER) \ 50 | .option("password", POSTGRES_PASSWORD) \ 51 | .option("driver", "org.postgresql.Driver") \ 52 | .load() \ 53 | .write \ 54 | .format("delta")\ 55 | .mode("overwrite")\ 56 | .save(f"s3a://{AWS_BUCKET_NAME}/bronze/CarPartsDB/{today}/{table_name}") 57 | print(f"{table_name} table done!") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building Data Lakehouse 2 | 3 | This project is designed to construct a data lakehouse. This data lakehouse will enable organizations to store, manage, and analyze large datasets in a cost-effective, secure, and scalable manner. The data lakehouse will provide a centralized repository for all data, allowing users to easily access and query the data with a unified interface. 4 | 5 | Minio will provide distributed object storage to store the data, Delta Lake will provide ACID-compliant transactions for managing the data, Spark will enable distributed computing for analytics, Presto will provide fast SQL queries, and Hive Metastore will provide a unified catalog for the data. This data lakehouse will enable organizations to quickly and easily access and analyze valuable data, allowing them to make better data-driven decisions. 6 | 7 | This project aims also to create an Extract, Load, and Transform (ELT) pipeline to ingest data from a Postgres database into our lakehouse. The ELT pipeline will make use of Apache Spark, to extract the data from the Postgres database, load it into the lakehouse, and then transform it into the desired format. Once the data is loaded into the lakehouse, it will be available for downstream analytics and reporting. 8 | ## Architecture 9 | 10 | ![Architecture](/images/1.png "Architecture") 11 | 12 | 13 | ## Setup 14 | - First, build Spark and Presto docker image 15 | ```bash 16 | docker build -t presto:0.272.1 ./Dockerfiles/presto 17 | docker build -t cluster-apache-spark:3.1.1 Dockerfiles/spark 18 | ``` 19 | - Run docker compose 20 | ```bash 21 | docker-compose up 22 | ``` 23 | 24 | - Create a bucket in [minio](http://localhost:9001) to store our data (name it datalake) 25 | 26 | - Create a Postgres database (name it CarParts and use CarParts.sql file to create tables) 27 | - Install jar files needed for our spark project 28 | ```bash 29 | docker exec -it master bash /opt/workspace/dependencies/packages_installer.sh 30 | ``` 31 | - Run the first script 32 | ```bash 33 | docker exec -it master spark-submit --master spark://master:7077 \ 34 | --deploy-mode cluster \ 35 | --executor-memory 5G \ 36 | --executor-cores 8 \ 37 | /opt/workspace/postgres_to_s3.py 38 | ``` 39 | 40 | - Run the second script 41 | ```bash 42 | docker exec -it master spark-submit --master spark://master:7077 \ 43 | --deploy-mode cluster \ 44 | --executor-memory 5G \ 45 | --executor-cores 8 \ 46 | /opt/workspace/clean_data.py 47 | ``` 48 | ## links 49 | - **Spark master UI:** http://localhost:9090 50 | - **Spark worker a UI:** http://localhost:9091 51 | - **Spark worker b UI:** http://localhost:9092 52 | - **Minio:** http://localhost:9001 53 | - **Presto:** http://localhost:8000 54 | 55 | ## Built With 56 | 57 | - Spark 58 | - Minio 59 | - PostgreSQL 60 | - Hive Metastore 61 | - Presto 62 | - Delta Lake 63 | 64 | 65 | ## Author 66 | 67 | **Youssef EL ASERY** 68 | 69 | - [Profile](https://github.com/ysfesr "Youssef ELASERY") 70 | - [Linkedin](https://www.linkedin.com/in/youssef-elasery/ "Welcome") 71 | - [Kaggle](https://www.kaggle.com/youssefelasery "Welcome") 72 | 73 | 74 | ## 🤝 Support 75 | 76 | Contributions, issues, and feature requests are welcome! 77 | 78 | Give a ⭐️ if you like this project! 79 | -------------------------------------------------------------------------------- /CarParts.sql: -------------------------------------------------------------------------------- 1 | -- Clean the database 2 | DROP TABLE IF EXISTS Part_in_Order; 3 | DROP TABLE IF EXISTS Supplier; 4 | DROP TABLE IF EXISTS Brand; 5 | DROP TABLE IF EXISTS Part; 6 | DROP TABLE IF EXISTS Part_for_Car; 7 | DROP TABLE IF EXISTS Part_Supplier; 8 | DROP TABLE IF EXISTS Customer; 9 | DROP TABLE IF EXISTS Customer_Statut; 10 | DROP TABLE IF EXISTS Orders; 11 | DROP TABLE IF EXISTS Car_Manufacturer; 12 | DROP TABLE IF EXISTS Car; 13 | DROP TABLE IF EXISTS Part_Maker; 14 | 15 | -- Create the Schema 16 | CREATE TABLE IF NOT EXISTS Customer_Statut( 17 | statut_id INT NOT NULL PRIMARY KEY, 18 | statut VARCHAR(255) NOT NULL 19 | ); 20 | 21 | CREATE TABLE IF NOT EXISTS Customer( 22 | customer_id INT NOT NULL PRIMARY KEY, 23 | statut_id INT NOT NULL, 24 | individual_or_organization VARCHAR(50) NOT NULL, 25 | organisation_name varchar(50), 26 | individual_first_name varchar(50), 27 | individual_last_name VARCHAR(50) 28 | ); 29 | 30 | CREATE TABLE IF NOT EXISTS Orders( 31 | order_id INT NOT NULL PRIMARY KEY, 32 | customer_id INT NOT NULL, 33 | amount_due INT NOT NULL 34 | ); 35 | 36 | CREATE TABLE IF NOT EXISTS Car_Manufacturer( 37 | car_manufacturer_id INT NOT NULL PRIMARY KEY, 38 | name VARCHAR(50) NOT NULL 39 | ); 40 | 41 | CREATE TABLE IF NOT EXISTS Car( 42 | car_id INT NOT NULL PRIMARY KEY, 43 | car_manufacturer_id INT NOT NULL, 44 | date_of_manufacture DATE NOT NULL, 45 | model VARCHAR(50) NOT NULL 46 | ); 47 | 48 | CREATE TABLE IF NOT EXISTS Supplier( 49 | supplier_id INT NOT NULL PRIMARY KEY, 50 | name VARCHAR(50) NOT NULL, 51 | street_address VARCHAR(50) NOT NULL, 52 | town VARCHAR(50) NOT NULL, 53 | country VARCHAR(50) NOT NULL, 54 | postcode INT NOT NULL, 55 | phone VARCHAR(50) NOT NULL 56 | ); 57 | 58 | CREATE TABLE IF NOT EXISTS Brand( 59 | brand_id INT NOT NULL PRIMARY KEY, 60 | name VARCHAR(50) NOT NULL 61 | ); 62 | 63 | CREATE TABLE IF NOT EXISTS Part_Maker( 64 | part_maker_id INT NOT NULL PRIMARY KEY, 65 | name VARCHAR(50) NOT NULL 66 | ); 67 | 68 | CREATE TABLE IF NOT EXISTS Part( 69 | part_id INT NOT NULL PRIMARY KEY, 70 | brand_id INT NOT NULL, 71 | supplier_id INT NOT NULL, 72 | part_group_id INT NOT NULL, 73 | part_maker_id INT NOT NULL, 74 | part_name VARCHAR(50) NOT NULL, 75 | main_supplier_name VARCHAR(50) NOT NULL, 76 | price_to_us INT NOT NULL, 77 | price_to_customer INT NOT NULL 78 | ); 79 | 80 | CREATE TABLE IF NOT EXISTS Part_for_Car( 81 | car_id INT NOT NULL, 82 | part_id INT NOT NULL 83 | ); 84 | 85 | CREATE TABLE IF NOT EXISTS Part_Supplier( 86 | part_supplier_id INT NOT NULL PRIMARY KEY, 87 | part_id INT NOT NULL, 88 | supplier_id INT NOT NULL 89 | ); 90 | 91 | CREATE TABLE IF NOT EXISTS Part_in_Order( 92 | part_in_order_id INT NOT NULL, 93 | order_id INT NOT NULL, 94 | part_supplier_id INT NOT NULL, 95 | actual_sale_price INT NOT NULL, 96 | quantity INT NOT NULL 97 | ); 98 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | services: 3 | spark-master: 4 | image: cluster-apache-spark:3.1.1 # docker build -t cluster-apache-spark:3.1.1 ./Dockerfiles/spark 5 | container_name: master 6 | ports: 7 | - "9090:8080" 8 | - "7077:7077" 9 | volumes: 10 | - ./workspace:/opt/workspace 11 | env_file: 12 | - ./config.env 13 | environment: 14 | - SPARK_LOCAL_IP=spark-master 15 | - SPARK_WORKLOAD=master 16 | - PATH=/usr/local/openjdk-11/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/spark/bin:/opt/spark/sbin 17 | 18 | spark-worker-a: 19 | image: cluster-apache-spark:3.1.1 20 | container_name: worker-a 21 | ports: 22 | - "9091:8080" 23 | - "7000:7000" 24 | depends_on: 25 | - spark-master 26 | environment: 27 | - SPARK_MASTER=spark://spark-master:7077 28 | - SPARK_WORKER_CORES=1 29 | - SPARK_WORKER_MEMORY=1G 30 | - SPARK_DRIVER_MEMORY=1G 31 | - SPARK_EXECUTOR_MEMORY=1G 32 | - SPARK_WORKLOAD=worker 33 | - SPARK_LOCAL_IP=spark-worker-a 34 | volumes: 35 | - ./workspace:/opt/workspace 36 | spark-worker-b: 37 | image: cluster-apache-spark:3.1.1 38 | container_name: worker-b 39 | ports: 40 | - "9092:8080" 41 | - "7001:7000" 42 | depends_on: 43 | - spark-master 44 | environment: 45 | - SPARK_MASTER=spark://spark-master:7077 46 | - SPARK_WORKER_CORES=1 47 | - SPARK_WORKER_MEMORY=1G 48 | - SPARK_DRIVER_MEMORY=1G 49 | - SPARK_EXECUTOR_MEMORY=1G 50 | - SPARK_WORKLOAD=worker 51 | - SPARK_LOCAL_IP=spark-worker-b 52 | volumes: 53 | - ./workspace:/opt/workspace 54 | 55 | minio: 56 | image: quay.io/minio/minio 57 | container_name: minio 58 | restart: always 59 | environment: 60 | - MINIO_ROOT_USER=admin 61 | - MINIO_ROOT_PASSWORD=123456789 62 | 63 | ports: 64 | - "9001:9001" 65 | - "9000:9000" 66 | volumes: 67 | - ./minio/data:/data 68 | command: server /data --console-address ":9001" 69 | healthcheck: 70 | test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] 71 | interval: 30s 72 | timeout: 30s 73 | retries: 3 74 | db: 75 | image: postgres:14.2 76 | container_name: postgres 77 | restart: always 78 | volumes: 79 | - ./database-data:/var/lib/postgresql/data/ 80 | environment: 81 | POSTGRES_USER: root 82 | POSTGRES_PASSWORD: root 83 | POSTGRES_DB: test_db 84 | ports: 85 | - "5432:5432" 86 | mariadb: 87 | image: mariadb:latest 88 | container_name: mariadb 89 | ports: 90 | - 3306:3306 91 | environment: 92 | USE_S3_STORAGE: 0 93 | MYSQL_ROOT_PASSWORD: root 94 | MYSQL_USER: admin 95 | MYSQL_PASSWORD: admin 96 | MYSQL_DATABASE: metastore_db 97 | 98 | hive-metastore: 99 | build: ./Dockerfiles/hive-metastore 100 | container_name: hive-metastore 101 | restart: unless-stopped 102 | ports: 103 | - 9083:9083 104 | 105 | presto: 106 | image: presto:0.272.1 # docker build -t presto:0.272.1 ./Dockerfiles/presto 107 | container_name: presto 108 | volumes: 109 | - ./presto-config:/opt/presto/etc/catalog 110 | ports: 111 | - 8000:8080 112 | -------------------------------------------------------------------------------- /workspace/clean_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pyspark.sql import SparkSession 3 | from datetime import date 4 | 5 | today = date.today().strftime("%b-%d-%Y") 6 | 7 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 8 | AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY") 9 | AWS_S3_ENDPOINT = os.getenv("AWS_S3_ENDPOINT") 10 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") 11 | HIVE_METASTORE_URI = os.getenv("HIVE_METASTORE_URI") 12 | 13 | spark = SparkSession.builder \ 14 | .appName('Clean data') \ 15 | .config("hive.metastore.uris", "thrift://hive-metastore:9083")\ 16 | .config("spark.sql.warehouse.dir","s3a://datalake/warehouse")\ 17 | .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY) \ 18 | .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_KEY) \ 19 | .config("fs.s3a.endpoint", AWS_S3_ENDPOINT)\ 20 | .config("spark.hadoop.fs.s3a.path.style.access", "true")\ 21 | .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ 22 | .config("fs.s3a.connection.ssl.enabled", "false")\ 23 | .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')\ 24 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ 25 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\ 26 | .config('spark.jars','/opt/spark/jars/aws-java-sdk-bundle-1.11.375.jar')\ 27 | .config('spark.jars','/opt/spark/jars/hadoop-aws-3.2.0.jar')\ 28 | .config('spark.jars','/opt/spark/jars/delta-core_2.12-1.0.1.jar')\ 29 | .enableHiveSupport()\ 30 | .getOrCreate() 31 | 32 | spark.sparkContext.setLogLevel("ERROR") 33 | 34 | spark.sql("CREATE DATABASE IF NOT EXISTS dwh COMMENT 'Data Warehouse for Car Part'") 35 | 36 | 37 | # Reading tables from landing area 38 | print('\nReading ...') 39 | Brand = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Brand') 40 | Car = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Car') 41 | Customer = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Customer') 42 | Orders = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Orders') 43 | Part_for_Car = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_for_Car') 44 | Part_in_Order = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_in_Order') 45 | Part_Maker = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_Maker') 46 | Part_Supplier = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part_Supplier') 47 | Part = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Part') 48 | Supplier = spark.read.format("delta").load(f's3a://datalake/bronze/CarPartsDB/{today}/Supplier') 49 | print('End of reading... \n') 50 | 51 | 52 | 53 | # transforming tables to a set of dimensionel tables 54 | print('\ntransforming ...') 55 | Brand.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Brand').saveAsTable("dwh.DimBrand") 56 | Car.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Car').saveAsTable("dwh.DimCar") 57 | Customer.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Customer').saveAsTable("dwh.DimCustomer") 58 | Orders.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Orders').saveAsTable("dwh.DimOrders") 59 | Part_Maker.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part_Maker').saveAsTable("dwh.DimPartMaker") 60 | Part_for_Car.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part_for_Car').saveAsTable("dwh.DimPartForCar") 61 | Part_Supplier.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part_Supplier').saveAsTable("dwh.DimPartSupplier") 62 | Supplier.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Supplier').saveAsTable("dwh.DimSupplier") 63 | Part.write.format('delta').mode('overwrite').option('path','s3a://datalake/silver/warehouse/CarParts/Dim_Part').saveAsTable("dwh.DimPart") 64 | 65 | Part_in_Order.join(Orders, 'order_id') \ 66 | .join(Part_Supplier,'part_supplier_id')\ 67 | .join(Part, 'part_id')\ 68 | .join(Part_for_Car, 'part_id')\ 69 | .join(Car, 'car_id')\ 70 | .select("part_in_order_id", "brand_id", "car_id", "car_manufacturer_id", "customer_id", "order_id", "part_id", 71 | "part_maker_id", "part_supplier_id", Part.supplier_id, "actual_sale_price", "quantity")\ 72 | .write.format('delta').mode('overwrite')\ 73 | .option('path','s3a://datalake/silver/warehouse/CarParts/Fact_part_in_Order').saveAsTable("dwh.FactPartInOrder") 74 | print('End Of Transforming') --------------------------------------------------------------------------------