├── .gitignore
├── README.md
├── kafka-cluster
    └── docker-compose.yml
├── postgres
    └── docker-compose.yml
├── pyspark-cluster-3_5_5
    ├── README.md
    ├── base
    │   ├── Dockerfile
    │   ├── execute-step.sh
    │   ├── finish-step.sh
    │   └── wait-for-step.sh
    ├── docker-compose.yml
    ├── jupyter
    │   └── Dockerfile
    ├── master
    │   ├── Dockerfile
    │   └── master.sh
    ├── readme_docs
    │   ├── img.png
    │   ├── img_1.png
    │   ├── img_2.png
    │   └── img_3.png
    └── worker
    │   ├── Dockerfile
    │   └── worker.sh
├── pyspark-cluster-with-jupyter
    ├── README.md
    └── docker-compose.yml
├── pyspark-jupyter-kafka
    ├── README.md
    └── docker-compose.yml
├── pyspark-jupyter-lab-old
    ├── Dockerfile
    └── README.md
├── pyspark-jupyter-lab
    ├── Dockerfile
    └── README.md
└── pyspark-kafka-cluster
    ├── README.md
    └── docker-compose.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Docker Images
2 | Creating public images for few of the popular services.
3 | 
4 | For Ease with Apache Spark notebooks checkout : https://github.com/subhamkharwal/ease-with-apache-spark
5 | 


--------------------------------------------------------------------------------
/kafka-cluster/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   cf-zookeeper:
 5 |     image: confluentinc/cp-zookeeper:latest
 6 |     environment:
 7 |       ZOOKEEPER_CLIENT_PORT: 2181
 8 |       ZOOKEEPER_SERVER_ID: 1
 9 |     ports:
10 |       - "2181:2181"
11 | 
12 | 
13 |   cf-kafka-1:
14 |     image: confluentinc/cp-kafka:latest
15 |     ports:
16 |       - "9092:9092"
17 |       - "29092:29092"
18 |     environment:
19 |       KAFKA_ADVERTISED_LISTENERS: INTERNAL://cf-kafka-1:19092,EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9092,DOCKER://host.docker.internal:29092
20 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT,DOCKER:PLAINTEXT
21 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
22 |       KAFKA_ZOOKEEPER_CONNECT: "cf-zookeeper:2181"
23 |       KAFKA_BROKER_ID: 1
24 |     depends_on:
25 |       - cf-zookeeper
26 | 
27 |   cf-kafka-2:
28 |     image: confluentinc/cp-kafka:latest
29 |     ports:
30 |       - "9093:9093"
31 |       - "29093:29093"
32 |     environment:
33 |       KAFKA_ADVERTISED_LISTENERS: INTERNAL://cf-kafka-2:19093,EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9093,DOCKER://host.docker.internal:29093
34 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT,DOCKER:PLAINTEXT
35 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
36 |       KAFKA_ZOOKEEPER_CONNECT: "cf-zookeeper:2181"
37 |       KAFKA_BROKER_ID: 2
38 |     depends_on:
39 |       - cf-zookeeper
40 | 
41 |   cf-kafka-3:
42 |     image: confluentinc/cp-kafka:latest
43 |     ports:
44 |       - "9094:9094"
45 |       - "29094:29094"
46 |     environment:
47 |       KAFKA_ADVERTISED_LISTENERS: INTERNAL://cf-kafka-3:19094,EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9094,DOCKER://host.docker.internal:29094
48 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT,DOCKER:PLAINTEXT
49 |       KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
50 |       KAFKA_ZOOKEEPER_CONNECT: "cf-zookeeper:2181"
51 |       KAFKA_BROKER_ID: 3
52 |     depends_on:
53 |       - cf-zookeeper


--------------------------------------------------------------------------------
/postgres/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Use postgres/example user/password credentials
 2 | version: '3.1'
 3 | 
 4 | services:
 5 |   db:
 6 |     image: postgres
 7 |     restart: always
 8 |     ports:
 9 |       - '5432:5432'
10 |     environment:
11 |       POSTGRES_USER: sqlpad
12 |       POSTGRES_PASSWORD: sqlpad
13 | 
14 |   sqlpad:
15 |     image: sqlpad/sqlpad:5
16 |     hostname: 'sqlpad'
17 |     ports:
18 |       - '3000:3000'
19 |     environment:
20 |       SQLPAD_ADMIN: 'admin@sqlpad.com'
21 |       SQLPAD_ADMIN_PASSWORD: 'admin'
22 |       SQLPAD_APP_LOG_LEVEL: debug
23 |       SQLPAD_WEB_LOG_LEVEL: warn
24 |       SQLPAD_SEED_DATA_PATH: /etc/sqlpad/seed-data
25 |       SQLPAD_CONNECTIONS__pgdemo__name: Postgres demo
26 |       SQLPAD_CONNECTIONS__pgdemo__driver: postgres
27 |       SQLPAD_CONNECTIONS__pgdemo__host: postgres
28 |       SQLPAD_CONNECTIONS__pgdemo__database: sqlpad
29 |       SQLPAD_CONNECTIONS__pgdemo__username: sqlpad
30 |       SQLPAD_CONNECTIONS__pgdemo__password: sqlpad
31 |       SQLPAD_CONNECTIONS__pgdemo__multiStatementTransactionEnabled: 'true'
32 |       SQLPAD_CONNECTIONS__pgdemo__idleTimeoutSeconds: 86400
33 |     volumes:
34 |       - ./seed-data:/etc/sqlpad/seed-data


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/README.md:
--------------------------------------------------------------------------------
 1 | # Spark Cluster with Jupyter v3.5.5
 2 | 
 3 | **Apache Spark version 3.5.5** Cluster with 1 master, 2 worker nodes & PySpark Jupyter Lab.
 4 | 
 5 | ### To setup the complete Cluster in docker
 6 | ```shell
 7 | docker compose up
 8 | ```
 9 | 
10 | #### Spark Cluster with Jupyter on Docker
11 | ![img.png](readme_docs/img.png)
12 | 
13 | ## Spark Connect
14 | To start Spark Connect server, open terminal and run the following commands
15 | 
16 | ```shell
17 | docker exec -it <edw-spark-master container-id> /bin/bash
18 | ```
19 | 
20 | 
21 | ![img_1.png](readme_docs/img_1.png)
22 | 
23 | ### 1. Run Spark Connect without Cluster
24 | Once the terminal is connected to `edw-spark-master` container, run the following command to start Spark Connect Server
25 | 
26 | ```shell
27 | /spark/sbin/start-connect-server.sh --jars /spark/jars/spark-connect_2.12-3.5.5.jar --conf spark.ui.port=4050
28 | ```
29 | 
30 | ####Note
31 | - Spark Connect server starts on port `15002` with only driver. 
32 | - The Spark UI for Spark Connect server is on port `4050` & can be accessed via `http://localhost:4050`.
33 | 
34 | ### 2. Run Spark Connect on Cluster
35 | 
36 | Once the terminal is connected to `edw-spark-master` container, run the following command to start Spark Connect Server
37 | 
38 | ```shell
39 | /spark/sbin/start-connect-server.sh --jars /spark/jars/spark-connect_2.12-3.5.5.jar --master spark://0.0.0.0:7077 --total-executor-cores 4 --executor-cores 2 --conf spark.ui.port=4050
40 | ```
41 | 
42 | ![img_2.png](readme_docs/img_2.png)
43 | 
44 | ####Note
45 | - Spark Connect server starts on port `15002` with **2 executors** and **2 cores** each. 
46 | - The Spark UI for Spark Connect server is on port `4050` & can be accessed via `http://localhost:4050`.
47 | - To change number of cores or executors, just change the parameter `--total-executor-cores` and `--executor-cores` in the above command.
48 | 
49 | 
50 | ## Code Example for Spark Connect
51 | To run Spark code using Spark Connect use `remote` from SparkSession builder.
52 | ```python
53 | from pyspark.sql import SparkSession
54 | 
55 | # Generate Spark Connect Session
56 | spark = SparkSession\
57 |     .builder\
58 |     .remote("sc://localhost:15002")\
59 |     .getOrCreate()
60 | 
61 | # Generate Dataframe using range
62 | spark.range(10).show()
63 | ```
64 | 
65 | ![img_3.png](readme_docs/img_3.png)
66 | 
67 | #### Note
68 | In case of issues, install the below python libraries before using Spark Connect
69 | ```
70 | pip install pandas
71 | pip install pyarrow
72 | pip install grpcio
73 | pip install protobuf
74 | pip install grpcio-status
75 | ```
76 | 
77 | ### References & Credits
78 | 1. **Ease With Data YouTube Channel (https://youtube.com/@easewithdata)**
79 | 2. Docker Image references - BDE2020 (https://hub.docker.com/u/bde2020)
80 | 
81 | ### Maintainer
82 | Ease With Data (easewithdata@gmail.com)


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/base/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.10
 2 | 
 3 | LABEL maintainer="Ease With Data <easewithdata@gmail.com> (Original 3.3.0 version published by bde20200)"
 4 | 
 5 | ENV ENABLE_INIT_DAEMON false
 6 | ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon
 7 | ENV INIT_DAEMON_STEP spark_master_init
 8 | 
 9 | ENV BASE_URL=https://archive.apache.org/dist/spark/
10 | ENV SPARK_VERSION=3.5.5
11 | ENV HADOOP_VERSION=3
12 | 
13 | COPY wait-for-step.sh /
14 | COPY execute-step.sh /
15 | COPY finish-step.sh /
16 | 
17 | RUN apk add --no-cache curl bash openjdk8-jre python3 py-pip nss libc6-compat coreutils procps \
18 |       && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2 \
19 |       && chmod +x *.sh \
20 |       && wget ${BASE_URL}/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
21 |       && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
22 |       && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
23 |       && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
24 |       && cd /
25 | 
26 | #Give permission to execute scripts
27 | RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh
28 | 
29 | # Fix the value of PYTHONHASHSEED
30 | # Note: this is needed when you use Python 3.3 or greater
31 | ENV PYTHONHASHSEED 1
32 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/base/execute-step.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $ENABLE_INIT_DAEMON = "true" ]
 4 |    then
 5 |        echo "Execute step ${INIT_DAEMON_STEP} in pipeline"
 6 |        while true; do
 7 | 	   sleep 5
 8 | 	   echo -n '.'
 9 | 	   string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/execute?step=$INIT_DAEMON_STEP -o /dev/null)
10 | 	   [ "$string" = "204" ] && break
11 |        done
12 |        echo "Notified execution of step ${INIT_DAEMON_STEP}"
13 | fi
14 | 
15 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/base/finish-step.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $ENABLE_INIT_DAEMON = "true" ]
 4 |    then
 5 |        echo "Finish step ${INIT_DAEMON_STEP} in pipeline"
 6 |        while true; do
 7 | 	   sleep 5
 8 | 	   echo -n '.'
 9 | 	   string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/finish?step=$INIT_DAEMON_STEP -o /dev/null)
10 | 	   [ "$string" = "204" ] && break
11 |        done
12 |        echo "Notified finish of step ${INIT_DAEMON_STEP}"
13 | fi
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/base/wait-for-step.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $ENABLE_INIT_DAEMON = "true" ]
 4 |    then
 5 |        echo "Validating if step ${INIT_DAEMON_STEP} can start in pipeline"
 6 |        while true; do
 7 | 	   sleep 5
 8 | 	   echo -n '.'
 9 | 	   string=$(curl -s $INIT_DAEMON_BASE_URI/canStart?step=$INIT_DAEMON_STEP)
10 | 	   [ "$string" = "true" ] && break
11 |        done
12 |        echo "Can start step ${INIT_DAEMON_STEP}"
13 | fi
14 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   ewd-pyspark-jupyter:
 4 |     image: easewithdata/pyspark-jupyter:3.5.5
 5 |     container_name: ewd-pyspark-jupyterlab
 6 |     ports:
 7 |         - 8888:8888
 8 |         - 4040:4040
 9 |         - 4041:4041
10 |     environment:
11 |         JUPYTER_PORT: 8888
12 |         SPARK_UI_PORT: 4040
13 |     volumes:
14 |         - spark_data:/data:rw
15 |   ewd-spark-master:
16 |     image: easewithdata/spark-master:3.5.5
17 |     #command: ln -s /usr/bin/python3 /usr/local/bin/python
18 |     container_name: ewd-spark-master
19 |     ports:
20 |       - "8080:8080"
21 |       - "7077:7077"
22 |       - 15002:15002
23 |       - 4050:4050
24 |     environment:
25 |       - INIT_DAEMON_STEP=setup_spark
26 |     volumes:
27 |         - spark_data:/data:rw
28 | 
29 |   ewd-spark-worker-1:
30 |     image: easewithdata/spark-worker:3.5.5
31 |     #command: ln -s /usr/bin/python3 /usr/local/bin/python
32 |     container_name: ewd-spark-worker-1
33 |     depends_on:
34 |       - ewd-spark-master
35 |     ports:
36 |       - "8081:8081"
37 |     environment:
38 |       - "SPARK_MASTER=spark://ewd-spark-master:7077"
39 |     volumes:
40 |         - spark_data:/data:rw
41 |   ewd-spark-worker-2:
42 |     image: easewithdata/spark-worker:3.5.5
43 |     #command: ln -s /usr/bin/python3 /usr/local/bin/python
44 |     container_name: ewd-spark-worker-2
45 |     depends_on:
46 |       - ewd-spark-master
47 |     ports:
48 |       - "8082:8081"
49 |     environment:
50 |       - "SPARK_MASTER=spark://ewd-spark-master:7077"
51 |     volumes:
52 |         - spark_data:/data:rw
53 | volumes:
54 |   spark_data:
55 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/jupyter/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Python 3.10 image
 2 | FROM python:3.10-bullseye
 3 | 
 4 | # Expose Port
 5 | EXPOSE 8888 4040
 6 | 
 7 | # Change shell to /bin/bash
 8 | SHELL ["/bin/bash", "-c"]
 9 | 
10 | # Upgrade pip
11 | RUN pip install --upgrade pip
12 | 
13 | # Install OpenJDK
14 | RUN apt-get update && \
15 |     apt install -y openjdk-11-jdk && \
16 |     apt-get clean;
17 |     
18 | # Fix certificate issues
19 | RUN apt-get install ca-certificates-java && \
20 |     apt-get clean && \
21 |     update-ca-certificates -f;
22 | 
23 | # Insatall nano & vi
24 | RUN apt-get install -y nano && \
25 |     apt-get install -y vim;
26 | 
27 | # Setup JAVA_HOME -- useful for docker commandline
28 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
29 | RUN export JAVA_HOME
30 | 
31 | # Download and Setup Spark binaries
32 | WORKDIR /tmp
33 | RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
34 | RUN tar -xvf spark-3.5.5-bin-hadoop3.tgz
35 | RUN mv spark-3.5.5-bin-hadoop3 spark
36 | RUN mv spark /
37 | RUN rm spark-3.5.5-bin-hadoop3.tgz
38 | 
39 | # Set up environment variables
40 | ENV SPARK_HOME /spark
41 | RUN export SPARK_HOME
42 | ENV PYSPARK_PYTHON /usr/local/bin/python
43 | RUN export PYSPARK_PYTHON
44 | ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip
45 | RUN export PYTHONPATH
46 | ENV PATH $PATH:$SPARK_HOME/bin
47 | RUN export PATH
48 | 
49 | # Fix configuration files
50 | RUN mv $SPARK_HOME/conf/log4j2.properties.template $SPARK_HOME/conf/log4j2.properties
51 | RUN mv $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
52 | RUN mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh
53 | 
54 | # Install Jupyter Lab, PySpark, Kafka, boto & Delta Lake
55 | RUN pip install jupyterlab
56 | RUN pip install pyspark==3.5.5
57 | RUN pip install kafka-python==2.0.2
58 | RUN pip install delta-spark==3.3.0
59 | RUN pip install boto3
60 | 
61 | # Change to working directory and clone git repo
62 | WORKDIR /home/jupyter
63 | 
64 | # Clone Ease with Apache Spark Repo to Start
65 | RUN git clone https://github.com/subhamkharwal/ease-with-apache-spark.git
66 | 
67 | # Download Spark Connect JAR
68 | RUN wget https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.5/spark-connect_2.12-3.5.5.jar
69 | RUN mv spark-connect_2.12-3.5.5.jar $SPARK_HOME/jars/
70 | 
71 | # PIP install for Spark Connect
72 | RUN pip install pandas
73 | RUN pip install pyarrow
74 | RUN pip install grpcio
75 | RUN pip install protobuf
76 | RUN pip install grpcio-status
77 | 
78 | # Fix Jupyter logging issue
79 | RUN ipython profile create
80 | RUN echo "c.IPKernelApp.capture_fd_output = False" >> "/root/.ipython/profile_default/ipython_kernel_config.py"
81 | 
82 | # Start the container with root privilages
83 | CMD ["python3", "-m", "jupyterlab", "--ip", "0.0.0.0", "--allow-root"]
84 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/master/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM easewithdata/spark-base:3.5.5
 2 | 
 3 | LABEL maintainer="Ease With Data <easewithdata@gmail.com> (Original 3.3.0 version published by bde2020)"
 4 | 
 5 | COPY master.sh /
 6 | 
 7 | ENV SPARK_MASTER_PORT 7077
 8 | ENV SPARK_MASTER_WEBUI_PORT 8080
 9 | ENV SPARK_MASTER_LOG /spark/logs
10 | 
11 | EXPOSE 8080 7077 6066
12 | 
13 | # Download Spark Connect JAR
14 | RUN wget https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.5/spark-connect_2.12-3.5.5.jar
15 | RUN mv spark-connect_2.12-3.5.5.jar /spark/jars/
16 | 
17 | CMD ["/bin/bash", "/master.sh"]


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/master/master.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export SPARK_MASTER_HOST=${SPARK_MASTER_HOST:-`hostname`}
 4 | 
 5 | export SPARK_HOME=/spark
 6 | 
 7 | . "/spark/sbin/spark-config.sh"
 8 | 
 9 | . "/spark/bin/load-spark-env.sh"
10 | 
11 | mkdir -p $SPARK_MASTER_LOG
12 | 
13 | ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out
14 | 
15 | cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \
16 |     --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
17 | 


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/readme_docs/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/subhamkharwal/docker-images/edad6b525964c2eb79ceda2a4a6c0e23c3fbf932/pyspark-cluster-3_5_5/readme_docs/img.png


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/readme_docs/img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/subhamkharwal/docker-images/edad6b525964c2eb79ceda2a4a6c0e23c3fbf932/pyspark-cluster-3_5_5/readme_docs/img_1.png


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/readme_docs/img_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/subhamkharwal/docker-images/edad6b525964c2eb79ceda2a4a6c0e23c3fbf932/pyspark-cluster-3_5_5/readme_docs/img_2.png


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/readme_docs/img_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/subhamkharwal/docker-images/edad6b525964c2eb79ceda2a4a6c0e23c3fbf932/pyspark-cluster-3_5_5/readme_docs/img_3.png


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM easewithdata/spark-base:3.5.5
 2 | 
 3 | LABEL maintainer="Ease With Data <easewithdata@gmail.com> (Original 3.3.0 version published by bde2020)"
 4 | 
 5 | COPY worker.sh /
 6 | 
 7 | ENV SPARK_WORKER_WEBUI_PORT 8081
 8 | ENV SPARK_WORKER_LOG /spark/logs
 9 | ENV SPARK_MASTER "spark://spark-master:7077"
10 | 
11 | EXPOSE 8081
12 | 
13 | CMD ["/bin/bash", "/worker.sh"]


--------------------------------------------------------------------------------
/pyspark-cluster-3_5_5/worker/worker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export SPARK_HOME=/spark
 4 | 
 5 | . "/spark/sbin/spark-config.sh"
 6 | 
 7 | . "/spark/bin/load-spark-env.sh"
 8 | 
 9 | mkdir -p $SPARK_WORKER_LOG
10 | 
11 | ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out
12 | 
13 | /spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \
14 |     --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out
15 | 


--------------------------------------------------------------------------------
/pyspark-cluster-with-jupyter/README.md:
--------------------------------------------------------------------------------
 1 | # Spark Cluster v3.3.0
 2 | 
 3 | Apache Spark Cluster with 1 master and 2 worker nodes.
 4 | 
 5 | #### Once build is complete
 6 |     docker compose up
 7 | 
 8 | #### Note:
 9 | If you are running in issue with following error:
10 | 
11 | `ln: failed to create symbolic link 'python': File Exists`
12 | 
13 | Please suppress the commands at line 17, 28 and 40 of the docker-compose.yml
14 | 
15 | You can run those commands manually once the containers are created by logging in master and worker nodes with docker exec command.
16 | 


--------------------------------------------------------------------------------
/pyspark-cluster-with-jupyter/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   bd-pyspark-jupyter:
 4 |     image: easewithdata/pyspark-jupyter-lab-old
 5 |     container_name: pyspark-jupyter-lab-old
 6 |     ports:
 7 |         - 8888:8888
 8 |         - 4040:4040
 9 |     environment:
10 |         JUPYTER_PORT: 8888
11 |         SPARK_UI_PORT: 4040
12 |     volumes:
13 |         - spark_data:/data:rw
14 |   bd-spark-master:
15 |     image: bde2020/spark-master:3.3.0-hadoop3.3
16 |     #command: ln -s /usr/bin/python3 /usr/local/bin/python
17 |     container_name: bd-spark-master
18 |     ports:
19 |       - "8080:8080"
20 |       - "7077:7077"
21 |     environment:
22 |       - INIT_DAEMON_STEP=setup_spark
23 |     volumes:
24 |         - spark_data:/data:rw
25 |   bd-spark-worker-1:
26 |     image: bde2020/spark-worker:3.3.0-hadoop3.3
27 |     #command: ln -s /usr/bin/python3 /usr/local/bin/python
28 |     container_name: bd-spark-worker-1
29 |     depends_on:
30 |       - bd-spark-master
31 |     ports:
32 |       - "8081:8081"
33 |     environment:
34 |       - "SPARK_MASTER=spark://bd-spark-master:7077"
35 |     volumes:
36 |         - spark_data:/data:rw
37 |   bd-spark-worker-2:
38 |     image: bde2020/spark-worker:3.3.0-hadoop3.3
39 |     #command: ln -s /usr/bin/python3 /usr/local/bin/python
40 |     container_name: bd-spark-worker-2
41 |     depends_on:
42 |       - bd-spark-master
43 |     ports:
44 |       - "8082:8081"
45 |     environment:
46 |       - "SPARK_MASTER=spark://bd-spark-master:7077"
47 |     volumes:
48 |         - spark_data:/data:rw
49 |   bd-spark-history-server:
50 |       image: bde2020/spark-history-server:3.3.0-hadoop3.3
51 |       container_name: bd-spark-history-server
52 |       depends_on:
53 |         - bd-spark-master
54 |       ports:
55 |         - "18081:18081"
56 |       volumes:
57 |         - /tmp/spark-events-local:/tmp/spark-events
58 | volumes:
59 |   spark_data:
60 | 


--------------------------------------------------------------------------------
/pyspark-jupyter-kafka/README.md:
--------------------------------------------------------------------------------
 1 | # Kafka Cluster with Spark
 2 | 
 3 | Kafka Cluster with Zookeeper and PySpark Jupyter Notebook.
 4 | 
 5 | ### To start the cluster and create containers
 6 |     docker compose build
 7 | 
 8 | #### Once Build is complete
 9 |     docker compose up
10 | 
11 | 


--------------------------------------------------------------------------------
/pyspark-jupyter-kafka/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   ed-pyspark-jupyter:
 4 |     image: easewithdata/pyspark-jupyter-lab
 5 |     user: root
 6 |     container_name: ed-pyspark-jupyter-lab
 7 |     ports:
 8 |       - 8888:8888
 9 |       - 4040:4040
10 |     environment:
11 |       JUPYTER_PORT: 8888
12 |       SPARK_UI_PORT: 4040
13 |       GRANT_SUDO: yes
14 |     volumes:
15 |       - streaming_data:/data:rw
16 | 
17 |   ed-zookeeper:
18 |     image: confluentinc/cp-zookeeper:latest
19 |     container_name: ed-zookeeper
20 |     ports:
21 |       - 2181:2181
22 |     environment:
23 |       ZOOKEEPER_CLIENT_PORT: 2181
24 |       ZOOKEEPER_TICK_TIME: 2000
25 | 
26 |   ed-kafka:
27 |     image: confluentinc/cp-kafka:latest
28 |     container_name: ed-kafka
29 |     depends_on:
30 |       - ed-zookeeper
31 |     ports:
32 |       - 9092:9092
33 |     volumes:
34 |       - streaming_data:/data:rw
35 |     environment:
36 |       KAFKA_BROKER_ID: 1
37 |       KAFKA_ZOOKEEPER_CONNECT: ed-zookeeper:2181
38 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://ed-kafka:29092,PLAINTEXT_HOST://127.0.0.1:9092
39 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
40 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
41 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
42 |       KAFKA_CREATE_TOPICS: "raw:1:1"
43 | 
44 | volumes:
45 |   streaming_data:
46 | 


--------------------------------------------------------------------------------
/pyspark-jupyter-lab-old/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Python 3.7.10 image
 2 | FROM python:3.7-bullseye
 3 | 
 4 | # Expose Port
 5 | EXPOSE 8888 4040
 6 | 
 7 | # Change shell to /bin/bash
 8 | SHELL ["/bin/bash", "-c"]
 9 | 
10 | # Upgrade pip
11 | RUN pip install --upgrade pip
12 | 
13 | # Install OpenJDK
14 | RUN apt-get update && \
15 |     apt-get install -y openjdk-11-jdk && \
16 |     apt-get clean;
17 |     
18 | # Fix certificate issues
19 | RUN apt-get install ca-certificates-java && \
20 |     apt-get clean && \
21 |     update-ca-certificates -f;
22 | 
23 | # Insatall nano & vi
24 | RUN apt-get install -y nano && \
25 |     apt-get install -y vim;
26 | 
27 | # Setup JAVA_HOME -- useful for docker commandline
28 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
29 | RUN export JAVA_HOME
30 | 
31 | # Download and Setup Spark binaries
32 | WORKDIR /tmp
33 | RUN wget https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
34 | RUN tar -xvf spark-3.3.0-bin-hadoop3.tgz
35 | RUN mv spark-3.3.0-bin-hadoop3 spark
36 | RUN mv spark /
37 | RUN rm spark-3.3.0-bin-hadoop3.tgz
38 | 
39 | # Set up environment variables
40 | ENV SPARK_HOME /spark
41 | RUN export SPARK_HOME
42 | ENV PYSPARK_PYTHON /usr/local/bin/python
43 | RUN export PYSPARK_PYTHON
44 | ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.5-src.zip
45 | RUN export PYTHONPATH
46 | ENV PATH $PATH:$SPARK_HOME/bin
47 | RUN export PATH
48 | 
49 | # Fix configuration files
50 | RUN mv $SPARK_HOME/conf/log4j2.properties.template $SPARK_HOME/conf/log4j2.properties
51 | RUN mv $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
52 | RUN mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh
53 | 
54 | # Install Jupyter Lab, PySpark, Kafka, boto & Delta Lake
55 | RUN pip install jupyterlab
56 | RUN pip install pyspark==3.3.0
57 | RUN pip install kafka-python==2.0.2
58 | RUN pip install delta-spark==2.2.0
59 | RUN pip install boto3
60 | 
61 | # Change to working directory and clone git repo
62 | WORKDIR /home/jupyter
63 | 
64 | # Clone Ease with Apache Spark Repo to Start
65 | RUN git clone https://github.com/subhamkharwal/ease-with-apache-spark.git
66 | 
67 | # Fix Jupyter logging issue
68 | RUN ipython profile create
69 | RUN echo "c.IPKernelApp.capture_fd_output = False" >> "/root/.ipython/profile_default/ipython_kernel_config.py"
70 | 
71 | # Start the container with root privilages
72 | CMD ["python3", "-m", "jupyterlab", "--ip", "0.0.0.0", "--allow-root"]
73 | 


--------------------------------------------------------------------------------
/pyspark-jupyter-lab-old/README.md:
--------------------------------------------------------------------------------
 1 | # PySpark Jupyter Lab Notebook - Python v3.7.10
 2 | 
 3 | Jupyter Lab Notebook with root access.
 4 | EaseWithApacheSpark notebooks provided to start with.
 5 | 
 6 | ### To build image from the Dockerfile:
 7 |     docker build --tag easewithdata/pyspark-jupyter-lab-old .
 8 | 
 9 | ### To create container from image
10 |     docker run -d -p 8888:8888 -p 4040:4040 --name jupyter-lab easewithdata/pyspark-jupyter-lab-old
11 | 
12 | > [!CAUTION]
13 | > In case you want to make setup easy, follow the below step.
14 | 
15 | ## In case you are still not able to setup, just pull the image from docker hub
16 |     docker pull easewithdata/pyspark-jupyter-lab-old
17 | 


--------------------------------------------------------------------------------
/pyspark-jupyter-lab/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Python 3.10 image
 2 | FROM python:3.10-bullseye
 3 | 
 4 | # Expose Port
 5 | EXPOSE 8888 4040
 6 | 
 7 | # Change shell to /bin/bash
 8 | SHELL ["/bin/bash", "-c"]
 9 | 
10 | # Upgrade pip
11 | RUN pip install --upgrade pip
12 | 
13 | # Install OpenJDK
14 | RUN apt-get update && \
15 |     apt install -y openjdk-11-jdk && \
16 |     apt-get clean;
17 |     
18 | # Fix certificate issues
19 | RUN apt-get install ca-certificates-java && \
20 |     apt-get clean && \
21 |     update-ca-certificates -f;
22 | 
23 | # Insatall nano & vi
24 | RUN apt-get install -y nano && \
25 |     apt-get install -y vim;
26 | 
27 | # Setup JAVA_HOME -- useful for docker commandline
28 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
29 | RUN export JAVA_HOME
30 | 
31 | # Download and Setup Spark binaries
32 | WORKDIR /tmp
33 | RUN wget https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
34 | RUN tar -xvf spark-3.3.0-bin-hadoop3.tgz
35 | RUN mv spark-3.3.0-bin-hadoop3 spark
36 | RUN mv spark /
37 | RUN rm spark-3.3.0-bin-hadoop3.tgz
38 | 
39 | # Set up environment variables
40 | ENV SPARK_HOME /spark
41 | RUN export SPARK_HOME
42 | ENV PYSPARK_PYTHON /usr/local/bin/python
43 | RUN export PYSPARK_PYTHON
44 | ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.5-src.zip
45 | RUN export PYTHONPATH
46 | ENV PATH $PATH:$SPARK_HOME/bin
47 | RUN export PATH
48 | 
49 | # Fix configuration files
50 | RUN mv $SPARK_HOME/conf/log4j2.properties.template $SPARK_HOME/conf/log4j2.properties
51 | RUN mv $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
52 | RUN mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh
53 | 
54 | # Install Jupyter Lab, PySpark, Kafka, boto & Delta Lake
55 | RUN pip install jupyterlab
56 | RUN pip install pyspark==3.3.0
57 | RUN pip install kafka-python==2.0.2
58 | RUN pip install delta-spark==2.2.0
59 | RUN pip install boto3
60 | 
61 | # Change to working directory and clone git repo
62 | WORKDIR /home/jupyter
63 | 
64 | # Clone Ease with Apache Spark Repo to Start
65 | RUN git clone https://github.com/subhamkharwal/ease-with-apache-spark.git
66 | 
67 | # Fix Jupyter logging issue
68 | RUN ipython profile create
69 | RUN echo "c.IPKernelApp.capture_fd_output = False" >> "/root/.ipython/profile_default/ipython_kernel_config.py"
70 | 
71 | # Start the container with root privilages
72 | CMD ["python3", "-m", "jupyterlab", "--ip", "0.0.0.0", "--allow-root"]
73 | 


--------------------------------------------------------------------------------
/pyspark-jupyter-lab/README.md:
--------------------------------------------------------------------------------
 1 | # PySpark Jupyter Lab Notebook - Python v3.10
 2 | 
 3 | Jupyter Lab Notebook with root access.
 4 | EaseWithApacheSpark notebooks provided to start with.
 5 | 
 6 | ### To build image from the Dockerfile:
 7 |     docker build --tag easewithdata/pyspark-jupyter-lab .
 8 | 
 9 | ### To create container from image
10 |     docker run -d -p 8888:8888 -p 4040:4040 --name jupyter-lab easewithdata/pyspark-jupyter-lab
11 | 
12 | > [!CAUTION]
13 | > In case you want to make setup easy, follow the below step.
14 | 
15 | ## In case you are still not able to setup, just pull the image from docker hub
16 |     docker pull easewithdata/pyspark-jupyter-lab
17 | 


--------------------------------------------------------------------------------
/pyspark-kafka-cluster/README.md:
--------------------------------------------------------------------------------
 1 | # Kafka Cluster with Spark v3.3.0 with Python 3.7.10
 2 | 
 3 | Kafka Cluster with Zookeeper and PySpark Notebook. To be uses with 
 4 | 
 5 | ### To start the cluster and create containers
 6 |     docker compose build
 7 | 
 8 | #### Once Build is complete
 9 |     docker compose up
10 | 
11 | 


--------------------------------------------------------------------------------
/pyspark-kafka-cluster/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   pyspark-jupyter:
 4 |     image: easewithdata/pyspark-jupyter-lab
 5 |     container_name: pyspark-jupyter-lab
 6 |     ports:
 7 |       - 8888:8888
 8 |       - 4040:4040
 9 |     environment:
10 |       JUPYTER_PORT: 8888
11 |       SPARK_UI_PORT: 4040
12 |     volumes:
13 |       - spark_data:/data:rw
14 | 
15 |   zookeeper:
16 |     image: confluentinc/cp-zookeeper:latest
17 |     container_name: zookeeper
18 |     ports:
19 |       - 2181:2181
20 |     environment:
21 |       ZOOKEEPER_CLIENT_PORT: 2181
22 |       ZOOKEEPER_TICK_TIME: 2000
23 | 
24 |   kafka:
25 |     image: confluentinc/cp-kafka:latest
26 |     container_name: kafka
27 |     depends_on:
28 |       - zookeeper
29 |     ports:
30 |       - 9092:9092
31 |     volumes:
32 |       - spark_data:/data:rw
33 |     environment:
34 |       KAFKA_BROKER_ID: 1
35 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
36 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://127.0.0.1:9092
37 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
38 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
39 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
40 |       KAFKA_CREATE_TOPICS: "raw:1:1"
41 | 
42 | volumes:
43 |   spark_data:
44 | 


--------------------------------------------------------------------------------