├── .github
    └── workflows
    │   └── docker-build-publish.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── config
    ├── core-site.xml
    ├── hdfs-site.xml
    ├── log4j.properties
    ├── mapred-site.xml
    ├── requirements.txt
    ├── spark-cmd.sh
    ├── spark-defaults.conf
    ├── spark-env.sh
    └── yarn-site.xml
├── docker-compose.yml
├── docker-compose_cluster.yml
├── toy-cluster.sh
└── version.txt


/.github/workflows/docker-build-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build, Publish Docker Image, and Tag Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main  # Trigger workflow on pushes to the main branch
 7 |   workflow_dispatch: # Allow manual triggering of the workflow
 8 | 
 9 | jobs:
10 |   build-and-publish:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - name: Checkout code
15 |       uses: actions/checkout@v3
16 | 
17 |     - name: Read version from version.txt
18 |       id: read_version
19 |       run: |
20 |         version=$(cat version.txt)
21 |         echo "version=$version" >> $GITHUB_ENV
22 | 
23 |     - name: Log in to Docker Hub
24 |       uses: docker/login-action@v2
25 |       with:
26 |         username: ${{ secrets.DOCKER_USERNAME }}
27 |         password: ${{ secrets.DOCKER_TOKEN }}
28 | 
29 |     - name: Build and tag Docker image
30 |       run: |
31 |         docker image build -t jwaresolutions/big-data-cluster:$version .
32 |         docker tag jwaresolutions/big-data-cluster:$version jwaresolutions/big-data-cluster:latest
33 | 
34 |     - name: Push Docker image
35 |       run: |
36 |         docker image push jwaresolutions/big-data-cluster:$version
37 |         docker image push jwaresolutions/big-data-cluster:latest
38 | 
39 |     - name: Create Tag and Release
40 |       uses: avakar/tag-and-release@v1
41 |       with:
42 |         tag_name: "v${{ env.version }}"
43 |       env:
44 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/settings.json
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image
 2 | FROM ubuntu:24.10
 3 | 
 4 | # Set environment variables for non-interactive installation
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # Adds some needed environment variables
 8 | ENV HDFS_NAMENODE_USER=root
 9 | ENV HDFS_DATANODE_USER=root
10 | ENV HDFS_SECONDARYNAMENODE_USER=root
11 | ENV YARN_RESOURCEMANAGER_USER=root
12 | ENV YARN_NODEMANAGER_USER=root
13 | ENV PYSPARK_PYTHON=python3
14 | 
15 | # Install required packages. NOTE: sudo is needed as it's called in some Spark scripts
16 | ENV OPEN_JDK_VERSION=21
17 | RUN apt update && apt install -y \
18 |     openjdk-${OPEN_JDK_VERSION}-jdk \
19 |     wget \
20 |     curl \
21 |     vim \
22 |     ssh \
23 |     rsync \
24 |     git \
25 |     net-tools \
26 |     python3-pip \
27 |     python3-venv \
28 |     sudo \ 
29 |     && rm -rf /var/lib/apt/lists/*
30 | 
31 | # Set JAVA_HOME environment variable
32 | ENV JAVA_HOME=/usr/lib/jvm/java-${OPEN_JDK_VERSION}-openjdk-amd64
33 | ENV PATH=$JAVA_HOME/bin:$PATH
34 | 
35 | # Install Hadoop
36 | ENV HADOOP_VERSION=3.4.0
37 | RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \
38 |     && tar -xzf hadoop-${HADOOP_VERSION}.tar.gz -C /opt/ \
39 |     && rm hadoop-${HADOOP_VERSION}.tar.gz
40 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
41 | ENV PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
42 | 
43 | # Creates the necessary directories for Hadoop
44 | RUN mkdir -p ${HADOOP_VERSION}/logs
45 | 
46 | # Install Spark
47 | ENV SPARK_VERSION=3.5.5
48 | RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz \
49 |     && tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz -C /opt/ \
50 |     && rm spark-${SPARK_VERSION}-bin-hadoop3.tgz
51 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION}-bin-hadoop3
52 | ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
53 | 
54 | # Set up SSH (for Hadoop to communicate across nodes)
55 | RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' \
56 |     && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys \
57 |     && chmod 0600 ~/.ssh/authorized_keys
58 | 
59 | # Create and activate a virtual environment for Python.
60 | ENV VIRTUAL_ENV=/opt/venv
61 | RUN python3 -m venv $VIRTUAL_ENV
62 | ENV PATH="$VIRTUAL_ENV/bin:$PATH"
63 | 
64 | # Copy requirements.txt and install Python dependencies in the virtual environment
65 | COPY ./config/requirements.txt /tmp/
66 | RUN pip install --upgrade pip \
67 |     && pip install -r /tmp/requirements.txt
68 | 
69 | # Hadoop settings
70 | WORKDIR ${HADOOP_HOME}/etc/hadoop
71 | COPY ./config/core-site.xml .
72 | COPY ./config/hdfs-site.xml .
73 | COPY ./config/mapred-site.xml .
74 | COPY ./config/yarn-site.xml .
75 | 
76 | # Spark settings
77 | WORKDIR ${SPARK_HOME}/conf
78 | COPY ./config/spark-env.sh .
79 | COPY ./config/spark-defaults.conf .
80 | COPY ./config/log4j.properties .
81 | 
82 | # Cluster cmd
83 | WORKDIR /home/big_data
84 | COPY ./config/spark-cmd.sh .
85 | RUN chmod +x /home/big_data/spark-cmd.sh
86 | 
87 | # Add an explicit step to set JAVA_HOME in the bash profile to make it available to all users
88 | RUN echo "export JAVA_HOME=$JAVA_HOME" >> /etc/profile \
89 |     && echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
90 |     && echo "export PATH=$JAVA_HOME/bin:$PATH" >> /etc/profile \
91 |     && echo 'export PATH=$PATH:$HADOOP_HOME/bin' >> ~/.bashrc \
92 |     && echo 'export PATH=$PATH:$HADOOP_HOME/sbin' >> ~/.bashrc
93 | 
94 | # Expose necessary ports (8080 -> Spark UI, 18080 -> Spark applications logs, 9870 -> Hadoop NameNode UI)
95 | EXPOSE 8080 18080 9870
96 | 
97 | # Start SSH service. The entrypoint is defined in the docker-compose file
98 | CMD ["service", "ssh", "start", "&&", "sleep", "infinity"]
99 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 JWare Solutions
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Docker Big Data Cluster
  3 | 
  4 | A ready to go Big Data cluster (Hadoop + Hadoop Streaming + Spark + PySpark) with Docker and Docker Swarm!
  5 | 
  6 | 
  7 | ## Index
  8 | 
  9 | 1. [Why?](#why)
 10 | 1. [Features](#features)
 11 | 1. [Running toy cluster](#running-toy-cluster)
 12 | 1. [Running a real cluster in Docker Swarm](#running-a-real-cluster-in-docker-swarm)
 13 | 1. [Usage](#usage)
 14 | 	1. [HDFS](#hdfs)
 15 | 	1. [Spark and PySpark](#spark-and-pyspark)
 16 | 1. [Going further](#going-further)
 17 | 1. [Frequent problems](#frequent-problems)
 18 | 1. [Contributing](#contributing)
 19 | 
 20 | 
 21 | ## Why?
 22 | 
 23 | Although today you can find several repositories ready to deploy a Spark or Hadoop cluster, they all run into the same problem: they do not work when deployed on Docker Swarm due to several issues ranging from the definition of the worker nodes to connection problems with Docker network interfaces.
 24 | 
 25 | This repository seeks to solve the problem by offering a functional alternative, both a toy cluster to deploy on a single machine, as well as a real cluster that works on multiple nodes that conform a Docker Swarm cluster.
 26 | 
 27 | 
 28 | ## Features
 29 | 
 30 | This repository is inspired by and uses several scripts taken from [Rubenafo's repo][rubenafo-repo] and [Sdesilva26's repo][sdesilva26-repo], however there are several changes introduced; the API is simpler, there is more documentation about usage and some extra features:
 31 | 
 32 | - ✅ Ready to deploy in a Docker Swarm cluster: all the networking and port configuration issues have been fixed so you can scale your cluster to as many worker nodes as you need.
 33 | - ⚡️ Hadoop, HDFS, Spark, Scala and PySpark ready to use: all the tools are available inside the container globally so you don't have to fight with environment variables and executable paths.
 34 | - 🌟 New technology: our image offers Hadoop 3.4.0, Spark 3.5.5 and Python 3.12.6
 35 | - ⚙️ Less configuration: we have removed some settings to keep the minimum possible configuration, this way you prevent errors, unexpected behaviors and get the freedom to set parameters via environment variables and have an agile development that does not require rebuilding the Docker image. 
 36 | - 🐍 Python dependencies: we include the most used Python dependencies like Pandas, Numpy and Scipy to be able to work on datasets and perform mathematical operations (you can remove them if you don't need them!)
 37 | 
 38 | 
 39 | ## Running toy cluster
 40 | 
 41 | You have two ways to run a cluster on a single machine:
 42 | 
 43 | - Use `toy-cluster.sh` script...
 44 | - Or use `docker-compose.yml` file
 45 | 
 46 | 
 47 | ### Using toy-cluster.sh script
 48 | 
 49 | The script has the following commands:
 50 | 
 51 | - deploy: create a new Docker network, containers (a master and 3 workers) and start these last
 52 | - start: start the existing containers
 53 | - stop: stop the running containers
 54 | - remove: remove all the created containers
 55 | - info:: useful URLs
 56 | 
 57 | So, if you want to try your new cluster run `./toy-cluster.sh deploy` to create a network, containers and format namenode HDFS (note that this script will start the containers too). To stop, start again or remove just run the `stop`, `start` or `remove` respectively.
 58 | 
 59 | Use `./toy-cluster.sh info` to see the URLs to check Hadoop and Spark clusters status.
 60 | 
 61 | 
 62 | ### Using docker-compose.yml file
 63 | 
 64 | The `docker-compose.yml` file has the same structure than `toy-cluster.sh` script except for the use of volumes to preserve HDFS data.
 65 | 
 66 | Only for the first time, you need to format the namenode information directory. **Do not execute this command when you are in production with valid data stored as you will lose all your data stored in the HDFS**:
 67 | 
 68 | `docker container run --rm -v hdfs_master_data_swarm:/home/hadoop/data/nameNode jwaresolutions/big-data-cluster:<tag> /usr/local/hadoop/bin/hadoop namenode -format`
 69 | 
 70 | Then you can manage your toy cluster with the following commands:
 71 | 
 72 | - To start the cluster run: `docker-compose up -d`
 73 | - To stop the cluster run: `docker-compose down`
 74 | 
 75 | **Important:** the use of `./toy-cluster.sh info` works with this! So you can get the useful cluster URLs.
 76 | 
 77 | 
 78 | ## Running a real cluster in Docker Swarm
 79 | 
 80 | Here is the important stuff, there are some minors steps to do to make it work: first of all you need a Docker Swarm cluster:
 81 | 
 82 | 1. Start the cluster in your master node: `docker swarm init`.
 83 | 1. Generate a token for the workers to be added ([official doc][swarm-docs]): `docker swarm join-token worker`. It will print on screen a token in a command that must be executed in all the workers to be added.
 84 | 1. Run the command generated in the previous step in all workers node: `docker swarm join: --token <token generated> <HOST>:<PORT>`
 85 | 
 86 | You have your Docker Swarm cluster! Now you have to label all the nodes to indicate which one will be the *master* and *workers*. On master node run:
 87 | 
 88 | 1. List all cluster nodes to get their ID: `docker node ls`
 89 | 1. Label master node as master: `docker node update --label-add role=master <MASTER NODE ID>`
 90 | 1. **For every** worker ID node run: `docker node update --label-add role=worker <WORKER NODE ID>`
 91 | 
 92 | Create needed network and volumes:
 93 | 
 94 | ```
 95 | docker network create -d overlay cluster_net_swarm
 96 | docker volume create --name=hdfs_master_data_swarm
 97 | docker volume create --name=hdfs_master_checkpoint_data_swarm
 98 | docker volume create --name=hdfs_worker_data_swarm
 99 | ```
100 | 
101 | Now it is time to select a tag of the Docker image. The default is latest but it is not recommended to use it in production. After choose one, set it version on `docker-compose_cluster.yml` and the command below.
102 | 
103 | Only for the first time, you need to format the namenode information directory **in Master and Workers nodes. Do not execute this command when you are in production with valid data stored as you will lose all your data stored in the HDFS**:
104 | 
105 | `docker container run --rm -v hdfs_master_data_swarm:/home/hadoop/data/nameNode jwaresolutions/big-data-cluster:<tag> /opt/hadoop-3.4.0/bin/hadoop namenode -format`
106 | 
107 | Now you are ready to deploy your production cluster!
108 | 
109 | `docker stack deploy -c docker-compose_cluster.yml big-data-cluster`
110 | 
111 | <!-- TODO: add ports -->
112 | 
113 | ## Usage
114 | 
115 | Finally you can use your cluster! Like the toy cluster, you have available some useful URLs:
116 | 
117 | - \<MASTER IP>:8080 -> Spark panel
118 | - \<MASTER IP>:18080 -> Spark applications logs
119 | - \<MASTER IP>:9870 -> HDFS panel
120 | 
121 |  Enter the master node:
122 | 
123 | `docker container exec -it <MASTER CONTAINER ID> bash`
124 | 
125 | 
126 | ### HDFS
127 | 
128 | You can store files in the Hadoop Distributed File System:
129 | 
130 | ```
131 | echo "test" > test.txt
132 | hdfs dfs -copyFromLocal ./test.txt /test.txt
133 | ```
134 | 
135 | If you check in a worker node that the file is visible in the entire cluster:
136 | 
137 | `hdfs dfs -ls /`
138 | 
139 | <!-- ### TODO: add Hadoop -->
140 | 
141 | ### Spark and PySpark
142 | 
143 | 1. You can initiate a PySpark console: `pyspark --master spark://master-node:7077`
144 | 	1. Now, for example, read a file and count lines:
145 | 
146 | 	```python
147 | 	lines = sc.textFile('hdfs://master-node:9000/test.txt')
148 | 	lines_count = lines.count()
149 | 	print(f'Line count -> {lines_count}')
150 | 	```
151 | 1. Or you can submit an script:
152 | 	1. Make the script:
153 | 	
154 | 	```python
155 | 	from pyspark import SparkContext
156 | 	import random
157 | 
158 | 	NUM_SAMPLES = 1000
159 | 
160 | 	sc = SparkContext("spark://master-node:7077", "Pi Estimation")
161 | 
162 | 
163 | 	def inside(p):
164 | 		x, y = random.random(), random.random()
165 | 		return x*x + y*y < 1
166 | 
167 | 	count = sc.parallelize(range(0, NUM_SAMPLES)) \
168 | 				.filter(inside).count()
169 | 	print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))
170 | 	```
171 | 	
172 | 	
173 | 	2. Submit it: `spark-submit your-script.py` 
174 | 
175 | 
176 | ## Going further
177 | 
178 | 
179 | ### Expand number of workers
180 | 
181 | Adding workers to cluster is easy:
182 | 
183 | 1. Add a worker to your Swarm cluster as explained in [Running a real cluster in Docker Swarm](##running-a-real-cluster-in-Docker-Swarm) and label it with `role=worker`.
184 | 1. Increment the number of replicas in `docker-compose_cluster.yml` for `worker` service.
185 | 1. Deploy the stack again with `docker stack deploy -c docker-compose_cluster.yml big-data-cluster` (restart is no required).
186 | 
187 | 
188 | ### Add files/folder inside cluster
189 | 
190 | In both `docker-compose.yml` (toy cluster) and `docker-compose_cluster.yml` (real cluster) there is a commented line in `volumes` section. Just uncomment it and set the the file/folder in host file and the destination inside master node in cluster! For more information read [official documentation][volumes-docs] about `volumes` setting in Docker Compose.
191 | 
192 | 
193 | ### Add Python dependencies
194 | 
195 | 1. Add the dependency to the `requirements.txt` file.
196 | 1. Build the image again.
197 | 
198 | 
199 | ### Check Spark logs
200 | 
201 | To check Spark `stderr` and `stdout` files you can run `bash` inside the Worker container and then run the following commands:
202 | 
203 | - stderr: `cat /sbin/spark-3.5.2-bin-without-hadoop/work/<app id>/<partition id>/stderr`
204 | - stdout: `cat /sbin/spark-3.5.2-bin-without-hadoop/work/<app id>/<partition id>/stdout`
205 | 
206 | 
207 | ## Frequent problems
208 | 
209 | 
210 | ### Connection refused error
211 | 
212 | Sometimes it throws a *Connection refused* error when run a HDFS command or try to access to DFS from Hadoop/Spark. There is [official documentation][connection-refused-docs] about this problem. The solution that worked for this repository was running the commands listed in [this Stack Overflow answer][connection-refused-answer]. That is why you need to format the namenode directory the first time you are deploying the real cluster (see [Running a real cluster in Docker Swarm](##running-a-real-cluster-in-docker-swarm)).
213 | 
214 | 
215 | ### Port 9870 is not working
216 | 
217 | This problem means that Namenode is now running in master node, is associated with [Connection refused for HDFS](###connection-refused-for-hdfs) problem and has the same solution. Once Namenode is running the port should be working correctly.
218 | 
219 | 
220 | ### HDFS panel does not show some living nodes
221 | 
222 | If there are nodes that are not listed as active in the HDFS panel you may also need to run the nanemode directory formatting command on the Workers nodes, not just the Driver. See [Running a real cluster in Docker Swarm](##running-a-real-cluster-in-docker-swarm) to get the command.
223 | 
224 | 
225 | ## Contributing
226 | 
227 | Any kind of help is welcome and appreciated! If you find a bug please submit an issue or make a PR:
228 | 
229 | 1. Fork this repo.
230 | 1. Create a branch where you will develop some changes.
231 | 1. Make a PR.
232 | 
233 | There are some TODOs to complete:
234 | 
235 | - [ ] Find a way to prevent *Connection refused* error to avoid format the namenode information directory
236 | - [ ] Add examples for Hadoop
237 | - [ ] Add examples for Hadoop Streaming
238 | - [ ] Add examples for Spark Streaming
239 | 
240 | 
241 | [rubenafo-repo]: https://github.com/rubenafo/docker-spark-cluster
242 | [sdesilva26-repo]: https://github.com/sdesilva26/docker-spark
243 | [swarm-docs]: https://docs.docker.com/engine/swarm/join-nodes/
244 | [volumes-docs]: https://docs.docker.com/compose/compose-file/compose-file-v3/#volumes
245 | [connection-refused-docs]: https://cwiki.apache.org/confluence/display/HADOOP2/ConnectionRefused
246 | [connection-refused-answer]: https://stackoverflow.com/a/42281292/7058363
247 | 


--------------------------------------------------------------------------------
/config/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <configuration>
 5 |         <property>
 6 |             <name>fs.defaultFS</name>
 7 |             <value>hdfs://master-node:9000</value>
 8 |         </property>
 9 |     </configuration>
10 | </configuration>


--------------------------------------------------------------------------------
/config/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |       <name>dfs.namenode.name.dir</name>
 6 |       <value>/home/hadoop/data/nameNode</value>
 7 |     </property>
 8 |     <property>
 9 |       <name>dfs.namenode.checkpoint.dir</name>
10 |       <value>/home/hadoop/data/namesecondary</value>
11 |     </property>    
12 |     <property>
13 |       <name>dfs.datanode.data.dir</name>
14 |       <value>/home/hadoop/data/dataNode</value>
15 |     </property>
16 |     <property>
17 |       <name>dfs.replication</name>
18 |       <value>1</value>
19 |     </property>
20 | </configuration>


--------------------------------------------------------------------------------
/config/log4j.properties:
--------------------------------------------------------------------------------
 1 | # it logs in only from Warning upwards.
 2 | # The rest of the configuration and comments are left as default by the framework.
 3 | log4j.rootCategory=WARN, console
 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 5 | log4j.appender.console.target=System.err
 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 8 | 
 9 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
10 | # log level for this class is used to overwrite the root logger's log level, so that
11 | # the user can have different defaults for the shell and regular Spark apps.
12 | log4j.logger.org.apache.spark.repl.Main=WARN
13 | 
14 | # Settings to quiet third party logs that are too verbose
15 | log4j.logger.org.spark_project.jetty=WARN
16 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
17 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
18 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
19 | log4j.logger.org.apache.parquet=ERROR
20 | log4j.logger.parquet=ERROR
21 | 
22 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
23 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
24 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
25 | 


--------------------------------------------------------------------------------
/config/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |       <name>mapreduce.framework.name</name>
 6 |       <value>yarn</value>
 7 |     </property>
 8 |     <property>
 9 |       <name>mapreduce.application.classpath</name>
10 |       <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
11 |     </property>
12 |     <property>
13 |       <name>mapreduce.job.tracker</name>
14 |       <value>master-node:9001</value>
15 |     </property>    
16 | </configuration>


--------------------------------------------------------------------------------
/config/requirements.txt:
--------------------------------------------------------------------------------
 1 | filelock==3.16.0
 2 | matplotlib==3.9.2
 3 | numpy==2.1.1
 4 | pandas==2.2.2
 5 | pyarrow==17.0.0
 6 | pyspark==3.5.2
 7 | scikit-learn==1.5.2
 8 | scikit-survival==0.23.0
 9 | scipy==1.14.1
10 | seaborn==0.13.2
11 | setuptools==68.0.0
12 | 


--------------------------------------------------------------------------------
/config/spark-cmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | service ssh start
 3 | 
 4 | # NOTE: SPARK_VERSION and HADOOP_HOME are defined in Dockerfile
 5 | 
 6 | echo "Starting HDFS and Yarn"
 7 | $HADOOP_HOME/sbin/start-dfs.sh
 8 | sleep 5
 9 | $HADOOP_HOME/sbin/start-yarn.sh
10 | sleep 5
11 | 
12 | if [[ $1 = "start" ]]; then
13 |     if [[ $2 = "master-node" ]]; then
14 |         ${SPARK_HOME}/sbin/start-master.sh
15 | 
16 |         # Starts history server to check running and completed applications
17 |         ${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /spark-logs
18 |         ${SPARK_HOME}/sbin/start-history-server.sh
19 | 
20 |         # Disables safe mode to prevent errors in small clusters
21 |         # ${HADOOP_HOME}/bin/hdfs dfsadmin -safemode leave
22 | 
23 |         sleep infinity
24 |         exit
25 |     fi
26 |     
27 |     # Sleeps to prevent connection issues with master
28 |     sleep 5
29 |     ${SPARK_HOME}/sbin/start-worker.sh master-node:7077
30 |     sleep infinity
31 |     exit
32 | fi
33 | 
34 | if [[ $1 = "stop" ]]; then
35 |     if [[ $2 = "master-node" ]]; then
36 |         ${SPARK_HOME}/sbin/stop-master.sh
37 |         exit
38 |     fi
39 |     ${SPARK_HOME}/sbin/stop-worker.sh
40 | fi
41 | 


--------------------------------------------------------------------------------
/config/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.eventLog.enabled          true
2 | spark.eventLog.dir              hdfs://master-node:9000/spark-logs
3 | spark.history.fs.logDirectory   hdfs://master-node:9000/spark-logs


--------------------------------------------------------------------------------
/config/spark-env.sh:
--------------------------------------------------------------------------------
1 | export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
2 | export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)


--------------------------------------------------------------------------------
/config/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |     <property>
 4 |         <name>yarn.nodemanager.aux-services</name>
 5 |         <value>mapreduce_shuffle</value>
 6 |     </property>
 7 |     <property>
 8 |         <name>yarn.nodemanager.env-whitelist</name>
 9 |         <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
10 |     </property>
11 |     <property>
12 |       <name>yarn.resourcemanager.hostname</name>
13 |       <value>master-node</value>
14 |     </property>
15 |     <!-- Port 8088 uses tcp6 protocol which does not work correctly with published ports using host mode in 
16 |     docker-compose file (source: https://stackoverflow.com/q/23128593/7058363).
17 |     Note that this setting is used only in master node -->
18 |     <property>
19 |       <name>yarn.resourcemanager.webapp.address</name>
20 |       <value>0.0.0.0:8088</value>
21 |     </property>
22 |     <!-- These two properties prevent some memory issues -->
23 |     <property>
24 |       <name>yarn.nodemanager.vmem-check-enabled</name>
25 |       <value>false</value>
26 |       <description>Whether virtual memory limits will be enforced for containers</description>
27 |     </property>
28 |     <property>
29 |       <name>yarn.nodemanager.vmem-pmem-ratio</name>
30 |       <value>4</value>
31 |       <description>Ratio between virtual memory to physical memory when setting memory limits for containers</description>
32 |     </property>
33 | </configuration>


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   # Master
 3 |   master-node:
 4 |     image: "jwaresolutions/big-data-cluster:1.0.1"
 5 |     container_name: "master-node"
 6 |     restart: "always"
 7 |     command: bash -c "/home/big_data/spark-cmd.sh start master-node"
 8 |     ports:
 9 |       - 8080:8080
10 |       - 9870:9870
11 |       - 18080:18080
12 |     networks:
13 |       - cluster-net
14 |     volumes:
15 |       # - "./data:/home/big_data/data" # Your data
16 |       - hdfs-master-data:/home/hadoop/data/nameNode
17 |       - hdfs-master-checkpoint-data:/home/hadoop/data/namesecondary
18 | 
19 |   # Workers
20 |   worker:
21 |     image: "jwaresolutions/big-data-cluster:1.0.1"
22 |     restart: "always"
23 |     command: bash -c "/home/big_data/spark-cmd.sh start"
24 |     deploy:
25 |     depends_on:
26 |       - "master-node"
27 |     volumes:
28 |       - hdfs-worker-data:/home/hadoop/data/dataNode
29 |     networks:
30 |       - cluster-net
31 | 
32 | volumes:
33 |   hdfs-master-data:
34 |   hdfs-master-checkpoint-data:
35 |   hdfs-worker-data:
36 | 
37 | # Create the cluster-net network
38 | networks:
39 |   cluster-net:
40 |     external: true
41 |     name: "cluster_net" # Useful for format as it does not allow '-' char on command
42 |     driver: bridge
43 |     attachable: false # Attachable: true prevents user to connect to Hadoop panels
44 | 


--------------------------------------------------------------------------------
/docker-compose_cluster.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   # Master
 3 |   master-node:
 4 |     image: "jwaresolutions/big-data-cluster:1.0.1"
 5 |     command: bash -c "/home/big_data/spark-cmd.sh start master-node"
 6 |     ports:
 7 |       - target: 8080
 8 |         published: 8080
 9 |         protocol: tcp
10 |         mode: host
11 |       - target: 9870
12 |         published: 9870
13 |         protocol: tcp
14 |         mode: host
15 |       - target: 18080
16 |         published: 18080
17 |         protocol: tcp
18 |         mode: host
19 |     networks:
20 |       - cluster-net
21 |     volumes:
22 |       # - "./data:/home/big_data/data" # Your data
23 |       - hdfs-master-data:/home/hadoop/data/nameNode
24 |       - hdfs-master-checkpoint-data:/home/hadoop/data/namesecondary
25 |     deploy:
26 |       mode: global # Required by Docker Swarm to make published ports work with other services
27 |       endpoint_mode: dnsrr # Required to prevent java.net.ConnectException
28 |       placement:
29 |         # Set node labels using `docker node update --label-add role=master <NODE ID>` from swarm manager
30 |         constraints:
31 |           - node.labels.role==master
32 | 
33 |   # Workers
34 |   worker:
35 |     image: "jwaresolutions/big-data-cluster:1.0.1"
36 |     command: bash -c "/home/big_data/spark-cmd.sh start"
37 |     depends_on:
38 |       - "master-node"
39 |     volumes:
40 |       - hdfs-worker-data:/home/hadoop/data/dataNode
41 |     deploy:
42 |       placement:
43 |         # Set node labels using `docker node update --label-add role=worker <NODE ID>` from swarm manager
44 |         constraints:
45 |           - node.labels.role==worker
46 |       # Deploy N containers for this service
47 |       replicas: 3
48 |     networks:
49 |       - cluster-net
50 | 
51 | volumes:
52 |   hdfs-master-data:
53 |     external: true
54 |     name: 'hdsf_master_data_swarm'
55 |   hdfs-master-checkpoint-data:
56 |     external: true
57 |     name: 'hdsf_master_checkpoint_data_swarm'
58 |   hdfs-worker-data:
59 |     external: true
60 |     name: 'hdsf_worker_data_swarm'
61 | 
62 | # Uses cluster-net network
63 | networks:
64 |   cluster-net:
65 |     external: true
66 |     name: cluster_net_swarm
67 | 


--------------------------------------------------------------------------------
/toy-cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | imageName="jwaresolutions/big-data-cluster:1.0.1"
 4 | 
 5 | # Bring the services up
 6 | function startServices {
 7 |   docker start master-node worker-1 worker-2 worker-3
 8 |   sleep 5
 9 |   echo ">> Starting Master and Workers ..."
10 |   docker exec -d master-node /home/big_data/spark-cmd.sh start master-node
11 |   docker exec -d worker-1 /home/big_data/spark-cmd.sh start
12 |   docker exec -d worker-2 /home/big_data/spark-cmd.sh start
13 |   docker exec -d worker-3 /home/big_data/spark-cmd.sh start
14 |   show_info
15 | }
16 | 
17 | function show_info {
18 |   masterIp=`docker inspect -f "{{ .NetworkSettings.Networks.cluster_net.IPAddress }}" master-node`
19 |   echo "Hadoop info @ master-node: http://$masterIp:8088/cluster"
20 |   echo "Spark info @ master-node:  http://$masterIp:8080/"
21 |   echo "Spark applications logs @ master-node:  http://$masterIp:18080/"
22 |   echo "DFS Health @ master-node:  http://$masterIp:9870/dfshealth.html"
23 | }
24 | 
25 | if [[ $1 = "start" ]]; then
26 |   startServices
27 |   exit
28 | fi
29 | 
30 | if [[ $1 = "stop" ]]; then
31 |   docker exec -d master-node /home/big_data/spark-cmd.sh stop master-node
32 |   docker exec -d worker-1 /home/big_data/spark-cmd.sh stop
33 |   docker exec -d worker-2 /home/big_data/spark-cmd.sh stop
34 |   docker exec -d worker-3 /home/big_data/spark-cmd.sh stop
35 |   docker stop master-node worker-1 worker-2 worker-3
36 |   exit
37 | fi
38 | 
39 | if [[ $1 = "remove" ]]; then
40 |   docker rm master-node worker-1 worker-2 worker-3
41 |   exit
42 | fi
43 | 
44 | if [[ $1 = "deploy" ]]; then
45 |   docker container rm -f `docker ps -a | grep $imageName | awk '{ print $1 }'` # delete old containers
46 |   docker network rm cluster_net
47 |   docker network create --driver bridge cluster_net # create custom network
48 | 
49 |   # 3 nodes
50 |   echo ">> Starting nodes master and worker nodes ..."
51 |   docker run -dP --network cluster_net --name master-node -h master-node -it $imageName
52 |   docker run -dP --network cluster_net --name worker-1 -it -h worker-1 $imageName
53 |   docker run -dP --network cluster_net --name worker-2 -it -h worker-2 $imageName
54 |   docker run -dP --network cluster_net --name worker-3 -it -h worker-3 $imageName
55 | 
56 |   # Format master
57 |   echo ">> Formatting hdfs ..."
58 |   docker exec -it master-node ${HADOOP_HOME}/bin/hdfs namenode -format
59 | 
60 |   startServices
61 |   exit
62 | fi
63 | 
64 | if [[ $1 = "info" ]]; then
65 |   show_info
66 |   exit
67 | fi
68 | 
69 | echo "Usage: cluster.sh deploy|start|stop"
70 | echo "    deploy - create a new Docker network, containers (a master and 3 workers) and start these last"
71 | echo "    start  - start the existing containers"
72 | echo "    stop   - stop the running containers"
73 | echo "    remove - remove all the created containers"
74 | echo "    info   - useful URLs"
75 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 1.0.1


--------------------------------------------------------------------------------