├── .github └── workflows │ └── docker-build-publish.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── config ├── core-site.xml ├── hdfs-site.xml ├── log4j.properties ├── mapred-site.xml ├── requirements.txt ├── spark-cmd.sh ├── spark-defaults.conf ├── spark-env.sh └── yarn-site.xml ├── docker-compose.yml ├── docker-compose_cluster.yml ├── toy-cluster.sh └── version.txt /.github/workflows/docker-build-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build, Publish Docker Image, and Tag Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main # Trigger workflow on pushes to the main branch 7 | workflow_dispatch: # Allow manual triggering of the workflow 8 | 9 | jobs: 10 | build-and-publish: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v3 16 | 17 | - name: Read version from version.txt 18 | id: read_version 19 | run: | 20 | version=$(cat version.txt) 21 | echo "version=$version" >> $GITHUB_ENV 22 | 23 | - name: Log in to Docker Hub 24 | uses: docker/login-action@v2 25 | with: 26 | username: ${{ secrets.DOCKER_USERNAME }} 27 | password: ${{ secrets.DOCKER_TOKEN }} 28 | 29 | - name: Build and tag Docker image 30 | run: | 31 | docker image build -t jwaresolutions/big-data-cluster:$version . 32 | docker tag jwaresolutions/big-data-cluster:$version jwaresolutions/big-data-cluster:latest 33 | 34 | - name: Push Docker image 35 | run: | 36 | docker image push jwaresolutions/big-data-cluster:$version 37 | docker image push jwaresolutions/big-data-cluster:latest 38 | 39 | - name: Create Tag and Release 40 | uses: avakar/tag-and-release@v1 41 | with: 42 | tag_name: "v${{ env.version }}" 43 | env: 44 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/settings.json 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image 2 | FROM ubuntu:24.10 3 | 4 | # Set environment variables for non-interactive installation 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # Adds some needed environment variables 8 | ENV HDFS_NAMENODE_USER=root 9 | ENV HDFS_DATANODE_USER=root 10 | ENV HDFS_SECONDARYNAMENODE_USER=root 11 | ENV YARN_RESOURCEMANAGER_USER=root 12 | ENV YARN_NODEMANAGER_USER=root 13 | ENV PYSPARK_PYTHON=python3 14 | 15 | # Install required packages. NOTE: sudo is needed as it's called in some Spark scripts 16 | ENV OPEN_JDK_VERSION=21 17 | RUN apt update && apt install -y \ 18 | openjdk-${OPEN_JDK_VERSION}-jdk \ 19 | wget \ 20 | curl \ 21 | vim \ 22 | ssh \ 23 | rsync \ 24 | git \ 25 | net-tools \ 26 | python3-pip \ 27 | python3-venv \ 28 | sudo \ 29 | && rm -rf /var/lib/apt/lists/* 30 | 31 | # Set JAVA_HOME environment variable 32 | ENV JAVA_HOME=/usr/lib/jvm/java-${OPEN_JDK_VERSION}-openjdk-amd64 33 | ENV PATH=$JAVA_HOME/bin:$PATH 34 | 35 | # Install Hadoop 36 | ENV HADOOP_VERSION=3.4.0 37 | RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \ 38 | && tar -xzf hadoop-${HADOOP_VERSION}.tar.gz -C /opt/ \ 39 | && rm hadoop-${HADOOP_VERSION}.tar.gz 40 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} 41 | ENV PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH 42 | 43 | # Creates the necessary directories for Hadoop 44 | RUN mkdir -p ${HADOOP_VERSION}/logs 45 | 46 | # Install Spark 47 | ENV SPARK_VERSION=3.5.5 48 | RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz \ 49 | && tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz -C /opt/ \ 50 | && rm spark-${SPARK_VERSION}-bin-hadoop3.tgz 51 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION}-bin-hadoop3 52 | ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH 53 | 54 | # Set up SSH (for Hadoop to communicate across nodes) 55 | RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' \ 56 | && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys \ 57 | && chmod 0600 ~/.ssh/authorized_keys 58 | 59 | # Create and activate a virtual environment for Python. 60 | ENV VIRTUAL_ENV=/opt/venv 61 | RUN python3 -m venv $VIRTUAL_ENV 62 | ENV PATH="$VIRTUAL_ENV/bin:$PATH" 63 | 64 | # Copy requirements.txt and install Python dependencies in the virtual environment 65 | COPY ./config/requirements.txt /tmp/ 66 | RUN pip install --upgrade pip \ 67 | && pip install -r /tmp/requirements.txt 68 | 69 | # Hadoop settings 70 | WORKDIR ${HADOOP_HOME}/etc/hadoop 71 | COPY ./config/core-site.xml . 72 | COPY ./config/hdfs-site.xml . 73 | COPY ./config/mapred-site.xml . 74 | COPY ./config/yarn-site.xml . 75 | 76 | # Spark settings 77 | WORKDIR ${SPARK_HOME}/conf 78 | COPY ./config/spark-env.sh . 79 | COPY ./config/spark-defaults.conf . 80 | COPY ./config/log4j.properties . 81 | 82 | # Cluster cmd 83 | WORKDIR /home/big_data 84 | COPY ./config/spark-cmd.sh . 85 | RUN chmod +x /home/big_data/spark-cmd.sh 86 | 87 | # Add an explicit step to set JAVA_HOME in the bash profile to make it available to all users 88 | RUN echo "export JAVA_HOME=$JAVA_HOME" >> /etc/profile \ 89 | && echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \ 90 | && echo "export PATH=$JAVA_HOME/bin:$PATH" >> /etc/profile \ 91 | && echo 'export PATH=$PATH:$HADOOP_HOME/bin' >> ~/.bashrc \ 92 | && echo 'export PATH=$PATH:$HADOOP_HOME/sbin' >> ~/.bashrc 93 | 94 | # Expose necessary ports (8080 -> Spark UI, 18080 -> Spark applications logs, 9870 -> Hadoop NameNode UI) 95 | EXPOSE 8080 18080 9870 96 | 97 | # Start SSH service. The entrypoint is defined in the docker-compose file 98 | CMD ["service", "ssh", "start", "&&", "sleep", "infinity"] 99 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 JWare Solutions 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Docker Big Data Cluster 3 | 4 | A ready to go Big Data cluster (Hadoop + Hadoop Streaming + Spark + PySpark) with Docker and Docker Swarm! 5 | 6 | 7 | ## Index 8 | 9 | 1. [Why?](#why) 10 | 1. [Features](#features) 11 | 1. [Running toy cluster](#running-toy-cluster) 12 | 1. [Running a real cluster in Docker Swarm](#running-a-real-cluster-in-docker-swarm) 13 | 1. [Usage](#usage) 14 | 1. [HDFS](#hdfs) 15 | 1. [Spark and PySpark](#spark-and-pyspark) 16 | 1. [Going further](#going-further) 17 | 1. [Frequent problems](#frequent-problems) 18 | 1. [Contributing](#contributing) 19 | 20 | 21 | ## Why? 22 | 23 | Although today you can find several repositories ready to deploy a Spark or Hadoop cluster, they all run into the same problem: they do not work when deployed on Docker Swarm due to several issues ranging from the definition of the worker nodes to connection problems with Docker network interfaces. 24 | 25 | This repository seeks to solve the problem by offering a functional alternative, both a toy cluster to deploy on a single machine, as well as a real cluster that works on multiple nodes that conform a Docker Swarm cluster. 26 | 27 | 28 | ## Features 29 | 30 | This repository is inspired by and uses several scripts taken from [Rubenafo's repo][rubenafo-repo] and [Sdesilva26's repo][sdesilva26-repo], however there are several changes introduced; the API is simpler, there is more documentation about usage and some extra features: 31 | 32 | - ✅ Ready to deploy in a Docker Swarm cluster: all the networking and port configuration issues have been fixed so you can scale your cluster to as many worker nodes as you need. 33 | - ⚡️ Hadoop, HDFS, Spark, Scala and PySpark ready to use: all the tools are available inside the container globally so you don't have to fight with environment variables and executable paths. 34 | - 🌟 New technology: our image offers Hadoop 3.4.0, Spark 3.5.5 and Python 3.12.6 35 | - ⚙️ Less configuration: we have removed some settings to keep the minimum possible configuration, this way you prevent errors, unexpected behaviors and get the freedom to set parameters via environment variables and have an agile development that does not require rebuilding the Docker image. 36 | - 🐍 Python dependencies: we include the most used Python dependencies like Pandas, Numpy and Scipy to be able to work on datasets and perform mathematical operations (you can remove them if you don't need them!) 37 | 38 | 39 | ## Running toy cluster 40 | 41 | You have two ways to run a cluster on a single machine: 42 | 43 | - Use `toy-cluster.sh` script... 44 | - Or use `docker-compose.yml` file 45 | 46 | 47 | ### Using toy-cluster.sh script 48 | 49 | The script has the following commands: 50 | 51 | - deploy: create a new Docker network, containers (a master and 3 workers) and start these last 52 | - start: start the existing containers 53 | - stop: stop the running containers 54 | - remove: remove all the created containers 55 | - info:: useful URLs 56 | 57 | So, if you want to try your new cluster run `./toy-cluster.sh deploy` to create a network, containers and format namenode HDFS (note that this script will start the containers too). To stop, start again or remove just run the `stop`, `start` or `remove` respectively. 58 | 59 | Use `./toy-cluster.sh info` to see the URLs to check Hadoop and Spark clusters status. 60 | 61 | 62 | ### Using docker-compose.yml file 63 | 64 | The `docker-compose.yml` file has the same structure than `toy-cluster.sh` script except for the use of volumes to preserve HDFS data. 65 | 66 | Only for the first time, you need to format the namenode information directory. **Do not execute this command when you are in production with valid data stored as you will lose all your data stored in the HDFS**: 67 | 68 | `docker container run --rm -v hdfs_master_data_swarm:/home/hadoop/data/nameNode jwaresolutions/big-data-cluster: /usr/local/hadoop/bin/hadoop namenode -format` 69 | 70 | Then you can manage your toy cluster with the following commands: 71 | 72 | - To start the cluster run: `docker-compose up -d` 73 | - To stop the cluster run: `docker-compose down` 74 | 75 | **Important:** the use of `./toy-cluster.sh info` works with this! So you can get the useful cluster URLs. 76 | 77 | 78 | ## Running a real cluster in Docker Swarm 79 | 80 | Here is the important stuff, there are some minors steps to do to make it work: first of all you need a Docker Swarm cluster: 81 | 82 | 1. Start the cluster in your master node: `docker swarm init`. 83 | 1. Generate a token for the workers to be added ([official doc][swarm-docs]): `docker swarm join-token worker`. It will print on screen a token in a command that must be executed in all the workers to be added. 84 | 1. Run the command generated in the previous step in all workers node: `docker swarm join: --token :` 85 | 86 | You have your Docker Swarm cluster! Now you have to label all the nodes to indicate which one will be the *master* and *workers*. On master node run: 87 | 88 | 1. List all cluster nodes to get their ID: `docker node ls` 89 | 1. Label master node as master: `docker node update --label-add role=master ` 90 | 1. **For every** worker ID node run: `docker node update --label-add role=worker ` 91 | 92 | Create needed network and volumes: 93 | 94 | ``` 95 | docker network create -d overlay cluster_net_swarm 96 | docker volume create --name=hdfs_master_data_swarm 97 | docker volume create --name=hdfs_master_checkpoint_data_swarm 98 | docker volume create --name=hdfs_worker_data_swarm 99 | ``` 100 | 101 | Now it is time to select a tag of the Docker image. The default is latest but it is not recommended to use it in production. After choose one, set it version on `docker-compose_cluster.yml` and the command below. 102 | 103 | Only for the first time, you need to format the namenode information directory **in Master and Workers nodes. Do not execute this command when you are in production with valid data stored as you will lose all your data stored in the HDFS**: 104 | 105 | `docker container run --rm -v hdfs_master_data_swarm:/home/hadoop/data/nameNode jwaresolutions/big-data-cluster: /opt/hadoop-3.4.0/bin/hadoop namenode -format` 106 | 107 | Now you are ready to deploy your production cluster! 108 | 109 | `docker stack deploy -c docker-compose_cluster.yml big-data-cluster` 110 | 111 | 112 | 113 | ## Usage 114 | 115 | Finally you can use your cluster! Like the toy cluster, you have available some useful URLs: 116 | 117 | - \:8080 -> Spark panel 118 | - \:18080 -> Spark applications logs 119 | - \:9870 -> HDFS panel 120 | 121 | Enter the master node: 122 | 123 | `docker container exec -it bash` 124 | 125 | 126 | ### HDFS 127 | 128 | You can store files in the Hadoop Distributed File System: 129 | 130 | ``` 131 | echo "test" > test.txt 132 | hdfs dfs -copyFromLocal ./test.txt /test.txt 133 | ``` 134 | 135 | If you check in a worker node that the file is visible in the entire cluster: 136 | 137 | `hdfs dfs -ls /` 138 | 139 | 140 | 141 | ### Spark and PySpark 142 | 143 | 1. You can initiate a PySpark console: `pyspark --master spark://master-node:7077` 144 | 1. Now, for example, read a file and count lines: 145 | 146 | ```python 147 | lines = sc.textFile('hdfs://master-node:9000/test.txt') 148 | lines_count = lines.count() 149 | print(f'Line count -> {lines_count}') 150 | ``` 151 | 1. Or you can submit an script: 152 | 1. Make the script: 153 | 154 | ```python 155 | from pyspark import SparkContext 156 | import random 157 | 158 | NUM_SAMPLES = 1000 159 | 160 | sc = SparkContext("spark://master-node:7077", "Pi Estimation") 161 | 162 | 163 | def inside(p): 164 | x, y = random.random(), random.random() 165 | return x*x + y*y < 1 166 | 167 | count = sc.parallelize(range(0, NUM_SAMPLES)) \ 168 | .filter(inside).count() 169 | print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES)) 170 | ``` 171 | 172 | 173 | 2. Submit it: `spark-submit your-script.py` 174 | 175 | 176 | ## Going further 177 | 178 | 179 | ### Expand number of workers 180 | 181 | Adding workers to cluster is easy: 182 | 183 | 1. Add a worker to your Swarm cluster as explained in [Running a real cluster in Docker Swarm](##running-a-real-cluster-in-Docker-Swarm) and label it with `role=worker`. 184 | 1. Increment the number of replicas in `docker-compose_cluster.yml` for `worker` service. 185 | 1. Deploy the stack again with `docker stack deploy -c docker-compose_cluster.yml big-data-cluster` (restart is no required). 186 | 187 | 188 | ### Add files/folder inside cluster 189 | 190 | In both `docker-compose.yml` (toy cluster) and `docker-compose_cluster.yml` (real cluster) there is a commented line in `volumes` section. Just uncomment it and set the the file/folder in host file and the destination inside master node in cluster! For more information read [official documentation][volumes-docs] about `volumes` setting in Docker Compose. 191 | 192 | 193 | ### Add Python dependencies 194 | 195 | 1. Add the dependency to the `requirements.txt` file. 196 | 1. Build the image again. 197 | 198 | 199 | ### Check Spark logs 200 | 201 | To check Spark `stderr` and `stdout` files you can run `bash` inside the Worker container and then run the following commands: 202 | 203 | - stderr: `cat /sbin/spark-3.5.2-bin-without-hadoop/work///stderr` 204 | - stdout: `cat /sbin/spark-3.5.2-bin-without-hadoop/work///stdout` 205 | 206 | 207 | ## Frequent problems 208 | 209 | 210 | ### Connection refused error 211 | 212 | Sometimes it throws a *Connection refused* error when run a HDFS command or try to access to DFS from Hadoop/Spark. There is [official documentation][connection-refused-docs] about this problem. The solution that worked for this repository was running the commands listed in [this Stack Overflow answer][connection-refused-answer]. That is why you need to format the namenode directory the first time you are deploying the real cluster (see [Running a real cluster in Docker Swarm](##running-a-real-cluster-in-docker-swarm)). 213 | 214 | 215 | ### Port 9870 is not working 216 | 217 | This problem means that Namenode is now running in master node, is associated with [Connection refused for HDFS](###connection-refused-for-hdfs) problem and has the same solution. Once Namenode is running the port should be working correctly. 218 | 219 | 220 | ### HDFS panel does not show some living nodes 221 | 222 | If there are nodes that are not listed as active in the HDFS panel you may also need to run the nanemode directory formatting command on the Workers nodes, not just the Driver. See [Running a real cluster in Docker Swarm](##running-a-real-cluster-in-docker-swarm) to get the command. 223 | 224 | 225 | ## Contributing 226 | 227 | Any kind of help is welcome and appreciated! If you find a bug please submit an issue or make a PR: 228 | 229 | 1. Fork this repo. 230 | 1. Create a branch where you will develop some changes. 231 | 1. Make a PR. 232 | 233 | There are some TODOs to complete: 234 | 235 | - [ ] Find a way to prevent *Connection refused* error to avoid format the namenode information directory 236 | - [ ] Add examples for Hadoop 237 | - [ ] Add examples for Hadoop Streaming 238 | - [ ] Add examples for Spark Streaming 239 | 240 | 241 | [rubenafo-repo]: https://github.com/rubenafo/docker-spark-cluster 242 | [sdesilva26-repo]: https://github.com/sdesilva26/docker-spark 243 | [swarm-docs]: https://docs.docker.com/engine/swarm/join-nodes/ 244 | [volumes-docs]: https://docs.docker.com/compose/compose-file/compose-file-v3/#volumes 245 | [connection-refused-docs]: https://cwiki.apache.org/confluence/display/HADOOP2/ConnectionRefused 246 | [connection-refused-answer]: https://stackoverflow.com/a/42281292/7058363 247 | -------------------------------------------------------------------------------- /config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.defaultFS 7 | hdfs://master-node:9000 8 | 9 | 10 | -------------------------------------------------------------------------------- /config/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.namenode.name.dir 6 | /home/hadoop/data/nameNode 7 | 8 | 9 | dfs.namenode.checkpoint.dir 10 | /home/hadoop/data/namesecondary 11 | 12 | 13 | dfs.datanode.data.dir 14 | /home/hadoop/data/dataNode 15 | 16 | 17 | dfs.replication 18 | 1 19 | 20 | -------------------------------------------------------------------------------- /config/log4j.properties: -------------------------------------------------------------------------------- 1 | # it logs in only from Warning upwards. 2 | # The rest of the configuration and comments are left as default by the framework. 3 | log4j.rootCategory=WARN, console 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | log4j.appender.console.target=System.err 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 8 | 9 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 10 | # log level for this class is used to overwrite the root logger's log level, so that 11 | # the user can have different defaults for the shell and regular Spark apps. 12 | log4j.logger.org.apache.spark.repl.Main=WARN 13 | 14 | # Settings to quiet third party logs that are too verbose 15 | log4j.logger.org.spark_project.jetty=WARN 16 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 17 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 18 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 19 | log4j.logger.org.apache.parquet=ERROR 20 | log4j.logger.parquet=ERROR 21 | 22 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 23 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 24 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 25 | -------------------------------------------------------------------------------- /config/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapreduce.framework.name 6 | yarn 7 | 8 | 9 | mapreduce.application.classpath 10 | $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/* 11 | 12 | 13 | mapreduce.job.tracker 14 | master-node:9001 15 | 16 | -------------------------------------------------------------------------------- /config/requirements.txt: -------------------------------------------------------------------------------- 1 | filelock==3.16.0 2 | matplotlib==3.9.2 3 | numpy==2.1.1 4 | pandas==2.2.2 5 | pyarrow==17.0.0 6 | pyspark==3.5.2 7 | scikit-learn==1.5.2 8 | scikit-survival==0.23.0 9 | scipy==1.14.1 10 | seaborn==0.13.2 11 | setuptools==68.0.0 12 | -------------------------------------------------------------------------------- /config/spark-cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | service ssh start 3 | 4 | # NOTE: SPARK_VERSION and HADOOP_HOME are defined in Dockerfile 5 | 6 | echo "Starting HDFS and Yarn" 7 | $HADOOP_HOME/sbin/start-dfs.sh 8 | sleep 5 9 | $HADOOP_HOME/sbin/start-yarn.sh 10 | sleep 5 11 | 12 | if [[ $1 = "start" ]]; then 13 | if [[ $2 = "master-node" ]]; then 14 | ${SPARK_HOME}/sbin/start-master.sh 15 | 16 | # Starts history server to check running and completed applications 17 | ${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /spark-logs 18 | ${SPARK_HOME}/sbin/start-history-server.sh 19 | 20 | # Disables safe mode to prevent errors in small clusters 21 | # ${HADOOP_HOME}/bin/hdfs dfsadmin -safemode leave 22 | 23 | sleep infinity 24 | exit 25 | fi 26 | 27 | # Sleeps to prevent connection issues with master 28 | sleep 5 29 | ${SPARK_HOME}/sbin/start-worker.sh master-node:7077 30 | sleep infinity 31 | exit 32 | fi 33 | 34 | if [[ $1 = "stop" ]]; then 35 | if [[ $2 = "master-node" ]]; then 36 | ${SPARK_HOME}/sbin/stop-master.sh 37 | exit 38 | fi 39 | ${SPARK_HOME}/sbin/stop-worker.sh 40 | fi 41 | -------------------------------------------------------------------------------- /config/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.eventLog.enabled true 2 | spark.eventLog.dir hdfs://master-node:9000/spark-logs 3 | spark.history.fs.logDirectory hdfs://master-node:9000/spark-logs -------------------------------------------------------------------------------- /config/spark-env.sh: -------------------------------------------------------------------------------- 1 | export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop" 2 | export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath) -------------------------------------------------------------------------------- /config/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.nodemanager.aux-services 5 | mapreduce_shuffle 6 | 7 | 8 | yarn.nodemanager.env-whitelist 9 | JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME 10 | 11 | 12 | yarn.resourcemanager.hostname 13 | master-node 14 | 15 | 18 | 19 | yarn.resourcemanager.webapp.address 20 | 0.0.0.0:8088 21 | 22 | 23 | 24 | yarn.nodemanager.vmem-check-enabled 25 | false 26 | Whether virtual memory limits will be enforced for containers 27 | 28 | 29 | yarn.nodemanager.vmem-pmem-ratio 30 | 4 31 | Ratio between virtual memory to physical memory when setting memory limits for containers 32 | 33 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | # Master 3 | master-node: 4 | image: "jwaresolutions/big-data-cluster:1.0.1" 5 | container_name: "master-node" 6 | restart: "always" 7 | command: bash -c "/home/big_data/spark-cmd.sh start master-node" 8 | ports: 9 | - 8080:8080 10 | - 9870:9870 11 | - 18080:18080 12 | networks: 13 | - cluster-net 14 | volumes: 15 | # - "./data:/home/big_data/data" # Your data 16 | - hdfs-master-data:/home/hadoop/data/nameNode 17 | - hdfs-master-checkpoint-data:/home/hadoop/data/namesecondary 18 | 19 | # Workers 20 | worker: 21 | image: "jwaresolutions/big-data-cluster:1.0.1" 22 | restart: "always" 23 | command: bash -c "/home/big_data/spark-cmd.sh start" 24 | deploy: 25 | depends_on: 26 | - "master-node" 27 | volumes: 28 | - hdfs-worker-data:/home/hadoop/data/dataNode 29 | networks: 30 | - cluster-net 31 | 32 | volumes: 33 | hdfs-master-data: 34 | hdfs-master-checkpoint-data: 35 | hdfs-worker-data: 36 | 37 | # Create the cluster-net network 38 | networks: 39 | cluster-net: 40 | external: true 41 | name: "cluster_net" # Useful for format as it does not allow '-' char on command 42 | driver: bridge 43 | attachable: false # Attachable: true prevents user to connect to Hadoop panels 44 | -------------------------------------------------------------------------------- /docker-compose_cluster.yml: -------------------------------------------------------------------------------- 1 | services: 2 | # Master 3 | master-node: 4 | image: "jwaresolutions/big-data-cluster:1.0.1" 5 | command: bash -c "/home/big_data/spark-cmd.sh start master-node" 6 | ports: 7 | - target: 8080 8 | published: 8080 9 | protocol: tcp 10 | mode: host 11 | - target: 9870 12 | published: 9870 13 | protocol: tcp 14 | mode: host 15 | - target: 18080 16 | published: 18080 17 | protocol: tcp 18 | mode: host 19 | networks: 20 | - cluster-net 21 | volumes: 22 | # - "./data:/home/big_data/data" # Your data 23 | - hdfs-master-data:/home/hadoop/data/nameNode 24 | - hdfs-master-checkpoint-data:/home/hadoop/data/namesecondary 25 | deploy: 26 | mode: global # Required by Docker Swarm to make published ports work with other services 27 | endpoint_mode: dnsrr # Required to prevent java.net.ConnectException 28 | placement: 29 | # Set node labels using `docker node update --label-add role=master ` from swarm manager 30 | constraints: 31 | - node.labels.role==master 32 | 33 | # Workers 34 | worker: 35 | image: "jwaresolutions/big-data-cluster:1.0.1" 36 | command: bash -c "/home/big_data/spark-cmd.sh start" 37 | depends_on: 38 | - "master-node" 39 | volumes: 40 | - hdfs-worker-data:/home/hadoop/data/dataNode 41 | deploy: 42 | placement: 43 | # Set node labels using `docker node update --label-add role=worker ` from swarm manager 44 | constraints: 45 | - node.labels.role==worker 46 | # Deploy N containers for this service 47 | replicas: 3 48 | networks: 49 | - cluster-net 50 | 51 | volumes: 52 | hdfs-master-data: 53 | external: true 54 | name: 'hdsf_master_data_swarm' 55 | hdfs-master-checkpoint-data: 56 | external: true 57 | name: 'hdsf_master_checkpoint_data_swarm' 58 | hdfs-worker-data: 59 | external: true 60 | name: 'hdsf_worker_data_swarm' 61 | 62 | # Uses cluster-net network 63 | networks: 64 | cluster-net: 65 | external: true 66 | name: cluster_net_swarm 67 | -------------------------------------------------------------------------------- /toy-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | imageName="jwaresolutions/big-data-cluster:1.0.1" 4 | 5 | # Bring the services up 6 | function startServices { 7 | docker start master-node worker-1 worker-2 worker-3 8 | sleep 5 9 | echo ">> Starting Master and Workers ..." 10 | docker exec -d master-node /home/big_data/spark-cmd.sh start master-node 11 | docker exec -d worker-1 /home/big_data/spark-cmd.sh start 12 | docker exec -d worker-2 /home/big_data/spark-cmd.sh start 13 | docker exec -d worker-3 /home/big_data/spark-cmd.sh start 14 | show_info 15 | } 16 | 17 | function show_info { 18 | masterIp=`docker inspect -f "{{ .NetworkSettings.Networks.cluster_net.IPAddress }}" master-node` 19 | echo "Hadoop info @ master-node: http://$masterIp:8088/cluster" 20 | echo "Spark info @ master-node: http://$masterIp:8080/" 21 | echo "Spark applications logs @ master-node: http://$masterIp:18080/" 22 | echo "DFS Health @ master-node: http://$masterIp:9870/dfshealth.html" 23 | } 24 | 25 | if [[ $1 = "start" ]]; then 26 | startServices 27 | exit 28 | fi 29 | 30 | if [[ $1 = "stop" ]]; then 31 | docker exec -d master-node /home/big_data/spark-cmd.sh stop master-node 32 | docker exec -d worker-1 /home/big_data/spark-cmd.sh stop 33 | docker exec -d worker-2 /home/big_data/spark-cmd.sh stop 34 | docker exec -d worker-3 /home/big_data/spark-cmd.sh stop 35 | docker stop master-node worker-1 worker-2 worker-3 36 | exit 37 | fi 38 | 39 | if [[ $1 = "remove" ]]; then 40 | docker rm master-node worker-1 worker-2 worker-3 41 | exit 42 | fi 43 | 44 | if [[ $1 = "deploy" ]]; then 45 | docker container rm -f `docker ps -a | grep $imageName | awk '{ print $1 }'` # delete old containers 46 | docker network rm cluster_net 47 | docker network create --driver bridge cluster_net # create custom network 48 | 49 | # 3 nodes 50 | echo ">> Starting nodes master and worker nodes ..." 51 | docker run -dP --network cluster_net --name master-node -h master-node -it $imageName 52 | docker run -dP --network cluster_net --name worker-1 -it -h worker-1 $imageName 53 | docker run -dP --network cluster_net --name worker-2 -it -h worker-2 $imageName 54 | docker run -dP --network cluster_net --name worker-3 -it -h worker-3 $imageName 55 | 56 | # Format master 57 | echo ">> Formatting hdfs ..." 58 | docker exec -it master-node ${HADOOP_HOME}/bin/hdfs namenode -format 59 | 60 | startServices 61 | exit 62 | fi 63 | 64 | if [[ $1 = "info" ]]; then 65 | show_info 66 | exit 67 | fi 68 | 69 | echo "Usage: cluster.sh deploy|start|stop" 70 | echo " deploy - create a new Docker network, containers (a master and 3 workers) and start these last" 71 | echo " start - start the existing containers" 72 | echo " stop - stop the running containers" 73 | echo " remove - remove all the created containers" 74 | echo " info - useful URLs" 75 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 1.0.1 --------------------------------------------------------------------------------