├── Dockerfile ├── LICENSE ├── README.md ├── count.py ├── docker-compose.yml └── scripts ├── create.sh ├── destroy.sh └── run.sh /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:jessie 2 | 3 | RUN apt-get update \ 4 | && apt-get install -y curl unzip \ 5 | python3 python3-setuptools \ 6 | && ln -s /usr/bin/python3 /usr/bin/python \ 7 | && easy_install3 pip py4j \ 8 | && apt-get clean \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | ENV PYTHONHASHSEED 0 12 | ENV PYTHONIOENCODING UTF-8 13 | ENV PIP_DISABLE_PIP_VERSION_CHECK 1 14 | 15 | # JAVA 16 | ARG JAVA_MAJOR_VERSION=8 17 | ARG JAVA_UPDATE_VERSION=131 18 | ARG JAVA_BUILD_NUMBER=11 19 | ENV JAVA_HOME /usr/jdk1.${JAVA_MAJOR_VERSION}.0_${JAVA_UPDATE_VERSION} 20 | 21 | ENV PATH $PATH:$JAVA_HOME/bin 22 | RUN curl -sL --retry 3 --insecure \ 23 | --header "Cookie: oraclelicense=accept-securebackup-cookie;" \ 24 | "http://download.oracle.com/otn-pub/java/jdk/${JAVA_MAJOR_VERSION}u${JAVA_UPDATE_VERSION}-b${JAVA_BUILD_NUMBER}/d54c1d3a095b4ff2b6607d096fa80163/server-jre-${JAVA_MAJOR_VERSION}u${JAVA_UPDATE_VERSION}-linux-x64.tar.gz" \ 25 | | gunzip \ 26 | | tar x -C /usr/ \ 27 | && ln -s $JAVA_HOME /usr/java \ 28 | && rm -rf $JAVA_HOME/man 29 | 30 | # HADOOP 31 | ENV HADOOP_VERSION 3.2.2 32 | ENV HADOOP_HOME /usr/hadoop-$HADOOP_VERSION 33 | ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 34 | ENV PATH $PATH:$HADOOP_HOME/bin 35 | RUN curl -sL --retry 3 \ 36 | "http://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" \ 37 | | gunzip \ 38 | | tar -x -C /usr/ \ 39 | && rm -rf $HADOOP_HOME/share/doc \ 40 | && chown -R root:root $HADOOP_HOME 41 | 42 | # SPARK 43 | ENV SPARK_VERSION 3.0.2 44 | ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-without-hadoop 45 | ENV SPARK_HOME /usr/spark-${SPARK_VERSION} 46 | ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*" 47 | ENV PATH $PATH:${SPARK_HOME}/bin 48 | RUN curl -sL --retry 3 \ 49 | "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \ 50 | | gunzip \ 51 | | tar x -C /usr/ \ 52 | && mv /usr/$SPARK_PACKAGE $SPARK_HOME \ 53 | && chown -R root:root $SPARK_HOME 54 | 55 | WORKDIR $SPARK_HOME 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Michael Herman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Running Spark with Docker Swarm on DigitalOcean 2 | 3 | ## Want to learn how to build this? 4 | 5 | Check out the [post](https://testdriven.io/running-spark-with-docker-swarm-on-digitalocean). 6 | 7 | ## Want to use this project? 8 | 9 | 1. Fork/Clone 10 | 11 | 1. [Sign up](https://m.do.co/c/d8f211a4b4c2) for Digital Ocean and [generate](https://www.digitalocean.com/community/tutorials/how-to-use-the-digitalocean-api-v2) an access token 12 | 13 | 1. Add the token to your environment: 14 | 15 | ```sh 16 | $ export DIGITAL_OCEAN_ACCESS_TOKEN=[your_token] 17 | ``` 18 | 19 | 1. Spin up three droplets and deploy Docker Swarm: 20 | 21 | ```sh 22 | $ sh scripts/create.sh 23 | ``` 24 | 25 | 1. Run the script: 26 | 27 | ```sh 28 | $ sh scripts/run.sh 29 | ``` 30 | 31 | 1. Bring down the resources: 32 | 33 | ```sh 34 | $ sh scripts/destroy.sh 35 | ``` 36 | -------------------------------------------------------------------------------- /count.py: -------------------------------------------------------------------------------- 1 | try: 2 | from pyspark import SparkContext, SparkConf 3 | from operator import add 4 | except Exception as e: 5 | print(e) 6 | 7 | 8 | def get_counts(): 9 | words = "test test" 10 | conf = SparkConf().setAppName('letter count') 11 | sc = SparkContext(conf=conf) 12 | seq = words.split() 13 | data = sc.parallelize(seq) 14 | counts = data.map(lambda word: (word, 1)).reduceByKey(add).collect() 15 | sc.stop() 16 | print('\n{0}\n'.format(dict(counts))) 17 | 18 | 19 | if __name__ == "__main__": 20 | get_counts() 21 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | 5 | master: 6 | image: mjhea0/spark:3.0.2 7 | command: bin/spark-class org.apache.spark.deploy.master.Master -h master 8 | hostname: master 9 | environment: 10 | MASTER: spark://master:7077 11 | SPARK_CONF_DIR: /conf 12 | SPARK_PUBLIC_DNS: ${EXTERNAL_IP} 13 | ports: 14 | - 4040:4040 15 | - 6066:6066 16 | - 7077:7077 17 | - 8080:8080 18 | 19 | worker: 20 | image: mjhea0/spark:3.0.2 21 | command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 22 | hostname: worker 23 | environment: 24 | SPARK_CONF_DIR: /conf 25 | SPARK_WORKER_CORES: 2 26 | SPARK_WORKER_MEMORY: 1g 27 | SPARK_PUBLIC_DNS: ${EXTERNAL_IP} 28 | depends_on: 29 | - master 30 | ports: 31 | - 8081:8081 32 | -------------------------------------------------------------------------------- /scripts/create.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | echo "Spinning up three droplets..." 5 | 6 | for i in 1 2 3; do 7 | docker-machine create \ 8 | --driver digitalocean \ 9 | --digitalocean-access-token $DIGITAL_OCEAN_ACCESS_TOKEN \ 10 | --engine-install-url "https://releases.rancher.com/install-docker/19.03.9.sh" \ 11 | node-$i; 12 | done 13 | 14 | 15 | echo "Initializing Swarm mode..." 16 | 17 | docker-machine ssh node-1 -- docker swarm init --advertise-addr $(docker-machine ip node-1) 18 | 19 | docker-machine ssh node-1 -- docker node update --availability drain node-1 20 | 21 | 22 | echo "Adding the nodes to the Swarm..." 23 | 24 | TOKEN=`docker-machine ssh node-1 docker swarm join-token worker | grep token | awk '{ print $5 }'` 25 | 26 | docker-machine ssh node-2 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377" 27 | docker-machine ssh node-3 "docker swarm join --token ${TOKEN} $(docker-machine ip node-1):2377" 28 | 29 | 30 | echo "Deploying Spark..." 31 | 32 | eval $(docker-machine env node-1) 33 | export EXTERNAL_IP=$(docker-machine ip node-2) 34 | docker stack deploy --compose-file=docker-compose.yml spark 35 | docker service scale spark_worker=2 36 | 37 | 38 | echo "Get address..." 39 | 40 | NODE=$(docker service ps --format "{{.Node}}" spark_master) 41 | docker-machine ip $NODE 42 | -------------------------------------------------------------------------------- /scripts/destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker-machine rm node-1 node-2 node-3 -y 4 | -------------------------------------------------------------------------------- /scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Getting container ID of the Spark master..." 4 | 5 | eval $(docker-machine env node-1) 6 | NODE=$(docker service ps --format "{{.Node}}" spark_master) 7 | eval $(docker-machine env $NODE) 8 | CONTAINER_ID=$(docker ps --filter name=master --format "{{.ID}}") 9 | 10 | 11 | echo "Copying count.py script to the Spark master..." 12 | 13 | docker cp count.py $CONTAINER_ID:/tmp 14 | 15 | 16 | echo "Running Spark job..." 17 | 18 | docker exec $CONTAINER_ID \ 19 | bin/spark-submit \ 20 | --master spark://master:7077 \ 21 | --class endpoint \ 22 | /tmp/count.py 23 | --------------------------------------------------------------------------------