├── .gitignore ├── README.md ├── create.sh ├── delete.sh ├── docker ├── Dockerfile ├── common.sh ├── spark-defaults.conf ├── spark-master └── spark-worker └── kubernetes ├── _old └── spark-kubernetes.yaml ├── minikube-ingress.yaml ├── spark-master-deployment.yaml ├── spark-master-service.yaml └── spark-worker-deployment.yaml /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/testdrivenio/spark-kubernetes/75fcaa3d6da0b68f705f05030970efba62e0f3b7/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deploying Spark on Kubernetes 2 | 3 | ## Want to learn how to build this? 4 | 5 | Check out the [post](https://testdriven.io/deploying-spark-on-kubernetes). 6 | 7 | ## Want to use this project? 8 | 9 | ### Minikube Setup 10 | 11 | Install and run [Minikube](https://kubernetes.io/docs/setup/minikube/): 12 | 13 | 1. Install a [Hypervisor](https://kubernetes.io/docs/tasks/tools/install-minikube/#install-a-hypervisor) (like [VirtualBox](https://www.virtualbox.org/wiki/Downloads) or [HyperKit](https://github.com/moby/hyperkit)) to manage virtual machines 14 | 1. Install and Set Up [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) to deploy and manage apps on Kubernetes 15 | 1. Install [Minikube](https://github.com/kubernetes/minikube/releases) 16 | 17 | Start the cluster: 18 | 19 | ```sh 20 | $ minikube start --memory 8192 --cpus 4 21 | $ minikube dashboard 22 | ``` 23 | 24 | Build the Docker image: 25 | 26 | ```sh 27 | $ eval $(minikube docker-env) 28 | $ docker build -t spark-hadoop:2.2.1 -f ./docker/Dockerfile ./docker 29 | ``` 30 | 31 | Create the deployments and services: 32 | 33 | ```sh 34 | $ kubectl create -f ./kubernetes/spark-master-deployment.yaml 35 | $ kubectl create -f ./kubernetes/spark-master-service.yaml 36 | $ kubectl create -f ./kubernetes/spark-worker-deployment.yaml 37 | $ minikube addons enable ingress 38 | $ kubectl apply -f ./kubernetes/minikube-ingress.yaml 39 | ``` 40 | 41 | Add an entry to /etc/hosts: 42 | 43 | ```sh 44 | $ echo "$(minikube ip) spark-kubernetes" | sudo tee -a /etc/hosts 45 | ``` 46 | 47 | Test it out in the browser at [http://spark-kubernetes/](http://spark-kubernetes/). 48 | -------------------------------------------------------------------------------- /create.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl create -f ./kubernetes/spark-master-deployment.yaml 4 | kubectl create -f ./kubernetes/spark-master-service.yaml 5 | 6 | sleep 10 7 | 8 | kubectl create -f ./kubernetes/spark-worker-deployment.yaml 9 | kubectl apply -f ./kubernetes/minikube-ingress.yaml 10 | -------------------------------------------------------------------------------- /delete.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl delete -f ./kubernetes/spark-master-deployment.yaml 4 | kubectl delete -f ./kubernetes/spark-master-service.yaml 5 | kubectl delete -f ./kubernetes/spark-worker-deployment.yaml 6 | kubectl delete -f ./kubernetes/minikube-ingress.yaml 7 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # base image 2 | FROM openjdk:11 3 | 4 | # define spark and hadoop versions 5 | ENV SPARK_VERSION=3.2.0 6 | ENV HADOOP_VERSION=3.3.1 7 | 8 | # download and install hadoop 9 | RUN mkdir -p /opt && \ 10 | cd /opt && \ 11 | curl http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ 12 | tar -zx hadoop-${HADOOP_VERSION}/lib/native && \ 13 | ln -s hadoop-${HADOOP_VERSION} hadoop && \ 14 | echo Hadoop ${HADOOP_VERSION} native libraries installed in /opt/hadoop/lib/native 15 | 16 | # download and install spark 17 | RUN mkdir -p /opt && \ 18 | cd /opt && \ 19 | curl http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz | \ 20 | tar -zx && \ 21 | ln -s spark-${SPARK_VERSION}-bin-hadoop2.7 spark && \ 22 | echo Spark ${SPARK_VERSION} installed in /opt 23 | 24 | # add scripts and update spark default config 25 | ADD common.sh spark-master spark-worker / 26 | ADD spark-defaults.conf /opt/spark/conf/spark-defaults.conf 27 | ENV PATH $PATH:/opt/spark/bin 28 | -------------------------------------------------------------------------------- /docker/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # unset variable set by kubernetes 4 | unset SPARK_MASTER_PORT 5 | -------------------------------------------------------------------------------- /docker/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master spark://spark-master:7077 2 | spark.driver.extraLibraryPath /opt/hadoop/lib/native 3 | spark.app.id KubernetesSpark 4 | -------------------------------------------------------------------------------- /docker/spark-master: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . /common.sh 4 | 5 | echo "$(hostname -i) spark-master" >> /etc/hosts 6 | 7 | /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080 8 | -------------------------------------------------------------------------------- /docker/spark-worker: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . /common.sh 4 | 5 | if ! getent hosts spark-master; then 6 | sleep 5 7 | exit 0 8 | fi 9 | 10 | /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 --webui-port 8081 11 | 12 | 13 | 14 | memory = '10g' 15 | pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell' 16 | os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args 17 | -------------------------------------------------------------------------------- /kubernetes/_old/spark-kubernetes.yaml: -------------------------------------------------------------------------------- 1 | kind: Deployment 2 | apiVersion: apps/v1 3 | metadata: 4 | name: spark-master-deployment 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | component: spark-master 10 | template: 11 | metadata: 12 | labels: 13 | component: spark-master 14 | spec: 15 | containers: 16 | - name: spark-master 17 | image: mjhea0/spark-hadoop:2.2.1 18 | command: ["/spark-master"] 19 | ports: 20 | - containerPort: 7077 21 | - containerPort: 8080 22 | resources: 23 | requests: 24 | cpu: 100m 25 | 26 | --- 27 | 28 | kind: Service 29 | apiVersion: v1 30 | metadata: 31 | name: spark-master 32 | spec: 33 | ports: 34 | - port: 7077 35 | targetPort: 7077 36 | selector: 37 | component: spark-master 38 | 39 | --- 40 | 41 | kind: Service 42 | apiVersion: v1 43 | metadata: 44 | name: spark-webui 45 | spec: 46 | ports: 47 | - port: 8080 48 | targetPort: 8080 49 | selector: 50 | component: spark-master 51 | 52 | --- 53 | 54 | kind: Deployment 55 | apiVersion: apps/v1 56 | metadata: 57 | name: spark-worker-deployment 58 | spec: 59 | replicas: 2 60 | selector: 61 | matchLabels: 62 | component: spark-worker 63 | template: 64 | metadata: 65 | labels: 66 | component: spark-worker 67 | spec: 68 | containers: 69 | - name: spark-worker 70 | image: mjhea0/spark-hadoop:2.2.1 71 | command: ["/spark-worker"] 72 | ports: 73 | - containerPort: 8081 74 | resources: 75 | requests: 76 | cpu: 100m 77 | -------------------------------------------------------------------------------- /kubernetes/minikube-ingress.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: Ingress 3 | metadata: 4 | name: minikube-ingress 5 | annotations: 6 | spec: 7 | rules: 8 | - host: spark-kubernetes 9 | http: 10 | paths: 11 | - pathType: Prefix 12 | path: / 13 | backend: 14 | service: 15 | name: spark-master 16 | port: 17 | number: 8080 18 | -------------------------------------------------------------------------------- /kubernetes/spark-master-deployment.yaml: -------------------------------------------------------------------------------- 1 | kind: Deployment 2 | apiVersion: apps/v1 3 | metadata: 4 | name: spark-master 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | component: spark-master 10 | template: 11 | metadata: 12 | labels: 13 | component: spark-master 14 | spec: 15 | containers: 16 | - name: spark-master 17 | image: spark-hadoop:3.2.0 18 | command: ["/spark-master"] 19 | ports: 20 | - containerPort: 7077 21 | - containerPort: 8080 22 | resources: 23 | requests: 24 | cpu: 100m 25 | -------------------------------------------------------------------------------- /kubernetes/spark-master-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: spark-master 5 | spec: 6 | ports: 7 | - name: webui 8 | port: 8080 9 | targetPort: 8080 10 | - name: spark 11 | port: 7077 12 | targetPort: 7077 13 | selector: 14 | component: spark-master 15 | -------------------------------------------------------------------------------- /kubernetes/spark-worker-deployment.yaml: -------------------------------------------------------------------------------- 1 | kind: Deployment 2 | apiVersion: apps/v1 3 | metadata: 4 | name: spark-worker 5 | spec: 6 | replicas: 2 7 | selector: 8 | matchLabels: 9 | component: spark-worker 10 | template: 11 | metadata: 12 | labels: 13 | component: spark-worker 14 | spec: 15 | containers: 16 | - name: spark-worker 17 | image: spark-hadoop:3.2.0 18 | command: ["/spark-worker"] 19 | ports: 20 | - containerPort: 8081 21 | resources: 22 | requests: 23 | cpu: 100m 24 | --------------------------------------------------------------------------------