├── spark ├── zeppelin │ ├── .gitignore │ ├── zeppelin-log4j.properties │ ├── docker-zeppelin.sh │ ├── zeppelin-env.sh │ └── Dockerfile ├── spark-defaults.conf ├── log4j.properties ├── core-site.xml ├── README.md ├── start-master ├── Makefile ├── start-worker ├── start-common.sh └── Dockerfile ├── images ├── spark-ui.jpg └── zeppelin-ui.jpg ├── spark-native ├── scripts │ └── spark │ │ ├── added │ │ ├── executor.sh │ │ ├── driver.sh │ │ └── launch.sh │ │ └── install ├── Makefile ├── README.md └── Dockerfile ├── manifests ├── namespace-spark-cluster.yaml ├── zeppelin-service.yaml ├── spark-ui-proxy-service.yaml ├── spark-gluster │ ├── spark-master-service.yaml │ ├── glusterfs-endpoints.yaml │ ├── spark-master-controller.yaml │ ├── spark-worker-controller.yaml │ └── README.md ├── spark-master-service.yaml ├── spark-ingress.yaml ├── zeppelin-controller.yaml ├── spark-worker-controller.yaml ├── spark-master-controller.yaml ├── spark-ui-proxy-controller.yaml └── README.md ├── README.md └── LICENSE /spark/zeppelin/.gitignore: -------------------------------------------------------------------------------- 1 | zeppelin.tgz 2 | -------------------------------------------------------------------------------- /images/spark-ui.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rootsongjc/spark-on-kubernetes/HEAD/images/spark-ui.jpg -------------------------------------------------------------------------------- /images/zeppelin-ui.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rootsongjc/spark-on-kubernetes/HEAD/images/zeppelin-ui.jpg -------------------------------------------------------------------------------- /spark-native/scripts/spark/added/executor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | set -x 5 | 6 | cd $SPARK_HOME 7 | $SPARK_HOME/bin/spark-class $@ 8 | -------------------------------------------------------------------------------- /manifests/namespace-spark-cluster.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: "spark-cluster" 5 | labels: 6 | name: "spark-cluster" 7 | -------------------------------------------------------------------------------- /spark-native/scripts/spark/added/driver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | set -x 5 | 6 | CLIENT_JAR=$1 7 | shift 8 | 9 | curl -L -o $SPARK_HOME/kubernetes/client.jar $CLIENT_JAR 10 | 11 | $SPARK_HOME/bin/spark-submit $@ 12 | -------------------------------------------------------------------------------- /manifests/zeppelin-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: zeppelin 5 | namespace: spark-cluster 6 | spec: 7 | ports: 8 | - port: 80 9 | targetPort: 8080 10 | selector: 11 | component: zeppelin 12 | type: ClusterIP 13 | -------------------------------------------------------------------------------- /manifests/spark-ui-proxy-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: spark-ui-proxy 5 | namespace: spark-cluster 6 | spec: 7 | ports: 8 | - port: 80 9 | targetPort: 80 10 | selector: 11 | component: spark-ui-proxy 12 | type: ClusterIP 13 | -------------------------------------------------------------------------------- /spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master spark://spark-master:7077 2 | spark.executor.extraClassPath /opt/spark/lib/gcs-connector-latest-hadoop2.jar 3 | spark.driver.extraClassPath /opt/spark/lib/gcs-connector-latest-hadoop2.jar 4 | spark.driver.extraLibraryPath /opt/hadoop/lib/native 5 | spark.app.id KubernetesSpark 6 | -------------------------------------------------------------------------------- /manifests/spark-gluster/spark-master-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: spark-master 5 | namespace: spark-cluster 6 | labels: 7 | component: spark-master-service 8 | spec: 9 | ports: 10 | - port: 7077 11 | targetPort: 7077 12 | selector: 13 | component: spark-master 14 | -------------------------------------------------------------------------------- /manifests/spark-gluster/glusterfs-endpoints.yaml: -------------------------------------------------------------------------------- 1 | kind: Endpoints 2 | apiVersion: v1 3 | metadata: 4 | name: glusterfs-cluster 5 | namespace: spark-cluster 6 | subsets: 7 | - addresses: 8 | - ip: 192.168.30.104 9 | ports: 10 | - port: 1 11 | - addresses: 12 | - ip: 192.168.30.105 13 | ports: 14 | - port: 1 15 | -------------------------------------------------------------------------------- /manifests/spark-master-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: spark-master 5 | namespace: spark-cluster 6 | spec: 7 | ports: 8 | - port: 7077 9 | targetPort: 7077 10 | name: spark 11 | - port: 8080 12 | targetPort: 8080 13 | name: http 14 | selector: 15 | component: spark-master 16 | -------------------------------------------------------------------------------- /spark/zeppelin/zeppelin-log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console. 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%5p [%d] ({%t} %F[%M]:%L) - %m%n 7 | -------------------------------------------------------------------------------- /spark-native/Makefile: -------------------------------------------------------------------------------- 1 | DISTRO_VERSION ?= 0.10 2 | ifndef DISTRO_PATH 3 | $(error DISTRO_PATH is undefined) 4 | endif 5 | 6 | ifndef REPO 7 | $(error REPO is undefined) 8 | endif 9 | 10 | build: 11 | cp $(DISTRO_PATH) ./spark-distro.tgz 12 | docker build -t "$(REPO):$(DISTRO_VERSION)" . 13 | 14 | clean: 15 | docker rmi $(DISTRO_NAME):$(DISTRO_VERSION) 16 | 17 | push: build 18 | docker push $(REPO):$(DISTRO_VERSION) 19 | -------------------------------------------------------------------------------- /manifests/spark-ingress.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Ingress 3 | metadata: 4 | name: traefik-ingress 5 | namespace: spark-cluster 6 | spec: 7 | rules: 8 | - host: spark.traefik.io 9 | http: 10 | paths: 11 | - path: / 12 | backend: 13 | serviceName: spark-ui-proxy 14 | servicePort: 80 15 | - host: zeppelin.traefik.io 16 | http: 17 | paths: 18 | - path: / 19 | backend: 20 | serviceName: zeppelin 21 | servicePort: 80 22 | -------------------------------------------------------------------------------- /manifests/zeppelin-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: zeppelin-controller 5 | namespace: spark-cluster 6 | spec: 7 | replicas: 1 8 | selector: 9 | component: zeppelin 10 | template: 11 | metadata: 12 | labels: 13 | component: zeppelin 14 | spec: 15 | containers: 16 | - name: zeppelin 17 | image: sz-pg-oam-docker-hub-001.tendcloud.com/library/zeppelin:0.7.1 18 | ports: 19 | - containerPort: 8080 20 | resources: 21 | requests: 22 | cpu: 100m 23 | -------------------------------------------------------------------------------- /manifests/spark-worker-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-worker-controller 5 | namespace: spark-cluster 6 | spec: 7 | replicas: 3 8 | selector: 9 | component: spark-worker 10 | template: 11 | metadata: 12 | labels: 13 | component: spark-worker 14 | spec: 15 | containers: 16 | - name: spark-worker 17 | image: sz-pg-oam-docker-hub-001.tendcloud.com/library/spark:1.5.2_v1 18 | command: ["/start-worker"] 19 | ports: 20 | - containerPort: 8081 21 | resources: 22 | requests: 23 | cpu: 100m 24 | 25 | -------------------------------------------------------------------------------- /spark-native/scripts/spark/install: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | SCRIPT_DIR=$(dirname $0) 6 | ADDED_DIR=${SCRIPT_DIR}/added 7 | 8 | mv $ADDED_DIR/launch.sh $SPARK_HOME/bin/ 9 | chmod +x $SPARK_HOME/bin/launch.sh 10 | 11 | mv $ADDED_DIR/driver.sh $ADDED_DIR/executor.sh /opt/ 12 | chmod a+rx /opt/driver.sh /opt/executor.sh 13 | 14 | # SPARK_WORKER_DIR defaults to SPARK_HOME/work and is created on 15 | # Worker startup if it does not exist. instead of making SPARK_HOME 16 | # world writable, create SPARK_HOME/work. 17 | mkdir $SPARK_HOME/work 18 | chmod a+rwx $SPARK_HOME/work 19 | 20 | mkdir $SPARK_HOME/kubernetes 21 | chmod a+rwx $SPARK_HOME/kubernetes 22 | 23 | chmod -R a+rX $SPARK_HOME 24 | -------------------------------------------------------------------------------- /spark/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.spark-project.jetty=WARN 10 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | -------------------------------------------------------------------------------- /manifests/spark-master-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-master-controller 5 | namespace: spark-cluster 6 | spec: 7 | replicas: 1 8 | selector: 9 | component: spark-master 10 | template: 11 | metadata: 12 | labels: 13 | component: spark-master 14 | spec: 15 | containers: 16 | - name: spark-master 17 | image: sz-pg-oam-docker-hub-001.tendcloud.com/library/spark:1.5.2_v1 18 | command: ["/start-master"] 19 | ports: 20 | - containerPort: 7077 21 | - containerPort: 8080 22 | resources: 23 | requests: 24 | cpu: 100m 25 | -------------------------------------------------------------------------------- /spark-native/README.md: -------------------------------------------------------------------------------- 1 | The repository this points to k8s4spark/spark. 2 | 3 | # Steps to build the docker image 4 | 5 | 1. Build your spark distribution (typically from sources) with kubernetes support. 6 | 7 | ``` 8 | ./dev/make-distribution.sh --tgz -Pkubernetes -Phadoop-2.4 -Darguments="-DskipTests" -Dhadoop.version=2.4.0 9 | ``` 10 | 11 | For further details, refer to: https://github.com/foxish/spark/tree/k8s-support/kubernetes 12 | 13 | 14 | 2. Build and push the docker image by running the following: 15 | 16 | ``` 17 | make push DISTRO_PATH=~/spark.tgz REPO=docker.io/foxish/kube-spark 18 | ``` 19 | 20 | 3. Use the newly pushed image in launching a new Spark Job with k8s support using spark-submit. 21 | 22 | 23 | -------------------------------------------------------------------------------- /spark/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.gs.impl 7 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 8 | The FileSystem for gs: (GCS) uris. 9 | 10 | 11 | fs.AbstractFileSystem.gs.impl 12 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS 13 | The AbstractFileSystem for gs: (GCS) uris. Only necessary for use with Hadoop 2. 14 | 15 | 16 | fs.gs.project.id 17 | NOT_RUNNING_INSIDE_GCE 18 | 19 | 20 | -------------------------------------------------------------------------------- /spark-native/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:latest 2 | 3 | MAINTAINER Erik Erlandson 4 | 5 | # when the containers are not run w/ uid 0, the uid may not map in 6 | # /etc/passwd and it may not be possible to modify things like 7 | # /etc/hosts. nss_wrapper provides an LD_PRELOAD way to modify passwd 8 | # and hosts. 9 | RUN yum install -y epel-release tar java && \ 10 | yum install -y nss_wrapper && \ 11 | yum clean all 12 | 13 | ENV PATH=$PATH:/opt/spark/bin 14 | ENV SPARK_HOME=/opt/spark 15 | 16 | # Add scripts used to configure the image 17 | COPY scripts /opt/scripts/ 18 | 19 | COPY spark-distro.tgz /opt/spark/ 20 | 21 | RUN cd /opt/spark && tar --strip-components=1 -xzf spark-distro.tgz && rm spark-distro.tgz && bash -x /opt/scripts/spark/install 22 | -------------------------------------------------------------------------------- /spark/README.md: -------------------------------------------------------------------------------- 1 | # Spark 2 | 3 | This is a Docker image appropriate for running Spark on Kuberenetes. It produces three main images: 4 | * `spark-master` - Runs a Spark master in Standalone mode and exposes a port for Spark and a port for the WebUI. 5 | * `spark-worker` - Runs a Spark worer in Standalone mode and connects to the Spark master via DNS name `spark-master`. 6 | * `zeppelin` - Runs a Zeppelin web notebook and connects to the Spark master via DNS name `spark-master` and exposes a port for the WebUI. 7 | 8 | In addition, there are two additional pushed images: 9 | * `spark-base` - This base image for `spark-master` and `spark-worker` that starts nothing. 10 | * `spark-driver` - This image, just like the `zeppelin` image, allows running things like `pyspark` to connect to `spark-master`, but is lighter weight than the `zeppelin` image. 11 | -------------------------------------------------------------------------------- /manifests/spark-ui-proxy-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-ui-proxy-controller 5 | namespace: spark-cluster 6 | spec: 7 | replicas: 1 8 | selector: 9 | component: spark-ui-proxy 10 | template: 11 | metadata: 12 | labels: 13 | component: spark-ui-proxy 14 | spec: 15 | containers: 16 | - name: spark-ui-proxy 17 | image: sz-pg-oam-docker-hub-001.tendcloud.com/library/spark-ui-proxy:1.0 18 | ports: 19 | - containerPort: 80 20 | resources: 21 | requests: 22 | cpu: 100m 23 | args: 24 | - spark-master:8080 25 | livenessProbe: 26 | httpGet: 27 | path: / 28 | port: 80 29 | initialDelaySeconds: 120 30 | timeoutSeconds: 5 31 | -------------------------------------------------------------------------------- /spark/zeppelin/docker-zeppelin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 The Kubernetes Authors All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | export ZEPPELIN_HOME=/opt/zeppelin 18 | export ZEPPELIN_CONF_DIR="${ZEPPELIN_HOME}/conf" 19 | 20 | echo "=== Launching Zeppelin under Docker ===" 21 | /opt/zeppelin/bin/zeppelin.sh "${ZEPPELIN_CONF_DIR}" 22 | -------------------------------------------------------------------------------- /spark/start-master: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 The Kubernetes Authors All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | . /start-common.sh 18 | 19 | echo "$(hostname -i) spark-master" >> /etc/hosts 20 | 21 | # Run spark-class directly so that when it exits (or crashes), the pod restarts. 22 | /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080 23 | -------------------------------------------------------------------------------- /manifests/spark-gluster/spark-master-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-master-controller 5 | namespace: spark-cluster 6 | labels: 7 | component: spark-master 8 | spec: 9 | replicas: 1 10 | selector: 11 | component: spark-master 12 | template: 13 | metadata: 14 | labels: 15 | component: spark-master 16 | spec: 17 | containers: 18 | - name: spark-master 19 | image: sz-pg-oam-docker-hub-001.tendcloud.com/library/spark:1.5.2_v1 20 | command: ["/start-master"] 21 | ports: 22 | - containerPort: 7077 23 | volumeMounts: 24 | - mountPath: /mnt/glusterfs 25 | name: glusterfsvol 26 | resources: 27 | requests: 28 | cpu: 100m 29 | volumes: 30 | - name: glusterfsvol 31 | glusterfs: 32 | endpoints: glusterfs-cluster 33 | path: MyVolume 34 | readOnly: false 35 | -------------------------------------------------------------------------------- /manifests/spark-gluster/spark-worker-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-gluster-worker-controller 5 | namespace: spark-cluster 6 | labels: 7 | component: spark-worker 8 | spec: 9 | replicas: 2 10 | selector: 11 | component: spark-worker 12 | template: 13 | metadata: 14 | labels: 15 | component: spark-worker 16 | uses: spark-master 17 | spec: 18 | containers: 19 | - name: spark-worker 20 | image: sz-pg-oam-docker-hub-001.tendcloud.com/library/spark:1.5.2_v1 21 | command: ["/start-worker"] 22 | ports: 23 | - containerPort: 8888 24 | volumeMounts: 25 | - mountPath: /mnt/glusterfs 26 | name: glusterfsvol 27 | resources: 28 | requests: 29 | cpu: 100m 30 | volumes: 31 | - name: glusterfsvol 32 | glusterfs: 33 | endpoints: glusterfs-cluster 34 | path: MyVolume 35 | readOnly: false 36 | -------------------------------------------------------------------------------- /spark/Makefile: -------------------------------------------------------------------------------- 1 | all: spark zeppelin 2 | push: push-spark push-zeppelin 3 | .PHONY: push push-spark push-zeppelin spark zeppelin zeppelin-build 4 | 5 | # To bump the Spark version, bump the spark_ver in Dockerfile, bump 6 | # this tag and reset to v1. You should also double check the native 7 | # Hadoop libs at that point (we grab the 2.6.1 libs, which are 8 | # appropriate for 1.5.2-with-2.6). Note that you'll need to re-test 9 | # Zeppelin (and it may not have caught up to newest Spark). 10 | SPARK_TAG = 1.5.2_v1 11 | 12 | ZEPPELIN_TAG = 0.7.1 13 | 14 | DOCKER_REGISTRY = sz-pg-oam-docker-hub-001.tendcloud.com/library 15 | 16 | spark: 17 | docker build -t $(DOCKER_REGISTRY)/spark:$(SPARK_TAG) . 18 | 19 | zeppelin: 20 | docker build -t $(DOCKER_REGISTRY)/zeppelin:$(ZEPPELIN_TAG) zeppelin 21 | 22 | push-spark: spark 23 | docker push $(DOCKER_REGISTRY)/spark:$(SPARK_TAG) 24 | 25 | push-zeppelin: zeppelin 26 | docker push $(DOCKER_REGISTRY)/zeppelin:$(ZEPPELIN_TAG) 27 | 28 | clean: 29 | docker rmi $(DOCKER_REGISTRY)/spark:$(SPARK_TAG) || : 30 | docker rmi $(DOCKER_REGISTRY)/spark || : 31 | 32 | docker rmi $(DOCKER_REGISTRY)/zeppelin:$(ZEPPELIN_TAG) || : 33 | docker rmi $(DOCKER_REGISTRY)/zeppelin || : 34 | -------------------------------------------------------------------------------- /spark/start-worker: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 The Kubernetes Authors All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | . /start-common.sh 18 | 19 | if ! getent hosts spark-master; then 20 | echo "=== Cannot resolve the DNS entry for spark-master. Has the service been created yet, and is SkyDNS functional?" 21 | echo "=== See http://kubernetes.io/v1.1/docs/admin/dns.html for more details on DNS integration." 22 | echo "=== Sleeping 10s before pod exit." 23 | sleep 10 24 | exit 0 25 | fi 26 | 27 | # Run spark-class directly so that when it exits (or crashes), the pod restarts. 28 | /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 --webui-port 8081 29 | -------------------------------------------------------------------------------- /spark/zeppelin/zeppelin-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 The Kubernetes Authors All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | export MASTER="spark://spark-master:7077" 18 | export SPARK_HOME=/opt/spark 19 | export ZEPPELIN_JAVA_OPTS="-Dspark.jars=/opt/spark/lib/gcs-connector-latest-hadoop2.jar" 20 | # TODO(zmerlynn): Setting global CLASSPATH *should* be unnecessary, 21 | # but ZEPPELIN_JAVA_OPTS isn't enough here. :( 22 | export CLASSPATH="/opt/spark/lib/gcs-connector-latest-hadoop2.jar" 23 | export ZEPPELIN_NOTEBOOK_DIR="${ZEPPELIN_HOME}/notebook" 24 | export ZEPPELIN_MEM=-Xmx1024m 25 | export ZEPPELIN_PORT=8080 26 | export PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip" 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark on Kubernetes 2 | 3 | 通过容器方式在Kubernetes上运行spark集群。 4 | 5 | ## 镜像制作 6 | 7 | **手动编译** 8 | 9 | 修改`spark/Makefile`中的`DOCKER_REGISTRY`为你的私有镜像仓库地址。 10 | 11 | ```bash 12 | $ cd spark 13 | $ make all 14 | $ make push 15 | ``` 16 | 17 | **时速云镜像** 18 | 19 | 或者直接下载我已经编译好的镜像,上传到了时速云仓库: 20 | 21 | ``` 22 | index.tenxcloud.com/jimmy/spark:1.5.2_v1 23 | index.tenxcloud.com/jimmy/zeppelin:0.7.1 24 | ``` 25 | 26 | ## 在Kubernetes上启动spark 27 | 28 | 创建名为spark-cluster的namespace,所有操作都在该namespace中进行。 29 | 30 | 所有yaml文件都在`manifests`目录下。 31 | 32 | ```bash 33 | $ kubectl create -f manifests/ 34 | ``` 35 | 36 | 将会启动一个拥有三个worker的spark集群和zeppelin。 37 | 38 | 同时在该namespace中增加ingress配置,将spark的UI和zeppelin页面都暴露出来,可以在集群外部访问。 39 | 40 | 该ingress后端使用[traefik](htts://traefik.io)。 41 | 42 | ## 访问spark 43 | 44 | 通过上面对ingress的配置暴露服务,需要修改本机的/etc/hosts文件,增加以下配置,使其能够解析到上述service。 45 | 46 | ``` 47 | 172.20.0.119 zeppelin.traefik.io 48 | 172.20.0.119 spark.traefik.io 49 | ``` 50 | 51 | 172.20.0.119是我设置的VIP地址,VIP的设置和traefik的配置请查看[kubernetes-handbook](https://github.com/rootsongjc/kubernetes-handbook)。 52 | 53 | **spark ui** 54 | 55 | 访问http://spark.traefik.io 56 | 57 | ![spark-ui](images/spark-ui.jpg) 58 | 59 | **zeppelin ui** 60 | 61 | 访问http://zepellin.treafik.io 62 | 63 | ![zeppelin-ui](images/zeppelin-ui.jpg) -------------------------------------------------------------------------------- /spark/start-common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 The Kubernetes Authors All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | PROJECT_ID=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/project/project-id) 18 | 19 | if [[ -n "${PROJECT_ID}" ]]; then 20 | sed -i "s/NOT_RUNNING_INSIDE_GCE/${PROJECT_ID}/" /opt/spark/conf/core-site.xml 21 | fi 22 | 23 | # We don't want any of the incoming service variables, we'd rather use 24 | # DNS. But this one interferes directly with Spark. 25 | unset SPARK_MASTER_PORT 26 | 27 | # spark.{executor,driver}.extraLibraryPath don't actually seem to 28 | # work, this seems to be the only reliable way to get the native libs 29 | # picked up. 30 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/hadoop/lib/native 31 | -------------------------------------------------------------------------------- /spark/zeppelin/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The Kubernetes Authors All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM index.tenxcloud.com/jimmy/spark:1.5.2_v1 16 | 17 | ENV ZEPPELIN_VER 0.7.1 18 | 19 | # Use bit apache mirror 20 | RUN mkdir -p /opt && \ 21 | cd /opt && \ 22 | curl http://mirror.bit.edu.cn/apache/zeppelin/zeppelin-${ZEPPELIN_VER}/zeppelin-${ZEPPELIN_VER}-bin-all.tgz | \ 23 | tar -zx && \ 24 | ln -s zeppelin-${ZEPPELIN_VER}-bin-all zeppelin && \ 25 | echo Zeppelin ${ZEPPELIN_VER} installed in /opt 26 | 27 | ADD zeppelin-log4j.properties /opt/zeppelin/conf/log4j.properties 28 | ADD zeppelin-env.sh /opt/zeppelin/conf/zeppelin-env.sh 29 | ADD docker-zeppelin.sh /opt/zeppelin/bin/docker-zeppelin.sh 30 | EXPOSE 8080 31 | ENTRYPOINT ["/opt/zeppelin/bin/docker-zeppelin.sh"] 32 | -------------------------------------------------------------------------------- /spark-native/scripts/spark/added/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # spark likes to be able to lookup a username for the running UID, if 4 | # no name is present fake it. 5 | cat /etc/passwd > /tmp/passwd 6 | echo "$(id -u):x:$(id -u):$(id -g):dynamic uid:$SPARK_HOME:/bin/false" >> /tmp/passwd 7 | 8 | export NSS_WRAPPER_PASSWD=/tmp/passwd 9 | # NSS_WRAPPER_GROUP must be set for NSS_WRAPPER_PASSWD to be used 10 | export NSS_WRAPPER_GROUP=/etc/group 11 | 12 | export LD_PRELOAD=libnss_wrapper.so 13 | 14 | # If SPARK_MASTER_ADDRESS env varaible is not provided, start master, 15 | # otherwise start worker and connect to SPARK_MASTER_ADDRESS 16 | if [ -z ${SPARK_MASTER_ADDRESS+_} ]; then 17 | echo "Starting master" 18 | 19 | # run the spark master directly (instead of sbin/start-master.sh) to 20 | # link master and container lifecycle 21 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.master.Master 22 | else 23 | echo "Starting worker, will connect to: $SPARK_MASTER_ADDRESS" 24 | while true; do 25 | echo "Waiting for spark master to be available ..." 26 | curl --connect-timeout 1 -s -X GET $SPARK_MASTER_UI_ADDRESS > /dev/null 27 | if [ $? -eq 0 ]; then 28 | break 29 | fi 30 | sleep 1 31 | done 32 | exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.worker.Worker $SPARK_MASTER_ADDRESS 33 | fi 34 | 35 | -------------------------------------------------------------------------------- /spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM java:openjdk-8-jdk 2 | 3 | ENV hadoop_ver 2.6.1 4 | ENV spark_ver 1.5.2 5 | 6 | # Get Hadoop from BIT Apache mirror and extract just the native 7 | # libs. (Until we care about running HDFS with these containers, this 8 | # is all we need.) 9 | RUN mkdir -p /opt && \ 10 | cd /opt && \ 11 | curl http://mirror.bit.edu.cn/apache/hadoop/common/hadoop-${hadoop_var}/hadoop-${hadoop_var}.tar.gz | \ 12 | tar -zx hadoop-${hadoop_ver}/lib/native && \ 13 | ln -s hadoop-${hadoop_ver} hadoop && \ 14 | echo Hadoop ${hadoop_ver} native libraries installed in /opt/hadoop/lib/native 15 | 16 | # Get Spark from Apache mirror. 17 | RUN mkdir -p /opt && \ 18 | cd /opt && \ 19 | curl http://archive.apache.org/dist/spark/spark-${spark_ver}/spark-${spark_ver}-bin-hadoop2.6.tgz | \ 20 | tar -zx && \ 21 | ln -s spark-${spark_ver}-bin-hadoop2.6 spark && \ 22 | echo Spark ${spark_ver} installed in /opt 23 | 24 | # if numpy is installed on a driver it needs to be installed on all 25 | # workers, so install it everywhere 26 | RUN apt-get update && \ 27 | apt-get install -y python-numpy && \ 28 | apt-get clean && \ 29 | rm -rf /var/lib/apt/lists/* 30 | 31 | ADD log4j.properties /opt/spark/conf/log4j.properties 32 | ADD start-common.sh start-worker start-master / 33 | ADD core-site.xml /opt/spark/conf/core-site.xml 34 | ADD spark-defaults.conf /opt/spark/conf/spark-defaults.conf 35 | ENV PATH $PATH:/opt/spark/bin 36 | -------------------------------------------------------------------------------- /manifests/spark-gluster/README.md: -------------------------------------------------------------------------------- 1 | # Spark on GlusterFS example 2 | 3 | This guide is an extension of the standard [Spark on Kubernetes Guide](../../../examples/spark/) and describes how to run Spark on GlusterFS using the [Kubernetes Volume Plugin for GlusterFS](../../../examples/volumes/glusterfs/) 4 | 5 | The setup is the same in that you will setup a Spark Master Service in the same way you do with the standard Spark guide but you will deploy a modified Spark Master and a Modified Spark Worker ReplicationController, as they will be modified to use the GlusterFS volume plugin to mount a GlusterFS volume into the Spark Master and Spark Workers containers. Note that this example can be used as a guide for implementing any of the Kubernetes Volume Plugins with the Spark Example. 6 | 7 | [There is also a video available that provides a walkthrough for how to set this solution up](https://youtu.be/xyIaoM0-gM0) 8 | 9 | ## Step Zero: Prerequisites 10 | 11 | This example assumes that you have been able to successfully get the standard Spark Example working in Kubernetes and that you have a GlusterFS cluster that is accessible from your Kubernetes cluster. It is also recommended that you are familiar with the GlusterFS Volume Plugin and how to configure it. 12 | 13 | ## Step One: Define the endpoints for your GlusterFS Cluster 14 | 15 | Modify the `examples/spark/spark-gluster/glusterfs-endpoints.yaml` file to list the IP addresses of some of the servers in your GlusterFS cluster. The GlusterFS Volume Plugin uses these IP addresses to perform a Fuse Mount of the GlusterFS Volume into the Spark Worker Containers that are launched by the ReplicationController in the next section. 16 | 17 | Register your endpoints by running the following command: 18 | 19 | ```console 20 | $ kubectl create -f examples/spark/spark-gluster/glusterfs-endpoints.yaml 21 | ``` 22 | 23 | ## Step Two: Modify and Submit your Spark Master ReplicationController 24 | 25 | Modify the `examples/spark/spark-gluster/spark-master-controller.yaml` file to reflect the GlusterFS Volume that you wish to use in the PATH parameter of the volumes subsection. 26 | 27 | Submit the Spark Master Pod 28 | 29 | ```console 30 | $ kubectl create -f examples/spark/spark-gluster/spark-master-controller.yaml 31 | ``` 32 | 33 | Verify that the Spark Master Pod deployed successfully. 34 | 35 | ```console 36 | $ kubectl get pods 37 | ``` 38 | 39 | Submit the Spark Master Service 40 | 41 | ```console 42 | $ kubectl create -f examples/spark/spark-gluster/spark-master-service.yaml 43 | ``` 44 | 45 | Verify that the Spark Master Service deployed successfully. 46 | 47 | ```console 48 | $ kubectl get services 49 | ``` 50 | 51 | ## Step Three: Start your Spark workers 52 | 53 | Modify the `examples/spark/spark-gluster/spark-worker-controller.yaml` file to reflect the GlusterFS Volume that you wish to use in the PATH parameter of the Volumes subsection. 54 | 55 | Make sure that the replication factor for the pods is not greater than the amount of Kubernetes nodes available in your Kubernetes cluster. 56 | 57 | Submit your Spark Worker ReplicationController by running the following command: 58 | 59 | ```console 60 | $ kubectl create -f examples/spark/spark-gluster/spark-worker-controller.yaml 61 | ``` 62 | 63 | Verify that the Spark Worker ReplicationController deployed its pods successfully. 64 | 65 | ```console 66 | $ kubectl get pods 67 | ``` 68 | 69 | Follow the steps from the standard example to verify the Spark Worker pods have registered successfully with the Spark Master. 70 | 71 | ## Step Four: Submit a Spark Job 72 | 73 | All the Spark Workers and the Spark Master in your cluster have a mount to GlusterFS. This means that any of them can be used as the Spark Client to submit a job. For simplicity, lets use the Spark Master as an example. 74 | 75 | 76 | The Spark Worker and Spark Master containers include a setup_client utility script that takes two parameters, the Service IP of the Spark Master and the port that it is running on. This must be to setup the container as a Spark client prior to submitting any Spark Jobs. 77 | 78 | Obtain the Service IP (listed as IP:) and Full Pod Name by running 79 | 80 | ```console 81 | $ kubectl describe pod spark-master-controller 82 | ``` 83 | 84 | Now we will shell into the Spark Master Container and run a Spark Job. In the example below, we are running the Spark Wordcount example and specifying the input and output directory at the location where GlusterFS is mounted in the Spark Master Container. This will submit the job to the Spark Master who will distribute the work to all the Spark Worker Containers. 85 | 86 | All the Spark Worker containers will be able to access the data as they all have the same GlusterFS volume mounted at /mnt/glusterfs. The reason we are submitting the job from a Spark Worker and not an additional Spark Base container (as in the standard Spark Example) is due to the fact that the Spark instance submitting the job must be able to access the data. Only the Spark Master and Spark Worker containers have GlusterFS mounted. 87 | 88 | The Spark Worker and Spark Master containers include a setup_client utility script that takes two parameters, the Service IP of the Spark Master and the port that it is running on. This must be done to setup the container as a Spark client prior to submitting any Spark Jobs. 89 | 90 | Shell into the Master Spark Node (spark-master-controller) by running 91 | 92 | ```console 93 | kubectl exec spark-master-controller- -i -t -- bash -i 94 | 95 | root@spark-master-controller-c1sqd:/# . /setup_client.sh 7077 96 | root@spark-master-controller-c1sqd:/# pyspark 97 | 98 | Python 2.7.9 (default, Mar 1 2015, 12:57:24) 99 | [GCC 4.9.2] on linux2 100 | Type "help", "copyright", "credits" or "license" for more information. 101 | 15/06/26 14:25:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 102 | Welcome to 103 | ____ __ 104 | / __/__ ___ _____/ /__ 105 | _\ \/ _ \/ _ `/ __/ '_/ 106 | /__ / .__/\_,_/_/ /_/\_\ version 1.4.0 107 | /_/ 108 | Using Python version 2.7.9 (default, Mar 1 2015 12:57:24) 109 | SparkContext available as sc, HiveContext available as sqlContext. 110 | >>> file = sc.textFile("/mnt/glusterfs/somefile.txt") 111 | >>> counts = file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) 112 | >>> counts.saveAsTextFile("/mnt/glusterfs/output") 113 | ``` 114 | 115 | While still in the container, you can see the output of your Spark Job in the Distributed File System by running the following: 116 | 117 | ```console 118 | root@spark-master-controller-c1sqd:/# ls -l /mnt/glusterfs/output 119 | ``` 120 | 121 | 122 | [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/examples/spark/spark-gluster/README.md?pixel)]() 123 | 124 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /manifests/README.md: -------------------------------------------------------------------------------- 1 | # Spark example 2 | 3 | Following this example, you will create a functional [Apache 4 | Spark](http://spark.apache.org/) cluster using Kubernetes and 5 | [Docker](http://docker.io). 6 | 7 | You will setup a Spark master service and a set of Spark workers using Spark's [standalone mode](http://spark.apache.org/docs/latest/spark-standalone.html). 8 | 9 | For the impatient expert, jump straight to the [tl;dr](#tldr) 10 | section. 11 | 12 | ### Sources 13 | 14 | The Docker images are heavily based on https://github.com/mattf/docker-spark. 15 | And are curated in https://github.com/kubernetes/application-images/tree/master/spark 16 | 17 | The Spark UI Proxy is taken from https://github.com/aseigneurin/spark-ui-proxy. 18 | 19 | The PySpark examples are taken from http://stackoverflow.com/questions/4114167/checking-if-a-number-is-a-prime-number-in-python/27946768#27946768 20 | 21 | ## Step Zero: Prerequisites 22 | 23 | This example assumes 24 | 25 | - You have a Kubernetes cluster installed and running. 26 | - That you have installed the ```kubectl``` command line tool installed in your path and configured to talk to your Kubernetes cluster 27 | - That your Kubernetes cluster is running [kube-dns](https://github.com/kubernetes/dns) or an equivalent integration. 28 | 29 | Optionally, your Kubernetes cluster should be configured with a Loadbalancer integration (automatically configured via kube-up or GKE) 30 | 31 | ## Step One: Create namespace 32 | 33 | ```sh 34 | $ kubectl create -f examples/spark/namespace-spark-cluster.yaml 35 | ``` 36 | 37 | Now list all namespaces: 38 | 39 | ```sh 40 | $ kubectl get namespaces 41 | NAME LABELS STATUS 42 | default Active 43 | spark-cluster name=spark-cluster Active 44 | ``` 45 | 46 | To configure kubectl to work with our namespace, we will create a new context using our current context as a base: 47 | 48 | ```sh 49 | $ CURRENT_CONTEXT=$(kubectl config view -o jsonpath='{.current-context}') 50 | $ USER_NAME=$(kubectl config view -o jsonpath='{.contexts[?(@.name == "'"${CURRENT_CONTEXT}"'")].context.user}') 51 | $ CLUSTER_NAME=$(kubectl config view -o jsonpath='{.contexts[?(@.name == "'"${CURRENT_CONTEXT}"'")].context.cluster}') 52 | $ kubectl config set-context spark --namespace=spark-cluster --cluster=${CLUSTER_NAME} --user=${USER_NAME} 53 | $ kubectl config use-context spark 54 | ``` 55 | 56 | ## Step Two: Start your Master service 57 | 58 | The Master [service](../../docs/user-guide/services.md) is the master service 59 | for a Spark cluster. 60 | 61 | Use the 62 | [`examples/spark/spark-master-controller.yaml`](spark-master-controller.yaml) 63 | file to create a 64 | [replication controller](../../docs/user-guide/replication-controller.md) 65 | running the Spark Master service. 66 | 67 | ```console 68 | $ kubectl create -f examples/spark/spark-master-controller.yaml 69 | replicationcontroller "spark-master-controller" created 70 | ``` 71 | 72 | Then, use the 73 | [`examples/spark/spark-master-service.yaml`](spark-master-service.yaml) file to 74 | create a logical service endpoint that Spark workers can use to access the 75 | Master pod: 76 | 77 | ```console 78 | $ kubectl create -f examples/spark/spark-master-service.yaml 79 | service "spark-master" created 80 | ``` 81 | 82 | ### Check to see if Master is running and accessible 83 | 84 | ```console 85 | $ kubectl get pods 86 | NAME READY STATUS RESTARTS AGE 87 | spark-master-controller-5u0q5 1/1 Running 0 8m 88 | ``` 89 | 90 | Check logs to see the status of the master. (Use the pod retrieved from the previous output.) 91 | 92 | ```sh 93 | $ kubectl logs spark-master-controller-5u0q5 94 | starting org.apache.spark.deploy.master.Master, logging to /opt/spark-1.5.1-bin-hadoop2.6/sbin/../logs/spark--org.apache.spark.deploy.master.Master-1-spark-master-controller-g0oao.out 95 | Spark Command: /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -cp /opt/spark-1.5.1-bin-hadoop2.6/sbin/../conf/:/opt/spark-1.5.1-bin-hadoop2.6/lib/spark-assembly-1.5.1-hadoop2.6.0.jar:/opt/spark-1.5.1-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar:/opt/spark-1.5.1-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/opt/spark-1.5.1-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar -Xms1g -Xmx1g org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080 96 | ======================================== 97 | 15/10/27 21:25:05 INFO Master: Registered signal handlers for [TERM, HUP, INT] 98 | 15/10/27 21:25:05 INFO SecurityManager: Changing view acls to: root 99 | 15/10/27 21:25:05 INFO SecurityManager: Changing modify acls to: root 100 | 15/10/27 21:25:05 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(root); users with modify permissions: Set(root) 101 | 15/10/27 21:25:06 INFO Slf4jLogger: Slf4jLogger started 102 | 15/10/27 21:25:06 INFO Remoting: Starting remoting 103 | 15/10/27 21:25:06 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkMaster@spark-master:7077] 104 | 15/10/27 21:25:06 INFO Utils: Successfully started service 'sparkMaster' on port 7077. 105 | 15/10/27 21:25:07 INFO Master: Starting Spark master at spark://spark-master:7077 106 | 15/10/27 21:25:07 INFO Master: Running Spark version 1.5.1 107 | 15/10/27 21:25:07 INFO Utils: Successfully started service 'MasterUI' on port 8080. 108 | 15/10/27 21:25:07 INFO MasterWebUI: Started MasterWebUI at http://spark-master:8080 109 | 15/10/27 21:25:07 INFO Utils: Successfully started service on port 6066. 110 | 15/10/27 21:25:07 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066 111 | 15/10/27 21:25:07 INFO Master: I have been elected leader! New state: ALIVE 112 | ``` 113 | 114 | Once the master is started, we'll want to check the Spark WebUI. In order to access the Spark WebUI, we will deploy a [specialized proxy](https://github.com/aseigneurin/spark-ui-proxy). This proxy is neccessary to access worker logs from the Spark UI. 115 | 116 | Deploy the proxy controller with [`examples/spark/spark-ui-proxy-controller.yaml`](spark-ui-proxy-controller.yaml): 117 | 118 | ```console 119 | $ kubectl create -f examples/spark/spark-ui-proxy-controller.yaml 120 | replicationcontroller "spark-ui-proxy-controller" created 121 | ``` 122 | 123 | We'll also need a corresponding Loadbalanced service for our Spark Proxy [`examples/spark/spark-ui-proxy-service.yaml`](spark-ui-proxy-service.yaml): 124 | 125 | ```console 126 | $ kubectl create -f examples/spark/spark-ui-proxy-service.yaml 127 | service "spark-ui-proxy" created 128 | ``` 129 | 130 | After creating the service, you should eventually get a loadbalanced endpoint: 131 | 132 | ```console 133 | $ kubectl get svc spark-ui-proxy -o wide 134 | NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR 135 | spark-ui-proxy 10.0.51.107 aad59283284d611e6839606c214502b5-833417581.us-east-1.elb.amazonaws.com 80/TCP 9m component=spark-ui-proxy 136 | ``` 137 | 138 | The Spark UI in the above example output will be available at http://aad59283284d611e6839606c214502b5-833417581.us-east-1.elb.amazonaws.com 139 | 140 | If your Kubernetes cluster is not equipped with a Loadbalancer integration, you will need to use the [kubectl proxy](../../docs/user-guide/accessing-the-cluster.md#using-kubectl-proxy) to 141 | connect to the Spark WebUI: 142 | 143 | ```console 144 | kubectl proxy --port=8001 145 | ``` 146 | 147 | At which point the UI will be available at 148 | [http://localhost:8001/api/v1/proxy/namespaces/spark-cluster/services/spark-master:8080/](http://localhost:8001/api/v1/proxy/namespaces/spark-cluster/services/spark-master:8080/). 149 | 150 | ## Step Three: Start your Spark workers 151 | 152 | The Spark workers do the heavy lifting in a Spark cluster. They 153 | provide execution resources and data cache capabilities for your 154 | program. 155 | 156 | The Spark workers need the Master service to be running. 157 | 158 | Use the [`examples/spark/spark-worker-controller.yaml`](spark-worker-controller.yaml) file to create a 159 | [replication controller](../../docs/user-guide/replication-controller.md) that manages the worker pods. 160 | 161 | ```console 162 | $ kubectl create -f examples/spark/spark-worker-controller.yaml 163 | replicationcontroller "spark-worker-controller" created 164 | ``` 165 | 166 | ### Check to see if the workers are running 167 | 168 | If you launched the Spark WebUI, your workers should just appear in the UI when 169 | they're ready. (It may take a little bit to pull the images and launch the 170 | pods.) You can also interrogate the status in the following way: 171 | 172 | ```console 173 | $ kubectl get pods 174 | NAME READY STATUS RESTARTS AGE 175 | spark-master-controller-5u0q5 1/1 Running 0 25m 176 | spark-worker-controller-e8otp 1/1 Running 0 6m 177 | spark-worker-controller-fiivl 1/1 Running 0 6m 178 | spark-worker-controller-ytc7o 1/1 Running 0 6m 179 | 180 | $ kubectl logs spark-master-controller-5u0q5 181 | [...] 182 | 15/10/26 18:20:14 INFO Master: Registering worker 10.244.1.13:53567 with 2 cores, 6.3 GB RAM 183 | 15/10/26 18:20:14 INFO Master: Registering worker 10.244.2.7:46195 with 2 cores, 6.3 GB RAM 184 | 15/10/26 18:20:14 INFO Master: Registering worker 10.244.3.8:39926 with 2 cores, 6.3 GB RAM 185 | ``` 186 | 187 | ## Step Four: Start the Zeppelin UI to launch jobs on your Spark cluster 188 | 189 | The Zeppelin UI pod can be used to launch jobs into the Spark cluster either via 190 | a web notebook frontend or the traditional Spark command line. See 191 | [Zeppelin](https://zeppelin.incubator.apache.org/) and 192 | [Spark architecture](https://spark.apache.org/docs/latest/cluster-overview.html) 193 | for more details. 194 | 195 | Deploy Zeppelin: 196 | 197 | ```console 198 | $ kubectl create -f examples/spark/zeppelin-controller.yaml 199 | replicationcontroller "zeppelin-controller" created 200 | ``` 201 | 202 | And the corresponding service: 203 | 204 | ```console 205 | $ kubectl create -f examples/spark/zeppelin-service.yaml 206 | service "zeppelin" created 207 | ``` 208 | 209 | Zeppelin needs the spark-master service to be running. 210 | 211 | ### Check to see if Zeppelin is running 212 | 213 | ```console 214 | $ kubectl get pods -l component=zeppelin 215 | NAME READY STATUS RESTARTS AGE 216 | zeppelin-controller-ja09s 1/1 Running 0 53s 217 | ``` 218 | 219 | ## Step Five: Do something with the cluster 220 | 221 | Now you have two choices, depending on your predilections. You can do something 222 | graphical with the Spark cluster, or you can stay in the CLI. 223 | 224 | For both choices, we will be working with this Python snippet: 225 | 226 | ```python 227 | from math import sqrt; from itertools import count, islice 228 | 229 | def isprime(n): 230 | return n > 1 and all(n%i for i in islice(count(2), int(sqrt(n)-1))) 231 | 232 | nums = sc.parallelize(xrange(10000000)) 233 | print nums.filter(isprime).count() 234 | ``` 235 | 236 | ### Do something fast with pyspark! 237 | 238 | Simply copy and paste the python snippet into pyspark from within the zeppelin pod: 239 | 240 | ```console 241 | $ kubectl exec zeppelin-controller-ja09s -it pyspark 242 | Python 2.7.9 (default, Mar 1 2015, 12:57:24) 243 | [GCC 4.9.2] on linux2 244 | Type "help", "copyright", "credits" or "license" for more information. 245 | Welcome to 246 | ____ __ 247 | / __/__ ___ _____/ /__ 248 | _\ \/ _ \/ _ `/ __/ '_/ 249 | /__ / .__/\_,_/_/ /_/\_\ version 1.5.1 250 | /_/ 251 | 252 | Using Python version 2.7.9 (default, Mar 1 2015 12:57:24) 253 | SparkContext available as sc, HiveContext available as sqlContext. 254 | >>> from math import sqrt; from itertools import count, islice 255 | >>> 256 | >>> def isprime(n): 257 | ... return n > 1 and all(n%i for i in islice(count(2), int(sqrt(n)-1))) 258 | ... 259 | >>> nums = sc.parallelize(xrange(10000000)) 260 | 261 | >>> print nums.filter(isprime).count() 262 | 664579 263 | ``` 264 | 265 | Congratulations, you now know how many prime numbers there are within the first 10 million numbers! 266 | 267 | ### Do something graphical and shiny! 268 | 269 | Creating the Zeppelin service should have yielded you a Loadbalancer endpoint: 270 | 271 | ```console 272 | $ kubectl get svc zeppelin -o wide 273 | NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR 274 | zeppelin 10.0.154.1 a596f143884da11e6839506c114532b5-121893930.us-east-1.elb.amazonaws.com 80/TCP 3m component=zeppelin 275 | ``` 276 | 277 | If your Kubernetes cluster does not have a Loadbalancer integration, then we will have to use port forwarding. 278 | 279 | Take the Zeppelin pod from before and port-forward the WebUI port: 280 | 281 | ```console 282 | $ kubectl port-forward zeppelin-controller-ja09s 8080:8080 283 | ``` 284 | 285 | This forwards `localhost` 8080 to container port 8080. You can then find 286 | Zeppelin at [http://localhost:8080/](http://localhost:8080/). 287 | 288 | Once you've loaded up the Zeppelin UI, create a "New Notebook". In there we will paste our python snippet, but we need to add a `%pyspark` hint for Zeppelin to understand it: 289 | 290 | ``` 291 | %pyspark 292 | from math import sqrt; from itertools import count, islice 293 | 294 | def isprime(n): 295 | return n > 1 and all(n%i for i in islice(count(2), int(sqrt(n)-1))) 296 | 297 | nums = sc.parallelize(xrange(10000000)) 298 | print nums.filter(isprime).count() 299 | ``` 300 | 301 | After pasting in our code, press shift+enter or click the play icon to the right of our snippet. The Spark job will run and once again we'll have our result! 302 | 303 | ## Result 304 | 305 | You now have services and replication controllers for the Spark master, Spark 306 | workers and Spark driver. You can take this example to the next step and start 307 | using the Apache Spark cluster you just created, see 308 | [Spark documentation](https://spark.apache.org/documentation.html) for more 309 | information. 310 | 311 | ## tl;dr 312 | 313 | ```console 314 | kubectl create -f examples/spark 315 | ``` 316 | 317 | After it's setup: 318 | 319 | ```console 320 | kubectl get pods # Make sure everything is running 321 | kubectl get svc -o wide # Get the Loadbalancer endpoints for spark-ui-proxy and zeppelin 322 | ``` 323 | 324 | At which point the Master UI and Zeppelin will be available at the URLs under the `EXTERNAL-IP` field. 325 | 326 | You can also interact with the Spark cluster using the traditional `spark-shell` / 327 | `spark-subsubmit` / `pyspark` commands by using `kubectl exec` against the 328 | `zeppelin-controller` pod. 329 | 330 | If your Kubernetes cluster does not have a Loadbalancer integration, use `kubectl proxy` and `kubectl port-forward` to access the Spark UI and Zeppelin. 331 | 332 | For Spark UI: 333 | 334 | ```console 335 | kubectl proxy --port=8001 336 | ``` 337 | 338 | Then visit [http://localhost:8001/api/v1/proxy/namespaces/spark-cluster/services/spark-ui-proxy/](http://localhost:8001/api/v1/proxy/namespaces/spark-cluster/services/spark-ui-proxy/). 339 | 340 | For Zeppelin: 341 | 342 | ```console 343 | kubectl port-forward zeppelin-controller-abc123 8080:8080 & 344 | ``` 345 | 346 | Then visit [http://localhost:8080/](http://localhost:8080/). 347 | 348 | ## Known Issues With Spark 349 | 350 | * This provides a Spark configuration that is restricted to the cluster network, 351 | meaning the Spark master is only available as a cluster service. If you need 352 | to submit jobs using external client other than Zeppelin or `spark-submit` on 353 | the `zeppelin` pod, you will need to provide a way for your clients to get to 354 | the 355 | [`examples/spark/spark-master-service.yaml`](spark-master-service.yaml). See 356 | [Services](../../docs/user-guide/services.md) for more information. 357 | 358 | ## Known Issues With Zeppelin 359 | 360 | * The Zeppelin pod is large, so it may take a while to pull depending on your 361 | network. The size of the Zeppelin pod is something we're working on, see issue #17231. 362 | 363 | * Zeppelin may take some time (about a minute) on this pipeline the first time 364 | you run it. It seems to take considerable time to load. 365 | 366 | * On GKE, `kubectl port-forward` may not be stable over long periods of time. If 367 | you see Zeppelin go into `Disconnected` state (there will be a red dot on the 368 | top right as well), the `port-forward` probably failed and needs to be 369 | restarted. See #12179. 370 | 371 | 372 | [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/examples/spark/README.md?pixel)]() 373 | 374 | --------------------------------------------------------------------------------