├── Dockerfile ├── README.md ├── Readme.txt ├── bootstrap.sh ├── run-sparkshell.sh ├── run_spark.sh ├── spark-defaults.conf └── spark-env.sh /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM kmubigdata/ubuntu-hadoop 2 | MAINTAINER kimjeongchul 3 | 4 | USER root 5 | 6 | # scala 7 | RUN apt-get update 8 | RUN apt-get install -y scala 9 | 10 | # python 11 | RUN apt-get install -y python 12 | RUN apt-get install -y python3 13 | 14 | # spark 3.0.1 without Hadoop 15 | RUN wget https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-without-hadoop.tgz 16 | RUN tar -xvzf spark-3.0.1-bin-without-hadoop.tgz -C /usr/local 17 | RUN cd /usr/local && ln -s ./spark-3.0.1-bin-without-hadoop spark 18 | RUN rm -f /spark-3.0.1-bin-without-hadoop.tgz 19 | 20 | # Files for S3A 21 | RUN mkdir /usr/local/spark/extrajars 22 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.11.860/aws-java-sdk-1.11.860.jar 23 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.11.860/aws-java-sdk-s3-1.11.860.jar 24 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/1.11.860/aws-java-sdk-core-1.11.860.jar 25 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/1.11.860/aws-java-sdk-dynamodb-1.11.860.jar 26 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.1.1/hadoop-aws-3.1.1.jar 27 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/joda-time/joda-time/2.10.6/joda-time-2.10.6.jar 28 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/4.5.9/httpclient-4.5.9.jar 29 | 30 | # ENV hadoop 31 | ENV HADOOP_COMMON_HOME /usr/local/hadoop 32 | ENV HADOOP_HDFS_HOME /usr/local/hadoop 33 | ENV HADOOP_MAPRED_HOME /usr/local/hadoop 34 | ENV HADOOP_YARN_HOME /usr/local/hadoop 35 | ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop 36 | ENV YARN_CONF_DIR /usr/local/hadoop/etc/hadoop 37 | ENV LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/:$LD_LIBRARY_PATH 38 | 39 | # ENV spark 40 | ENV SPARK_HOME /usr/local/spark 41 | ENV PATH $PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin 42 | 43 | ADD spark-env.sh $SPARK_HOME/conf/spark-env.sh 44 | ADD spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf 45 | ADD run-sparkshell.sh $SPARK_HOME/run-sparkshell.sh 46 | RUN cp $HADOOP_HOME/etc/hadoop/workers $SPARK_HOME/conf/slaves 47 | 48 | COPY bootstrap.sh /etc/bootstrap.sh 49 | RUN chown root.root /etc/bootstrap.sh 50 | RUN chmod 700 /etc/bootstrap.sh 51 | RUN chmod +x /usr/local/spark/run-sparkshell.sh 52 | 53 | # Spark Web UI, History Server Port 54 | EXPOSE 8080 18080 55 | 56 | EXPOSE 7077 57 | 58 | #install sbt 59 | RUN apt-get install apt-transport-https 60 | RUN echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list 61 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 62 | RUN apt-get update 63 | RUN apt-get -y install sbt 64 | 65 | 66 | ENTRYPOINT ["/etc/bootstrap.sh"] 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ubuntu-spark 2 | 3 | ubuntu 16.04 hadoop 3.1.1 spark 2.4.0 4 | 5 | ```bash 6 | $ sudo docker run -dit --name [spark-container-name] --network [network-hadoop-container-is-connected] [image-name] /bin/bash 7 | ``` 8 | or 9 | ```bash 10 | $ ./run_spark.sh [network-name] [spark-container-name] 11 | ``` 12 | if `./run_spark.sh` fails with Permission denied error, 13 | ```bash 14 | # chmod +x run_spark.sh 15 | ``` 16 |
17 | 18 | ### run container 19 | ``` 20 | $ docker exec -it [spark-container-name] bash 21 | ``` 22 |
23 | 24 | ### run spark-shell in client mode 25 | ```bash 26 | # spark-shell --master yarn --deploy-mode client 27 | ``` 28 | or 29 | ```bash 30 | # cd /usr/local/spark/ 31 | # ./run-sparkshell.sh 32 | ``` 33 | 34 | ### quit spark-shell 35 | `:quit` or ctrl+D 36 | 37 | --- 38 | 39 | Check if master and slaves are connected. 40 | Lists all running nodes. 41 | ```bash 42 | # yarn node -list 43 | ``` 44 | 45 | 46 | if spark-shell gets stuck, check running applications and kill unnecessary ones. 47 | ```bash 48 | # yarn application -list 49 | # yarn application -kill [application-id] 50 | ``` 51 | 52 | --- 53 | 54 | 55 | install sbt which is a build tool.
56 | [sbt install](https://www.scala-sbt.org/1.0/docs/Installing-sbt-on-Linux.html) 57 | 58 | ```bash 59 | # echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list 60 | # sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 61 | # sudo apt-get update 62 | # sudo apt-get install sbt 63 | ``` 64 | 65 | --- 66 | 67 | 68 | if `sudo apt-get update` fails with `E: The method driver /usr/lib/apt/methods/https could not be found.` 69 | ```bash 70 | # sudo apt-get install apt-transport-https 71 | ``` 72 | -------------------------------------------------------------------------------- /Readme.txt: -------------------------------------------------------------------------------- 1 | These files are for building docker files and make containers easily. 2 | 3 | run_hadoop is script file for making containers. 4 | 5 | reference 6 | https://hub.docker.com/r/sequenceiq/spark/ 7 | https://github.com/sequenceiq/docker-spark 8 | -------------------------------------------------------------------------------- /bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : ${HADOOP_PREFIX:=/usr/local/hadoop} 4 | 5 | # installing libraries if any - (resource urls added comma separated to the ACP system variable) 6 | cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - 7 | 8 | service ssh start 9 | #$HADOOP_PREFIX/sbin/start-dfs.sh 10 | #$HADOOP_PREFIX/sbin/start-yarn.sh 11 | 12 | #hdfs dfs -put $SPARK_HOME/jars /spark 13 | #echo spark.yarn.jars hdfs:///spark/*.jar > $SPARK_HOME/conf/spark-defaults.conf 14 | 15 | #make directory in hdfs 16 | hdfs dfs -mkdir /spark/ 17 | hdfs dfs -mkdir /spark/shared-logs/ 18 | 19 | #spark.yarn.archive 20 | #apt-get install zip 21 | #cd /usr/local/spark/jars/ && zip /usr/local/spark/spark-jars.zip ./* 22 | #hdfs dfs -put /usr/local/spark/spark-jars.zip /spark/ 23 | 24 | cp $SPARK_HOME/conf/metrics.properties.template $SPARK_HOME/conf/metrics.properties 25 | 26 | # Create a user in the start up if NEW_USER environment variable is given 27 | # EX: docker run -e NEW_USER=kmucs -e RSA_PUBLIC_KEY="...." ... 28 | if [[ ! -z $NEW_USER ]]; 29 | then 30 | adduser --disabled-password --gecos "" "$NEW_USER" > /dev/null 31 | usermod -aG sudo "$NEW_USER" > /dev/null 32 | sudo -u "$NEW_USER" mkdir /home/"$NEW_USER"/.ssh 33 | sudo -u "$NEW_USER" chmod 700 /home/"$NEW_USER"/.ssh 34 | sudo -u "$NEW_USER" touch /home/"$NEW_USER"/.ssh/authorized_keys 35 | 36 | if [[ ! -z $RSA_PUBLIC_KEY ]]; 37 | then 38 | sudo -u "$NEW_USER" echo "$RSA_PUBLIC_KEY" >> /home/"$NEW_USER"/.ssh/authorized_keys 39 | else 40 | sudo -u "$NEW_USER" cat /tmp/id_rsa.pub >> /home/"$NEW_USER"/.ssh/authorized_keys 41 | fi 42 | sudo -u "$NEW_USER" chmod 600 /home/"$NEW_USER"/.ssh/authorized_keys 43 | 44 | echo "export HADOOP_HOME=$HADOOP_HOME" >> /home/"$NEW_USER"/.bashrc 45 | echo "export SPARK_HOME=$SPARK_HOME" >> /home/"$NEW_USER"/.bashrc 46 | echo "export HADOOP_CONF_DIR=$HADOOP_CONF_DIR" >> /home/"$NEW_USER"/.bashrc 47 | 48 | echo "export PATH=\$PATH:$PATH" >> /home/"$NEW_USER"/.bashrc 49 | echo "export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:$LD_LIBRARY_PATH" >> /home/"$NEW_USER"/.bashrc 50 | fi 51 | 52 | CMD=${1:-"exit 0"} 53 | if [[ "$CMD" == "-d" ]]; 54 | then 55 | service sshd stop 56 | /usr/sbin/sshd -D -d 57 | else 58 | /bin/bash -c "$*" 59 | fi 60 | 61 | -------------------------------------------------------------------------------- /run-sparkshell.sh: -------------------------------------------------------------------------------- 1 | spark-shell --master yarn --deploy-mode client 2 | -------------------------------------------------------------------------------- /run_spark.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | sudo docker run -dit --name $2 --network $1 kmubigdata/ubuntu-spark:latest /bin/bash 4 | -------------------------------------------------------------------------------- /spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.eventLog.enabled true 2 | spark.eventLog.dir hdfs://master:9000/spark/shared-logs/ 3 | spark.history.fs.logDirectory hdfs://master:9000/spark/shared-logs/ 4 | spark.driver.extraClassPath :/usr/local/spark/extrajars/* 5 | spark.executor.extraClassPath :/usr/local/spark/extrajars/* 6 | -------------------------------------------------------------------------------- /spark-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This file is sourced when running various Spark programs. 21 | # Copy it as spark-env.sh and edit that to configure Spark for your site. 22 | 23 | # Options read when launching programs locally with 24 | # ./bin/run-example or ./bin/spark-submit 25 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files 26 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node 27 | # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program 28 | # - SPARK_CLASSPATH, default classpath entries to append 29 | 30 | # Options read by executors and drivers running inside the cluster 31 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node 32 | # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program 33 | # - SPARK_CLASSPATH, default classpath entries to append 34 | # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data 35 | # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos 36 | 37 | # Options read in YARN client mode 38 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files 39 | # - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2) 40 | # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). 41 | # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) 42 | # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) 43 | 44 | # Options for the daemons used in the standalone deploy mode 45 | # - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname 46 | # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master 47 | # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") 48 | # - SPARK_WORKER_CORES, to set the number of cores to use on this machine 49 | # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) 50 | # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker 51 | # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node 52 | # - SPARK_WORKER_DIR, to set the working directory of worker processes 53 | # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") 54 | # - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). 55 | # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") 56 | # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") 57 | # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") 58 | # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers 59 | 60 | # Generic options for the daemons used in the standalone deploy mode 61 | # - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) 62 | # - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) 63 | # - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) 64 | # - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) 65 | # - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) 66 | 67 | export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath) 68 | export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop 69 | export SPARK_CLASSPATH=$SPARK_HOME/jars 70 | export JAVA_HOME=/usr/java/default 71 | --------------------------------------------------------------------------------