├── Dockerfile
├── README.md
├── Readme.txt
├── bootstrap.sh
├── run-sparkshell.sh
├── run_spark.sh
├── spark-defaults.conf
└── spark-env.sh


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM kmubigdata/ubuntu-hadoop
 2 | MAINTAINER kimjeongchul
 3 | 
 4 | USER root
 5 | 
 6 | # scala
 7 | RUN apt-get update
 8 | RUN apt-get install -y scala
 9 | 
10 | # python
11 | RUN apt-get install -y python
12 | RUN apt-get install -y python3
13 | 
14 | # spark 3.0.1 without Hadoop
15 | RUN wget https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-without-hadoop.tgz
16 | RUN tar -xvzf spark-3.0.1-bin-without-hadoop.tgz -C /usr/local
17 | RUN cd /usr/local && ln -s ./spark-3.0.1-bin-without-hadoop spark
18 | RUN rm -f /spark-3.0.1-bin-without-hadoop.tgz
19 | 
20 | # Files for S3A
21 | RUN mkdir /usr/local/spark/extrajars
22 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.11.860/aws-java-sdk-1.11.860.jar
23 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.11.860/aws-java-sdk-s3-1.11.860.jar
24 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/1.11.860/aws-java-sdk-core-1.11.860.jar
25 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/1.11.860/aws-java-sdk-dynamodb-1.11.860.jar
26 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.1.1/hadoop-aws-3.1.1.jar
27 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/joda-time/joda-time/2.10.6/joda-time-2.10.6.jar
28 | RUN wget -P /usr/local/spark/extrajars https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/4.5.9/httpclient-4.5.9.jar
29 | 
30 | # ENV hadoop
31 | ENV HADOOP_COMMON_HOME /usr/local/hadoop
32 | ENV HADOOP_HDFS_HOME /usr/local/hadoop
33 | ENV HADOOP_MAPRED_HOME /usr/local/hadoop
34 | ENV HADOOP_YARN_HOME /usr/local/hadoop
35 | ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
36 | ENV YARN_CONF_DIR /usr/local/hadoop/etc/hadoop
37 | ENV LD_LIBRARY_PATH=/usr/local/hadoop/lib/native/:$LD_LIBRARY_PATH
38 | 
39 | # ENV spark
40 | ENV SPARK_HOME /usr/local/spark
41 | ENV PATH $PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
42 | 
43 | ADD spark-env.sh $SPARK_HOME/conf/spark-env.sh
44 | ADD spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf
45 | ADD run-sparkshell.sh $SPARK_HOME/run-sparkshell.sh
46 | RUN cp $HADOOP_HOME/etc/hadoop/workers $SPARK_HOME/conf/slaves
47 | 
48 | COPY bootstrap.sh /etc/bootstrap.sh
49 | RUN chown root.root /etc/bootstrap.sh
50 | RUN chmod 700 /etc/bootstrap.sh
51 | RUN chmod +x /usr/local/spark/run-sparkshell.sh
52 | 
53 | # Spark Web UI, History Server Port
54 | EXPOSE 8080 18080
55 | 
56 | EXPOSE 7077
57 | 
58 | #install sbt
59 | RUN apt-get install apt-transport-https
60 | RUN echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
61 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
62 | RUN apt-get update
63 | RUN apt-get -y install sbt
64 | 
65 | 
66 | ENTRYPOINT ["/etc/bootstrap.sh"]
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ubuntu-spark
 2 | 
 3 | ubuntu 16.04 hadoop 3.1.1 spark 2.4.0
 4 | 
 5 | ```bash
 6 | $ sudo docker run -dit --name [spark-container-name] --network [network-hadoop-container-is-connected] [image-name] /bin/bash
 7 | ```
 8 | or
 9 | ```bash
10 | $ ./run_spark.sh [network-name] [spark-container-name]
11 | ```
12 | if `./run_spark.sh` fails with Permission denied error,
13 | ```bash
14 | # chmod +x run_spark.sh
15 | ```
16 | <br/>
17 | 
18 | ### run container
19 | ```
20 | $ docker exec -it [spark-container-name] bash
21 | ```
22 | <br/>
23 | 
24 | ### run spark-shell in client mode
25 | ```bash
26 | # spark-shell --master yarn --deploy-mode client
27 | ```
28 | or
29 | ```bash
30 | # cd /usr/local/spark/
31 | # ./run-sparkshell.sh
32 | ```
33 | 
34 | ### quit spark-shell
35 | `:quit` or ctrl+D
36 | 
37 | ---
38 | 
39 | Check if master and slaves are connected.
40 | Lists all running nodes.
41 | ```bash
42 | # yarn node -list
43 | ```
44 | 
45 | 
46 | if spark-shell gets stuck, check running applications and kill unnecessary ones.
47 | ```bash
48 | # yarn application -list
49 | # yarn application -kill [application-id]
50 | ```
51 | 
52 | ---
53 | 
54 | 
55 | install sbt which is a build tool.<br/>
56 | [sbt install](https://www.scala-sbt.org/1.0/docs/Installing-sbt-on-Linux.html)
57 | 
58 | ```bash
59 | # echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list  
60 | # sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823  
61 | # sudo apt-get update  
62 | # sudo apt-get install sbt
63 | ```
64 | 
65 | ---
66 | 
67 | 
68 | if `sudo apt-get update` fails with `E: The method driver /usr/lib/apt/methods/https could not be found.`
69 | ```bash
70 | # sudo apt-get install apt-transport-https
71 | ```
72 | 


--------------------------------------------------------------------------------
/Readme.txt:
--------------------------------------------------------------------------------
1 | These files are for building docker files and make containers easily.
2 | 
3 | run_hadoop is script file for making containers.
4 | 
5 | reference
6 | https://hub.docker.com/r/sequenceiq/spark/
7 | https://github.com/sequenceiq/docker-spark
8 | 


--------------------------------------------------------------------------------
/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | : ${HADOOP_PREFIX:=/usr/local/hadoop}
 4 | 
 5 | # installing libraries if any - (resource urls added comma separated to the ACP system variable)
 6 | cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp; curl -LO $cp ; done; cd -
 7 | 
 8 | service ssh start
 9 | #$HADOOP_PREFIX/sbin/start-dfs.sh
10 | #$HADOOP_PREFIX/sbin/start-yarn.sh
11 | 
12 | #hdfs dfs -put $SPARK_HOME/jars /spark
13 | #echo spark.yarn.jars hdfs:///spark/*.jar > $SPARK_HOME/conf/spark-defaults.conf
14 | 
15 | #make directory in hdfs 
16 | hdfs dfs -mkdir /spark/
17 | hdfs dfs -mkdir /spark/shared-logs/
18 | 
19 | #spark.yarn.archive
20 | #apt-get install zip
21 | #cd /usr/local/spark/jars/ && zip /usr/local/spark/spark-jars.zip ./* 
22 | #hdfs dfs -put /usr/local/spark/spark-jars.zip /spark/
23 | 
24 | cp $SPARK_HOME/conf/metrics.properties.template $SPARK_HOME/conf/metrics.properties
25 | 
26 | # Create a user in the start up if NEW_USER environment variable is given
27 | # EX: docker run  -e NEW_USER=kmucs -e RSA_PUBLIC_KEY="...."  ...
28 | if [[ ! -z $NEW_USER ]];
29 | then
30 |     adduser --disabled-password --gecos ""  "$NEW_USER" > /dev/null
31 |     usermod -aG sudo "$NEW_USER" > /dev/null
32 |     sudo -u "$NEW_USER" mkdir /home/"$NEW_USER"/.ssh
33 |     sudo -u "$NEW_USER" chmod 700 /home/"$NEW_USER"/.ssh
34 |     sudo -u "$NEW_USER" touch /home/"$NEW_USER"/.ssh/authorized_keys
35 | 
36 |     if [[ ! -z $RSA_PUBLIC_KEY ]];
37 |     then
38 |         sudo -u "$NEW_USER" echo "$RSA_PUBLIC_KEY" >> /home/"$NEW_USER"/.ssh/authorized_keys
39 |     else
40 |         sudo -u "$NEW_USER" cat /tmp/id_rsa.pub >> /home/"$NEW_USER"/.ssh/authorized_keys
41 |     fi
42 |     sudo -u "$NEW_USER" chmod 600 /home/"$NEW_USER"/.ssh/authorized_keys
43 | 
44 |     echo "export HADOOP_HOME=$HADOOP_HOME" >> /home/"$NEW_USER"/.bashrc
45 |     echo "export SPARK_HOME=$SPARK_HOME" >> /home/"$NEW_USER"/.bashrc
46 |     echo "export HADOOP_CONF_DIR=$HADOOP_CONF_DIR" >> /home/"$NEW_USER"/.bashrc
47 | 
48 |     echo "export PATH=\$PATH:$PATH" >> /home/"$NEW_USER"/.bashrc
49 |     echo "export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:$LD_LIBRARY_PATH" >> /home/"$NEW_USER"/.bashrc
50 | fi
51 | 
52 | CMD=${1:-"exit 0"}
53 | if [[ "$CMD" == "-d" ]];
54 | then
55 |     service sshd stop
56 |     /usr/sbin/sshd -D -d
57 | else
58 |     /bin/bash -c "$*"
59 | fi
60 | 
61 | 


--------------------------------------------------------------------------------
/run-sparkshell.sh:
--------------------------------------------------------------------------------
1 | spark-shell --master yarn --deploy-mode client
2 | 


--------------------------------------------------------------------------------
/run_spark.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | sudo docker run -dit --name $2 --network $1 kmubigdata/ubuntu-spark:latest /bin/bash
4 | 


--------------------------------------------------------------------------------
/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.eventLog.enabled           true
2 | spark.eventLog.dir               hdfs://master:9000/spark/shared-logs/
3 | spark.history.fs.logDirectory    hdfs://master:9000/spark/shared-logs/
4 | spark.driver.extraClassPath :/usr/local/spark/extrajars/*
5 | spark.executor.extraClassPath :/usr/local/spark/extrajars/*
6 | 


--------------------------------------------------------------------------------
/spark-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # This file is sourced when running various Spark programs.
21 | # Copy it as spark-env.sh and edit that to configure Spark for your site.
22 | 
23 | # Options read when launching programs locally with
24 | # ./bin/run-example or ./bin/spark-submit
25 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
26 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
27 | # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
28 | # - SPARK_CLASSPATH, default classpath entries to append
29 | 
30 | # Options read by executors and drivers running inside the cluster
31 | # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
32 | # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
33 | # - SPARK_CLASSPATH, default classpath entries to append
34 | # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
35 | # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
36 | 
37 | # Options read in YARN client mode
38 | # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
39 | # - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2)
40 | # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
41 | # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
42 | # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
43 | 
44 | # Options for the daemons used in the standalone deploy mode
45 | # - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
46 | # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
47 | # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
48 | # - SPARK_WORKER_CORES, to set the number of cores to use on this machine
49 | # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
50 | # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
51 | # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
52 | # - SPARK_WORKER_DIR, to set the working directory of worker processes
53 | # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
54 | # - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
55 | # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
56 | # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
57 | # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
58 | # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
59 | 
60 | # Generic options for the daemons used in the standalone deploy mode
61 | # - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
62 | # - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
63 | # - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
64 | # - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
65 | # - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
66 | 
67 | export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
68 | export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
69 | export SPARK_CLASSPATH=$SPARK_HOME/jars
70 | export JAVA_HOME=/usr/java/default
71 | 


--------------------------------------------------------------------------------