├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── conf
    ├── hadoop
    │   ├── core-site.xml
    │   ├── hadoop-env.sh
    │   ├── hdfs-site.xml
    │   ├── mapred-site.xml
    │   ├── workers
    │   └── yarn-site.xml
    ├── hive
    │   └── hive-site.xml
    └── spark
    │   └── spark-defaults.conf
├── entrypoint.sh
└── scripts
    ├── parallel_commands.sh
    └── watchdir.c


/.gitignore:
--------------------------------------------------------------------------------
1 | # OS garbage
2 | .DS_Store
3 | desktop.ini
4 | 
5 | 
6 | # IDE garbage
7 | .idea/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | # Alpine 3.11 contains Python 3.8, pyspark only supports Python up to 3.7
  2 | FROM alpine:3.10.4
  3 | 
  4 | # curl and unzip: download and extract Hive, Hadoop, Spark etc.
  5 | # bash: Hadoop is not compatible with Alpine's `ash` shell
  6 | # openjdk8: Java
  7 | # coreutils: Spark launcher script relies on GNU implementation of `nice`
  8 | # procps: Hadoop needs GNU `ps` utility
  9 | # findutils: Spark needs GNU `find` to run jobs (weird but true)
 10 | # ncurses: so that you can run `yarn top`
 11 | RUN apk add --no-cache \
 12 |     'curl=~7.66' \
 13 |     'unzip=~6.0' \
 14 |     'openjdk8=~8' \
 15 |     'bash=~5.0' \
 16 |     'coreutils=~8.31' \
 17 |     'procps=~3.3' \
 18 |     'findutils=~4.6' \
 19 |     'ncurses=~6.1' \
 20 |     'g++=~8.3' \
 21 |     'libc6-compat=~1.1' \
 22 |     && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2
 23 | 
 24 | # https://github.com/hadolint/hadolint/wiki/DL4006
 25 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 26 | 
 27 | # Hadoop
 28 | ENV HADOOP_VERSION=3.2.0
 29 | ENV HADOOP_HOME /usr/hadoop
 30 | RUN curl --progress-bar -L --retry 3 \
 31 |   "http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \
 32 |   | gunzip \
 33 |   | tar -x -C /usr/ \
 34 |  && mv "/usr/hadoop-${HADOOP_VERSION}" "${HADOOP_HOME}" \
 35 |  && rm -rf "${HADOOP_HOME}/share/doc" \
 36 |  && chown -R root:root "${HADOOP_HOME}"
 37 | 
 38 | # Hive
 39 | ENV HIVE_VERSION=3.1.2
 40 | ENV HIVE_HOME=/usr/hive
 41 | ENV HIVE_CONF_DIR="${HIVE_HOME}/conf"
 42 | ENV PATH "${PATH}:${HIVE_HOME}/bin"
 43 | RUN curl --progress-bar -L \
 44 |   "https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz" \
 45 |     | gunzip \
 46 |     | tar -x -C /usr/ \
 47 |   && mv "/usr/apache-hive-${HIVE_VERSION}-bin" "${HIVE_HOME}" \
 48 |   && chown -R root:root "${HIVE_HOME}" \
 49 |   && mkdir -p "${HIVE_HOME}/hcatalog/var/log" \
 50 |   && mkdir -p "${HIVE_HOME}/var/log" \
 51 |   && mkdir -p "${HIVE_CONF_DIR}" \
 52 |   && chmod 777 "${HIVE_HOME}/hcatalog/var/log" \
 53 |   && chmod 777 "${HIVE_HOME}/var/log"
 54 | 
 55 | # Spark
 56 | ENV SPARK_VERSION=2.4.5
 57 | ENV SPARK_PACKAGE "spark-${SPARK_VERSION}-bin-without-hadoop"
 58 | ENV SPARK_HOME /usr/spark
 59 | RUN curl --progress-bar -L --retry 3 \
 60 |   "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \
 61 |   | gunzip \
 62 |   | tar x -C /usr/ \
 63 |  && mv "/usr/${SPARK_PACKAGE}" "${SPARK_HOME}" \
 64 |  && chown -R root:root "${SPARK_HOME}"
 65 | # For inscrutable reasons, Spark distribution doesn't include spark-hive.jar
 66 | # Livy attempts to load it though, and will throw
 67 | # java.lang.ClassNotFoundException: org.apache.spark.sql.hive.HiveContext
 68 | ARG SCALA_VERSION=2.11
 69 | RUN curl --progress-bar -L \
 70 |     "https://repo1.maven.org/maven2/org/apache/spark/spark-hive_${SCALA_VERSION}/${SPARK_VERSION}/spark-hive_${SCALA_VERSION}-${SPARK_VERSION}.jar" \
 71 |     --output "${SPARK_HOME}/jars/spark-hive_${SCALA_VERSION}-${SPARK_VERSION}.jar"
 72 | 
 73 | # PySpark - comment out if you don't want it in order to save image space
 74 | RUN apk add --no-cache \
 75 |     'python3=~3.7' \
 76 |     'python3-dev=~3.7' \
 77 |  && ln -s /usr/bin/python3 /usr/bin/python
 78 | 
 79 | # SparkR - comment out if you don't want it in order to save image space
 80 | RUN apk add --no-cache \
 81 |     'R=~3.6' \
 82 |     'R-dev=~3.6' \
 83 |     'libc-dev=~0.7' \
 84 |  && R -e 'install.packages("knitr", repos = "http://cran.us.r-project.org")'
 85 | 
 86 | # Common settings
 87 | ENV JAVA_HOME "/usr/lib/jvm/java-1.8-openjdk"
 88 | ENV PATH="${PATH}:${JAVA_HOME}/bin"
 89 | # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
 90 | ENV PYTHONHASHSEED 0
 91 | ENV PYTHONIOENCODING UTF-8
 92 | ENV PIP_DISABLE_PIP_VERSION_CHECK 1
 93 | 
 94 | # Hadoop setup
 95 | ENV PATH="${PATH}:${HADOOP_HOME}/bin"
 96 | ENV HDFS_NAMENODE_USER="root"
 97 | ENV HDFS_DATANODE_USER="root"
 98 | ENV HDFS_SECONDARYNAMENODE_USER="root"
 99 | ENV YARN_RESOURCEMANAGER_USER="root"
100 | ENV YARN_NODEMANAGER_USER="root"
101 | ENV LD_LIBRARY_PATH="${HADOOP_HOME}/lib/native:${LD_LIBRARY_PATH}"
102 | ENV HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
103 | ENV HADOOP_LOG_DIR="${HADOOP_HOME}/logs"
104 | COPY conf/hadoop/core-site.xml "${HADOOP_CONF_DIR}"
105 | COPY conf/hadoop/hadoop-env.sh "${HADOOP_CONF_DIR}"
106 | COPY conf/hadoop/hdfs-site.xml "${HADOOP_CONF_DIR}"
107 | COPY conf/hadoop/mapred-site.xml "${HADOOP_CONF_DIR}"
108 | COPY conf/hadoop/workers "${HADOOP_CONF_DIR}"
109 | COPY conf/hadoop/yarn-site.xml "${HADOOP_CONF_DIR}"
110 | 
111 | # For S3 to work. Without this line you'll get "Class org.apache.hadoop.fs.s3a.S3AFileSystem not found" exception when accessing S3 from Hadoop
112 | ENV HADOOP_CLASSPATH="${HADOOP_HOME}/share/hadoop/tools/lib/*"
113 | 
114 | # Hadoop JVM crashes on Alpine when it tries to load native libraries.
115 | # Solution? Delete those altogether.
116 | # Alternatively, you can try and compile them
117 | # https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/NativeLibraries.html
118 | RUN mkdir "${HADOOP_LOG_DIR}"  \
119 |  && rm -rf "${HADOOP_HOME}/lib/native"
120 | 
121 | # Hive setup
122 | ENV PATH="${PATH}:${HIVE_HOME}/bin"
123 | ENV HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HIVE_HOME}/lib/*"
124 | COPY conf/hive/hive-site.xml "${HIVE_CONF_DIR}/"
125 | 
126 | # Spark setup
127 | ENV PATH="${PATH}:${SPARK_HOME}/bin"
128 | ENV SPARK_CONF_DIR="${SPARK_HOME}/conf"
129 | ENV SPARK_LOG_DIR="${SPARK_HOME}/logs"
130 | ENV SPARK_DIST_CLASSPATH="${HADOOP_CONF_DIR}:${HADOOP_HOME}/share/hadoop/tools/lib/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/hdfs:${HADOOP_HOME}/share/hadoop/hdfs/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/yarn:${HADOOP_HOME}/share/hadoop/yarn/lib/*:${HADOOP_HOME}/share/hadoop/yarn/*"
131 | COPY conf/hadoop/core-site.xml "${SPARK_CONF_DIR}"/
132 | COPY conf/hadoop/hdfs-site.xml "${SPARK_CONF_DIR}"/
133 | COPY conf/spark/spark-defaults.conf "${SPARK_CONF_DIR}"/
134 | 
135 | # Spark with Hive
136 | # TODO enable in Spark 3.0
137 | #ENV SPARK_DIST_CLASSPATH=$SPARK_DIST_CLASSPATH:$HIVE_HOME/lib/*
138 | #COPY conf/hive/hive-site.xml $SPARK_CONF_DIR/
139 | #RUN ln -s $SPARK_HOME/jars/scala-library-*.jar $HIVE_HOME/lib \
140 | #    && ln -s $SPARK_HOME/jars/spark-core_*.jar $HIVE_HOME/lib \
141 | #    && ln -s $SPARK_HOME/jars/spark-network-common_*.jar $HIVE_HOME/lib
142 | 
143 | # Clean up
144 | RUN rm -rf "${HIVE_HOME}/examples" \
145 |     && rm -rf "${SPARK_HOME}/examples/src"
146 | 
147 | # If both YARN Web UI and Spark UI is up, then returns 0, 1 otherwise.
148 | HEALTHCHECK CMD curl -f http://host.docker.internal:8080/ \
149 |     && curl -f http://host.docker.internal:8088/ || exit 1
150 | 
151 | # Multitail for logging
152 | COPY scripts/ /scripts
153 | RUN apk add --no-cache 'linux-headers=~4.19' \
154 |  && gcc /scripts/watchdir.c -o /scripts/watchdir \
155 |  && chmod +x /scripts/parallel_commands.sh
156 | 
157 | # Entry point: start all services and applications.
158 | COPY entrypoint.sh /
159 | RUN chmod +x /entrypoint.sh
160 | ENTRYPOINT ["/entrypoint.sh"]
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2020 Vadim Panov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Big data playground: Hadoop + Hive + Spark
 2 | 
 3 | [![Docker Build Status](https://img.shields.io/docker/cloud/build/panovvv/hadoop-hive-spark.svg)](https://cloud.docker.com/repository/docker/panovvv/hadoop-hive-spark/builds)
 4 | [![Docker Pulls](https://img.shields.io/docker/pulls/panovvv/hadoop-hive-spark.svg)](https://hub.docker.com/r/panovvv/hadoop-hive-spark)
 5 | [![Docker Stars](https://img.shields.io/docker/stars/panovvv/hadoop-hive-spark.svg)](https://hub.docker.com/r/panovvv/hadoop-hive-spark)
 6 | 
 7 | Base Docker image with just essentials: Hadoop, Hive and Spark.
 8 | 
 9 | ## Software
10 | 
11 | * [Hadoop 3.2.0](http://hadoop.apache.org/docs/r3.2.0/) in Fully Distributed (Multi-node) Mode
12 | 
13 | * [Hive 3.1.2](http://hive.apache.org/) with HiveServer2 exposed to host.
14 | 
15 | * [Spark 2.4.5](https://spark.apache.org/docs/2.4.5/) in YARN mode (Spark Scala, PySpark and SparkR)
16 | 
17 | ## Usage
18 | 
19 | Take a look [at this repo](https://github.com/panovvv/bigdata-docker-compose)
20 | to see how I use it as a part of a Docker Compose cluster.
21 | 
22 | Hive JDBC port is exposed to host:
23 | * URI: `jdbc:hive2://localhost:10000`
24 | * Driver: `org.apache.hive.jdbc.HiveDriver` (org.apache.hive:hive-jdbc:3.1.2)
25 | * User and password: unused.
26 | 
27 | ## Version compatibility notes
28 | * Hadoop 3.2.1 and Hive 3.1.2 are incompatible due to Guava version
29 | mismatch (Hadoop: Guava 27.0, Hive: Guava 19.0). Hive fails with
30 | `java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)`
31 | * Spark 2.4.4 can not 
32 | [use Hive higher than 1.2.2 as a SparkSQL engine](https://spark.apache.org/docs/2.4.4/sql-data-sources-hive-tables.html)
33 | because of this bug: [Spark need to support reading data from Hive 2.0.0 metastore](https://issues.apache.org/jira/browse/SPARK-13446)
34 | and associated issue [Dealing with TimeVars removed in Hive 2.x](https://issues.apache.org/jira/browse/SPARK-27349).
35 | Trying to make it happen results in this exception:
36 | `java.lang.NoSuchFieldError: HIVE_STATS_JDBC_TIMEOUT`.
37 | When this is fixed in Spark 3.0, it will be able to use Hive as a
38 | backend for SparkSQL. Alternatively you can try to downgrade Hive :)
39 | 
40 | ## Maintaining
41 | 
42 | * Docker file code linting:  `docker run --rm -i hadolint/hadolint < Dockerfile`
43 | * [To trim the fat from Docker image](https://github.com/wagoodman/dive)
44 | 
45 | ## TODO
46 | * Upgrade spark to 3.0
47 | * When upgraded, enable Spark-Hive integration.


--------------------------------------------------------------------------------
/conf/hadoop/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html -->
 5 | <configuration>
 6 | 
 7 |     <property>
 8 |         <name>fs.defaultFS</name>
 9 |         <value>hdfs://master:9000</value>
10 |     </property>
11 | 
12 |     <property>
13 |         <name>fs.default.name</name>
14 |         <value>hdfs://master:9000</value>
15 |     </property>
16 | 
17 |     <!-- This is where here namenode stores its metadata, and  where HDFS
18 |     data blocks are stored as well. -->
19 |     <property>
20 |         <name>hadoop.tmp.dir</name>
21 |     </property>
22 | 
23 |     <!--To be able to run Hive via JDBC as root-->
24 |     <property>
25 |         <name>hadoop.proxyuser.root.groups</name>
26 |         <value>*</value>
27 |     </property>
28 |     <property>
29 |         <name>hadoop.proxyuser.root.hosts</name>
30 |         <value>*</value>
31 |     </property>
32 | </configuration>
33 | 


--------------------------------------------------------------------------------
/conf/hadoop/hadoop-env.sh:
--------------------------------------------------------------------------------
1 | # The java implementation to use. By default, this environment
2 | # variable is REQUIRED on ALL platforms except OS X!
3 | export JAVA_HOME=/usr/lib/jvm/java-1.8-openjdk


--------------------------------------------------------------------------------
/conf/hadoop/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html -->
 5 | <configuration>
 6 | 
 7 |     <property>
 8 |         <name>dfs.namenode.name.dir</name>
 9 |         <value>/dfs/name</value>
10 |     </property>
11 | 
12 |     <property>
13 |         <name>dfs.datanode.data.dir</name>
14 |         <value>/dfs/data</value>
15 |     </property>
16 | 
17 |     <!-- Replication factor. Files you store on HDFS will be stored
18 |     on this many machines -->
19 |     <property>
20 |         <name>dfs.replication</name>
21 |         <value>2</value>
22 |     </property>
23 | 
24 | </configuration>


--------------------------------------------------------------------------------
/conf/hadoop/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html -->
 5 | <configuration>
 6 | 
 7 |     <property>
 8 |         <name>mapreduce.framework.name</name>
 9 |         <value>yarn</value>
10 |     </property>
11 | 
12 |     <!-- https://hadoop.apache.org/docs/r3.1.2/hadoop-yarn/hadoop-yarn-site/ResourceModel.html -->
13 |     <!-- The amount of memory a container needs to run MR Application Master -->
14 |     <property>
15 |         <name>yarn.app.mapreduce.am.resource.memory-mb</name>
16 |         <value>512</value>
17 |     </property>
18 |     <!-- Upper memory limit that Hadoop allows to be allocated to a
19 |     mapper container, in megabytes. The default is 1024 -->
20 |     <property>
21 |         <name>mapreduce.map.resource.memory-mb</name>
22 |         <value>512</value>
23 |     </property>
24 |     <!-- Upper memory limit that Hadoop allows to be allocated to a
25 |     reducer container, in megabytes. The default is 1024 -->
26 |     <property>
27 |         <name>mapreduce.reduce.resource.memory-mb</name>
28 |         <value>2048</value>
29 |     </property>
30 | 
31 | </configuration>
32 | 


--------------------------------------------------------------------------------
/conf/hadoop/workers:
--------------------------------------------------------------------------------
1 | worker1
2 | worker2


--------------------------------------------------------------------------------
/conf/hadoop/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- http://hadoop.apache.org/docs/r3.1.2/hadoop-project-dist/hadoop-common/ClusterSetup.html -->
 5 | <configuration>
 6 | 
 7 |     <!-- Single hostname that can be set in place of setting all
 8 |     yarn.resourcemanager*address resources. Results in default ports
 9 |     for ResourceManager components: scheduler - 8030, resource-tracker - 8031,
10 |     resourcemanager - 8032, admin - 8033, webapp - 8088. -->
11 |     <property>
12 |         <name>yarn.resourcemanager.hostname</name>
13 |         <value>master</value>
14 |     </property>
15 | 
16 |     <!-- Shuffle service that needs to be set for Map Reduce applications. -->
17 |     <property>
18 |         <name>yarn.nodemanager.aux-services</name>
19 |         <value>mapreduce_shuffle</value>
20 |     </property>
21 |     <property>
22 |         <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
23 |         <value>org.apache.hadoop.mapred.ShuffleHandler</value>
24 |     </property>
25 | 
26 |     <!-- Memory and CPU constraints -->
27 | 
28 |     <!-- Amount of physical memory, in MB, that can be allocated for containers.
29 |      It means the amount of memory YARN can utilize on this node and therefore
30 |      this property should be lower than the total memory of that machine. -->
31 |     <property>
32 |         <name>yarn.nodemanager.resource.memory-mb</name>
33 |         <value>8192</value>
34 |     </property>
35 | 
36 |     <!-- Resource manager can only allocate memory to containers in increments
37 |     of "yarn.scheduler.minimum-allocation-mb" (1024 by default) and not exceed
38 |     "yarn.scheduler.maximum-allocation-mb" (8192 by default).
39 |     It should not be more then total memory of the Node.
40 |     YARN processes each map or reduce task in a container so this
41 |     param divided by param above is effectively how many jobs you can
42 |     run concurrently. -->
43 |     <property>
44 |         <name>yarn.scheduler.maximum-allocation-mb</name>
45 |         <value>4096</value>
46 |     </property>
47 |     <property>
48 |         <name>yarn.scheduler.minimum-allocation-mb</name>
49 |         <value>512</value>
50 |     </property>
51 | 
52 |     <!-- And it can only allocate CPU vcores to containers
53 |     in increments of "yarn.scheduler.minimum-allocation-vcores"
54 |      and not exceed "yarn.scheduler.maximum-allocation-vcores". -->
55 |     <property>
56 |         <name>yarn.scheduler.maximum-allocation-vcores</name>
57 |         <value>4</value>
58 |     </property>
59 |     <property>
60 |         <name>yarn.scheduler.minimum-allocation-vcores</name>
61 |         <value>1</value>
62 |     </property>
63 | 
64 |     <!-- Required for Hive on Spark
65 |     https://cwiki.apache.org/confluence/display/Hive/Hive+on+Spark%3A+Getting+Started -->
66 |     <property>
67 |         <name>yarn.resourcemanager.scheduler.class</name>
68 |         <!-- Note: do not reformat, any new lines between <value></value>
69 |         tags will cause NoClassDefFound for FairScheduler-->
70 |         <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
71 |     </property>
72 | 
73 |     <!-- Getting "application is running beyond the 'VIRTUAL' memory limit"
74 |     in Spark YARN otherwise-->
75 |     <property>
76 |         <name>yarn.nodemanager.vmem-check-enabled</name>
77 |         <value>false</value>
78 |     </property>
79 | 
80 |     <property>
81 |         <description>To enable RM web ui2 application.</description>
82 |         <name>yarn.webapp.ui2.enable</name>
83 |         <value>true</value>
84 |     </property>
85 | 
86 | </configuration>
87 | 


--------------------------------------------------------------------------------
/conf/hive/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
 3 |    Licensed to the Apache Software Foundation (ASF) under one or more
 4 |    contributor license agreements.  See the NOTICE file distributed with
 5 |    this work for additional information regarding copyright ownership.
 6 |    The ASF licenses this file to You under the Apache License, Version 2.0
 7 |    (the "License"); you may not use this file except in compliance with
 8 |    the License.  You may obtain a copy of the License at
 9 | 
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | -->
18 | <configuration>
19 | 
20 |     <!--Where in HDFS we're going to store table files.-->
21 |     <property>
22 |         <name>hive.metastore.warehouse.dir</name>
23 |         <value>/usr/hive/warehouse</value>
24 |     </property>
25 | 
26 |     <!--Postgres metastore connection details (stores info about tables etc.)-->
27 |     <property>
28 |         <name>javax.jdo.option.ConnectionDriverName</name>
29 |         <value>org.postgresql.Driver</value>
30 |     </property>
31 |     <property>
32 |         <name>javax.jdo.option.ConnectionURL</name>
33 |         <value>jdbc:postgresql://hivemetastore:5432/hivemetastoredb</value>
34 |     </property>
35 |     <property>
36 |         <name>javax.jdo.option.ConnectionUserName</name>
37 |         <value>postgres</value>
38 |     </property>
39 |     <property>
40 |         <name>javax.jdo.option.ConnectionPassword</name>
41 |         <value>new_password</value>
42 |     </property>
43 | 
44 |     <!--    https://cwiki.apache.org/confluence/display/Hive/Hive+on+Spark%3A+Getting+Started -->
45 |     <property>
46 |         <name>hive.execution.engine</name>
47 |         <value>spark</value>
48 |     </property>
49 |     <property>
50 |         <name>spark.yarn.jars</name>
51 |         <value>hdfs://master:9000/spark-jars/*</value>
52 |     </property>
53 | 
54 |     <!-- To be able to execute Hive queries from Zeppelin -->
55 |     <property>
56 |         <name>hive.server2.enable.doAs</name>
57 |         <value>false</value>
58 |         <description>
59 |             Setting this property to true will have HiveServer2 execute
60 |             Hive operations as the user making the calls to it.
61 |         </description>
62 |     </property>
63 | </configuration>


--------------------------------------------------------------------------------
/conf/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | # Default system properties included when running spark-submit.
 2 | # This is useful for setting default environmental settings.
 3 | 
 4 | # Port settings
 5 | spark.ui.port 4040
 6 | spark.driver.port 7001
 7 | spark.blockManager.port 7002
 8 | spark.port.maxRetries 4
 9 | spark.history.ui.port 18080
10 | 
11 | # Run Spark jobs in YARN
12 | spark.master    yarn
13 | spark.yarn.jars hdfs://master:9000/spark-jars/*
14 | 
15 | # Spark history server
16 | spark.eventLog.enabled true
17 | spark.eventLog.dir hdfs:///log/spark
18 | spark.history.fs.logDirectory hdfs:///log/spark
19 | 
20 | # Spark with Hive
21 | # TODO enable when they remove HIVE_STATS_JDBC_TIMEOUT
22 | # https://github.com/apache/spark/commit/1d95dea30788b9f64c5e304d908b85936aafb238#diff-842e3447fc453de26c706db1cac8f2c4
23 | # https://issues.apache.org/jira/browse/SPARK-13446
24 | #spark.sql.catalogImplementation hive
25 | #spark.sql.hive.metastore.version 2.3.0
26 | #spark.sql.hive.metastore.jars /usr/hive/lib/*:/usr/hadoop/etc/hadoop/*:/usr/hadoop/share/hadoop/common/*:/usr/hadoop/share/hadoop/common/lib/*:/usr/hadoop/share/hadoop/common/sources/*:/usr/hadoop/share/hadoop/hdfs/*:/usr/hadoop/share/hadoop/hdfs/lib/*:/usr/hadoop/share/hadoop/hdfs/sources/*:/usr/hadoop/share/hadoop/mapreduce/*:/usr/hadoop/share/hadoop/mapreduce/lib/*:/usr/hadoop/share/hadoop/mapreduce/sources/*:/usr/hadoop/share/hadoop/yarn/*:/usr/hadoop/share/hadoop/yarn/lib/*:/usr/hadoop/share/hadoop/yarn/sources/*:/usr/hadoop/share/hadoop/yarn/timelineservice/*:/usr/hadoop/share/hadoop/client/*:/usr/hadoop/share/hadoop/tools/lib/*:/usr/hadoop/share/hadoop/tools/sources/*


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -n "${HADOOP_DATANODE_UI_PORT}" ]; then
 4 |   echo "Replacing default datanode UI port 9864 with ${HADOOP_DATANODE_UI_PORT}"
 5 |   sed -i "$ i\<property><name>dfs.datanode.http.address</name><value>0.0.0.0:${HADOOP_DATANODE_UI_PORT}</value></property>" ${HADOOP_CONF_DIR}/hdfs-site.xml
 6 | fi
 7 | if [ "${HADOOP_NODE}" == "namenode" ]; then
 8 |   echo "Starting Hadoop name node..."
 9 |   yes | hdfs namenode -format
10 |   hdfs --daemon start namenode
11 |   hdfs --daemon start secondarynamenode
12 |   yarn --daemon start resourcemanager
13 |   mapred --daemon start historyserver
14 | fi
15 | if [ "${HADOOP_NODE}" == "datanode" ]; then
16 |   echo "Starting Hadoop data node..."
17 |   hdfs --daemon start datanode
18 |   yarn --daemon start nodemanager
19 | fi
20 | 
21 | if [ -n "${HIVE_CONFIGURE}" ]; then
22 |   echo "Configuring Hive..."
23 |   schematool -dbType postgres -initSchema
24 | 
25 |   # Start metastore service.
26 |   hive --service metastore &
27 | 
28 |   # JDBC Server.
29 |   hiveserver2 &
30 | fi
31 | 
32 | if [ -z "${SPARK_MASTER_ADDRESS}" ]; then
33 |   echo "Starting Spark master node..."
34 |   # Create directory for Spark logs
35 |   SPARK_LOGS_HDFS_PATH=/log/spark
36 |   if ! hadoop fs -test -d "${SPARK_LOGS_HDFS_PATH}"
37 |   then
38 |     hadoop fs -mkdir -p  ${SPARK_LOGS_HDFS_PATH}
39 |     hadoop fs -chmod -R 755 ${SPARK_LOGS_HDFS_PATH}/*
40 |   fi
41 | 
42 |   # Spark on YARN
43 |   SPARK_JARS_HDFS_PATH=/spark-jars
44 |   if ! hadoop fs -test -d "${SPARK_JARS_HDFS_PATH}"
45 |   then
46 |     hadoop dfs -copyFromLocal "${SPARK_HOME}/jars" "${SPARK_JARS_HDFS_PATH}"
47 |   fi
48 | 
49 |   "${SPARK_HOME}/sbin/start-master.sh" -h master &
50 |   "${SPARK_HOME}/sbin/start-history-server.sh" &
51 | else
52 |   echo "Starting Spark slave node..."
53 |   "${SPARK_HOME}/sbin/start-slave.sh" "${SPARK_MASTER_ADDRESS}" &
54 | fi
55 | 
56 | echo "All initializations finished!"
57 | 
58 | # Blocking call to view all logs. This is what won't let container exit right away.
59 | /scripts/parallel_commands.sh "scripts/watchdir ${HADOOP_LOG_DIR}" "scripts/watchdir ${SPARK_LOG_DIR}"
60 | 
61 | # Stop all
62 | if [ "${HADOOP_NODE}" == "namenode" ]; then
63 |   hdfs namenode -format
64 |   hdfs --daemon stop namenode
65 |   hdfs --daemon stop secondarynamenode
66 |   yarn --daemon stop resourcemanager
67 |   mapred --daemon stop historyserver
68 | fi
69 | if [ "${HADOOP_NODE}" == "datanode" ]; then
70 |   hdfs --daemon stop datanode
71 |   yarn --daemon stop nodemanager
72 | fi


--------------------------------------------------------------------------------
/scripts/parallel_commands.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for cmd in "$@"; do {
 4 |   echo "Process \"$cmd\" started";
 5 |   $cmd & pid=$!
 6 |   PID_LIST+=" $pid";
 7 | } done
 8 | 
 9 | trap "kill $PID_LIST" SIGINT
10 | 
11 | echo "Parallel processes have started";
12 | 
13 | wait $PID_LIST
14 | 
15 | echo
16 | echo "All processes have completed";


--------------------------------------------------------------------------------
/scripts/watchdir.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <stdlib.h>
  4 | #include <unistd.h>
  5 | #include <errno.h>
  6 | #include <sys/types.h>
  7 | #include <sys/inotify.h>
  8 | #include <sys/stat.h>
  9 | #include <signal.h>
 10 | #include <dirent.h>
 11 | #include <linux/limits.h>
 12 | #define    CHAR_BACK   500
 13 | 
 14 | // * File handler structure
 15 | struct file_followed { long last_position; char filename[NAME_MAX]; struct file_followed * next; };
 16 | struct file_followed * file_list = NULL;
 17 | 
 18 | // * To quit peacefully
 19 | int cycle = 1;
 20 | void stopCycle(int u) { cycle = 0; }
 21 | 
 22 | // * Last tailed filename
 23 | char last_tailed[NAME_MAX];
 24 | 
 25 | void fileAdd(char * file) {
 26 |     struct file_followed ** list = &file_list;
 27 |     struct stat statdesc;
 28 | 
 29 |     if(stat(file, &statdesc) || !S_ISREG(statdesc.st_mode)) { return; }
 30 |     while(*list) { list = &((*list)->next); }
 31 |     *list = (struct file_followed*)malloc(sizeof(struct file_followed));
 32 |     (*list)->last_position = -1;
 33 |     strcpy((*list)->filename, file);
 34 |     (*list)->next = NULL;
 35 | }
 36 | 
 37 | void fileMod(char* fileName, struct file_followed* file_list) {
 38 |     struct file_followed* item = file_list;
 39 |     while(item) { 
 40 |         if(strcmp(item->filename, fileName) == 0) {
 41 |             FILE* fp = fopen(item->filename, "r");
 42 |             fseek(fp, 0, SEEK_END);
 43 |             long end_position = ftell(fp);
 44 |             fclose(fp);
 45 |             if (end_position <= item->last_position) {
 46 |                 printf("\n** %s truncated **\n", fileName);
 47 |                 item->last_position = -1;
 48 |             }
 49 |             usleep(100);
 50 |             return;
 51 |         }
 52 |         item = item->next;
 53 |     }
 54 | }
 55 | 
 56 | int fileTail(struct file_followed * item) {
 57 |     int ret = 0;
 58 |     FILE * fp = fopen(item->filename, "r");
 59 |     fseek(fp, 0, SEEK_END);
 60 |     long end_position = ftell(fp);
 61 | 
 62 |     if( end_position != item->last_position ) {
 63 |         if(strcmp(item->filename, last_tailed)) { strcpy(last_tailed, item->filename); printf("\n** %s **:\n", item->filename); }
 64 | 
 65 |         int start_position = item->last_position == -1 || item->last_position > end_position ? (end_position-CHAR_BACK > 0 ? end_position-CHAR_BACK : 0) : item->last_position;
 66 |                     fseek(fp, start_position, SEEK_SET);
 67 | 
 68 |         int len = end_position - start_position;
 69 |         char * buf = (char*)malloc(len+1);
 70 |         fread(buf, len, 1, fp);
 71 |         buf[len] = '\0';
 72 |         printf("%s%s", len == CHAR_BACK ? "[...]" : "", buf);
 73 |         free(buf);
 74 | 
 75 |         item->last_position = end_position;
 76 |         ret = 1;
 77 |     }
 78 | 
 79 |     fclose(fp);
 80 |     return ret;
 81 | }
 82 | 
 83 | void fileRem(char * file) {
 84 |     struct file_followed ** list = &file_list;
 85 |     while(*list && strcmp((*list)->filename, file)) { list = &((*list)->next); }
 86 |     if(*list) { struct file_followed * todel = *list; *list = (*list)->next; free(todel); }
 87 | }
 88 | 
 89 | int main(int argc, char ** argv) {
 90 | 
 91 |     struct dirent **namelist;
 92 |     struct stat statdesc;
 93 |     struct timeval tv;
 94 |     fd_set set;
 95 |     int fd;
 96 |     int wd;
 97 |     int r;
 98 | 
 99 |     // * Help
100 |     if(stat(argv[1], &statdesc) || !S_ISDIR(statdesc.st_mode)) { printf("[usage] %s dir-to-monitor\n", argv[0]); exit(EXIT_FAILURE); }
101 | 
102 |     // * Init
103 |     chdir(argv[1]);
104 |     memset(last_tailed, 0, sizeof(last_tailed));
105 |     signal(SIGINT, stopCycle);
106 |     signal(SIGTERM, stopCycle);
107 | 
108 |     // * Inotify
109 |     if( (fd = inotify_init()) < 0) { perror("inotify_init"); }
110 |     if( (wd = inotify_add_watch( fd, ".", IN_CREATE | IN_MODIFY |IN_DELETE ) < 0)) { perror("inotify_add_watch"); }
111 | 
112 |     // * File add recursively on dirscan
113 |     if( (r = scandir(".", &namelist, 0, alphasort)) < 0) { perror("scandir"); }
114 |     while (r--) { fileAdd(namelist[r]->d_name); free(namelist[r]); }
115 |     free(namelist);
116 | 
117 |     // * Neverending cycle
118 |     while(cycle) {
119 |         // * Select on inotify
120 |         FD_ZERO(&set);
121 |         FD_SET(fd, &set);
122 |         tv.tv_sec = 0;
123 |         tv.tv_usec = 1000;
124 |         if( (r = select(fd+1, &set, NULL, NULL, &tv)) == -1) { perror("select"); }
125 | 
126 |         // * New add or del on inotify
127 |         if(r) {
128 |             struct inotify_event * event;
129 |             char buf[1024];
130 |             if(read(fd, buf, 1024) <= 0) { perror("read"); }
131 |             event = (struct inotify_event *) buf;
132 |             if(event->mask & IN_MODIFY) { fileMod(event->name, file_list);} 
133 |             else if(event->mask & IN_CREATE) { fileAdd(event->name); } 
134 |             else if(event->mask & IN_DELETE) { fileRem(event->name); }
135 |         }
136 | 
137 |         // * Check for new tails
138 |         struct file_followed * list = file_list;
139 |         int tailers = 0;
140 |         while(list) { tailers += fileTail(list); list = list->next; }
141 |         if(!tailers) { usleep(500000); }
142 |     }
143 | 
144 |     // * Stop inotify
145 |     inotify_rm_watch( fd, wd );
146 |     close(fd);
147 | 
148 |     return EXIT_SUCCESS;
149 | }


--------------------------------------------------------------------------------