├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── conf ├── hadoop │ ├── core-site.xml │ ├── hadoop-env.sh │ ├── hdfs-site.xml │ ├── mapred-site.xml │ ├── workers │ └── yarn-site.xml ├── hive │ └── hive-site.xml └── spark │ └── spark-defaults.conf ├── entrypoint.sh └── scripts ├── parallel_commands.sh └── watchdir.c /.gitignore: -------------------------------------------------------------------------------- 1 | # OS garbage 2 | .DS_Store 3 | desktop.ini 4 | 5 | 6 | # IDE garbage 7 | .idea/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Alpine 3.11 contains Python 3.8, pyspark only supports Python up to 3.7 2 | FROM alpine:3.10.4 3 | 4 | # curl and unzip: download and extract Hive, Hadoop, Spark etc. 5 | # bash: Hadoop is not compatible with Alpine's `ash` shell 6 | # openjdk8: Java 7 | # coreutils: Spark launcher script relies on GNU implementation of `nice` 8 | # procps: Hadoop needs GNU `ps` utility 9 | # findutils: Spark needs GNU `find` to run jobs (weird but true) 10 | # ncurses: so that you can run `yarn top` 11 | RUN apk add --no-cache \ 12 | 'curl=~7.66' \ 13 | 'unzip=~6.0' \ 14 | 'openjdk8=~8' \ 15 | 'bash=~5.0' \ 16 | 'coreutils=~8.31' \ 17 | 'procps=~3.3' \ 18 | 'findutils=~4.6' \ 19 | 'ncurses=~6.1' \ 20 | 'g++=~8.3' \ 21 | 'libc6-compat=~1.1' \ 22 | && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2 23 | 24 | # https://github.com/hadolint/hadolint/wiki/DL4006 25 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 26 | 27 | # Hadoop 28 | ENV HADOOP_VERSION=3.2.0 29 | ENV HADOOP_HOME /usr/hadoop 30 | RUN curl --progress-bar -L --retry 3 \ 31 | "http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ 32 | | gunzip \ 33 | | tar -x -C /usr/ \ 34 | && mv "/usr/hadoop-${HADOOP_VERSION}" "${HADOOP_HOME}" \ 35 | && rm -rf "${HADOOP_HOME}/share/doc" \ 36 | && chown -R root:root "${HADOOP_HOME}" 37 | 38 | # Hive 39 | ENV HIVE_VERSION=3.1.2 40 | ENV HIVE_HOME=/usr/hive 41 | ENV HIVE_CONF_DIR="${HIVE_HOME}/conf" 42 | ENV PATH "${PATH}:${HIVE_HOME}/bin" 43 | RUN curl --progress-bar -L \ 44 | "https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz" \ 45 | | gunzip \ 46 | | tar -x -C /usr/ \ 47 | && mv "/usr/apache-hive-${HIVE_VERSION}-bin" "${HIVE_HOME}" \ 48 | && chown -R root:root "${HIVE_HOME}" \ 49 | && mkdir -p "${HIVE_HOME}/hcatalog/var/log" \ 50 | && mkdir -p "${HIVE_HOME}/var/log" \ 51 | && mkdir -p "${HIVE_CONF_DIR}" \ 52 | && chmod 777 "${HIVE_HOME}/hcatalog/var/log" \ 53 | && chmod 777 "${HIVE_HOME}/var/log" 54 | 55 | # Spark 56 | ENV SPARK_VERSION=2.4.5 57 | ENV SPARK_PACKAGE "spark-${SPARK_VERSION}-bin-without-hadoop" 58 | ENV SPARK_HOME /usr/spark 59 | RUN curl --progress-bar -L --retry 3 \ 60 | "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \ 61 | | gunzip \ 62 | | tar x -C /usr/ \ 63 | && mv "/usr/${SPARK_PACKAGE}" "${SPARK_HOME}" \ 64 | && chown -R root:root "${SPARK_HOME}" 65 | # For inscrutable reasons, Spark distribution doesn't include spark-hive.jar 66 | # Livy attempts to load it though, and will throw 67 | # java.lang.ClassNotFoundException: org.apache.spark.sql.hive.HiveContext 68 | ARG SCALA_VERSION=2.11 69 | RUN curl --progress-bar -L \ 70 | "https://repo1.maven.org/maven2/org/apache/spark/spark-hive_${SCALA_VERSION}/${SPARK_VERSION}/spark-hive_${SCALA_VERSION}-${SPARK_VERSION}.jar" \ 71 | --output "${SPARK_HOME}/jars/spark-hive_${SCALA_VERSION}-${SPARK_VERSION}.jar" 72 | 73 | # PySpark - comment out if you don't want it in order to save image space 74 | RUN apk add --no-cache \ 75 | 'python3=~3.7' \ 76 | 'python3-dev=~3.7' \ 77 | && ln -s /usr/bin/python3 /usr/bin/python 78 | 79 | # SparkR - comment out if you don't want it in order to save image space 80 | RUN apk add --no-cache \ 81 | 'R=~3.6' \ 82 | 'R-dev=~3.6' \ 83 | 'libc-dev=~0.7' \ 84 | && R -e 'install.packages("knitr", repos = "http://cran.us.r-project.org")' 85 | 86 | # Common settings 87 | ENV JAVA_HOME "/usr/lib/jvm/java-1.8-openjdk" 88 | ENV PATH="${PATH}:${JAVA_HOME}/bin" 89 | # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed 90 | ENV PYTHONHASHSEED 0 91 | ENV PYTHONIOENCODING UTF-8 92 | ENV PIP_DISABLE_PIP_VERSION_CHECK 1 93 | 94 | # Hadoop setup 95 | ENV PATH="${PATH}:${HADOOP_HOME}/bin" 96 | ENV HDFS_NAMENODE_USER="root" 97 | ENV HDFS_DATANODE_USER="root" 98 | ENV HDFS_SECONDARYNAMENODE_USER="root" 99 | ENV YARN_RESOURCEMANAGER_USER="root" 100 | ENV YARN_NODEMANAGER_USER="root" 101 | ENV LD_LIBRARY_PATH="${HADOOP_HOME}/lib/native:${LD_LIBRARY_PATH}" 102 | ENV HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop" 103 | ENV HADOOP_LOG_DIR="${HADOOP_HOME}/logs" 104 | COPY conf/hadoop/core-site.xml "${HADOOP_CONF_DIR}" 105 | COPY conf/hadoop/hadoop-env.sh "${HADOOP_CONF_DIR}" 106 | COPY conf/hadoop/hdfs-site.xml "${HADOOP_CONF_DIR}" 107 | COPY conf/hadoop/mapred-site.xml "${HADOOP_CONF_DIR}" 108 | COPY conf/hadoop/workers "${HADOOP_CONF_DIR}" 109 | COPY conf/hadoop/yarn-site.xml "${HADOOP_CONF_DIR}" 110 | 111 | # For S3 to work. Without this line you'll get "Class org.apache.hadoop.fs.s3a.S3AFileSystem not found" exception when accessing S3 from Hadoop 112 | ENV HADOOP_CLASSPATH="${HADOOP_HOME}/share/hadoop/tools/lib/*" 113 | 114 | # Hadoop JVM crashes on Alpine when it tries to load native libraries. 115 | # Solution? Delete those altogether. 116 | # Alternatively, you can try and compile them 117 | # https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/NativeLibraries.html 118 | RUN mkdir "${HADOOP_LOG_DIR}" \ 119 | && rm -rf "${HADOOP_HOME}/lib/native" 120 | 121 | # Hive setup 122 | ENV PATH="${PATH}:${HIVE_HOME}/bin" 123 | ENV HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HIVE_HOME}/lib/*" 124 | COPY conf/hive/hive-site.xml "${HIVE_CONF_DIR}/" 125 | 126 | # Spark setup 127 | ENV PATH="${PATH}:${SPARK_HOME}/bin" 128 | ENV SPARK_CONF_DIR="${SPARK_HOME}/conf" 129 | ENV SPARK_LOG_DIR="${SPARK_HOME}/logs" 130 | ENV SPARK_DIST_CLASSPATH="${HADOOP_CONF_DIR}:${HADOOP_HOME}/share/hadoop/tools/lib/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/hdfs:${HADOOP_HOME}/share/hadoop/hdfs/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/yarn:${HADOOP_HOME}/share/hadoop/yarn/lib/*:${HADOOP_HOME}/share/hadoop/yarn/*" 131 | COPY conf/hadoop/core-site.xml "${SPARK_CONF_DIR}"/ 132 | COPY conf/hadoop/hdfs-site.xml "${SPARK_CONF_DIR}"/ 133 | COPY conf/spark/spark-defaults.conf "${SPARK_CONF_DIR}"/ 134 | 135 | # Spark with Hive 136 | # TODO enable in Spark 3.0 137 | #ENV SPARK_DIST_CLASSPATH=$SPARK_DIST_CLASSPATH:$HIVE_HOME/lib/* 138 | #COPY conf/hive/hive-site.xml $SPARK_CONF_DIR/ 139 | #RUN ln -s $SPARK_HOME/jars/scala-library-*.jar $HIVE_HOME/lib \ 140 | # && ln -s $SPARK_HOME/jars/spark-core_*.jar $HIVE_HOME/lib \ 141 | # && ln -s $SPARK_HOME/jars/spark-network-common_*.jar $HIVE_HOME/lib 142 | 143 | # Clean up 144 | RUN rm -rf "${HIVE_HOME}/examples" \ 145 | && rm -rf "${SPARK_HOME}/examples/src" 146 | 147 | # If both YARN Web UI and Spark UI is up, then returns 0, 1 otherwise. 148 | HEALTHCHECK CMD curl -f http://host.docker.internal:8080/ \ 149 | && curl -f http://host.docker.internal:8088/ || exit 1 150 | 151 | # Multitail for logging 152 | COPY scripts/ /scripts 153 | RUN apk add --no-cache 'linux-headers=~4.19' \ 154 | && gcc /scripts/watchdir.c -o /scripts/watchdir \ 155 | && chmod +x /scripts/parallel_commands.sh 156 | 157 | # Entry point: start all services and applications. 158 | COPY entrypoint.sh / 159 | RUN chmod +x /entrypoint.sh 160 | ENTRYPOINT ["/entrypoint.sh"] 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2020 Vadim Panov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Big data playground: Hadoop + Hive + Spark 2 | 3 | [![Docker Build Status](https://img.shields.io/docker/cloud/build/panovvv/hadoop-hive-spark.svg)](https://cloud.docker.com/repository/docker/panovvv/hadoop-hive-spark/builds) 4 | [![Docker Pulls](https://img.shields.io/docker/pulls/panovvv/hadoop-hive-spark.svg)](https://hub.docker.com/r/panovvv/hadoop-hive-spark) 5 | [![Docker Stars](https://img.shields.io/docker/stars/panovvv/hadoop-hive-spark.svg)](https://hub.docker.com/r/panovvv/hadoop-hive-spark) 6 | 7 | Base Docker image with just essentials: Hadoop, Hive and Spark. 8 | 9 | ## Software 10 | 11 | * [Hadoop 3.2.0](http://hadoop.apache.org/docs/r3.2.0/) in Fully Distributed (Multi-node) Mode 12 | 13 | * [Hive 3.1.2](http://hive.apache.org/) with HiveServer2 exposed to host. 14 | 15 | * [Spark 2.4.5](https://spark.apache.org/docs/2.4.5/) in YARN mode (Spark Scala, PySpark and SparkR) 16 | 17 | ## Usage 18 | 19 | Take a look [at this repo](https://github.com/panovvv/bigdata-docker-compose) 20 | to see how I use it as a part of a Docker Compose cluster. 21 | 22 | Hive JDBC port is exposed to host: 23 | * URI: `jdbc:hive2://localhost:10000` 24 | * Driver: `org.apache.hive.jdbc.HiveDriver` (org.apache.hive:hive-jdbc:3.1.2) 25 | * User and password: unused. 26 | 27 | ## Version compatibility notes 28 | * Hadoop 3.2.1 and Hive 3.1.2 are incompatible due to Guava version 29 | mismatch (Hadoop: Guava 27.0, Hive: Guava 19.0). Hive fails with 30 | `java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)` 31 | * Spark 2.4.4 can not 32 | [use Hive higher than 1.2.2 as a SparkSQL engine](https://spark.apache.org/docs/2.4.4/sql-data-sources-hive-tables.html) 33 | because of this bug: [Spark need to support reading data from Hive 2.0.0 metastore](https://issues.apache.org/jira/browse/SPARK-13446) 34 | and associated issue [Dealing with TimeVars removed in Hive 2.x](https://issues.apache.org/jira/browse/SPARK-27349). 35 | Trying to make it happen results in this exception: 36 | `java.lang.NoSuchFieldError: HIVE_STATS_JDBC_TIMEOUT`. 37 | When this is fixed in Spark 3.0, it will be able to use Hive as a 38 | backend for SparkSQL. Alternatively you can try to downgrade Hive :) 39 | 40 | ## Maintaining 41 | 42 | * Docker file code linting: `docker run --rm -i hadolint/hadolint < Dockerfile` 43 | * [To trim the fat from Docker image](https://github.com/wagoodman/dive) 44 | 45 | ## TODO 46 | * Upgrade spark to 3.0 47 | * When upgraded, enable Spark-Hive integration. -------------------------------------------------------------------------------- /conf/hadoop/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | fs.defaultFS 9 | hdfs://master:9000 10 | 11 | 12 | 13 | fs.default.name 14 | hdfs://master:9000 15 | 16 | 17 | 19 | 20 | hadoop.tmp.dir 21 | 22 | 23 | 24 | 25 | hadoop.proxyuser.root.groups 26 | * 27 | 28 | 29 | hadoop.proxyuser.root.hosts 30 | * 31 | 32 | 33 | -------------------------------------------------------------------------------- /conf/hadoop/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # The java implementation to use. By default, this environment 2 | # variable is REQUIRED on ALL platforms except OS X! 3 | export JAVA_HOME=/usr/lib/jvm/java-1.8-openjdk -------------------------------------------------------------------------------- /conf/hadoop/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | dfs.namenode.name.dir 9 | /dfs/name 10 | 11 | 12 | 13 | dfs.datanode.data.dir 14 | /dfs/data 15 | 16 | 17 | 19 | 20 | dfs.replication 21 | 2 22 | 23 | 24 | -------------------------------------------------------------------------------- /conf/hadoop/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | mapreduce.framework.name 9 | yarn 10 | 11 | 12 | 13 | 14 | 15 | yarn.app.mapreduce.am.resource.memory-mb 16 | 512 17 | 18 | 20 | 21 | mapreduce.map.resource.memory-mb 22 | 512 23 | 24 | 26 | 27 | mapreduce.reduce.resource.memory-mb 28 | 2048 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /conf/hadoop/workers: -------------------------------------------------------------------------------- 1 | worker1 2 | worker2 -------------------------------------------------------------------------------- /conf/hadoop/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 12 | yarn.resourcemanager.hostname 13 | master 14 | 15 | 16 | 17 | 18 | yarn.nodemanager.aux-services 19 | mapreduce_shuffle 20 | 21 | 22 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 23 | org.apache.hadoop.mapred.ShuffleHandler 24 | 25 | 26 | 27 | 28 | 31 | 32 | yarn.nodemanager.resource.memory-mb 33 | 8192 34 | 35 | 36 | 43 | 44 | yarn.scheduler.maximum-allocation-mb 45 | 4096 46 | 47 | 48 | yarn.scheduler.minimum-allocation-mb 49 | 512 50 | 51 | 52 | 55 | 56 | yarn.scheduler.maximum-allocation-vcores 57 | 4 58 | 59 | 60 | yarn.scheduler.minimum-allocation-vcores 61 | 1 62 | 63 | 64 | 66 | 67 | yarn.resourcemanager.scheduler.class 68 | 70 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler 71 | 72 | 73 | 75 | 76 | yarn.nodemanager.vmem-check-enabled 77 | false 78 | 79 | 80 | 81 | To enable RM web ui2 application. 82 | yarn.webapp.ui2.enable 83 | true 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /conf/hive/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | hive.metastore.warehouse.dir 23 | /usr/hive/warehouse 24 | 25 | 26 | 27 | 28 | javax.jdo.option.ConnectionDriverName 29 | org.postgresql.Driver 30 | 31 | 32 | javax.jdo.option.ConnectionURL 33 | jdbc:postgresql://hivemetastore:5432/hivemetastoredb 34 | 35 | 36 | javax.jdo.option.ConnectionUserName 37 | postgres 38 | 39 | 40 | javax.jdo.option.ConnectionPassword 41 | new_password 42 | 43 | 44 | 45 | 46 | hive.execution.engine 47 | spark 48 | 49 | 50 | spark.yarn.jars 51 | hdfs://master:9000/spark-jars/* 52 | 53 | 54 | 55 | 56 | hive.server2.enable.doAs 57 | false 58 | 59 | Setting this property to true will have HiveServer2 execute 60 | Hive operations as the user making the calls to it. 61 | 62 | 63 | -------------------------------------------------------------------------------- /conf/spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # Default system properties included when running spark-submit. 2 | # This is useful for setting default environmental settings. 3 | 4 | # Port settings 5 | spark.ui.port 4040 6 | spark.driver.port 7001 7 | spark.blockManager.port 7002 8 | spark.port.maxRetries 4 9 | spark.history.ui.port 18080 10 | 11 | # Run Spark jobs in YARN 12 | spark.master yarn 13 | spark.yarn.jars hdfs://master:9000/spark-jars/* 14 | 15 | # Spark history server 16 | spark.eventLog.enabled true 17 | spark.eventLog.dir hdfs:///log/spark 18 | spark.history.fs.logDirectory hdfs:///log/spark 19 | 20 | # Spark with Hive 21 | # TODO enable when they remove HIVE_STATS_JDBC_TIMEOUT 22 | # https://github.com/apache/spark/commit/1d95dea30788b9f64c5e304d908b85936aafb238#diff-842e3447fc453de26c706db1cac8f2c4 23 | # https://issues.apache.org/jira/browse/SPARK-13446 24 | #spark.sql.catalogImplementation hive 25 | #spark.sql.hive.metastore.version 2.3.0 26 | #spark.sql.hive.metastore.jars /usr/hive/lib/*:/usr/hadoop/etc/hadoop/*:/usr/hadoop/share/hadoop/common/*:/usr/hadoop/share/hadoop/common/lib/*:/usr/hadoop/share/hadoop/common/sources/*:/usr/hadoop/share/hadoop/hdfs/*:/usr/hadoop/share/hadoop/hdfs/lib/*:/usr/hadoop/share/hadoop/hdfs/sources/*:/usr/hadoop/share/hadoop/mapreduce/*:/usr/hadoop/share/hadoop/mapreduce/lib/*:/usr/hadoop/share/hadoop/mapreduce/sources/*:/usr/hadoop/share/hadoop/yarn/*:/usr/hadoop/share/hadoop/yarn/lib/*:/usr/hadoop/share/hadoop/yarn/sources/*:/usr/hadoop/share/hadoop/yarn/timelineservice/*:/usr/hadoop/share/hadoop/client/*:/usr/hadoop/share/hadoop/tools/lib/*:/usr/hadoop/share/hadoop/tools/sources/* -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -n "${HADOOP_DATANODE_UI_PORT}" ]; then 4 | echo "Replacing default datanode UI port 9864 with ${HADOOP_DATANODE_UI_PORT}" 5 | sed -i "$ i\dfs.datanode.http.address0.0.0.0:${HADOOP_DATANODE_UI_PORT}" ${HADOOP_CONF_DIR}/hdfs-site.xml 6 | fi 7 | if [ "${HADOOP_NODE}" == "namenode" ]; then 8 | echo "Starting Hadoop name node..." 9 | yes | hdfs namenode -format 10 | hdfs --daemon start namenode 11 | hdfs --daemon start secondarynamenode 12 | yarn --daemon start resourcemanager 13 | mapred --daemon start historyserver 14 | fi 15 | if [ "${HADOOP_NODE}" == "datanode" ]; then 16 | echo "Starting Hadoop data node..." 17 | hdfs --daemon start datanode 18 | yarn --daemon start nodemanager 19 | fi 20 | 21 | if [ -n "${HIVE_CONFIGURE}" ]; then 22 | echo "Configuring Hive..." 23 | schematool -dbType postgres -initSchema 24 | 25 | # Start metastore service. 26 | hive --service metastore & 27 | 28 | # JDBC Server. 29 | hiveserver2 & 30 | fi 31 | 32 | if [ -z "${SPARK_MASTER_ADDRESS}" ]; then 33 | echo "Starting Spark master node..." 34 | # Create directory for Spark logs 35 | SPARK_LOGS_HDFS_PATH=/log/spark 36 | if ! hadoop fs -test -d "${SPARK_LOGS_HDFS_PATH}" 37 | then 38 | hadoop fs -mkdir -p ${SPARK_LOGS_HDFS_PATH} 39 | hadoop fs -chmod -R 755 ${SPARK_LOGS_HDFS_PATH}/* 40 | fi 41 | 42 | # Spark on YARN 43 | SPARK_JARS_HDFS_PATH=/spark-jars 44 | if ! hadoop fs -test -d "${SPARK_JARS_HDFS_PATH}" 45 | then 46 | hadoop dfs -copyFromLocal "${SPARK_HOME}/jars" "${SPARK_JARS_HDFS_PATH}" 47 | fi 48 | 49 | "${SPARK_HOME}/sbin/start-master.sh" -h master & 50 | "${SPARK_HOME}/sbin/start-history-server.sh" & 51 | else 52 | echo "Starting Spark slave node..." 53 | "${SPARK_HOME}/sbin/start-slave.sh" "${SPARK_MASTER_ADDRESS}" & 54 | fi 55 | 56 | echo "All initializations finished!" 57 | 58 | # Blocking call to view all logs. This is what won't let container exit right away. 59 | /scripts/parallel_commands.sh "scripts/watchdir ${HADOOP_LOG_DIR}" "scripts/watchdir ${SPARK_LOG_DIR}" 60 | 61 | # Stop all 62 | if [ "${HADOOP_NODE}" == "namenode" ]; then 63 | hdfs namenode -format 64 | hdfs --daemon stop namenode 65 | hdfs --daemon stop secondarynamenode 66 | yarn --daemon stop resourcemanager 67 | mapred --daemon stop historyserver 68 | fi 69 | if [ "${HADOOP_NODE}" == "datanode" ]; then 70 | hdfs --daemon stop datanode 71 | yarn --daemon stop nodemanager 72 | fi -------------------------------------------------------------------------------- /scripts/parallel_commands.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for cmd in "$@"; do { 4 | echo "Process \"$cmd\" started"; 5 | $cmd & pid=$! 6 | PID_LIST+=" $pid"; 7 | } done 8 | 9 | trap "kill $PID_LIST" SIGINT 10 | 11 | echo "Parallel processes have started"; 12 | 13 | wait $PID_LIST 14 | 15 | echo 16 | echo "All processes have completed"; -------------------------------------------------------------------------------- /scripts/watchdir.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #define CHAR_BACK 500 13 | 14 | // * File handler structure 15 | struct file_followed { long last_position; char filename[NAME_MAX]; struct file_followed * next; }; 16 | struct file_followed * file_list = NULL; 17 | 18 | // * To quit peacefully 19 | int cycle = 1; 20 | void stopCycle(int u) { cycle = 0; } 21 | 22 | // * Last tailed filename 23 | char last_tailed[NAME_MAX]; 24 | 25 | void fileAdd(char * file) { 26 | struct file_followed ** list = &file_list; 27 | struct stat statdesc; 28 | 29 | if(stat(file, &statdesc) || !S_ISREG(statdesc.st_mode)) { return; } 30 | while(*list) { list = &((*list)->next); } 31 | *list = (struct file_followed*)malloc(sizeof(struct file_followed)); 32 | (*list)->last_position = -1; 33 | strcpy((*list)->filename, file); 34 | (*list)->next = NULL; 35 | } 36 | 37 | void fileMod(char* fileName, struct file_followed* file_list) { 38 | struct file_followed* item = file_list; 39 | while(item) { 40 | if(strcmp(item->filename, fileName) == 0) { 41 | FILE* fp = fopen(item->filename, "r"); 42 | fseek(fp, 0, SEEK_END); 43 | long end_position = ftell(fp); 44 | fclose(fp); 45 | if (end_position <= item->last_position) { 46 | printf("\n** %s truncated **\n", fileName); 47 | item->last_position = -1; 48 | } 49 | usleep(100); 50 | return; 51 | } 52 | item = item->next; 53 | } 54 | } 55 | 56 | int fileTail(struct file_followed * item) { 57 | int ret = 0; 58 | FILE * fp = fopen(item->filename, "r"); 59 | fseek(fp, 0, SEEK_END); 60 | long end_position = ftell(fp); 61 | 62 | if( end_position != item->last_position ) { 63 | if(strcmp(item->filename, last_tailed)) { strcpy(last_tailed, item->filename); printf("\n** %s **:\n", item->filename); } 64 | 65 | int start_position = item->last_position == -1 || item->last_position > end_position ? (end_position-CHAR_BACK > 0 ? end_position-CHAR_BACK : 0) : item->last_position; 66 | fseek(fp, start_position, SEEK_SET); 67 | 68 | int len = end_position - start_position; 69 | char * buf = (char*)malloc(len+1); 70 | fread(buf, len, 1, fp); 71 | buf[len] = '\0'; 72 | printf("%s%s", len == CHAR_BACK ? "[...]" : "", buf); 73 | free(buf); 74 | 75 | item->last_position = end_position; 76 | ret = 1; 77 | } 78 | 79 | fclose(fp); 80 | return ret; 81 | } 82 | 83 | void fileRem(char * file) { 84 | struct file_followed ** list = &file_list; 85 | while(*list && strcmp((*list)->filename, file)) { list = &((*list)->next); } 86 | if(*list) { struct file_followed * todel = *list; *list = (*list)->next; free(todel); } 87 | } 88 | 89 | int main(int argc, char ** argv) { 90 | 91 | struct dirent **namelist; 92 | struct stat statdesc; 93 | struct timeval tv; 94 | fd_set set; 95 | int fd; 96 | int wd; 97 | int r; 98 | 99 | // * Help 100 | if(stat(argv[1], &statdesc) || !S_ISDIR(statdesc.st_mode)) { printf("[usage] %s dir-to-monitor\n", argv[0]); exit(EXIT_FAILURE); } 101 | 102 | // * Init 103 | chdir(argv[1]); 104 | memset(last_tailed, 0, sizeof(last_tailed)); 105 | signal(SIGINT, stopCycle); 106 | signal(SIGTERM, stopCycle); 107 | 108 | // * Inotify 109 | if( (fd = inotify_init()) < 0) { perror("inotify_init"); } 110 | if( (wd = inotify_add_watch( fd, ".", IN_CREATE | IN_MODIFY |IN_DELETE ) < 0)) { perror("inotify_add_watch"); } 111 | 112 | // * File add recursively on dirscan 113 | if( (r = scandir(".", &namelist, 0, alphasort)) < 0) { perror("scandir"); } 114 | while (r--) { fileAdd(namelist[r]->d_name); free(namelist[r]); } 115 | free(namelist); 116 | 117 | // * Neverending cycle 118 | while(cycle) { 119 | // * Select on inotify 120 | FD_ZERO(&set); 121 | FD_SET(fd, &set); 122 | tv.tv_sec = 0; 123 | tv.tv_usec = 1000; 124 | if( (r = select(fd+1, &set, NULL, NULL, &tv)) == -1) { perror("select"); } 125 | 126 | // * New add or del on inotify 127 | if(r) { 128 | struct inotify_event * event; 129 | char buf[1024]; 130 | if(read(fd, buf, 1024) <= 0) { perror("read"); } 131 | event = (struct inotify_event *) buf; 132 | if(event->mask & IN_MODIFY) { fileMod(event->name, file_list);} 133 | else if(event->mask & IN_CREATE) { fileAdd(event->name); } 134 | else if(event->mask & IN_DELETE) { fileRem(event->name); } 135 | } 136 | 137 | // * Check for new tails 138 | struct file_followed * list = file_list; 139 | int tailers = 0; 140 | while(list) { tailers += fileTail(list); list = list->next; } 141 | if(!tailers) { usleep(500000); } 142 | } 143 | 144 | // * Stop inotify 145 | inotify_rm_watch( fd, wd ); 146 | close(fd); 147 | 148 | return EXIT_SUCCESS; 149 | } --------------------------------------------------------------------------------