├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── conf
├── hadoop
│ ├── core-site.xml
│ ├── hadoop-env.sh
│ ├── hdfs-site.xml
│ ├── mapred-site.xml
│ ├── workers
│ └── yarn-site.xml
├── hive
│ └── hive-site.xml
└── spark
│ └── spark-defaults.conf
├── entrypoint.sh
└── scripts
├── parallel_commands.sh
└── watchdir.c
/.gitignore:
--------------------------------------------------------------------------------
1 | # OS garbage
2 | .DS_Store
3 | desktop.ini
4 |
5 |
6 | # IDE garbage
7 | .idea/
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Alpine 3.11 contains Python 3.8, pyspark only supports Python up to 3.7
2 | FROM alpine:3.10.4
3 |
4 | # curl and unzip: download and extract Hive, Hadoop, Spark etc.
5 | # bash: Hadoop is not compatible with Alpine's `ash` shell
6 | # openjdk8: Java
7 | # coreutils: Spark launcher script relies on GNU implementation of `nice`
8 | # procps: Hadoop needs GNU `ps` utility
9 | # findutils: Spark needs GNU `find` to run jobs (weird but true)
10 | # ncurses: so that you can run `yarn top`
11 | RUN apk add --no-cache \
12 | 'curl=~7.66' \
13 | 'unzip=~6.0' \
14 | 'openjdk8=~8' \
15 | 'bash=~5.0' \
16 | 'coreutils=~8.31' \
17 | 'procps=~3.3' \
18 | 'findutils=~4.6' \
19 | 'ncurses=~6.1' \
20 | 'g++=~8.3' \
21 | 'libc6-compat=~1.1' \
22 | && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2
23 |
24 | # https://github.com/hadolint/hadolint/wiki/DL4006
25 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
26 |
27 | # Hadoop
28 | ENV HADOOP_VERSION=3.2.0
29 | ENV HADOOP_HOME /usr/hadoop
30 | RUN curl --progress-bar -L --retry 3 \
31 | "http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \
32 | | gunzip \
33 | | tar -x -C /usr/ \
34 | && mv "/usr/hadoop-${HADOOP_VERSION}" "${HADOOP_HOME}" \
35 | && rm -rf "${HADOOP_HOME}/share/doc" \
36 | && chown -R root:root "${HADOOP_HOME}"
37 |
38 | # Hive
39 | ENV HIVE_VERSION=3.1.2
40 | ENV HIVE_HOME=/usr/hive
41 | ENV HIVE_CONF_DIR="${HIVE_HOME}/conf"
42 | ENV PATH "${PATH}:${HIVE_HOME}/bin"
43 | RUN curl --progress-bar -L \
44 | "https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz" \
45 | | gunzip \
46 | | tar -x -C /usr/ \
47 | && mv "/usr/apache-hive-${HIVE_VERSION}-bin" "${HIVE_HOME}" \
48 | && chown -R root:root "${HIVE_HOME}" \
49 | && mkdir -p "${HIVE_HOME}/hcatalog/var/log" \
50 | && mkdir -p "${HIVE_HOME}/var/log" \
51 | && mkdir -p "${HIVE_CONF_DIR}" \
52 | && chmod 777 "${HIVE_HOME}/hcatalog/var/log" \
53 | && chmod 777 "${HIVE_HOME}/var/log"
54 |
55 | # Spark
56 | ENV SPARK_VERSION=2.4.5
57 | ENV SPARK_PACKAGE "spark-${SPARK_VERSION}-bin-without-hadoop"
58 | ENV SPARK_HOME /usr/spark
59 | RUN curl --progress-bar -L --retry 3 \
60 | "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \
61 | | gunzip \
62 | | tar x -C /usr/ \
63 | && mv "/usr/${SPARK_PACKAGE}" "${SPARK_HOME}" \
64 | && chown -R root:root "${SPARK_HOME}"
65 | # For inscrutable reasons, Spark distribution doesn't include spark-hive.jar
66 | # Livy attempts to load it though, and will throw
67 | # java.lang.ClassNotFoundException: org.apache.spark.sql.hive.HiveContext
68 | ARG SCALA_VERSION=2.11
69 | RUN curl --progress-bar -L \
70 | "https://repo1.maven.org/maven2/org/apache/spark/spark-hive_${SCALA_VERSION}/${SPARK_VERSION}/spark-hive_${SCALA_VERSION}-${SPARK_VERSION}.jar" \
71 | --output "${SPARK_HOME}/jars/spark-hive_${SCALA_VERSION}-${SPARK_VERSION}.jar"
72 |
73 | # PySpark - comment out if you don't want it in order to save image space
74 | RUN apk add --no-cache \
75 | 'python3=~3.7' \
76 | 'python3-dev=~3.7' \
77 | && ln -s /usr/bin/python3 /usr/bin/python
78 |
79 | # SparkR - comment out if you don't want it in order to save image space
80 | RUN apk add --no-cache \
81 | 'R=~3.6' \
82 | 'R-dev=~3.6' \
83 | 'libc-dev=~0.7' \
84 | && R -e 'install.packages("knitr", repos = "http://cran.us.r-project.org")'
85 |
86 | # Common settings
87 | ENV JAVA_HOME "/usr/lib/jvm/java-1.8-openjdk"
88 | ENV PATH="${PATH}:${JAVA_HOME}/bin"
89 | # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
90 | ENV PYTHONHASHSEED 0
91 | ENV PYTHONIOENCODING UTF-8
92 | ENV PIP_DISABLE_PIP_VERSION_CHECK 1
93 |
94 | # Hadoop setup
95 | ENV PATH="${PATH}:${HADOOP_HOME}/bin"
96 | ENV HDFS_NAMENODE_USER="root"
97 | ENV HDFS_DATANODE_USER="root"
98 | ENV HDFS_SECONDARYNAMENODE_USER="root"
99 | ENV YARN_RESOURCEMANAGER_USER="root"
100 | ENV YARN_NODEMANAGER_USER="root"
101 | ENV LD_LIBRARY_PATH="${HADOOP_HOME}/lib/native:${LD_LIBRARY_PATH}"
102 | ENV HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
103 | ENV HADOOP_LOG_DIR="${HADOOP_HOME}/logs"
104 | COPY conf/hadoop/core-site.xml "${HADOOP_CONF_DIR}"
105 | COPY conf/hadoop/hadoop-env.sh "${HADOOP_CONF_DIR}"
106 | COPY conf/hadoop/hdfs-site.xml "${HADOOP_CONF_DIR}"
107 | COPY conf/hadoop/mapred-site.xml "${HADOOP_CONF_DIR}"
108 | COPY conf/hadoop/workers "${HADOOP_CONF_DIR}"
109 | COPY conf/hadoop/yarn-site.xml "${HADOOP_CONF_DIR}"
110 |
111 | # For S3 to work. Without this line you'll get "Class org.apache.hadoop.fs.s3a.S3AFileSystem not found" exception when accessing S3 from Hadoop
112 | ENV HADOOP_CLASSPATH="${HADOOP_HOME}/share/hadoop/tools/lib/*"
113 |
114 | # Hadoop JVM crashes on Alpine when it tries to load native libraries.
115 | # Solution? Delete those altogether.
116 | # Alternatively, you can try and compile them
117 | # https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/NativeLibraries.html
118 | RUN mkdir "${HADOOP_LOG_DIR}" \
119 | && rm -rf "${HADOOP_HOME}/lib/native"
120 |
121 | # Hive setup
122 | ENV PATH="${PATH}:${HIVE_HOME}/bin"
123 | ENV HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HIVE_HOME}/lib/*"
124 | COPY conf/hive/hive-site.xml "${HIVE_CONF_DIR}/"
125 |
126 | # Spark setup
127 | ENV PATH="${PATH}:${SPARK_HOME}/bin"
128 | ENV SPARK_CONF_DIR="${SPARK_HOME}/conf"
129 | ENV SPARK_LOG_DIR="${SPARK_HOME}/logs"
130 | ENV SPARK_DIST_CLASSPATH="${HADOOP_CONF_DIR}:${HADOOP_HOME}/share/hadoop/tools/lib/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/hdfs:${HADOOP_HOME}/share/hadoop/hdfs/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/yarn:${HADOOP_HOME}/share/hadoop/yarn/lib/*:${HADOOP_HOME}/share/hadoop/yarn/*"
131 | COPY conf/hadoop/core-site.xml "${SPARK_CONF_DIR}"/
132 | COPY conf/hadoop/hdfs-site.xml "${SPARK_CONF_DIR}"/
133 | COPY conf/spark/spark-defaults.conf "${SPARK_CONF_DIR}"/
134 |
135 | # Spark with Hive
136 | # TODO enable in Spark 3.0
137 | #ENV SPARK_DIST_CLASSPATH=$SPARK_DIST_CLASSPATH:$HIVE_HOME/lib/*
138 | #COPY conf/hive/hive-site.xml $SPARK_CONF_DIR/
139 | #RUN ln -s $SPARK_HOME/jars/scala-library-*.jar $HIVE_HOME/lib \
140 | # && ln -s $SPARK_HOME/jars/spark-core_*.jar $HIVE_HOME/lib \
141 | # && ln -s $SPARK_HOME/jars/spark-network-common_*.jar $HIVE_HOME/lib
142 |
143 | # Clean up
144 | RUN rm -rf "${HIVE_HOME}/examples" \
145 | && rm -rf "${SPARK_HOME}/examples/src"
146 |
147 | # If both YARN Web UI and Spark UI is up, then returns 0, 1 otherwise.
148 | HEALTHCHECK CMD curl -f http://host.docker.internal:8080/ \
149 | && curl -f http://host.docker.internal:8088/ || exit 1
150 |
151 | # Multitail for logging
152 | COPY scripts/ /scripts
153 | RUN apk add --no-cache 'linux-headers=~4.19' \
154 | && gcc /scripts/watchdir.c -o /scripts/watchdir \
155 | && chmod +x /scripts/parallel_commands.sh
156 |
157 | # Entry point: start all services and applications.
158 | COPY entrypoint.sh /
159 | RUN chmod +x /entrypoint.sh
160 | ENTRYPOINT ["/entrypoint.sh"]
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2020 Vadim Panov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Big data playground: Hadoop + Hive + Spark
2 |
3 | [](https://cloud.docker.com/repository/docker/panovvv/hadoop-hive-spark/builds)
4 | [](https://hub.docker.com/r/panovvv/hadoop-hive-spark)
5 | [](https://hub.docker.com/r/panovvv/hadoop-hive-spark)
6 |
7 | Base Docker image with just essentials: Hadoop, Hive and Spark.
8 |
9 | ## Software
10 |
11 | * [Hadoop 3.2.0](http://hadoop.apache.org/docs/r3.2.0/) in Fully Distributed (Multi-node) Mode
12 |
13 | * [Hive 3.1.2](http://hive.apache.org/) with HiveServer2 exposed to host.
14 |
15 | * [Spark 2.4.5](https://spark.apache.org/docs/2.4.5/) in YARN mode (Spark Scala, PySpark and SparkR)
16 |
17 | ## Usage
18 |
19 | Take a look [at this repo](https://github.com/panovvv/bigdata-docker-compose)
20 | to see how I use it as a part of a Docker Compose cluster.
21 |
22 | Hive JDBC port is exposed to host:
23 | * URI: `jdbc:hive2://localhost:10000`
24 | * Driver: `org.apache.hive.jdbc.HiveDriver` (org.apache.hive:hive-jdbc:3.1.2)
25 | * User and password: unused.
26 |
27 | ## Version compatibility notes
28 | * Hadoop 3.2.1 and Hive 3.1.2 are incompatible due to Guava version
29 | mismatch (Hadoop: Guava 27.0, Hive: Guava 19.0). Hive fails with
30 | `java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)`
31 | * Spark 2.4.4 can not
32 | [use Hive higher than 1.2.2 as a SparkSQL engine](https://spark.apache.org/docs/2.4.4/sql-data-sources-hive-tables.html)
33 | because of this bug: [Spark need to support reading data from Hive 2.0.0 metastore](https://issues.apache.org/jira/browse/SPARK-13446)
34 | and associated issue [Dealing with TimeVars removed in Hive 2.x](https://issues.apache.org/jira/browse/SPARK-27349).
35 | Trying to make it happen results in this exception:
36 | `java.lang.NoSuchFieldError: HIVE_STATS_JDBC_TIMEOUT`.
37 | When this is fixed in Spark 3.0, it will be able to use Hive as a
38 | backend for SparkSQL. Alternatively you can try to downgrade Hive :)
39 |
40 | ## Maintaining
41 |
42 | * Docker file code linting: `docker run --rm -i hadolint/hadolint < Dockerfile`
43 | * [To trim the fat from Docker image](https://github.com/wagoodman/dive)
44 |
45 | ## TODO
46 | * Upgrade spark to 3.0
47 | * When upgraded, enable Spark-Hive integration.
--------------------------------------------------------------------------------
/conf/hadoop/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | fs.defaultFS
9 | hdfs://master:9000
10 |
11 |
12 |
13 | fs.default.name
14 | hdfs://master:9000
15 |
16 |
17 |
19 |
20 | hadoop.tmp.dir
21 |
22 |
23 |
24 |
25 | hadoop.proxyuser.root.groups
26 | *
27 |
28 |
29 | hadoop.proxyuser.root.hosts
30 | *
31 |
32 |
33 |
--------------------------------------------------------------------------------
/conf/hadoop/hadoop-env.sh:
--------------------------------------------------------------------------------
1 | # The java implementation to use. By default, this environment
2 | # variable is REQUIRED on ALL platforms except OS X!
3 | export JAVA_HOME=/usr/lib/jvm/java-1.8-openjdk
--------------------------------------------------------------------------------
/conf/hadoop/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | dfs.namenode.name.dir
9 | /dfs/name
10 |
11 |
12 |
13 | dfs.datanode.data.dir
14 | /dfs/data
15 |
16 |
17 |
19 |
20 | dfs.replication
21 | 2
22 |
23 |
24 |
--------------------------------------------------------------------------------
/conf/hadoop/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | mapreduce.framework.name
9 | yarn
10 |
11 |
12 |
13 |
14 |
15 | yarn.app.mapreduce.am.resource.memory-mb
16 | 512
17 |
18 |
20 |
21 | mapreduce.map.resource.memory-mb
22 | 512
23 |
24 |
26 |
27 | mapreduce.reduce.resource.memory-mb
28 | 2048
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/conf/hadoop/workers:
--------------------------------------------------------------------------------
1 | worker1
2 | worker2
--------------------------------------------------------------------------------
/conf/hadoop/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
11 |
12 | yarn.resourcemanager.hostname
13 | master
14 |
15 |
16 |
17 |
18 | yarn.nodemanager.aux-services
19 | mapreduce_shuffle
20 |
21 |
22 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
23 | org.apache.hadoop.mapred.ShuffleHandler
24 |
25 |
26 |
27 |
28 |
31 |
32 | yarn.nodemanager.resource.memory-mb
33 | 8192
34 |
35 |
36 |
43 |
44 | yarn.scheduler.maximum-allocation-mb
45 | 4096
46 |
47 |
48 | yarn.scheduler.minimum-allocation-mb
49 | 512
50 |
51 |
52 |
55 |
56 | yarn.scheduler.maximum-allocation-vcores
57 | 4
58 |
59 |
60 | yarn.scheduler.minimum-allocation-vcores
61 | 1
62 |
63 |
64 |
66 |
67 | yarn.resourcemanager.scheduler.class
68 |
70 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler
71 |
72 |
73 |
75 |
76 | yarn.nodemanager.vmem-check-enabled
77 | false
78 |
79 |
80 |
81 | To enable RM web ui2 application.
82 | yarn.webapp.ui2.enable
83 | true
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/conf/hive/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 |
21 |
22 | hive.metastore.warehouse.dir
23 | /usr/hive/warehouse
24 |
25 |
26 |
27 |
28 | javax.jdo.option.ConnectionDriverName
29 | org.postgresql.Driver
30 |
31 |
32 | javax.jdo.option.ConnectionURL
33 | jdbc:postgresql://hivemetastore:5432/hivemetastoredb
34 |
35 |
36 | javax.jdo.option.ConnectionUserName
37 | postgres
38 |
39 |
40 | javax.jdo.option.ConnectionPassword
41 | new_password
42 |
43 |
44 |
45 |
46 | hive.execution.engine
47 | spark
48 |
49 |
50 | spark.yarn.jars
51 | hdfs://master:9000/spark-jars/*
52 |
53 |
54 |
55 |
56 | hive.server2.enable.doAs
57 | false
58 |
59 | Setting this property to true will have HiveServer2 execute
60 | Hive operations as the user making the calls to it.
61 |
62 |
63 |
--------------------------------------------------------------------------------
/conf/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | # Default system properties included when running spark-submit.
2 | # This is useful for setting default environmental settings.
3 |
4 | # Port settings
5 | spark.ui.port 4040
6 | spark.driver.port 7001
7 | spark.blockManager.port 7002
8 | spark.port.maxRetries 4
9 | spark.history.ui.port 18080
10 |
11 | # Run Spark jobs in YARN
12 | spark.master yarn
13 | spark.yarn.jars hdfs://master:9000/spark-jars/*
14 |
15 | # Spark history server
16 | spark.eventLog.enabled true
17 | spark.eventLog.dir hdfs:///log/spark
18 | spark.history.fs.logDirectory hdfs:///log/spark
19 |
20 | # Spark with Hive
21 | # TODO enable when they remove HIVE_STATS_JDBC_TIMEOUT
22 | # https://github.com/apache/spark/commit/1d95dea30788b9f64c5e304d908b85936aafb238#diff-842e3447fc453de26c706db1cac8f2c4
23 | # https://issues.apache.org/jira/browse/SPARK-13446
24 | #spark.sql.catalogImplementation hive
25 | #spark.sql.hive.metastore.version 2.3.0
26 | #spark.sql.hive.metastore.jars /usr/hive/lib/*:/usr/hadoop/etc/hadoop/*:/usr/hadoop/share/hadoop/common/*:/usr/hadoop/share/hadoop/common/lib/*:/usr/hadoop/share/hadoop/common/sources/*:/usr/hadoop/share/hadoop/hdfs/*:/usr/hadoop/share/hadoop/hdfs/lib/*:/usr/hadoop/share/hadoop/hdfs/sources/*:/usr/hadoop/share/hadoop/mapreduce/*:/usr/hadoop/share/hadoop/mapreduce/lib/*:/usr/hadoop/share/hadoop/mapreduce/sources/*:/usr/hadoop/share/hadoop/yarn/*:/usr/hadoop/share/hadoop/yarn/lib/*:/usr/hadoop/share/hadoop/yarn/sources/*:/usr/hadoop/share/hadoop/yarn/timelineservice/*:/usr/hadoop/share/hadoop/client/*:/usr/hadoop/share/hadoop/tools/lib/*:/usr/hadoop/share/hadoop/tools/sources/*
--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ -n "${HADOOP_DATANODE_UI_PORT}" ]; then
4 | echo "Replacing default datanode UI port 9864 with ${HADOOP_DATANODE_UI_PORT}"
5 | sed -i "$ i\dfs.datanode.http.address0.0.0.0:${HADOOP_DATANODE_UI_PORT}" ${HADOOP_CONF_DIR}/hdfs-site.xml
6 | fi
7 | if [ "${HADOOP_NODE}" == "namenode" ]; then
8 | echo "Starting Hadoop name node..."
9 | yes | hdfs namenode -format
10 | hdfs --daemon start namenode
11 | hdfs --daemon start secondarynamenode
12 | yarn --daemon start resourcemanager
13 | mapred --daemon start historyserver
14 | fi
15 | if [ "${HADOOP_NODE}" == "datanode" ]; then
16 | echo "Starting Hadoop data node..."
17 | hdfs --daemon start datanode
18 | yarn --daemon start nodemanager
19 | fi
20 |
21 | if [ -n "${HIVE_CONFIGURE}" ]; then
22 | echo "Configuring Hive..."
23 | schematool -dbType postgres -initSchema
24 |
25 | # Start metastore service.
26 | hive --service metastore &
27 |
28 | # JDBC Server.
29 | hiveserver2 &
30 | fi
31 |
32 | if [ -z "${SPARK_MASTER_ADDRESS}" ]; then
33 | echo "Starting Spark master node..."
34 | # Create directory for Spark logs
35 | SPARK_LOGS_HDFS_PATH=/log/spark
36 | if ! hadoop fs -test -d "${SPARK_LOGS_HDFS_PATH}"
37 | then
38 | hadoop fs -mkdir -p ${SPARK_LOGS_HDFS_PATH}
39 | hadoop fs -chmod -R 755 ${SPARK_LOGS_HDFS_PATH}/*
40 | fi
41 |
42 | # Spark on YARN
43 | SPARK_JARS_HDFS_PATH=/spark-jars
44 | if ! hadoop fs -test -d "${SPARK_JARS_HDFS_PATH}"
45 | then
46 | hadoop dfs -copyFromLocal "${SPARK_HOME}/jars" "${SPARK_JARS_HDFS_PATH}"
47 | fi
48 |
49 | "${SPARK_HOME}/sbin/start-master.sh" -h master &
50 | "${SPARK_HOME}/sbin/start-history-server.sh" &
51 | else
52 | echo "Starting Spark slave node..."
53 | "${SPARK_HOME}/sbin/start-slave.sh" "${SPARK_MASTER_ADDRESS}" &
54 | fi
55 |
56 | echo "All initializations finished!"
57 |
58 | # Blocking call to view all logs. This is what won't let container exit right away.
59 | /scripts/parallel_commands.sh "scripts/watchdir ${HADOOP_LOG_DIR}" "scripts/watchdir ${SPARK_LOG_DIR}"
60 |
61 | # Stop all
62 | if [ "${HADOOP_NODE}" == "namenode" ]; then
63 | hdfs namenode -format
64 | hdfs --daemon stop namenode
65 | hdfs --daemon stop secondarynamenode
66 | yarn --daemon stop resourcemanager
67 | mapred --daemon stop historyserver
68 | fi
69 | if [ "${HADOOP_NODE}" == "datanode" ]; then
70 | hdfs --daemon stop datanode
71 | yarn --daemon stop nodemanager
72 | fi
--------------------------------------------------------------------------------
/scripts/parallel_commands.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for cmd in "$@"; do {
4 | echo "Process \"$cmd\" started";
5 | $cmd & pid=$!
6 | PID_LIST+=" $pid";
7 | } done
8 |
9 | trap "kill $PID_LIST" SIGINT
10 |
11 | echo "Parallel processes have started";
12 |
13 | wait $PID_LIST
14 |
15 | echo
16 | echo "All processes have completed";
--------------------------------------------------------------------------------
/scripts/watchdir.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #define CHAR_BACK 500
13 |
14 | // * File handler structure
15 | struct file_followed { long last_position; char filename[NAME_MAX]; struct file_followed * next; };
16 | struct file_followed * file_list = NULL;
17 |
18 | // * To quit peacefully
19 | int cycle = 1;
20 | void stopCycle(int u) { cycle = 0; }
21 |
22 | // * Last tailed filename
23 | char last_tailed[NAME_MAX];
24 |
25 | void fileAdd(char * file) {
26 | struct file_followed ** list = &file_list;
27 | struct stat statdesc;
28 |
29 | if(stat(file, &statdesc) || !S_ISREG(statdesc.st_mode)) { return; }
30 | while(*list) { list = &((*list)->next); }
31 | *list = (struct file_followed*)malloc(sizeof(struct file_followed));
32 | (*list)->last_position = -1;
33 | strcpy((*list)->filename, file);
34 | (*list)->next = NULL;
35 | }
36 |
37 | void fileMod(char* fileName, struct file_followed* file_list) {
38 | struct file_followed* item = file_list;
39 | while(item) {
40 | if(strcmp(item->filename, fileName) == 0) {
41 | FILE* fp = fopen(item->filename, "r");
42 | fseek(fp, 0, SEEK_END);
43 | long end_position = ftell(fp);
44 | fclose(fp);
45 | if (end_position <= item->last_position) {
46 | printf("\n** %s truncated **\n", fileName);
47 | item->last_position = -1;
48 | }
49 | usleep(100);
50 | return;
51 | }
52 | item = item->next;
53 | }
54 | }
55 |
56 | int fileTail(struct file_followed * item) {
57 | int ret = 0;
58 | FILE * fp = fopen(item->filename, "r");
59 | fseek(fp, 0, SEEK_END);
60 | long end_position = ftell(fp);
61 |
62 | if( end_position != item->last_position ) {
63 | if(strcmp(item->filename, last_tailed)) { strcpy(last_tailed, item->filename); printf("\n** %s **:\n", item->filename); }
64 |
65 | int start_position = item->last_position == -1 || item->last_position > end_position ? (end_position-CHAR_BACK > 0 ? end_position-CHAR_BACK : 0) : item->last_position;
66 | fseek(fp, start_position, SEEK_SET);
67 |
68 | int len = end_position - start_position;
69 | char * buf = (char*)malloc(len+1);
70 | fread(buf, len, 1, fp);
71 | buf[len] = '\0';
72 | printf("%s%s", len == CHAR_BACK ? "[...]" : "", buf);
73 | free(buf);
74 |
75 | item->last_position = end_position;
76 | ret = 1;
77 | }
78 |
79 | fclose(fp);
80 | return ret;
81 | }
82 |
83 | void fileRem(char * file) {
84 | struct file_followed ** list = &file_list;
85 | while(*list && strcmp((*list)->filename, file)) { list = &((*list)->next); }
86 | if(*list) { struct file_followed * todel = *list; *list = (*list)->next; free(todel); }
87 | }
88 |
89 | int main(int argc, char ** argv) {
90 |
91 | struct dirent **namelist;
92 | struct stat statdesc;
93 | struct timeval tv;
94 | fd_set set;
95 | int fd;
96 | int wd;
97 | int r;
98 |
99 | // * Help
100 | if(stat(argv[1], &statdesc) || !S_ISDIR(statdesc.st_mode)) { printf("[usage] %s dir-to-monitor\n", argv[0]); exit(EXIT_FAILURE); }
101 |
102 | // * Init
103 | chdir(argv[1]);
104 | memset(last_tailed, 0, sizeof(last_tailed));
105 | signal(SIGINT, stopCycle);
106 | signal(SIGTERM, stopCycle);
107 |
108 | // * Inotify
109 | if( (fd = inotify_init()) < 0) { perror("inotify_init"); }
110 | if( (wd = inotify_add_watch( fd, ".", IN_CREATE | IN_MODIFY |IN_DELETE ) < 0)) { perror("inotify_add_watch"); }
111 |
112 | // * File add recursively on dirscan
113 | if( (r = scandir(".", &namelist, 0, alphasort)) < 0) { perror("scandir"); }
114 | while (r--) { fileAdd(namelist[r]->d_name); free(namelist[r]); }
115 | free(namelist);
116 |
117 | // * Neverending cycle
118 | while(cycle) {
119 | // * Select on inotify
120 | FD_ZERO(&set);
121 | FD_SET(fd, &set);
122 | tv.tv_sec = 0;
123 | tv.tv_usec = 1000;
124 | if( (r = select(fd+1, &set, NULL, NULL, &tv)) == -1) { perror("select"); }
125 |
126 | // * New add or del on inotify
127 | if(r) {
128 | struct inotify_event * event;
129 | char buf[1024];
130 | if(read(fd, buf, 1024) <= 0) { perror("read"); }
131 | event = (struct inotify_event *) buf;
132 | if(event->mask & IN_MODIFY) { fileMod(event->name, file_list);}
133 | else if(event->mask & IN_CREATE) { fileAdd(event->name); }
134 | else if(event->mask & IN_DELETE) { fileRem(event->name); }
135 | }
136 |
137 | // * Check for new tails
138 | struct file_followed * list = file_list;
139 | int tailers = 0;
140 | while(list) { tailers += fileTail(list); list = list->next; }
141 | if(!tailers) { usleep(500000); }
142 | }
143 |
144 | // * Stop inotify
145 | inotify_rm_watch( fd, wd );
146 | close(fd);
147 |
148 | return EXIT_SUCCESS;
149 | }
--------------------------------------------------------------------------------