├── .gitignore
├── Dockerfile
├── README.md
├── config
    ├── core-site.xml
    ├── hadoop-env.sh
    ├── hdfs-site.xml
    ├── mapred-site.xml
    ├── ssh_config
    ├── workers
    └── yarn-site.xml
├── docker-compose.yml
├── share
    ├── bigdata-learning-0.0.1.jar
    ├── my_script.py
    └── words.txt
└── start-hadoop.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/bitnami/spark:3
 2 | LABEL maintainer="s1mplecc <s1mple951205@gmail.com>"
 3 | LABEL description="Docker image with Spark (3.1.2) and Hadoop (3.2.0), based on bitnami/spark:3. \
 4 | For more information, please visit https://github.com/s1mplecc/spark-hadoop-docker."
 5 | 
 6 | USER root
 7 | 
 8 | ENV HADOOP_HOME="/opt/hadoop"
 9 | ENV HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"
10 | ENV HADOOP_LOG_DIR="/var/log/hadoop"
11 | ENV PATH="$HADOOP_HOME/hadoop/sbin:$HADOOP_HOME/bin:$PATH"
12 | 
13 | WORKDIR /opt
14 | 
15 | RUN apt-get update && apt-get install -y openssh-server
16 | 
17 | RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -P '' && \
18 |     cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
19 | 
20 | RUN curl -OL https://archive.apache.org/dist/hadoop/common/hadoop-3.2.0/hadoop-3.2.0.tar.gz
21 | RUN tar -xzvf hadoop-3.2.0.tar.gz && \
22 | 	mv hadoop-3.2.0 hadoop && \
23 | 	rm -rf hadoop-3.2.0.tar.gz && \
24 | 	mkdir /var/log/hadoop
25 | 
26 | RUN mkdir -p /root/hdfs/namenode && \ 
27 |     mkdir -p /root/hdfs/datanode 
28 | 
29 | COPY config/* /tmp/
30 | 
31 | RUN mv /tmp/ssh_config /root/.ssh/config && \
32 |     mv /tmp/hadoop-env.sh $HADOOP_CONF_DIR/hadoop-env.sh && \
33 |     mv /tmp/hdfs-site.xml $HADOOP_CONF_DIR/hdfs-site.xml && \ 
34 |     mv /tmp/core-site.xml $HADOOP_CONF_DIR/core-site.xml && \
35 |     mv /tmp/mapred-site.xml $HADOOP_CONF_DIR/mapred-site.xml && \
36 |     mv /tmp/yarn-site.xml $HADOOP_CONF_DIR/yarn-site.xml && \
37 |     mv /tmp/workers $HADOOP_CONF_DIR/workers
38 | 
39 | COPY start-hadoop.sh /opt/start-hadoop.sh
40 | 
41 | RUN chmod +x /opt/start-hadoop.sh && \
42 |     chmod +x $HADOOP_HOME/sbin/start-dfs.sh && \
43 |     chmod +x $HADOOP_HOME/sbin/start-yarn.sh 
44 | 
45 | RUN hdfs namenode -format
46 | RUN sed -i "1 a /etc/init.d/ssh start > /dev/null &" /opt/bitnami/scripts/spark/entrypoint.sh
47 | 
48 | ENTRYPOINT [ "/opt/bitnami/scripts/spark/entrypoint.sh" ]
49 | CMD [ "/opt/bitnami/scripts/spark/run.sh" ]
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 介绍
 2 | 
 3 | 为了免去繁杂的环境配置工作，提供开箱即用的 Spark + Hadoop 快捷部署方案。基于 [BitNami](https://github.com/bitnami/bitnami-docker-spark) 项目的成熟镜像方案，搭建 Spark Docker 集群，并在原有镜像基础上，构建了安装有对应版本 Hadoop 的镜像。详细构建过程请参考：[使用 Docker 快速部署 Spark + Hadoop 大数据集群](https://s1mple.cc/2021/10/12/%E4%BD%BF%E7%94%A8-Docker-%E5%BF%AB%E9%80%9F%E9%83%A8%E7%BD%B2-Spark-Hadoop-%E5%A4%A7%E6%95%B0%E6%8D%AE%E9%9B%86%E7%BE%A4/)。
 4 | 
 5 | - Spark Version：3.1.2
 6 | - Hadoop Version：3.2.0
 7 | 
 8 | ## 如何运行
 9 | 
10 | 拉取镜像：
11 | 
12 | ```sh
13 | docker pull s1mplecc/spark-hadoop:3
14 | ```
15 | 
16 | 拷贝项目中的 docker-compose.yml 至本地，执行如下命令启动集群：
17 | 
18 | ```
19 | docker-compose up -d
20 | ```
21 | 
22 | 在容器内运行 Hadoop 启动脚本：
23 | 
24 | ```sh
25 | $ ./start-hadoop.sh
26 | ```
27 | 
28 | ## 运行 MapReduce 示例程序
29 | 
30 | ```sh
31 | $ hdfs dfs -put share/words.txt /
32 | $ hadoop jar share/bigdata-learning-0.0.1.jar example.mapreduce.WordCount /words.txt /output
33 | ```
34 | 
35 | ## 运行 Spark 示例程序
36 | 
37 | TODO
38 | 
39 | ## Web UI 汇总
40 | 
41 | | Web UI                      | 默认网址                   | 备注                                   |
42 | |:---------------------------:|:----------------------:|:------------------------------------:|
43 | | \* **Spark Application**           | http://localhost:4040  | 由 SparkContext 启动，显示以本地或 Standalone 模式运行的 Spark 应用 |
44 | | Spark Standalone Master     | http://localhost:8080  |  显示集群状态，以及以 Standalone 模式提交的 Spark 应用                                  |
45 | | \* **HDFS NameNode**               | http://localhost:9870                   | 可浏览 HDFS 文件系统                         |
46 | | \* **YARN ResourceManager**        | http://localhost:8088                   | 显示提交到 YARN 上的 Spark 应用      |
47 | | YARN NodeManager            | http://localhost:8042 | 显示工作节点配置信息和运行时日志                                     |
48 | | MapReduce Job History | http://localhost:19888 | MapReduce 历史任务 |
49 | 


--------------------------------------------------------------------------------
/config/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 	<property>
21 |   		<name>fs.defaultFS</name>
22 |   		<value>hdfs://master:9000</value>
23 | 	</property>
24 | </configuration>


--------------------------------------------------------------------------------
/config/hadoop-env.sh:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # Set Hadoop-specific environment variables here.
 19 | 
 20 | ##
 21 | ## THIS FILE ACTS AS THE MASTER FILE FOR ALL HADOOP PROJECTS.
 22 | ## SETTINGS HERE WILL BE READ BY ALL HADOOP COMMANDS.  THEREFORE,
 23 | ## ONE CAN USE THIS FILE TO SET YARN, HDFS, AND MAPREDUCE
 24 | ## CONFIGURATION OPTIONS INSTEAD OF xxx-env.sh.
 25 | ##
 26 | ## Precedence rules:
 27 | ##
 28 | ## {yarn-env.sh|hdfs-env.sh} > hadoop-env.sh > hard-coded defaults
 29 | ##
 30 | ## {YARN_xyz|HDFS_xyz} > HADOOP_xyz > hard-coded defaults
 31 | ##
 32 | 
 33 | # Many of the options here are built from the perspective that users
 34 | # may want to provide OVERWRITING values on the command line.
 35 | # For example:
 36 | #
 37 | #  JAVA_HOME=/usr/java/testing hdfs dfs -ls
 38 | #
 39 | # Therefore, the vast majority (BUT NOT ALL!) of these defaults
 40 | # are configured for substitution and not append.  If append
 41 | # is preferable, modify this file accordingly.
 42 | 
 43 | ###
 44 | # Generic settings for HADOOP
 45 | ###
 46 | 
 47 | # Technically, the only required environment variable is JAVA_HOME.
 48 | # All others are optional.  However, the defaults are probably not
 49 | # preferred.  Many sites configure these options outside of Hadoop,
 50 | # such as in /etc/profile.d
 51 | 
 52 | # The java implementation to use. By default, this environment
 53 | # variable is REQUIRED on ALL platforms except OS X!
 54 | # export JAVA_HOME=
 55 | 
 56 | # Location of Hadoop.  By default, Hadoop will attempt to determine
 57 | # this location based upon its execution path.
 58 | # export HADOOP_HOME=
 59 | 
 60 | # Location of Hadoop's configuration information.  i.e., where this
 61 | # file is living. If this is not defined, Hadoop will attempt to
 62 | # locate it based upon its execution path.
 63 | #
 64 | # NOTE: It is recommend that this variable not be set here but in
 65 | # /etc/profile.d or equivalent.  Some options (such as
 66 | # --config) may react strangely otherwise.
 67 | #
 68 | # export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
 69 | 
 70 | # The maximum amount of heap to use (Java -Xmx).  If no unit
 71 | # is provided, it will be converted to MB.  Daemons will
 72 | # prefer any Xmx setting in their respective _OPT variable.
 73 | # There is no default; the JVM will autoscale based upon machine
 74 | # memory size.
 75 | # export HADOOP_HEAPSIZE_MAX=
 76 | 
 77 | # The minimum amount of heap to use (Java -Xms).  If no unit
 78 | # is provided, it will be converted to MB.  Daemons will
 79 | # prefer any Xms setting in their respective _OPT variable.
 80 | # There is no default; the JVM will autoscale based upon machine
 81 | # memory size.
 82 | # export HADOOP_HEAPSIZE_MIN=
 83 | 
 84 | # Enable extra debugging of Hadoop's JAAS binding, used to set up
 85 | # Kerberos security.
 86 | # export HADOOP_JAAS_DEBUG=true
 87 | 
 88 | # Extra Java runtime options for all Hadoop commands. We don't support
 89 | # IPv6 yet/still, so by default the preference is set to IPv4.
 90 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true"
 91 | # For Kerberos debugging, an extended option set logs more information
 92 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"
 93 | 
 94 | # Some parts of the shell code may do special things dependent upon
 95 | # the operating system.  We have to set this here. See the next
 96 | # section as to why....
 97 | # export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
 98 | 
 99 | 
100 | # Under certain conditions, Java on OS X will throw SCDynamicStore errors
101 | # in the system logs.
102 | # See HADOOP-8719 for more information.  If one needs Kerberos
103 | # support on OS X, one will want to change/remove this extra bit.
104 | # case ${HADOOP_OS_TYPE} in
105 | #   Darwin*)
106 | #     export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.realm= "
107 | #     export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.kdc= "
108 | #     export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.conf= "
109 | #   ;;
110 | # esac
111 | 
112 | # Extra Java runtime options for some Hadoop commands
113 | # and clients (i.e., hdfs dfs -blah).  These get appended to HADOOP_OPTS for
114 | # such commands.  In most cases, # this should be left empty and
115 | # let users supply it on the command line.
116 | # export HADOOP_CLIENT_OPTS=""
117 | 
118 | #
119 | # A note about classpaths.
120 | #
121 | # By default, Apache Hadoop overrides Java's CLASSPATH
122 | # environment variable.  It is configured such
123 | # that it starts out blank with new entries added after passing
124 | # a series of checks (file/dir exists, not already listed aka
125 | # de-deduplication).  During de-deduplication, wildcards and/or
126 | # directories are *NOT* expanded to keep it simple. Therefore,
127 | # if the computed classpath has two specific mentions of
128 | # awesome-methods-1.0.jar, only the first one added will be seen.
129 | # If two directories are in the classpath that both contain
130 | # awesome-methods-1.0.jar, then Java will pick up both versions.
131 | 
132 | # An additional, custom CLASSPATH. Site-wide configs should be
133 | # handled via the shellprofile functionality, utilizing the
134 | # hadoop_add_classpath function for greater control and much
135 | # harder for apps/end-users to accidentally override.
136 | # Similarly, end users should utilize ${HOME}/.hadooprc .
137 | # This variable should ideally only be used as a short-cut,
138 | # interactive way for temporary additions on the command line.
139 | # export HADOOP_CLASSPATH="/some/cool/path/on/your/machine"
140 | 
141 | # Should HADOOP_CLASSPATH be first in the official CLASSPATH?
142 | # export HADOOP_USER_CLASSPATH_FIRST="yes"
143 | 
144 | # If HADOOP_USE_CLIENT_CLASSLOADER is set, the classpath along
145 | # with the main jar are handled by a separate isolated
146 | # client classloader when 'hadoop jar', 'yarn jar', or 'mapred job'
147 | # is utilized. If it is set, HADOOP_CLASSPATH and
148 | # HADOOP_USER_CLASSPATH_FIRST are ignored.
149 | # export HADOOP_USE_CLIENT_CLASSLOADER=true
150 | 
151 | # HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES overrides the default definition of
152 | # system classes for the client classloader when HADOOP_USE_CLIENT_CLASSLOADER
153 | # is enabled. Names ending in '.' (period) are treated as package names, and
154 | # names starting with a '-' are treated as negative matches. For example,
155 | # export HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES="-org.apache.hadoop.UserClass,java.,javax.,org.apache.hadoop."
156 | 
157 | # Enable optional, bundled Hadoop features
158 | # This is a comma delimited list.  It may NOT be overridden via .hadooprc
159 | # Entries may be added/removed as needed.
160 | # export HADOOP_OPTIONAL_TOOLS="hadoop-aliyun,hadoop-aws,hadoop-azure-datalake,hadoop-azure,hadoop-kafka,hadoop-openstack"
161 | 
162 | ###
163 | # Options for remote shell connectivity
164 | ###
165 | 
166 | # There are some optional components of hadoop that allow for
167 | # command and control of remote hosts.  For example,
168 | # start-dfs.sh will attempt to bring up all NNs, DNS, etc.
169 | 
170 | # Options to pass to SSH when one of the "log into a host and
171 | # start/stop daemons" scripts is executed
172 | # export HADOOP_SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10s"
173 | 
174 | # The built-in ssh handler will limit itself to 10 simultaneous connections.
175 | # For pdsh users, this sets the fanout size ( -f )
176 | # Change this to increase/decrease as necessary.
177 | # export HADOOP_SSH_PARALLEL=10
178 | 
179 | # Filename which contains all of the hosts for any remote execution
180 | # helper scripts # such as workers.sh, start-dfs.sh, etc.
181 | # export HADOOP_WORKERS="${HADOOP_CONF_DIR}/workers"
182 | 
183 | ###
184 | # Options for all daemons
185 | ###
186 | #
187 | 
188 | #
189 | # Many options may also be specified as Java properties.  It is
190 | # very common, and in many cases, desirable, to hard-set these
191 | # in daemon _OPTS variables.  Where applicable, the appropriate
192 | # Java property is also identified.  Note that many are re-used
193 | # or set differently in certain contexts (e.g., secure vs
194 | # non-secure)
195 | #
196 | 
197 | # Where (primarily) daemon log files are stored.
198 | # ${HADOOP_HOME}/logs by default.
199 | # Java property: hadoop.log.dir
200 | # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
201 | 
202 | # A string representing this instance of hadoop. $USER by default.
203 | # This is used in writing log and pid files, so keep that in mind!
204 | # Java property: hadoop.id.str
205 | # export HADOOP_IDENT_STRING=$USER
206 | 
207 | # How many seconds to pause after stopping a daemon
208 | # export HADOOP_STOP_TIMEOUT=5
209 | 
210 | # Where pid files are stored.  /tmp by default.
211 | # export HADOOP_PID_DIR=/tmp
212 | 
213 | # Default log4j setting for interactive commands
214 | # Java property: hadoop.root.logger
215 | # export HADOOP_ROOT_LOGGER=INFO,console
216 | 
217 | # Default log4j setting for daemons spawned explicitly by
218 | # --daemon option of hadoop, hdfs, mapred and yarn command.
219 | # Java property: hadoop.root.logger
220 | # export HADOOP_DAEMON_ROOT_LOGGER=INFO,RFA
221 | 
222 | # Default log level and output location for security-related messages.
223 | # You will almost certainly want to change this on a per-daemon basis via
224 | # the Java property (i.e., -Dhadoop.security.logger=foo). (Note that the
225 | # defaults for the NN and 2NN override this by default.)
226 | # Java property: hadoop.security.logger
227 | # export HADOOP_SECURITY_LOGGER=INFO,NullAppender
228 | 
229 | # Default process priority level
230 | # Note that sub-processes will also run at this level!
231 | # export HADOOP_NICENESS=0
232 | 
233 | # Default name for the service level authorization file
234 | # Java property: hadoop.policy.file
235 | # export HADOOP_POLICYFILE="hadoop-policy.xml"
236 | 
237 | #
238 | # NOTE: this is not used by default!  <-----
239 | # You can define variables right here and then re-use them later on.
240 | # For example, it is common to use the same garbage collection settings
241 | # for all the daemons.  So one could define:
242 | #
243 | # export HADOOP_GC_SETTINGS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps"
244 | #
245 | # .. and then use it as per the b option under the namenode.
246 | 
247 | ###
248 | # Secure/privileged execution
249 | ###
250 | 
251 | #
252 | # Out of the box, Hadoop uses jsvc from Apache Commons to launch daemons
253 | # on privileged ports.  This functionality can be replaced by providing
254 | # custom functions.  See hadoop-functions.sh for more information.
255 | #
256 | 
257 | # The jsvc implementation to use. Jsvc is required to run secure datanodes
258 | # that bind to privileged ports to provide authentication of data transfer
259 | # protocol.  Jsvc is not required if SASL is configured for authentication of
260 | # data transfer protocol using non-privileged ports.
261 | # export JSVC_HOME=/usr/bin
262 | 
263 | #
264 | # This directory contains pids for secure and privileged processes.
265 | #export HADOOP_SECURE_PID_DIR=${HADOOP_PID_DIR}
266 | 
267 | #
268 | # This directory contains the logs for secure and privileged processes.
269 | # Java property: hadoop.log.dir
270 | # export HADOOP_SECURE_LOG=${HADOOP_LOG_DIR}
271 | 
272 | #
273 | # When running a secure daemon, the default value of HADOOP_IDENT_STRING
274 | # ends up being a bit bogus.  Therefore, by default, the code will
275 | # replace HADOOP_IDENT_STRING with HADOOP_xx_SECURE_USER.  If one wants
276 | # to keep HADOOP_IDENT_STRING untouched, then uncomment this line.
277 | # export HADOOP_SECURE_IDENT_PRESERVE="true"
278 | 
279 | ###
280 | # NameNode specific parameters
281 | ###
282 | 
283 | # Default log level and output location for file system related change
284 | # messages. For non-namenode daemons, the Java property must be set in
285 | # the appropriate _OPTS if one wants something other than INFO,NullAppender
286 | # Java property: hdfs.audit.logger
287 | # export HDFS_AUDIT_LOGGER=INFO,NullAppender
288 | 
289 | # Specify the JVM options to be used when starting the NameNode.
290 | # These options will be appended to the options specified as HADOOP_OPTS
291 | # and therefore may override any similar flags set in HADOOP_OPTS
292 | #
293 | # a) Set JMX options
294 | # export HDFS_NAMENODE_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026"
295 | #
296 | # b) Set garbage collection logs
297 | # export HDFS_NAMENODE_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
298 | #
299 | # c) ... or set them directly
300 | # export HDFS_NAMENODE_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
301 | 
302 | # this is the default:
303 | # export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
304 | 
305 | ###
306 | # SecondaryNameNode specific parameters
307 | ###
308 | # Specify the JVM options to be used when starting the SecondaryNameNode.
309 | # These options will be appended to the options specified as HADOOP_OPTS
310 | # and therefore may override any similar flags set in HADOOP_OPTS
311 | #
312 | # This is the default:
313 | # export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
314 | 
315 | ###
316 | # DataNode specific parameters
317 | ###
318 | # Specify the JVM options to be used when starting the DataNode.
319 | # These options will be appended to the options specified as HADOOP_OPTS
320 | # and therefore may override any similar flags set in HADOOP_OPTS
321 | #
322 | # This is the default:
323 | # export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"
324 | 
325 | # On secure datanodes, user to run the datanode as after dropping privileges.
326 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports
327 | # to provide authentication of data transfer protocol.  This **MUST NOT** be
328 | # defined if SASL is configured for authentication of data transfer protocol
329 | # using non-privileged ports.
330 | # This will replace the hadoop.id.str Java property in secure mode.
331 | # export HDFS_DATANODE_SECURE_USER=hdfs
332 | 
333 | # Supplemental options for secure datanodes
334 | # By default, Hadoop uses jsvc which needs to know to launch a
335 | # server jvm.
336 | # export HDFS_DATANODE_SECURE_EXTRA_OPTS="-jvm server"
337 | 
338 | ###
339 | # NFS3 Gateway specific parameters
340 | ###
341 | # Specify the JVM options to be used when starting the NFS3 Gateway.
342 | # These options will be appended to the options specified as HADOOP_OPTS
343 | # and therefore may override any similar flags set in HADOOP_OPTS
344 | #
345 | # export HDFS_NFS3_OPTS=""
346 | 
347 | # Specify the JVM options to be used when starting the Hadoop portmapper.
348 | # These options will be appended to the options specified as HADOOP_OPTS
349 | # and therefore may override any similar flags set in HADOOP_OPTS
350 | #
351 | # export HDFS_PORTMAP_OPTS="-Xmx512m"
352 | 
353 | # Supplemental options for priviliged gateways
354 | # By default, Hadoop uses jsvc which needs to know to launch a
355 | # server jvm.
356 | # export HDFS_NFS3_SECURE_EXTRA_OPTS="-jvm server"
357 | 
358 | # On privileged gateways, user to run the gateway as after dropping privileges
359 | # This will replace the hadoop.id.str Java property in secure mode.
360 | # export HDFS_NFS3_SECURE_USER=nfsserver
361 | 
362 | ###
363 | # ZKFailoverController specific parameters
364 | ###
365 | # Specify the JVM options to be used when starting the ZKFailoverController.
366 | # These options will be appended to the options specified as HADOOP_OPTS
367 | # and therefore may override any similar flags set in HADOOP_OPTS
368 | #
369 | # export HDFS_ZKFC_OPTS=""
370 | 
371 | ###
372 | # QuorumJournalNode specific parameters
373 | ###
374 | # Specify the JVM options to be used when starting the QuorumJournalNode.
375 | # These options will be appended to the options specified as HADOOP_OPTS
376 | # and therefore may override any similar flags set in HADOOP_OPTS
377 | #
378 | # export HDFS_JOURNALNODE_OPTS=""
379 | 
380 | ###
381 | # HDFS Balancer specific parameters
382 | ###
383 | # Specify the JVM options to be used when starting the HDFS Balancer.
384 | # These options will be appended to the options specified as HADOOP_OPTS
385 | # and therefore may override any similar flags set in HADOOP_OPTS
386 | #
387 | # export HDFS_BALANCER_OPTS=""
388 | 
389 | ###
390 | # HDFS Mover specific parameters
391 | ###
392 | # Specify the JVM options to be used when starting the HDFS Mover.
393 | # These options will be appended to the options specified as HADOOP_OPTS
394 | # and therefore may override any similar flags set in HADOOP_OPTS
395 | #
396 | # export HDFS_MOVER_OPTS=""
397 | 
398 | ###
399 | # Router-based HDFS Federation specific parameters
400 | # Specify the JVM options to be used when starting the RBF Routers.
401 | # These options will be appended to the options specified as HADOOP_OPTS
402 | # and therefore may override any similar flags set in HADOOP_OPTS
403 | #
404 | # export HDFS_DFSROUTER_OPTS=""
405 | 
406 | ###
407 | # HDFS StorageContainerManager specific parameters
408 | ###
409 | # Specify the JVM options to be used when starting the HDFS Storage Container Manager.
410 | # These options will be appended to the options specified as HADOOP_OPTS
411 | # and therefore may override any similar flags set in HADOOP_OPTS
412 | #
413 | # export HDFS_STORAGECONTAINERMANAGER_OPTS=""
414 | 
415 | ###
416 | # Advanced Users Only!
417 | ###
418 | 
419 | #
420 | # When building Hadoop, one can add the class paths to the commands
421 | # via this special env var:
422 | # export HADOOP_ENABLE_BUILD_PATHS="true"
423 | 
424 | #
425 | # To prevent accidents, shell commands be (superficially) locked
426 | # to only allow certain users to execute certain subcommands.
427 | # It uses the format of (command)_(subcommand)_USER.
428 | #
429 | # For example, to limit who can execute the namenode command,
430 | # export HDFS_NAMENODE_USER=hdfs
431 | export JAVA_HOME=/opt/bitnami/java
432 | export HADOOP_HOME=/opt/hadoop
433 | export HADOOP_MAPRED_HOME=/opt/hadoop
434 | export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop
435 | export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
436 | export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"
437 | 
438 | export HDFS_NAMENODE_USER="root"
439 | export HDFS_DATANODE_USER="root"
440 | export HDFS_SECONDARYNAMENODE_USER="root"
441 | export YARN_RESOURCEMANAGER_USER="root"
442 | export YARN_NODEMANAGER_USER="root"


--------------------------------------------------------------------------------
/config/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 	<property>
21 |         <name>dfs.namenode.name.dir</name>
22 |         <value>file:///root/hdfs/namenode</value>
23 |         <description>NameNode directory for namespace and transaction logs storage.</description>
24 |     </property>
25 |     <property>
26 |         <name>dfs.datanode.data.dir</name>
27 |         <value>file:///root/hdfs/datanode</value>
28 |         <description>DataNode directory</description>
29 |     </property>
30 |     <property>
31 |         <name>dfs.replication</name>
32 |         <value>2</value>
33 |     </property>
34 | </configuration>


--------------------------------------------------------------------------------
/config/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 |     <property>
21 |         <name>mapreduce.framework.name</name>
22 |         <value>yarn</value>
23 |     </property>
24 |     <property>
25 |         <name>yarn.app.mapreduce.am.env</name>
26 |         <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
27 |     </property>
28 |     <property>
29 |         <name>mapreduce.map.env</name>
30 |         <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
31 |     </property>
32 |     <property>
33 |         <name>mapreduce.reduce.env</name>
34 |         <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
35 |     </property>
36 |     <property> 
37 |       <name>mapreduce.application.classpath</name>
38 |       <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/*</value>
39 |     </property>
40 | </configuration>


--------------------------------------------------------------------------------
/config/ssh_config:
--------------------------------------------------------------------------------
 1 | Host localhost
 2 |   StrictHostKeyChecking no
 3 | 
 4 | Host 0.0.0.0
 5 |   StrictHostKeyChecking no
 6 |   
 7 | Host hadoop-*
 8 |    StrictHostKeyChecking no
 9 |    UserKnownHostsFile=/dev/null
10 | 


--------------------------------------------------------------------------------
/config/workers:
--------------------------------------------------------------------------------
1 | worker1
2 | worker2
3 | 


--------------------------------------------------------------------------------
/config/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed under the Apache License, Version 2.0 (the "License");
 4 |   you may not use this file except in compliance with the License.
 5 |   You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |   Unless required by applicable law or agreed to in writing, software
10 |   distributed under the License is distributed on an "AS IS" BASIS,
11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |   See the License for the specific language governing permissions and
13 |   limitations under the License. See accompanying LICENSE file.
14 | -->
15 | 
16 | <!-- Site specific YARN configuration properties -->
17 | <configuration>
18 | 	<property>
19 |         <name>yarn.nodemanager.aux-services</name>
20 |         <value>mapreduce_shuffle</value>
21 |     </property>
22 |     <property>
23 |         <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
24 |         <value>org.apache.hadoop.mapred.ShuffleHandler</value>
25 |     </property>
26 |     <property>
27 |         <name>yarn.resourcemanager.hostname</name>
28 |         <value>master</value>
29 |     </property>
30 |     <property>
31 |         <name>yarn.nodemanager.env-whitelist</name>
32 |         <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
33 |     </property>
34 | </configuration>


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   spark:
 5 |     image: s1mplecc/spark-hadoop:3
 6 |     hostname: master
 7 |     environment:
 8 |       - SPARK_MODE=master
 9 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
10 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
11 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
12 |       - SPARK_SSL_ENABLED=no
13 |     volumes:
14 |       - ~/docker/spark/share:/opt/share
15 |     ports:
16 |       - '8080:8080'
17 |       - '4040:4040'
18 |       - '8088:8088'
19 |       - '8042:8042'
20 |       - '9870:9870'
21 |       - '19888:19888'
22 |   spark-worker-1:
23 |     image: s1mplecc/spark-hadoop:3
24 |     hostname: worker1
25 |     environment:
26 |       - SPARK_MODE=worker
27 |       - SPARK_MASTER_URL=spark://master:7077
28 |       - SPARK_WORKER_MEMORY=1G
29 |       - SPARK_WORKER_CORES=1
30 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
31 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
32 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
33 |       - SPARK_SSL_ENABLED=no
34 |     volumes:
35 |       - ~/docker/spark/share:/opt/share
36 |     ports:
37 |       - '8081:8081'
38 |   spark-worker-2:
39 |     image: s1mplecc/spark-hadoop:3
40 |     hostname: worker2
41 |     environment:
42 |       - SPARK_MODE=worker
43 |       - SPARK_MASTER_URL=spark://master:7077
44 |       - SPARK_WORKER_MEMORY=1G
45 |       - SPARK_WORKER_CORES=1
46 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
47 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
48 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
49 |       - SPARK_SSL_ENABLED=no
50 |     volumes:
51 |       - ~/docker/spark/share:/opt/share
52 |     ports:
53 |       - '8082:8081'
54 | 


--------------------------------------------------------------------------------
/share/bigdata-learning-0.0.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/s1mplecc/spark-hadoop-docker/2b6b5131e3ff7076c4b31f54028c883e0ce8d54b/share/bigdata-learning-0.0.1.jar


--------------------------------------------------------------------------------
/share/my_script.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | 
3 | conf = SparkConf().setAppName('My App')
4 | sc = SparkContext(conf=conf)
5 | 
6 | count = sc.range(1, 1000 * 1000 * 100).filter(lambda x: x > 100).count()
7 | print('count: ', count)
8 | 


--------------------------------------------------------------------------------
/share/words.txt:
--------------------------------------------------------------------------------
1 | Apache Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Structured Streaming for incremental computation and stream processing.


--------------------------------------------------------------------------------
/start-hadoop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | $HADOOP_HOME/sbin/start-dfs.sh
3 | $HADOOP_HOME/sbin/start-yarn.sh


--------------------------------------------------------------------------------