├── .gitignore ├── Dockerfile ├── README.md ├── config ├── core-site.xml ├── hadoop-env.sh ├── hdfs-site.xml ├── mapred-site.xml ├── ssh_config ├── workers └── yarn-site.xml ├── docker-compose.yml ├── share ├── bigdata-learning-0.0.1.jar ├── my_script.py └── words.txt └── start-hadoop.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/bitnami/spark:3 2 | LABEL maintainer="s1mplecc " 3 | LABEL description="Docker image with Spark (3.1.2) and Hadoop (3.2.0), based on bitnami/spark:3. \ 4 | For more information, please visit https://github.com/s1mplecc/spark-hadoop-docker." 5 | 6 | USER root 7 | 8 | ENV HADOOP_HOME="/opt/hadoop" 9 | ENV HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop" 10 | ENV HADOOP_LOG_DIR="/var/log/hadoop" 11 | ENV PATH="$HADOOP_HOME/hadoop/sbin:$HADOOP_HOME/bin:$PATH" 12 | 13 | WORKDIR /opt 14 | 15 | RUN apt-get update && apt-get install -y openssh-server 16 | 17 | RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -P '' && \ 18 | cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys 19 | 20 | RUN curl -OL https://archive.apache.org/dist/hadoop/common/hadoop-3.2.0/hadoop-3.2.0.tar.gz 21 | RUN tar -xzvf hadoop-3.2.0.tar.gz && \ 22 | mv hadoop-3.2.0 hadoop && \ 23 | rm -rf hadoop-3.2.0.tar.gz && \ 24 | mkdir /var/log/hadoop 25 | 26 | RUN mkdir -p /root/hdfs/namenode && \ 27 | mkdir -p /root/hdfs/datanode 28 | 29 | COPY config/* /tmp/ 30 | 31 | RUN mv /tmp/ssh_config /root/.ssh/config && \ 32 | mv /tmp/hadoop-env.sh $HADOOP_CONF_DIR/hadoop-env.sh && \ 33 | mv /tmp/hdfs-site.xml $HADOOP_CONF_DIR/hdfs-site.xml && \ 34 | mv /tmp/core-site.xml $HADOOP_CONF_DIR/core-site.xml && \ 35 | mv /tmp/mapred-site.xml $HADOOP_CONF_DIR/mapred-site.xml && \ 36 | mv /tmp/yarn-site.xml $HADOOP_CONF_DIR/yarn-site.xml && \ 37 | mv /tmp/workers $HADOOP_CONF_DIR/workers 38 | 39 | COPY start-hadoop.sh /opt/start-hadoop.sh 40 | 41 | RUN chmod +x /opt/start-hadoop.sh && \ 42 | chmod +x $HADOOP_HOME/sbin/start-dfs.sh && \ 43 | chmod +x $HADOOP_HOME/sbin/start-yarn.sh 44 | 45 | RUN hdfs namenode -format 46 | RUN sed -i "1 a /etc/init.d/ssh start > /dev/null &" /opt/bitnami/scripts/spark/entrypoint.sh 47 | 48 | ENTRYPOINT [ "/opt/bitnami/scripts/spark/entrypoint.sh" ] 49 | CMD [ "/opt/bitnami/scripts/spark/run.sh" ] 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 介绍 2 | 3 | 为了免去繁杂的环境配置工作,提供开箱即用的 Spark + Hadoop 快捷部署方案。基于 [BitNami](https://github.com/bitnami/bitnami-docker-spark) 项目的成熟镜像方案,搭建 Spark Docker 集群,并在原有镜像基础上,构建了安装有对应版本 Hadoop 的镜像。详细构建过程请参考:[使用 Docker 快速部署 Spark + Hadoop 大数据集群](https://s1mple.cc/2021/10/12/%E4%BD%BF%E7%94%A8-Docker-%E5%BF%AB%E9%80%9F%E9%83%A8%E7%BD%B2-Spark-Hadoop-%E5%A4%A7%E6%95%B0%E6%8D%AE%E9%9B%86%E7%BE%A4/)。 4 | 5 | - Spark Version:3.1.2 6 | - Hadoop Version:3.2.0 7 | 8 | ## 如何运行 9 | 10 | 拉取镜像: 11 | 12 | ```sh 13 | docker pull s1mplecc/spark-hadoop:3 14 | ``` 15 | 16 | 拷贝项目中的 docker-compose.yml 至本地,执行如下命令启动集群: 17 | 18 | ``` 19 | docker-compose up -d 20 | ``` 21 | 22 | 在容器内运行 Hadoop 启动脚本: 23 | 24 | ```sh 25 | $ ./start-hadoop.sh 26 | ``` 27 | 28 | ## 运行 MapReduce 示例程序 29 | 30 | ```sh 31 | $ hdfs dfs -put share/words.txt / 32 | $ hadoop jar share/bigdata-learning-0.0.1.jar example.mapreduce.WordCount /words.txt /output 33 | ``` 34 | 35 | ## 运行 Spark 示例程序 36 | 37 | TODO 38 | 39 | ## Web UI 汇总 40 | 41 | | Web UI | 默认网址 | 备注 | 42 | |:---------------------------:|:----------------------:|:------------------------------------:| 43 | | \* **Spark Application** | http://localhost:4040 | 由 SparkContext 启动,显示以本地或 Standalone 模式运行的 Spark 应用 | 44 | | Spark Standalone Master | http://localhost:8080 | 显示集群状态,以及以 Standalone 模式提交的 Spark 应用 | 45 | | \* **HDFS NameNode** | http://localhost:9870 | 可浏览 HDFS 文件系统 | 46 | | \* **YARN ResourceManager** | http://localhost:8088 | 显示提交到 YARN 上的 Spark 应用 | 47 | | YARN NodeManager | http://localhost:8042 | 显示工作节点配置信息和运行时日志 | 48 | | MapReduce Job History | http://localhost:19888 | MapReduce 历史任务 | 49 | -------------------------------------------------------------------------------- /config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | fs.defaultFS 22 | hdfs://master:9000 23 | 24 | -------------------------------------------------------------------------------- /config/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # Set Hadoop-specific environment variables here. 19 | 20 | ## 21 | ## THIS FILE ACTS AS THE MASTER FILE FOR ALL HADOOP PROJECTS. 22 | ## SETTINGS HERE WILL BE READ BY ALL HADOOP COMMANDS. THEREFORE, 23 | ## ONE CAN USE THIS FILE TO SET YARN, HDFS, AND MAPREDUCE 24 | ## CONFIGURATION OPTIONS INSTEAD OF xxx-env.sh. 25 | ## 26 | ## Precedence rules: 27 | ## 28 | ## {yarn-env.sh|hdfs-env.sh} > hadoop-env.sh > hard-coded defaults 29 | ## 30 | ## {YARN_xyz|HDFS_xyz} > HADOOP_xyz > hard-coded defaults 31 | ## 32 | 33 | # Many of the options here are built from the perspective that users 34 | # may want to provide OVERWRITING values on the command line. 35 | # For example: 36 | # 37 | # JAVA_HOME=/usr/java/testing hdfs dfs -ls 38 | # 39 | # Therefore, the vast majority (BUT NOT ALL!) of these defaults 40 | # are configured for substitution and not append. If append 41 | # is preferable, modify this file accordingly. 42 | 43 | ### 44 | # Generic settings for HADOOP 45 | ### 46 | 47 | # Technically, the only required environment variable is JAVA_HOME. 48 | # All others are optional. However, the defaults are probably not 49 | # preferred. Many sites configure these options outside of Hadoop, 50 | # such as in /etc/profile.d 51 | 52 | # The java implementation to use. By default, this environment 53 | # variable is REQUIRED on ALL platforms except OS X! 54 | # export JAVA_HOME= 55 | 56 | # Location of Hadoop. By default, Hadoop will attempt to determine 57 | # this location based upon its execution path. 58 | # export HADOOP_HOME= 59 | 60 | # Location of Hadoop's configuration information. i.e., where this 61 | # file is living. If this is not defined, Hadoop will attempt to 62 | # locate it based upon its execution path. 63 | # 64 | # NOTE: It is recommend that this variable not be set here but in 65 | # /etc/profile.d or equivalent. Some options (such as 66 | # --config) may react strangely otherwise. 67 | # 68 | # export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop 69 | 70 | # The maximum amount of heap to use (Java -Xmx). If no unit 71 | # is provided, it will be converted to MB. Daemons will 72 | # prefer any Xmx setting in their respective _OPT variable. 73 | # There is no default; the JVM will autoscale based upon machine 74 | # memory size. 75 | # export HADOOP_HEAPSIZE_MAX= 76 | 77 | # The minimum amount of heap to use (Java -Xms). If no unit 78 | # is provided, it will be converted to MB. Daemons will 79 | # prefer any Xms setting in their respective _OPT variable. 80 | # There is no default; the JVM will autoscale based upon machine 81 | # memory size. 82 | # export HADOOP_HEAPSIZE_MIN= 83 | 84 | # Enable extra debugging of Hadoop's JAAS binding, used to set up 85 | # Kerberos security. 86 | # export HADOOP_JAAS_DEBUG=true 87 | 88 | # Extra Java runtime options for all Hadoop commands. We don't support 89 | # IPv6 yet/still, so by default the preference is set to IPv4. 90 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true" 91 | # For Kerberos debugging, an extended option set logs more information 92 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug" 93 | 94 | # Some parts of the shell code may do special things dependent upon 95 | # the operating system. We have to set this here. See the next 96 | # section as to why.... 97 | # export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)} 98 | 99 | 100 | # Under certain conditions, Java on OS X will throw SCDynamicStore errors 101 | # in the system logs. 102 | # See HADOOP-8719 for more information. If one needs Kerberos 103 | # support on OS X, one will want to change/remove this extra bit. 104 | # case ${HADOOP_OS_TYPE} in 105 | # Darwin*) 106 | # export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.realm= " 107 | # export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.kdc= " 108 | # export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.conf= " 109 | # ;; 110 | # esac 111 | 112 | # Extra Java runtime options for some Hadoop commands 113 | # and clients (i.e., hdfs dfs -blah). These get appended to HADOOP_OPTS for 114 | # such commands. In most cases, # this should be left empty and 115 | # let users supply it on the command line. 116 | # export HADOOP_CLIENT_OPTS="" 117 | 118 | # 119 | # A note about classpaths. 120 | # 121 | # By default, Apache Hadoop overrides Java's CLASSPATH 122 | # environment variable. It is configured such 123 | # that it starts out blank with new entries added after passing 124 | # a series of checks (file/dir exists, not already listed aka 125 | # de-deduplication). During de-deduplication, wildcards and/or 126 | # directories are *NOT* expanded to keep it simple. Therefore, 127 | # if the computed classpath has two specific mentions of 128 | # awesome-methods-1.0.jar, only the first one added will be seen. 129 | # If two directories are in the classpath that both contain 130 | # awesome-methods-1.0.jar, then Java will pick up both versions. 131 | 132 | # An additional, custom CLASSPATH. Site-wide configs should be 133 | # handled via the shellprofile functionality, utilizing the 134 | # hadoop_add_classpath function for greater control and much 135 | # harder for apps/end-users to accidentally override. 136 | # Similarly, end users should utilize ${HOME}/.hadooprc . 137 | # This variable should ideally only be used as a short-cut, 138 | # interactive way for temporary additions on the command line. 139 | # export HADOOP_CLASSPATH="/some/cool/path/on/your/machine" 140 | 141 | # Should HADOOP_CLASSPATH be first in the official CLASSPATH? 142 | # export HADOOP_USER_CLASSPATH_FIRST="yes" 143 | 144 | # If HADOOP_USE_CLIENT_CLASSLOADER is set, the classpath along 145 | # with the main jar are handled by a separate isolated 146 | # client classloader when 'hadoop jar', 'yarn jar', or 'mapred job' 147 | # is utilized. If it is set, HADOOP_CLASSPATH and 148 | # HADOOP_USER_CLASSPATH_FIRST are ignored. 149 | # export HADOOP_USE_CLIENT_CLASSLOADER=true 150 | 151 | # HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES overrides the default definition of 152 | # system classes for the client classloader when HADOOP_USE_CLIENT_CLASSLOADER 153 | # is enabled. Names ending in '.' (period) are treated as package names, and 154 | # names starting with a '-' are treated as negative matches. For example, 155 | # export HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES="-org.apache.hadoop.UserClass,java.,javax.,org.apache.hadoop." 156 | 157 | # Enable optional, bundled Hadoop features 158 | # This is a comma delimited list. It may NOT be overridden via .hadooprc 159 | # Entries may be added/removed as needed. 160 | # export HADOOP_OPTIONAL_TOOLS="hadoop-aliyun,hadoop-aws,hadoop-azure-datalake,hadoop-azure,hadoop-kafka,hadoop-openstack" 161 | 162 | ### 163 | # Options for remote shell connectivity 164 | ### 165 | 166 | # There are some optional components of hadoop that allow for 167 | # command and control of remote hosts. For example, 168 | # start-dfs.sh will attempt to bring up all NNs, DNS, etc. 169 | 170 | # Options to pass to SSH when one of the "log into a host and 171 | # start/stop daemons" scripts is executed 172 | # export HADOOP_SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10s" 173 | 174 | # The built-in ssh handler will limit itself to 10 simultaneous connections. 175 | # For pdsh users, this sets the fanout size ( -f ) 176 | # Change this to increase/decrease as necessary. 177 | # export HADOOP_SSH_PARALLEL=10 178 | 179 | # Filename which contains all of the hosts for any remote execution 180 | # helper scripts # such as workers.sh, start-dfs.sh, etc. 181 | # export HADOOP_WORKERS="${HADOOP_CONF_DIR}/workers" 182 | 183 | ### 184 | # Options for all daemons 185 | ### 186 | # 187 | 188 | # 189 | # Many options may also be specified as Java properties. It is 190 | # very common, and in many cases, desirable, to hard-set these 191 | # in daemon _OPTS variables. Where applicable, the appropriate 192 | # Java property is also identified. Note that many are re-used 193 | # or set differently in certain contexts (e.g., secure vs 194 | # non-secure) 195 | # 196 | 197 | # Where (primarily) daemon log files are stored. 198 | # ${HADOOP_HOME}/logs by default. 199 | # Java property: hadoop.log.dir 200 | # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs 201 | 202 | # A string representing this instance of hadoop. $USER by default. 203 | # This is used in writing log and pid files, so keep that in mind! 204 | # Java property: hadoop.id.str 205 | # export HADOOP_IDENT_STRING=$USER 206 | 207 | # How many seconds to pause after stopping a daemon 208 | # export HADOOP_STOP_TIMEOUT=5 209 | 210 | # Where pid files are stored. /tmp by default. 211 | # export HADOOP_PID_DIR=/tmp 212 | 213 | # Default log4j setting for interactive commands 214 | # Java property: hadoop.root.logger 215 | # export HADOOP_ROOT_LOGGER=INFO,console 216 | 217 | # Default log4j setting for daemons spawned explicitly by 218 | # --daemon option of hadoop, hdfs, mapred and yarn command. 219 | # Java property: hadoop.root.logger 220 | # export HADOOP_DAEMON_ROOT_LOGGER=INFO,RFA 221 | 222 | # Default log level and output location for security-related messages. 223 | # You will almost certainly want to change this on a per-daemon basis via 224 | # the Java property (i.e., -Dhadoop.security.logger=foo). (Note that the 225 | # defaults for the NN and 2NN override this by default.) 226 | # Java property: hadoop.security.logger 227 | # export HADOOP_SECURITY_LOGGER=INFO,NullAppender 228 | 229 | # Default process priority level 230 | # Note that sub-processes will also run at this level! 231 | # export HADOOP_NICENESS=0 232 | 233 | # Default name for the service level authorization file 234 | # Java property: hadoop.policy.file 235 | # export HADOOP_POLICYFILE="hadoop-policy.xml" 236 | 237 | # 238 | # NOTE: this is not used by default! <----- 239 | # You can define variables right here and then re-use them later on. 240 | # For example, it is common to use the same garbage collection settings 241 | # for all the daemons. So one could define: 242 | # 243 | # export HADOOP_GC_SETTINGS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 244 | # 245 | # .. and then use it as per the b option under the namenode. 246 | 247 | ### 248 | # Secure/privileged execution 249 | ### 250 | 251 | # 252 | # Out of the box, Hadoop uses jsvc from Apache Commons to launch daemons 253 | # on privileged ports. This functionality can be replaced by providing 254 | # custom functions. See hadoop-functions.sh for more information. 255 | # 256 | 257 | # The jsvc implementation to use. Jsvc is required to run secure datanodes 258 | # that bind to privileged ports to provide authentication of data transfer 259 | # protocol. Jsvc is not required if SASL is configured for authentication of 260 | # data transfer protocol using non-privileged ports. 261 | # export JSVC_HOME=/usr/bin 262 | 263 | # 264 | # This directory contains pids for secure and privileged processes. 265 | #export HADOOP_SECURE_PID_DIR=${HADOOP_PID_DIR} 266 | 267 | # 268 | # This directory contains the logs for secure and privileged processes. 269 | # Java property: hadoop.log.dir 270 | # export HADOOP_SECURE_LOG=${HADOOP_LOG_DIR} 271 | 272 | # 273 | # When running a secure daemon, the default value of HADOOP_IDENT_STRING 274 | # ends up being a bit bogus. Therefore, by default, the code will 275 | # replace HADOOP_IDENT_STRING with HADOOP_xx_SECURE_USER. If one wants 276 | # to keep HADOOP_IDENT_STRING untouched, then uncomment this line. 277 | # export HADOOP_SECURE_IDENT_PRESERVE="true" 278 | 279 | ### 280 | # NameNode specific parameters 281 | ### 282 | 283 | # Default log level and output location for file system related change 284 | # messages. For non-namenode daemons, the Java property must be set in 285 | # the appropriate _OPTS if one wants something other than INFO,NullAppender 286 | # Java property: hdfs.audit.logger 287 | # export HDFS_AUDIT_LOGGER=INFO,NullAppender 288 | 289 | # Specify the JVM options to be used when starting the NameNode. 290 | # These options will be appended to the options specified as HADOOP_OPTS 291 | # and therefore may override any similar flags set in HADOOP_OPTS 292 | # 293 | # a) Set JMX options 294 | # export HDFS_NAMENODE_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026" 295 | # 296 | # b) Set garbage collection logs 297 | # export HDFS_NAMENODE_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')" 298 | # 299 | # c) ... or set them directly 300 | # export HDFS_NAMENODE_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')" 301 | 302 | # this is the default: 303 | # export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS" 304 | 305 | ### 306 | # SecondaryNameNode specific parameters 307 | ### 308 | # Specify the JVM options to be used when starting the SecondaryNameNode. 309 | # These options will be appended to the options specified as HADOOP_OPTS 310 | # and therefore may override any similar flags set in HADOOP_OPTS 311 | # 312 | # This is the default: 313 | # export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS" 314 | 315 | ### 316 | # DataNode specific parameters 317 | ### 318 | # Specify the JVM options to be used when starting the DataNode. 319 | # These options will be appended to the options specified as HADOOP_OPTS 320 | # and therefore may override any similar flags set in HADOOP_OPTS 321 | # 322 | # This is the default: 323 | # export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS" 324 | 325 | # On secure datanodes, user to run the datanode as after dropping privileges. 326 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports 327 | # to provide authentication of data transfer protocol. This **MUST NOT** be 328 | # defined if SASL is configured for authentication of data transfer protocol 329 | # using non-privileged ports. 330 | # This will replace the hadoop.id.str Java property in secure mode. 331 | # export HDFS_DATANODE_SECURE_USER=hdfs 332 | 333 | # Supplemental options for secure datanodes 334 | # By default, Hadoop uses jsvc which needs to know to launch a 335 | # server jvm. 336 | # export HDFS_DATANODE_SECURE_EXTRA_OPTS="-jvm server" 337 | 338 | ### 339 | # NFS3 Gateway specific parameters 340 | ### 341 | # Specify the JVM options to be used when starting the NFS3 Gateway. 342 | # These options will be appended to the options specified as HADOOP_OPTS 343 | # and therefore may override any similar flags set in HADOOP_OPTS 344 | # 345 | # export HDFS_NFS3_OPTS="" 346 | 347 | # Specify the JVM options to be used when starting the Hadoop portmapper. 348 | # These options will be appended to the options specified as HADOOP_OPTS 349 | # and therefore may override any similar flags set in HADOOP_OPTS 350 | # 351 | # export HDFS_PORTMAP_OPTS="-Xmx512m" 352 | 353 | # Supplemental options for priviliged gateways 354 | # By default, Hadoop uses jsvc which needs to know to launch a 355 | # server jvm. 356 | # export HDFS_NFS3_SECURE_EXTRA_OPTS="-jvm server" 357 | 358 | # On privileged gateways, user to run the gateway as after dropping privileges 359 | # This will replace the hadoop.id.str Java property in secure mode. 360 | # export HDFS_NFS3_SECURE_USER=nfsserver 361 | 362 | ### 363 | # ZKFailoverController specific parameters 364 | ### 365 | # Specify the JVM options to be used when starting the ZKFailoverController. 366 | # These options will be appended to the options specified as HADOOP_OPTS 367 | # and therefore may override any similar flags set in HADOOP_OPTS 368 | # 369 | # export HDFS_ZKFC_OPTS="" 370 | 371 | ### 372 | # QuorumJournalNode specific parameters 373 | ### 374 | # Specify the JVM options to be used when starting the QuorumJournalNode. 375 | # These options will be appended to the options specified as HADOOP_OPTS 376 | # and therefore may override any similar flags set in HADOOP_OPTS 377 | # 378 | # export HDFS_JOURNALNODE_OPTS="" 379 | 380 | ### 381 | # HDFS Balancer specific parameters 382 | ### 383 | # Specify the JVM options to be used when starting the HDFS Balancer. 384 | # These options will be appended to the options specified as HADOOP_OPTS 385 | # and therefore may override any similar flags set in HADOOP_OPTS 386 | # 387 | # export HDFS_BALANCER_OPTS="" 388 | 389 | ### 390 | # HDFS Mover specific parameters 391 | ### 392 | # Specify the JVM options to be used when starting the HDFS Mover. 393 | # These options will be appended to the options specified as HADOOP_OPTS 394 | # and therefore may override any similar flags set in HADOOP_OPTS 395 | # 396 | # export HDFS_MOVER_OPTS="" 397 | 398 | ### 399 | # Router-based HDFS Federation specific parameters 400 | # Specify the JVM options to be used when starting the RBF Routers. 401 | # These options will be appended to the options specified as HADOOP_OPTS 402 | # and therefore may override any similar flags set in HADOOP_OPTS 403 | # 404 | # export HDFS_DFSROUTER_OPTS="" 405 | 406 | ### 407 | # HDFS StorageContainerManager specific parameters 408 | ### 409 | # Specify the JVM options to be used when starting the HDFS Storage Container Manager. 410 | # These options will be appended to the options specified as HADOOP_OPTS 411 | # and therefore may override any similar flags set in HADOOP_OPTS 412 | # 413 | # export HDFS_STORAGECONTAINERMANAGER_OPTS="" 414 | 415 | ### 416 | # Advanced Users Only! 417 | ### 418 | 419 | # 420 | # When building Hadoop, one can add the class paths to the commands 421 | # via this special env var: 422 | # export HADOOP_ENABLE_BUILD_PATHS="true" 423 | 424 | # 425 | # To prevent accidents, shell commands be (superficially) locked 426 | # to only allow certain users to execute certain subcommands. 427 | # It uses the format of (command)_(subcommand)_USER. 428 | # 429 | # For example, to limit who can execute the namenode command, 430 | # export HDFS_NAMENODE_USER=hdfs 431 | export JAVA_HOME=/opt/bitnami/java 432 | export HADOOP_HOME=/opt/hadoop 433 | export HADOOP_MAPRED_HOME=/opt/hadoop 434 | export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop 435 | export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native 436 | export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib" 437 | 438 | export HDFS_NAMENODE_USER="root" 439 | export HDFS_DATANODE_USER="root" 440 | export HDFS_SECONDARYNAMENODE_USER="root" 441 | export YARN_RESOURCEMANAGER_USER="root" 442 | export YARN_NODEMANAGER_USER="root" -------------------------------------------------------------------------------- /config/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.namenode.name.dir 22 | file:///root/hdfs/namenode 23 | NameNode directory for namespace and transaction logs storage. 24 | 25 | 26 | dfs.datanode.data.dir 27 | file:///root/hdfs/datanode 28 | DataNode directory 29 | 30 | 31 | dfs.replication 32 | 2 33 | 34 | -------------------------------------------------------------------------------- /config/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | yarn.app.mapreduce.am.env 26 | HADOOP_MAPRED_HOME=/opt/hadoop 27 | 28 | 29 | mapreduce.map.env 30 | HADOOP_MAPRED_HOME=/opt/hadoop 31 | 32 | 33 | mapreduce.reduce.env 34 | HADOOP_MAPRED_HOME=/opt/hadoop 35 | 36 | 37 | mapreduce.application.classpath 38 | $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/common/*,$HADOOP_MAPRED_HOME/share/hadoop/common/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/*,$HADOOP_MAPRED_HOME/share/hadoop/yarn/lib/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/*,$HADOOP_MAPRED_HOME/share/hadoop/hdfs/lib/* 39 | 40 | -------------------------------------------------------------------------------- /config/ssh_config: -------------------------------------------------------------------------------- 1 | Host localhost 2 | StrictHostKeyChecking no 3 | 4 | Host 0.0.0.0 5 | StrictHostKeyChecking no 6 | 7 | Host hadoop-* 8 | StrictHostKeyChecking no 9 | UserKnownHostsFile=/dev/null 10 | -------------------------------------------------------------------------------- /config/workers: -------------------------------------------------------------------------------- 1 | worker1 2 | worker2 3 | -------------------------------------------------------------------------------- /config/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | 19 | yarn.nodemanager.aux-services 20 | mapreduce_shuffle 21 | 22 | 23 | yarn.nodemanager.aux-services.mapreduce_shuffle.class 24 | org.apache.hadoop.mapred.ShuffleHandler 25 | 26 | 27 | yarn.resourcemanager.hostname 28 | master 29 | 30 | 31 | yarn.nodemanager.env-whitelist 32 | JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME 33 | 34 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | spark: 5 | image: s1mplecc/spark-hadoop:3 6 | hostname: master 7 | environment: 8 | - SPARK_MODE=master 9 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 10 | - SPARK_RPC_ENCRYPTION_ENABLED=no 11 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 12 | - SPARK_SSL_ENABLED=no 13 | volumes: 14 | - ~/docker/spark/share:/opt/share 15 | ports: 16 | - '8080:8080' 17 | - '4040:4040' 18 | - '8088:8088' 19 | - '8042:8042' 20 | - '9870:9870' 21 | - '19888:19888' 22 | spark-worker-1: 23 | image: s1mplecc/spark-hadoop:3 24 | hostname: worker1 25 | environment: 26 | - SPARK_MODE=worker 27 | - SPARK_MASTER_URL=spark://master:7077 28 | - SPARK_WORKER_MEMORY=1G 29 | - SPARK_WORKER_CORES=1 30 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 31 | - SPARK_RPC_ENCRYPTION_ENABLED=no 32 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 33 | - SPARK_SSL_ENABLED=no 34 | volumes: 35 | - ~/docker/spark/share:/opt/share 36 | ports: 37 | - '8081:8081' 38 | spark-worker-2: 39 | image: s1mplecc/spark-hadoop:3 40 | hostname: worker2 41 | environment: 42 | - SPARK_MODE=worker 43 | - SPARK_MASTER_URL=spark://master:7077 44 | - SPARK_WORKER_MEMORY=1G 45 | - SPARK_WORKER_CORES=1 46 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 47 | - SPARK_RPC_ENCRYPTION_ENABLED=no 48 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 49 | - SPARK_SSL_ENABLED=no 50 | volumes: 51 | - ~/docker/spark/share:/opt/share 52 | ports: 53 | - '8082:8081' 54 | -------------------------------------------------------------------------------- /share/bigdata-learning-0.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s1mplecc/spark-hadoop-docker/2b6b5131e3ff7076c4b31f54028c883e0ce8d54b/share/bigdata-learning-0.0.1.jar -------------------------------------------------------------------------------- /share/my_script.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | 3 | conf = SparkConf().setAppName('My App') 4 | sc = SparkContext(conf=conf) 5 | 6 | count = sc.range(1, 1000 * 1000 * 100).filter(lambda x: x > 100).count() 7 | print('count: ', count) 8 | -------------------------------------------------------------------------------- /share/words.txt: -------------------------------------------------------------------------------- 1 | Apache Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Structured Streaming for incremental computation and stream processing. -------------------------------------------------------------------------------- /start-hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | $HADOOP_HOME/sbin/start-dfs.sh 3 | $HADOOP_HOME/sbin/start-yarn.sh --------------------------------------------------------------------------------