├── site-files ├── workers ├── mapred-site.xml ├── core-site.xml ├── hdfs-site.xml └── yarn-site.xml ├── docker-compose.yml ├── mapreduce-example.sh ├── 2.9.0 ├── Dockerfile └── docker-entrypoint.sh ├── 5-node-cluster.yml └── README.md /site-files/workers: -------------------------------------------------------------------------------- 1 | worker1 2 | worker2 3 | worker3 4 | -------------------------------------------------------------------------------- /site-files/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | mapreduce.framework.name 7 | yarn 8 | 9 | 10 | -------------------------------------------------------------------------------- /site-files/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.default.name 7 | hdfs://namenode:9000 8 | 9 | 10 | -------------------------------------------------------------------------------- /site-files/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | dfs.replication 7 | 2 8 | 9 | 10 | dfs.name.dir 11 | file:///hdfsdata/namenode 12 | 13 | 14 | dfs.data.dir 15 | file:///hdfsdata/datanode 16 | 17 | 18 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | 3 | services: 4 | hadoop: 5 | image: renci/hadoop:2.9.0 6 | build: 7 | context: ./2.9.0 8 | dockerfile: Dockerfile 9 | container_name: hadoop 10 | restart: always 11 | hostname: localhost 12 | ports: 13 | - '8042:8042' 14 | - '8088:8088' 15 | - '50070:50070' 16 | - '50075:50075' 17 | - '50090:50090' 18 | environment: 19 | IS_NODE_MANAGER: 'true' 20 | IS_NAME_NODE: 'true' 21 | IS_SECONDARY_NAME_NODE: 'true' 22 | IS_DATA_NODE: 'true' 23 | IS_RESOURCE_MANAGER: 'true' 24 | CLUSTER_NODES: hadoop 25 | -------------------------------------------------------------------------------- /site-files/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | yarn.resourcemanager.hostname 6 | resourcemanager 7 | 8 | 9 | yarn.resourcemanager.bind-host 10 | 0.0.0.0 11 | 12 | 13 | yarn.nodemanager.aux-services 14 | mapreduce_shuffle 15 | 16 | 17 | yarn.nodemanager.aux-services.mapreduce_shuffle.class 18 | org.apache.hadoop.mapred.ShuffleHandler 19 | 20 | 21 | yarn.resourcemanager.address 22 | resourcemanager:8032 23 | 24 | 25 | -------------------------------------------------------------------------------- /mapreduce-example.sh: -------------------------------------------------------------------------------- 1 | #/usr/bin/env bash 2 | 3 | echo 'INFO: remove input/output HDFS directories if they already exist' 4 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -rm -R input' 5 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -rm -R output' 6 | 7 | echo 'INFO: hdfs dfs -mkdir -p /user/hadoop/input' 8 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -mkdir -p /user/hadoop/input' 9 | 10 | echo 'INFO: hdfs dfs -put hadoop/README.txt /user/hadoop/input/' 11 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -put hadoop/README.txt /user/hadoop/input/' 12 | 13 | echo 'INFO: hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.0.jar wordcount input output' 14 | docker exec namenode runuser -l hadoop -c $'hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.0.jar wordcount input output' 15 | 16 | echo 'INFO: hdfs dfs -ls /user/hadoop/output' 17 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -ls /user/hadoop/output' 18 | 19 | echo 'INFO: cat hadoop/README.txt' 20 | docker exec namenode runuser -l hadoop -c $'cat hadoop/README.txt' 21 | 22 | echo 'INFO: hdfs dfs -cat /user/hadoop/output/part-r-00000' 23 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -cat /user/hadoop/output/part-r-00000' 24 | 25 | echo 'HDFS directories at: http://localhost:50070/explorer.html#/user/hadoop' 26 | 27 | exit 0; 28 | -------------------------------------------------------------------------------- /2.9.0/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM krallin/centos-tini:centos7 2 | MAINTAINER Michael J. Stealey 3 | 4 | ARG HADOOP_VERSION=2.9.0 5 | 6 | # Set correct environment variables. 7 | ENV HOME=/root \ 8 | LANG=en_US.UTF-8 \ 9 | LC_ALL=en_US.UTF-8 10 | 11 | # java 8: https://github.com/binarybabel/docker-jdk/blob/master/src/centos.Dockerfile 12 | ENV JAVA_DOWNLOAD=http://download.oracle.com/otn-pub/java/jdk/8u161-b12/2f38c3b165be4555a1fa6e98c45e0808/jdk-8u161-linux-x64.rpm 13 | RUN cd /tmp \ 14 | && curl -o jdk.rpm -jfksSLH "Cookie: oraclelicense=accept-securebackup-cookie" \ 15 | "${JAVA_DOWNLOAD:-$(curl -s https://lv.binarybabel.org/catalog-api/java/jdk8.txt?p=downloads.rpm)}" \ 16 | && rpm -Uvh jdk.rpm && rm jdk.rpm \ 17 | && echo "export JAVA_HOME=/usr/java/default/" > /etc/profile.d/java_home.sh 18 | 19 | # apache hadoop 20 | ARG HADOOP_INSTALL_DIR=/home/hadoop 21 | RUN yum install -y \ 22 | openssh-server \ 23 | openssh-clients \ 24 | which 25 | RUN adduser -m -d $HADOOP_INSTALL_DIR hadoop 26 | WORKDIR $HADOOP_INSTALL_DIR 27 | USER hadoop 28 | RUN curl -o hadoop-$HADOOP_VERSION.tar.gz "https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" \ 29 | && tar xzf hadoop-$HADOOP_VERSION.tar.gz \ 30 | && mv hadoop-$HADOOP_VERSION hadoop \ 31 | && rm -f hadoop-HADOOP_VERSION.tar.gz \ 32 | && yum clean all 33 | 34 | WORKDIR /root/ 35 | USER root 36 | RUN ssh-keygen -q -N '' -t rsa -f /root/.ssh/id_rsa \ 37 | && ssh-keygen -q -N '' -t dsa -f /etc/ssh/ssh_host_dsa_key \ 38 | && ssh-keygen -q -N '' -t rsa -f /etc/ssh/ssh_host_rsa_key \ 39 | && ssh-keygen -q -N '' -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key \ 40 | && ssh-keygen -q -N '' -t ed25519 -f /etc/ssh/ssh_host_ed25519_key \ 41 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 42 | && chmod 0600 /root/.ssh/authorized_keys 43 | 44 | ENV HADOOP_USER_HOME=${HADOOP_INSTALL_DIR} \ 45 | HADOOP_PREFIX=${HADOOP_INSTALL_DIR}/hadoop \ 46 | HADOOP_INSTALL=${HADOOP_PREFIX} \ 47 | HADOOP_MAPRED_HOME=${HADOOP_PREFIX} \ 48 | HADOOP_COMMON_HOME=${HADOOP_PREFIX} \ 49 | HADOOP_HDFS_HOME=${HADOOP_PREFIX} \ 50 | YARN_HOME=${HADOOP_PREFIX} \ 51 | HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_PREFIX}/lib/native \ 52 | HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop \ PATH=$PATH:${HADOOP_PREFIX}/sbin:${HADOOP_PREFIX}/bin 53 | 54 | ENV IS_NODE_MANAGER=true \ 55 | IS_NAME_NODE=true \ 56 | IS_SECONDARY_NAME_NODE=true \ 57 | IS_DATA_NODE=true \ 58 | IS_RESOURCE_MANAGER=true \ 59 | CLUSTER_NODES=localhost 60 | 61 | VOLUME ["/site-files", "/home/hadoop/public"] 62 | 63 | COPY docker-entrypoint.sh /docker-entrypoint.sh 64 | 65 | EXPOSE 22 66 | 67 | ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"] 68 | -------------------------------------------------------------------------------- /5-node-cluster.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | 3 | services: 4 | namenode: 5 | image: renci/hadoop:2.9.0 6 | container_name: namenode 7 | volumes: 8 | - hadoop-public:/home/hadoop/public 9 | - ./site-files:/site-files 10 | restart: always 11 | hostname: namenode 12 | networks: 13 | - hadoop 14 | ports: 15 | - '50070:50070' 16 | environment: 17 | IS_NODE_MANAGER: 'false' 18 | IS_NAME_NODE: 'true' 19 | IS_SECONDARY_NAME_NODE: 'false' 20 | IS_DATA_NODE: 'false' 21 | IS_RESOURCE_MANAGER: 'false' 22 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 23 | 24 | resourcemanager: 25 | image: renci/hadoop:2.9.0 26 | depends_on: 27 | - namenode 28 | container_name: resourcemanager 29 | volumes: 30 | - hadoop-public:/home/hadoop/public 31 | - ./site-files:/site-files 32 | restart: always 33 | hostname: resourcemanager 34 | networks: 35 | - hadoop 36 | ports: 37 | - '8088:8088' 38 | environment: 39 | IS_NODE_MANAGER: 'false' 40 | IS_NAME_NODE: 'false' 41 | IS_SECONDARY_NAME_NODE: 'false' 42 | IS_DATA_NODE: 'false' 43 | IS_RESOURCE_MANAGER: 'true' 44 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 45 | 46 | worker1: 47 | image: renci/hadoop:2.9.0 48 | depends_on: 49 | - namenode 50 | container_name: worker1 51 | volumes: 52 | - hadoop-public:/home/hadoop/public 53 | - ./site-files:/site-files 54 | restart: always 55 | hostname: worker1 56 | networks: 57 | - hadoop 58 | ports: 59 | - '8042:8042' 60 | - '50075:50075' 61 | environment: 62 | IS_NODE_MANAGER: 'true' 63 | IS_NAME_NODE: 'false' 64 | IS_SECONDARY_NAME_NODE: 'false' 65 | IS_DATA_NODE: 'true' 66 | IS_RESOURCE_MANAGER: 'false' 67 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 68 | 69 | worker2: 70 | image: renci/hadoop:2.9.0 71 | depends_on: 72 | - namenode 73 | container_name: worker2 74 | volumes: 75 | - hadoop-public:/home/hadoop/public 76 | - ./site-files:/site-files 77 | restart: always 78 | hostname: worker2 79 | networks: 80 | - hadoop 81 | ports: 82 | - '8043:8042' 83 | - '50076:50075' 84 | environment: 85 | IS_NODE_MANAGER: 'true' 86 | IS_NAME_NODE: 'false' 87 | IS_SECONDARY_NAME_NODE: 'false' 88 | IS_DATA_NODE: 'true' 89 | IS_RESOURCE_MANAGER: 'false' 90 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 91 | 92 | worker3: 93 | image: renci/hadoop:2.9.0 94 | depends_on: 95 | - namenode 96 | container_name: worker3 97 | volumes: 98 | - hadoop-public:/home/hadoop/public 99 | - ./site-files:/site-files 100 | restart: always 101 | hostname: worker3 102 | networks: 103 | - hadoop 104 | ports: 105 | - '8044:8042' 106 | - '50077:50075' 107 | environment: 108 | IS_NODE_MANAGER: 'true' 109 | IS_NAME_NODE: 'false' 110 | IS_SECONDARY_NAME_NODE: 'false' 111 | IS_DATA_NODE: 'true' 112 | IS_RESOURCE_MANAGER: 'false' 113 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 114 | 115 | volumes: 116 | hadoop-public: 117 | 118 | networks: 119 | hadoop: 120 | -------------------------------------------------------------------------------- /2.9.0/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop 5 | CORE_SITE_FILE=${HADOOP_CONF_DIR}/core-site.xml 6 | HDFS_SITE_FILE=${HADOOP_CONF_DIR}/hdfs-site.xml 7 | MAPRED_SITE_FILE=${HADOOP_CONF_DIR}/mapred-site.xml 8 | YARN_SITE_FILE=${HADOOP_CONF_DIR}/yarn-site.xml 9 | WORKERS_FILE=${HADOOP_CONF_DIR}/slaves 10 | 11 | _core_site_xml () { 12 | if [ -f /site-files/core-site.xml ]; then 13 | echo "USE: /site-files/core-site.xml" 14 | cat /site-files/core-site.xml > $CORE_SITE_FILE 15 | else 16 | cat > $CORE_SITE_FILE << EOF 17 | 18 | 19 | 20 | 21 | 22 | fs.default.name 23 | hdfs://localhost:9000 24 | 25 | 26 | EOF 27 | fi 28 | chown hadoop:hadoop $CORE_SITE_FILE 29 | } 30 | 31 | _hdfs_site_xml () { 32 | if [ -f /site-files/hdfs-site.xml ]; then 33 | cat /site-files/hdfs-site.xml > $HDFS_SITE_FILE 34 | else 35 | cat > $HDFS_SITE_FILE << EOF 36 | 37 | 38 | 39 | 40 | 41 | dfs.replication 42 | 1 43 | 44 | 45 | dfs.name.dir 46 | file:///home/hadoop/hadoopdata/hdfs/namenode 47 | 48 | 49 | dfs.data.dir 50 | file:///home/hadoop/hadoopdata/hdfs/datanode 51 | 52 | 53 | EOF 54 | fi 55 | chown hadoop:hadoop $HDFS_SITE_FILE 56 | } 57 | 58 | _mapred_site_xml() { 59 | if [ -f /site-files/mapred-site.xml ]; then 60 | cat /site-files/mapred-site.xml > $MAPRED_SITE_FILE 61 | else 62 | cat > $MAPRED_SITE_FILE << EOF 63 | 64 | 65 | 66 | 67 | 68 | mapreduce.framework.name 69 | yarn 70 | 71 | 72 | EOF 73 | fi 74 | chown hadoop:hadoop $MAPRED_SITE_FILE 75 | } 76 | 77 | _yarn_site_xml() { 78 | if [ -f /site-files/yarn-site.xml ]; then 79 | cat /site-files/yarn-site.xml > $YARN_SITE_FILE 80 | else 81 | cat > $YARN_SITE_FILE << EOF 82 | 83 | 84 | 85 | 86 | yarn.nodemanager.aux-services 87 | mapreduce_shuffle 88 | 89 | 90 | EOF 91 | fi 92 | chown hadoop:hadoop $YARN_SITE_FILE 93 | } 94 | 95 | _workers() { 96 | if [ -f /site-files/workers ]; then 97 | cat /site-files/workers > $WORKERS_FILE 98 | else 99 | cat > $WORKERS_FILE << EOF 100 | localhost 101 | EOF 102 | fi 103 | chown hadoop:hadoop $WORKERS_FILE 104 | } 105 | 106 | _hadoop_profile() { 107 | cat > /etc/profile.d/hadoop.sh << EOF 108 | export HADOOP_USER_HOME=${HADOOP_USER_HOME} 109 | export HADOOP_HOME=${HADOOP_USER_HOME}/hadoop 110 | export HADOOP_PREFIX=${HADOOP_USER_HOME}/hadoop 111 | export HADOOP_INSTALL=${HADOOP_PREFIX} 112 | export HADOOP_MAPRED_HOME=${HADOOP_PREFIX} 113 | export HADOOP_COMMON_HOME=${HADOOP_PREFIX} 114 | export HADOOP_HDFS_HOME=${HADOOP_PREFIX} 115 | export JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_PREFIX}/lib/native 116 | export YARN_HOME=${HADOOP_PREFIX} 117 | export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_PREFIX}/lib/native 118 | export HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop 119 | export CORE_SITE_FILE=${HADOOP_CONF_DIR}/core-site.xml 120 | export HDFS_SITE_FILE=${HADOOP_CONF_DIR}/hdfs-site.xml 121 | export MAPRED_SITE_FILE=${HADOOP_CONF_DIR}/mapred-site.xml 122 | export YARN_SITE_FILE=${HADOOP_CONF_DIR}/yarn-site.xml 123 | export WORKERS_FILE=${HADOOP_CONF_DIR}/slaves 124 | export PATH=$PATH:${HADOOP_PREFIX}/sbin:${HADOOP_PREFIX}/bin 125 | EOF 126 | } 127 | 128 | _generate_ssh_keys() { 129 | mkdir -p $HADOOP_USER_HOME/.ssh 130 | ssh-keygen -t rsa -N '' -f $HADOOP_USER_HOME/.ssh/id_rsa 131 | cat $HADOOP_USER_HOME/.ssh/id_rsa.pub >> $HADOOP_USER_HOME/.ssh/authorized_keys 132 | chmod 0600 $HADOOP_USER_HOME/.ssh/authorized_keys 133 | chown -R hadoop:hadoop $HADOOP_USER_HOME/.ssh 134 | } 135 | 136 | _hadoop_profile 137 | # runuser -l hadoop -c $'env' # debug hadoop env 138 | 139 | if $IS_NAME_NODE; then 140 | mkdir -p /hdfsdata/namenode 141 | chown -R hadoop:hadoop /hdfsdata/namenode 142 | fi 143 | if $IS_DATA_NODE; then 144 | mkdir -p /hdfsdata/datanode 145 | chown -R hadoop:hadoop /hdfsdata/datanode 146 | fi 147 | 148 | chown -R hadoop:hadoop /home/hadoop/public 149 | 150 | /usr/sbin/sshd -D & 151 | 152 | runuser -l hadoop -c $'sed -i \'s:export JAVA_HOME=.*:export JAVA_HOME=/usr/java/jdk1.8.0_161/jre:\' /home/hadoop/hadoop/etc/hadoop/hadoop-env.sh' 153 | 154 | _core_site_xml 155 | _hdfs_site_xml 156 | _mapred_site_xml 157 | _yarn_site_xml 158 | _workers 159 | 160 | IS_FIRST_RUN=$(if [ ! -f "/home/hadoop/.ssh/id_rsa.pub" ]; then echo 'true'; else echo 'false'; fi) 161 | 162 | if $IS_NAME_NODE; then 163 | if $IS_FIRST_RUN; then 164 | echo "NameNode copy ssh" 165 | _generate_ssh_keys 166 | cp -r /home/hadoop/.ssh /home/hadoop/public/ 167 | fi 168 | else 169 | while [ ! -d /home/hadoop/public/.ssh ]; do 170 | echo "waiting for /home/hadoop/public/.ssh" 171 | sleep 2 172 | done 173 | if $IS_FIRST_RUN; then 174 | echo "COPY: .ssh from namenode to $(hostname)" 175 | cp -rf /home/hadoop/public/.ssh /home/hadoop/ 176 | cat /home/hadoop/.ssh/id_rsa.pub >> /home/hadoop/.ssh/authorized_keys 177 | chown -R hadoop:hadoop /home/hadoop/.ssh 178 | fi 179 | fi 180 | 181 | if $IS_FIRST_RUN; then 182 | while read node; do 183 | echo "node = $node" 184 | until runuser -l hadoop -c $'ssh-keyscan $node >> /home/hadoop/.ssh/known_hosts'; do sleep 2; done 185 | done < <($CLUSTER_NODES) 186 | fi 187 | 188 | if $IS_NAME_NODE; then 189 | echo "Staring NameNode" 190 | if $IS_FIRST_RUN; then 191 | runuser -l hadoop -c $'$HADOOP_PREFIX/bin/hdfs namenode -format' 192 | fi 193 | runuser -l hadoop -c $'$HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode' 194 | fi 195 | 196 | if $IS_SECONDARY_NAME_NODE; then 197 | echo "Staring SecondaryNameNode" 198 | runuser -l hadoop -c $'$HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start secondarynamenode' 199 | fi 200 | 201 | if $IS_DATA_NODE; then 202 | echo "Staring DataNode" 203 | runuser -l hadoop -c $'$HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode' 204 | fi 205 | 206 | if $IS_RESOURCE_MANAGER; then 207 | echo "Staring ResourceManager" 208 | runuser -l hadoop -c $'$YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager' 209 | fi 210 | 211 | if $IS_NODE_MANAGER; then 212 | echo "Staring NodeManager" 213 | runuser -l hadoop -c $'$YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager' 214 | fi 215 | 216 | tail -f /dev/null 217 | 218 | exec "$@" 219 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Hadoop in Docker 2 | 3 | This work has been inspired by: 4 | 5 | - techadmin.net: [Setup Hadoop cluster on CentOS](https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/) 6 | - Oracle Java 8: [binarybabel/docker-jdk](https://github.com/binarybabel/docker-jdk/blob/master/src/centos.Dockerfile) 7 | - CentOS 7 base image: [krallin/tini-images](https://github.com/krallin/tini-images) 8 | - ExoGENI Recipes: [RENCI-NRIG/exogeni-recipes/hadoop](https://github.com/RENCI-NRIG/exogeni-recipes/tree/master/hadoop/hadoop-2) 9 | 10 | ### What Is Apache Hadoop? 11 | 12 | The Apache Hadoop project develops open-source software for reliable, scalable, distributed computing. 13 | 14 | The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage. Rather than rely on hardware to deliver high-availability, the library itself is designed to detect and handle failures at the application layer, so delivering a highly-available service on top of a cluster of computers, each of which may be prone to failures. 15 | 16 | See [official documentation](http://hadoop.apache.org) for more information. 17 | 18 | ## How to use this image 19 | 20 | ### Build locally 21 | 22 | 23 | ``` 24 | $ docker build -t renci/hadoop:2.9.0 ./2.9.0/ 25 | ... 26 | $ docker images 27 | REPOSITORY TAG IMAGE ID CREATED SIZE 28 | renci/hadoop 2.9.0 4a4de8ed48b2 3 minutes ago 1.92GB 29 | ... 30 | ``` 31 | 32 | Example `docker-compose.yml` file included that builds from local repository and deploys a single node cluster based on [[1](https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/)]. 33 | 34 | ``` 35 | $ docker-compose build 36 | ... 37 | $ docker-compose up -d 38 | ... 39 | $ docker-compose ps 40 | Name Command State Ports 41 | -------------------------------------------------------------------------------------------------------------------------------------------- 42 | hadoop /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50070->50070/tcp, 0.0.0.0:50075->50075/tcp, 0.0.0.0:50090->50090/tcp, 43 | 0.0.0.0:8042->8042/tcp, 0.0.0.0:8088->8088/tcp 44 | ``` 45 | 46 | - Port mappings from above: 47 | 48 | ``` 49 | ports: 50 | - '8042:8042' # NodeManager web ui 51 | - '8088:8088' # ResourceManager web ui 52 | - '50070:50070' # NameNode web ui 53 | - '50075:50075' # DataNode web ui 54 | - '50090:50090' # Secondary NameNode web ui 55 | ``` 56 | 57 | ### From Docker Hub 58 | 59 | Automated builds are generated at: [https://hub.docker.com/u/renci](https://hub.docker.com/u/renci/dashboard/) and can be pulled as follows. 60 | 61 | ``` 62 | $ docker pull renci/hadoop:2.9.0 63 | ``` 64 | 65 | ## Example: Five node cluster 66 | 67 | Using the provided [`5-node-cluster.yml`](5-node-cluster.yml) file to stand up a five node Hadoop cluster that includes a `namenode`, `resourcemanager` and three workers (`worker1`, `worker2` and `worker3`). 68 | 69 | Hadoop docker network and port mappings (specific network values subject to change based on system): 70 | 71 | Hadoop docker network 72 | 73 | The nodes will use the definitions found in the [site-files](site-files) directory to configure the cluster. These files can be modified as needed to configure your cluster as needed at runtime. 74 | 75 | A docker volume named `hadoop-public` is also created to allow the nodes to exchange SSH key information between themselves on startup. 76 | 77 | ```yaml 78 | version: '3.1' 79 | 80 | services: 81 | namenode: 82 | image: renci/hadoop:2.9.0 83 | container_name: namenode 84 | volumes: 85 | - hadoop-public:/home/hadoop/public 86 | - ./site-files:/site-files 87 | restart: always 88 | hostname: namenode 89 | networks: 90 | - hadoop 91 | ports: 92 | - '50070:50070' 93 | environment: 94 | IS_NODE_MANAGER: 'false' 95 | IS_NAME_NODE: 'true' 96 | IS_SECONDARY_NAME_NODE: 'false' 97 | IS_DATA_NODE: 'false' 98 | IS_RESOURCE_MANAGER: 'false' 99 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 100 | 101 | resourcemanager: 102 | image: renci/hadoop:2.9.0 103 | depends_on: 104 | - namenode 105 | container_name: resourcemanager 106 | volumes: 107 | - hadoop-public:/home/hadoop/public 108 | - ./site-files:/site-files 109 | restart: always 110 | hostname: resourcemanager 111 | networks: 112 | - hadoop 113 | ports: 114 | - '8088:8088' 115 | environment: 116 | IS_NODE_MANAGER: 'false' 117 | IS_NAME_NODE: 'false' 118 | IS_SECONDARY_NAME_NODE: 'false' 119 | IS_DATA_NODE: 'false' 120 | IS_RESOURCE_MANAGER: 'true' 121 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 122 | 123 | worker1: 124 | image: renci/hadoop:2.9.0 125 | depends_on: 126 | - namenode 127 | container_name: worker1 128 | volumes: 129 | - hadoop-public:/home/hadoop/public 130 | - ./site-files:/site-files 131 | restart: always 132 | hostname: worker1 133 | networks: 134 | - hadoop 135 | ports: 136 | - '8042:8042' 137 | - '50075:50075' 138 | environment: 139 | IS_NODE_MANAGER: 'true' 140 | IS_NAME_NODE: 'false' 141 | IS_SECONDARY_NAME_NODE: 'false' 142 | IS_DATA_NODE: 'true' 143 | IS_RESOURCE_MANAGER: 'false' 144 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 145 | 146 | worker2: 147 | image: renci/hadoop:2.9.0 148 | depends_on: 149 | - namenode 150 | container_name: worker2 151 | volumes: 152 | - hadoop-public:/home/hadoop/public 153 | - ./site-files:/site-files 154 | restart: always 155 | hostname: worker2 156 | networks: 157 | - hadoop 158 | ports: 159 | - '8043:8042' 160 | - '50076:50075' 161 | environment: 162 | IS_NODE_MANAGER: 'true' 163 | IS_NAME_NODE: 'false' 164 | IS_SECONDARY_NAME_NODE: 'false' 165 | IS_DATA_NODE: 'true' 166 | IS_RESOURCE_MANAGER: 'false' 167 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 168 | 169 | worker3: 170 | image: renci/hadoop:2.9.0 171 | depends_on: 172 | - namenode 173 | container_name: worker3 174 | volumes: 175 | - hadoop-public:/home/hadoop/public 176 | - ./site-files:/site-files 177 | restart: always 178 | hostname: worker3 179 | networks: 180 | - hadoop 181 | ports: 182 | - '8044:8042' 183 | - '50077:50075' 184 | environment: 185 | IS_NODE_MANAGER: 'true' 186 | IS_NAME_NODE: 'false' 187 | IS_SECONDARY_NAME_NODE: 'false' 188 | IS_DATA_NODE: 'true' 189 | IS_RESOURCE_MANAGER: 'false' 190 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3 191 | 192 | volumes: 193 | hadoop-public: 194 | 195 | networks: 196 | hadoop: 197 | ``` 198 | 199 | ### Start the cluster 200 | 201 | Using `docker-compose` 202 | 203 | ``` 204 | $ docker-compose -f 5-node-cluster.yml up -d 205 | ``` 206 | 207 | After a few moments all containers will be running and should display in a `ps` call. 208 | 209 | ``` 210 | $ docker-compose -f 5-node-cluster.yml ps 211 | Name Command State Ports 212 | ------------------------------------------------------------------------------------------------------------------- 213 | namenode /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50070->50070/tcp 214 | resourcemanager /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:8088->8088/tcp 215 | worker1 /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50075->50075/tcp, 0.0.0.0:8042->8042/tcp 216 | worker2 /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50076->50075/tcp, 0.0.0.0:8043->8042/tcp 217 | worker3 /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50077->50075/tcp, 0.0.0.0:8044->8042/tcp 218 | ``` 219 | 220 | Since the ports of the containers were mapped to the host the various web ui's can be observed using a local browser. 221 | 222 | **namenode container**: NameNode Web UI on port 50070 223 | 224 | NameNode: [http://localhost:50070/dfshealth.html#tab-datanode](http://localhost:50070/dfshealth.html#tab-datanode) 225 | 226 | NameNode 227 | 228 | **resource manager container**: ResourceManager Web UI on port 8088 229 | 230 | ResourceManger: [http://localhost:8088/cluster](http://localhost:8088/cluster) 231 | 232 | ResourceManager 233 | 234 | 235 | **worker1, worker2 and worker3 containers**: DataNode Web UI on ports 50075, 50076 and 50077, NodeManager Web UI on ports 8042, 8043 and 8044. 236 | 237 | DataNode (worker1): [http://localhost:50075/datanode.html](http://localhost:50075/datanode.html) 238 | 239 | Worker1 DataManager 240 | 241 | NodeManager (worker1): [http://localhost:8042/node](http://localhost:8042/node) 242 | 243 | NodeManager 244 | 245 | Worker2 DataNode: [http://localhost:50076/datanode.html](http://localhost:50076/datanode.html) 246 | 247 | Worker2 DataManager 248 | 249 | Worker3 DataNode: [http://localhost:50077/datanode.html](http://localhost:50077/datanode.html) 250 | 251 | Worker3 DataManager 252 | 253 | ### Stop the cluster 254 | 255 | The cluster can be stopped by issuing a `stop` call. 256 | 257 | ``` 258 | $ docker-compose -f 5-node-cluster.yml stop 259 | Stopping worker2 ... done 260 | Stopping resourcemanager ... done 261 | Stopping worker1 ... done 262 | Stopping worker3 ... done 263 | Stopping namenode ... done 264 | ``` 265 | 266 | ### Restart the cluster 267 | 268 | So long as the container definitions have not been removed, the cluster can be restarted by using a `start` call. 269 | 270 | ``` 271 | $ docker-compose -f 5-node-cluster.yml start 272 | Starting namenode ... done 273 | Starting worker1 ... done 274 | Starting worker3 ... done 275 | Starting worker2 ... done 276 | Starting resourcemanager ... done 277 | ``` 278 | 279 | After a few moments all cluster activity should be back to normal. 280 | 281 | ### Remove the cluster 282 | 283 | The entire cluster can be removed by first stopping it, and then removing the containers from the local machine. 284 | 285 | ``` 286 | $ docker-compose -f 5-node-cluster.yml stop && docker-compose -f 5-node-cluster.yml rm -f 287 | Stopping worker2 ... done 288 | Stopping resourcemanager ... done 289 | Stopping worker1 ... done 290 | Stopping worker3 ... done 291 | Stopping namenode ... done 292 | Going to remove worker2, resourcemanager, worker1, worker3, namenode 293 | Removing worker2 ... done 294 | Removing resourcemanager ... done 295 | Removing worker1 ... done 296 | Removing worker3 ... done 297 | Removing namenode ... done 298 | ``` 299 | 300 | ## Example: Map Reduce 301 | 302 | **NOTE**: Assumes the existence of the five node cluster from the previous example. 303 | 304 | A simple map reduce example has been provided in the [mapreduce-example.sh](mapreduce-example.sh) script. 305 | 306 | The script is meant to be run from the host machine and uses `docker exec` to relay commands to the docker `namenode` container as the `hadoop` user. 307 | 308 | 309 | ``` 310 | $ ./mapreduce-example.sh 311 | INFO: remove input/output HDFS directories if they already exist 312 | rm: `input': No such file or directory 313 | rm: `output': No such file or directory 314 | INFO: hdfs dfs -mkdir -p /user/hadoop/input 315 | INFO: hdfs dfs -put hadoop/README.txt /user/hadoop/input/ 316 | INFO: hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.0.jar wordcount input output 317 | 18/02/17 19:42:38 INFO client.RMProxy: Connecting to ResourceManager at resourcemanager/172.19.0.5:8032 318 | 18/02/17 19:42:39 INFO input.FileInputFormat: Total input files to process : 1 319 | 18/02/17 19:42:39 INFO mapreduce.JobSubmitter: number of splits:1 320 | 18/02/17 19:42:39 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled 321 | 18/02/17 19:42:39 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1518896527275_0001 322 | 18/02/17 19:42:40 INFO impl.YarnClientImpl: Submitted application application_1518896527275_0001 323 | 18/02/17 19:42:40 INFO mapreduce.Job: The url to track the job: http://resourcemanager:8088/proxy/application_1518896527275_0001/ 324 | 18/02/17 19:42:40 INFO mapreduce.Job: Running job: job_1518896527275_0001 325 | 18/02/17 19:42:51 INFO mapreduce.Job: Job job_1518896527275_0001 running in uber mode : false 326 | 18/02/17 19:42:51 INFO mapreduce.Job: map 0% reduce 0% 327 | 18/02/17 19:42:58 INFO mapreduce.Job: map 100% reduce 0% 328 | 18/02/17 19:43:05 INFO mapreduce.Job: map 100% reduce 100% 329 | 18/02/17 19:43:05 INFO mapreduce.Job: Job job_1518896527275_0001 completed successfully 330 | 18/02/17 19:43:05 INFO mapreduce.Job: Counters: 49 331 | File System Counters 332 | FILE: Number of bytes read=1836 333 | FILE: Number of bytes written=407057 334 | FILE: Number of read operations=0 335 | FILE: Number of large read operations=0 336 | FILE: Number of write operations=0 337 | HDFS: Number of bytes read=1480 338 | HDFS: Number of bytes written=1306 339 | HDFS: Number of read operations=6 340 | HDFS: Number of large read operations=0 341 | HDFS: Number of write operations=2 342 | Job Counters 343 | Launched map tasks=1 344 | Launched reduce tasks=1 345 | Rack-local map tasks=1 346 | Total time spent by all maps in occupied slots (ms)=3851 347 | Total time spent by all reduces in occupied slots (ms)=3718 348 | Total time spent by all map tasks (ms)=3851 349 | Total time spent by all reduce tasks (ms)=3718 350 | Total vcore-milliseconds taken by all map tasks=3851 351 | Total vcore-milliseconds taken by all reduce tasks=3718 352 | Total megabyte-milliseconds taken by all map tasks=3943424 353 | Total megabyte-milliseconds taken by all reduce tasks=3807232 354 | Map-Reduce Framework 355 | Map input records=31 356 | Map output records=179 357 | Map output bytes=2055 358 | Map output materialized bytes=1836 359 | Input split bytes=114 360 | Combine input records=179 361 | Combine output records=131 362 | Reduce input groups=131 363 | Reduce shuffle bytes=1836 364 | Reduce input records=131 365 | Reduce output records=131 366 | Spilled Records=262 367 | Shuffled Maps =1 368 | Failed Shuffles=0 369 | Merged Map outputs=1 370 | GC time elapsed (ms)=114 371 | CPU time spent (ms)=1330 372 | Physical memory (bytes) snapshot=482201600 373 | Virtual memory (bytes) snapshot=3950104576 374 | Total committed heap usage (bytes)=281018368 375 | Shuffle Errors 376 | BAD_ID=0 377 | CONNECTION=0 378 | IO_ERROR=0 379 | WRONG_LENGTH=0 380 | WRONG_MAP=0 381 | WRONG_REDUCE=0 382 | File Input Format Counters 383 | Bytes Read=1366 384 | File Output Format Counters 385 | Bytes Written=1306 386 | INFO: hdfs dfs -ls /user/hadoop/output 387 | Found 2 items 388 | -rw-r--r-- 2 hadoop supergroup 0 2018-02-17 19:43 /user/hadoop/output/_SUCCESS 389 | -rw-r--r-- 2 hadoop supergroup 1306 2018-02-17 19:43 /user/hadoop/output/part-r-00000 390 | INFO: cat hadoop/README.txt 391 | For the latest information about Hadoop, please visit our website at: 392 | 393 | http://hadoop.apache.org/core/ 394 | 395 | and our wiki, at: 396 | 397 | http://wiki.apache.org/hadoop/ 398 | 399 | This distribution includes cryptographic software. The country in 400 | which you currently reside may have restrictions on the import, 401 | possession, use, and/or re-export to another country, of 402 | encryption software. BEFORE using any encryption software, please 403 | check your country's laws, regulations and policies concerning the 404 | import, possession, or use, and re-export of encryption software, to 405 | see if this is permitted. See for more 406 | information. 407 | 408 | The U.S. Government Department of Commerce, Bureau of Industry and 409 | Security (BIS), has classified this software as Export Commodity 410 | Control Number (ECCN) 5D002.C.1, which includes information security 411 | software using or performing cryptographic functions with asymmetric 412 | algorithms. The form and manner of this Apache Software Foundation 413 | distribution makes it eligible for export under the License Exception 414 | ENC Technology Software Unrestricted (TSU) exception (see the BIS 415 | Export Administration Regulations, Section 740.13) for both object 416 | code and source code. 417 | 418 | The following provides more details on the included cryptographic 419 | software: 420 | Hadoop Core uses the SSL libraries from the Jetty project written 421 | by mortbay.org. 422 | INFO: hdfs dfs -cat /user/hadoop/output/part-r-00000 423 | (BIS), 1 424 | (ECCN) 1 425 | (TSU) 1 426 | (see 1 427 | 5D002.C.1, 1 428 | 740.13) 1 429 | 1 430 | Administration 1 431 | Apache 1 432 | BEFORE 1 433 | BIS 1 434 | Bureau 1 435 | Commerce, 1 436 | Commodity 1 437 | Control 1 438 | Core 1 439 | Department 1 440 | ENC 1 441 | Exception 1 442 | Export 2 443 | For 1 444 | Foundation 1 445 | Government 1 446 | Hadoop 1 447 | Hadoop, 1 448 | Industry 1 449 | Jetty 1 450 | License 1 451 | Number 1 452 | Regulations, 1 453 | SSL 1 454 | Section 1 455 | Security 1 456 | See 1 457 | Software 2 458 | Technology 1 459 | The 4 460 | This 1 461 | U.S. 1 462 | Unrestricted 1 463 | about 1 464 | algorithms. 1 465 | and 6 466 | and/or 1 467 | another 1 468 | any 1 469 | as 1 470 | asymmetric 1 471 | at: 2 472 | both 1 473 | by 1 474 | check 1 475 | classified 1 476 | code 1 477 | code. 1 478 | concerning 1 479 | country 1 480 | country's 1 481 | country, 1 482 | cryptographic 3 483 | currently 1 484 | details 1 485 | distribution 2 486 | eligible 1 487 | encryption 3 488 | exception 1 489 | export 1 490 | following 1 491 | for 3 492 | form 1 493 | from 1 494 | functions 1 495 | has 1 496 | have 1 497 | http://hadoop.apache.org/core/ 1 498 | http://wiki.apache.org/hadoop/ 1 499 | if 1 500 | import, 2 501 | in 1 502 | included 1 503 | includes 2 504 | information 2 505 | information. 1 506 | is 1 507 | it 1 508 | latest 1 509 | laws, 1 510 | libraries 1 511 | makes 1 512 | manner 1 513 | may 1 514 | more 2 515 | mortbay.org. 1 516 | object 1 517 | of 5 518 | on 2 519 | or 2 520 | our 2 521 | performing 1 522 | permitted. 1 523 | please 2 524 | policies 1 525 | possession, 2 526 | project 1 527 | provides 1 528 | re-export 2 529 | regulations 1 530 | reside 1 531 | restrictions 1 532 | security 1 533 | see 1 534 | software 2 535 | software, 2 536 | software. 2 537 | software: 1 538 | source 1 539 | the 8 540 | this 3 541 | to 2 542 | under 1 543 | use, 2 544 | uses 1 545 | using 2 546 | visit 1 547 | website 1 548 | which 2 549 | wiki, 1 550 | with 1 551 | written 1 552 | you 1 553 | your 1 554 | HDFS directories at: http://localhost:50070/explorer.html#/user/hadoop 555 | ``` 556 | 557 | NameNode: [http://localhost:50070/explorer.html#/user/hadoop](http://localhost:50070/explorer.html#/user/hadoop) 558 | 559 | MapReduce Example 560 | 561 | ### References 562 | 563 | 1. [https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/](https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/) 564 | 2. [https://github.com/RENCI-NRIG/exogeni-recipes/hadoop/hadoop-2/hadoop\_exogeni\_postboot.sh](https://github.com/RENCI-NRIG/exogeni-recipes/blob/master/hadoop/hadoop-2/hadoop_exogeni_postboot.sh) 565 | 3. Hadoop configuration files 566 | - Common: [hadoop-common/core-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-common/core-default.xml) 567 | - HDFS: [hadoop-hdfs/hdfs-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml) 568 | - MapReduce: [hadoop-mapreduce-client-core/mapred-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml) 569 | - Yarn: [hadoop-yarn-common/yarn-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-yarn/hadoop-yarn-common/yarn-default.xml) 570 | - Deprecated Properties: [hadoop-common/DeprecatedProperties.html](http://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-common/DeprecatedProperties.html) 571 | 4. Example MapReduce: [https://tecadmin.net/hadoop-running-a-wordcount-mapreduce-example/](https://tecadmin.net/hadoop-running-a-wordcount-mapreduce-example/) 572 | --------------------------------------------------------------------------------