├── site-files
├── workers
├── mapred-site.xml
├── core-site.xml
├── hdfs-site.xml
└── yarn-site.xml
├── docker-compose.yml
├── mapreduce-example.sh
├── 2.9.0
├── Dockerfile
└── docker-entrypoint.sh
├── 5-node-cluster.yml
└── README.md
/site-files/workers:
--------------------------------------------------------------------------------
1 | worker1
2 | worker2
3 | worker3
4 |
--------------------------------------------------------------------------------
/site-files/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | mapreduce.framework.name
7 | yarn
8 |
9 |
10 |
--------------------------------------------------------------------------------
/site-files/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | fs.default.name
7 | hdfs://namenode:9000
8 |
9 |
10 |
--------------------------------------------------------------------------------
/site-files/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | dfs.replication
7 | 2
8 |
9 |
10 | dfs.name.dir
11 | file:///hdfsdata/namenode
12 |
13 |
14 | dfs.data.dir
15 | file:///hdfsdata/datanode
16 |
17 |
18 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.1'
2 |
3 | services:
4 | hadoop:
5 | image: renci/hadoop:2.9.0
6 | build:
7 | context: ./2.9.0
8 | dockerfile: Dockerfile
9 | container_name: hadoop
10 | restart: always
11 | hostname: localhost
12 | ports:
13 | - '8042:8042'
14 | - '8088:8088'
15 | - '50070:50070'
16 | - '50075:50075'
17 | - '50090:50090'
18 | environment:
19 | IS_NODE_MANAGER: 'true'
20 | IS_NAME_NODE: 'true'
21 | IS_SECONDARY_NAME_NODE: 'true'
22 | IS_DATA_NODE: 'true'
23 | IS_RESOURCE_MANAGER: 'true'
24 | CLUSTER_NODES: hadoop
25 |
--------------------------------------------------------------------------------
/site-files/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | yarn.resourcemanager.hostname
6 | resourcemanager
7 |
8 |
9 | yarn.resourcemanager.bind-host
10 | 0.0.0.0
11 |
12 |
13 | yarn.nodemanager.aux-services
14 | mapreduce_shuffle
15 |
16 |
17 | yarn.nodemanager.aux-services.mapreduce_shuffle.class
18 | org.apache.hadoop.mapred.ShuffleHandler
19 |
20 |
21 | yarn.resourcemanager.address
22 | resourcemanager:8032
23 |
24 |
25 |
--------------------------------------------------------------------------------
/mapreduce-example.sh:
--------------------------------------------------------------------------------
1 | #/usr/bin/env bash
2 |
3 | echo 'INFO: remove input/output HDFS directories if they already exist'
4 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -rm -R input'
5 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -rm -R output'
6 |
7 | echo 'INFO: hdfs dfs -mkdir -p /user/hadoop/input'
8 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -mkdir -p /user/hadoop/input'
9 |
10 | echo 'INFO: hdfs dfs -put hadoop/README.txt /user/hadoop/input/'
11 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -put hadoop/README.txt /user/hadoop/input/'
12 |
13 | echo 'INFO: hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.0.jar wordcount input output'
14 | docker exec namenode runuser -l hadoop -c $'hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.0.jar wordcount input output'
15 |
16 | echo 'INFO: hdfs dfs -ls /user/hadoop/output'
17 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -ls /user/hadoop/output'
18 |
19 | echo 'INFO: cat hadoop/README.txt'
20 | docker exec namenode runuser -l hadoop -c $'cat hadoop/README.txt'
21 |
22 | echo 'INFO: hdfs dfs -cat /user/hadoop/output/part-r-00000'
23 | docker exec namenode runuser -l hadoop -c $'hdfs dfs -cat /user/hadoop/output/part-r-00000'
24 |
25 | echo 'HDFS directories at: http://localhost:50070/explorer.html#/user/hadoop'
26 |
27 | exit 0;
28 |
--------------------------------------------------------------------------------
/2.9.0/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM krallin/centos-tini:centos7
2 | MAINTAINER Michael J. Stealey
3 |
4 | ARG HADOOP_VERSION=2.9.0
5 |
6 | # Set correct environment variables.
7 | ENV HOME=/root \
8 | LANG=en_US.UTF-8 \
9 | LC_ALL=en_US.UTF-8
10 |
11 | # java 8: https://github.com/binarybabel/docker-jdk/blob/master/src/centos.Dockerfile
12 | ENV JAVA_DOWNLOAD=http://download.oracle.com/otn-pub/java/jdk/8u161-b12/2f38c3b165be4555a1fa6e98c45e0808/jdk-8u161-linux-x64.rpm
13 | RUN cd /tmp \
14 | && curl -o jdk.rpm -jfksSLH "Cookie: oraclelicense=accept-securebackup-cookie" \
15 | "${JAVA_DOWNLOAD:-$(curl -s https://lv.binarybabel.org/catalog-api/java/jdk8.txt?p=downloads.rpm)}" \
16 | && rpm -Uvh jdk.rpm && rm jdk.rpm \
17 | && echo "export JAVA_HOME=/usr/java/default/" > /etc/profile.d/java_home.sh
18 |
19 | # apache hadoop
20 | ARG HADOOP_INSTALL_DIR=/home/hadoop
21 | RUN yum install -y \
22 | openssh-server \
23 | openssh-clients \
24 | which
25 | RUN adduser -m -d $HADOOP_INSTALL_DIR hadoop
26 | WORKDIR $HADOOP_INSTALL_DIR
27 | USER hadoop
28 | RUN curl -o hadoop-$HADOOP_VERSION.tar.gz "https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" \
29 | && tar xzf hadoop-$HADOOP_VERSION.tar.gz \
30 | && mv hadoop-$HADOOP_VERSION hadoop \
31 | && rm -f hadoop-HADOOP_VERSION.tar.gz \
32 | && yum clean all
33 |
34 | WORKDIR /root/
35 | USER root
36 | RUN ssh-keygen -q -N '' -t rsa -f /root/.ssh/id_rsa \
37 | && ssh-keygen -q -N '' -t dsa -f /etc/ssh/ssh_host_dsa_key \
38 | && ssh-keygen -q -N '' -t rsa -f /etc/ssh/ssh_host_rsa_key \
39 | && ssh-keygen -q -N '' -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key \
40 | && ssh-keygen -q -N '' -t ed25519 -f /etc/ssh/ssh_host_ed25519_key \
41 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
42 | && chmod 0600 /root/.ssh/authorized_keys
43 |
44 | ENV HADOOP_USER_HOME=${HADOOP_INSTALL_DIR} \
45 | HADOOP_PREFIX=${HADOOP_INSTALL_DIR}/hadoop \
46 | HADOOP_INSTALL=${HADOOP_PREFIX} \
47 | HADOOP_MAPRED_HOME=${HADOOP_PREFIX} \
48 | HADOOP_COMMON_HOME=${HADOOP_PREFIX} \
49 | HADOOP_HDFS_HOME=${HADOOP_PREFIX} \
50 | YARN_HOME=${HADOOP_PREFIX} \
51 | HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_PREFIX}/lib/native \
52 | HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop \ PATH=$PATH:${HADOOP_PREFIX}/sbin:${HADOOP_PREFIX}/bin
53 |
54 | ENV IS_NODE_MANAGER=true \
55 | IS_NAME_NODE=true \
56 | IS_SECONDARY_NAME_NODE=true \
57 | IS_DATA_NODE=true \
58 | IS_RESOURCE_MANAGER=true \
59 | CLUSTER_NODES=localhost
60 |
61 | VOLUME ["/site-files", "/home/hadoop/public"]
62 |
63 | COPY docker-entrypoint.sh /docker-entrypoint.sh
64 |
65 | EXPOSE 22
66 |
67 | ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"]
68 |
--------------------------------------------------------------------------------
/5-node-cluster.yml:
--------------------------------------------------------------------------------
1 | version: '3.1'
2 |
3 | services:
4 | namenode:
5 | image: renci/hadoop:2.9.0
6 | container_name: namenode
7 | volumes:
8 | - hadoop-public:/home/hadoop/public
9 | - ./site-files:/site-files
10 | restart: always
11 | hostname: namenode
12 | networks:
13 | - hadoop
14 | ports:
15 | - '50070:50070'
16 | environment:
17 | IS_NODE_MANAGER: 'false'
18 | IS_NAME_NODE: 'true'
19 | IS_SECONDARY_NAME_NODE: 'false'
20 | IS_DATA_NODE: 'false'
21 | IS_RESOURCE_MANAGER: 'false'
22 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
23 |
24 | resourcemanager:
25 | image: renci/hadoop:2.9.0
26 | depends_on:
27 | - namenode
28 | container_name: resourcemanager
29 | volumes:
30 | - hadoop-public:/home/hadoop/public
31 | - ./site-files:/site-files
32 | restart: always
33 | hostname: resourcemanager
34 | networks:
35 | - hadoop
36 | ports:
37 | - '8088:8088'
38 | environment:
39 | IS_NODE_MANAGER: 'false'
40 | IS_NAME_NODE: 'false'
41 | IS_SECONDARY_NAME_NODE: 'false'
42 | IS_DATA_NODE: 'false'
43 | IS_RESOURCE_MANAGER: 'true'
44 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
45 |
46 | worker1:
47 | image: renci/hadoop:2.9.0
48 | depends_on:
49 | - namenode
50 | container_name: worker1
51 | volumes:
52 | - hadoop-public:/home/hadoop/public
53 | - ./site-files:/site-files
54 | restart: always
55 | hostname: worker1
56 | networks:
57 | - hadoop
58 | ports:
59 | - '8042:8042'
60 | - '50075:50075'
61 | environment:
62 | IS_NODE_MANAGER: 'true'
63 | IS_NAME_NODE: 'false'
64 | IS_SECONDARY_NAME_NODE: 'false'
65 | IS_DATA_NODE: 'true'
66 | IS_RESOURCE_MANAGER: 'false'
67 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
68 |
69 | worker2:
70 | image: renci/hadoop:2.9.0
71 | depends_on:
72 | - namenode
73 | container_name: worker2
74 | volumes:
75 | - hadoop-public:/home/hadoop/public
76 | - ./site-files:/site-files
77 | restart: always
78 | hostname: worker2
79 | networks:
80 | - hadoop
81 | ports:
82 | - '8043:8042'
83 | - '50076:50075'
84 | environment:
85 | IS_NODE_MANAGER: 'true'
86 | IS_NAME_NODE: 'false'
87 | IS_SECONDARY_NAME_NODE: 'false'
88 | IS_DATA_NODE: 'true'
89 | IS_RESOURCE_MANAGER: 'false'
90 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
91 |
92 | worker3:
93 | image: renci/hadoop:2.9.0
94 | depends_on:
95 | - namenode
96 | container_name: worker3
97 | volumes:
98 | - hadoop-public:/home/hadoop/public
99 | - ./site-files:/site-files
100 | restart: always
101 | hostname: worker3
102 | networks:
103 | - hadoop
104 | ports:
105 | - '8044:8042'
106 | - '50077:50075'
107 | environment:
108 | IS_NODE_MANAGER: 'true'
109 | IS_NAME_NODE: 'false'
110 | IS_SECONDARY_NAME_NODE: 'false'
111 | IS_DATA_NODE: 'true'
112 | IS_RESOURCE_MANAGER: 'false'
113 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
114 |
115 | volumes:
116 | hadoop-public:
117 |
118 | networks:
119 | hadoop:
120 |
--------------------------------------------------------------------------------
/2.9.0/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop
5 | CORE_SITE_FILE=${HADOOP_CONF_DIR}/core-site.xml
6 | HDFS_SITE_FILE=${HADOOP_CONF_DIR}/hdfs-site.xml
7 | MAPRED_SITE_FILE=${HADOOP_CONF_DIR}/mapred-site.xml
8 | YARN_SITE_FILE=${HADOOP_CONF_DIR}/yarn-site.xml
9 | WORKERS_FILE=${HADOOP_CONF_DIR}/slaves
10 |
11 | _core_site_xml () {
12 | if [ -f /site-files/core-site.xml ]; then
13 | echo "USE: /site-files/core-site.xml"
14 | cat /site-files/core-site.xml > $CORE_SITE_FILE
15 | else
16 | cat > $CORE_SITE_FILE << EOF
17 |
18 |
19 |
20 |
21 |
22 | fs.default.name
23 | hdfs://localhost:9000
24 |
25 |
26 | EOF
27 | fi
28 | chown hadoop:hadoop $CORE_SITE_FILE
29 | }
30 |
31 | _hdfs_site_xml () {
32 | if [ -f /site-files/hdfs-site.xml ]; then
33 | cat /site-files/hdfs-site.xml > $HDFS_SITE_FILE
34 | else
35 | cat > $HDFS_SITE_FILE << EOF
36 |
37 |
38 |
39 |
40 |
41 | dfs.replication
42 | 1
43 |
44 |
45 | dfs.name.dir
46 | file:///home/hadoop/hadoopdata/hdfs/namenode
47 |
48 |
49 | dfs.data.dir
50 | file:///home/hadoop/hadoopdata/hdfs/datanode
51 |
52 |
53 | EOF
54 | fi
55 | chown hadoop:hadoop $HDFS_SITE_FILE
56 | }
57 |
58 | _mapred_site_xml() {
59 | if [ -f /site-files/mapred-site.xml ]; then
60 | cat /site-files/mapred-site.xml > $MAPRED_SITE_FILE
61 | else
62 | cat > $MAPRED_SITE_FILE << EOF
63 |
64 |
65 |
66 |
67 |
68 | mapreduce.framework.name
69 | yarn
70 |
71 |
72 | EOF
73 | fi
74 | chown hadoop:hadoop $MAPRED_SITE_FILE
75 | }
76 |
77 | _yarn_site_xml() {
78 | if [ -f /site-files/yarn-site.xml ]; then
79 | cat /site-files/yarn-site.xml > $YARN_SITE_FILE
80 | else
81 | cat > $YARN_SITE_FILE << EOF
82 |
83 |
84 |
85 |
86 | yarn.nodemanager.aux-services
87 | mapreduce_shuffle
88 |
89 |
90 | EOF
91 | fi
92 | chown hadoop:hadoop $YARN_SITE_FILE
93 | }
94 |
95 | _workers() {
96 | if [ -f /site-files/workers ]; then
97 | cat /site-files/workers > $WORKERS_FILE
98 | else
99 | cat > $WORKERS_FILE << EOF
100 | localhost
101 | EOF
102 | fi
103 | chown hadoop:hadoop $WORKERS_FILE
104 | }
105 |
106 | _hadoop_profile() {
107 | cat > /etc/profile.d/hadoop.sh << EOF
108 | export HADOOP_USER_HOME=${HADOOP_USER_HOME}
109 | export HADOOP_HOME=${HADOOP_USER_HOME}/hadoop
110 | export HADOOP_PREFIX=${HADOOP_USER_HOME}/hadoop
111 | export HADOOP_INSTALL=${HADOOP_PREFIX}
112 | export HADOOP_MAPRED_HOME=${HADOOP_PREFIX}
113 | export HADOOP_COMMON_HOME=${HADOOP_PREFIX}
114 | export HADOOP_HDFS_HOME=${HADOOP_PREFIX}
115 | export JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_PREFIX}/lib/native
116 | export YARN_HOME=${HADOOP_PREFIX}
117 | export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_PREFIX}/lib/native
118 | export HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop
119 | export CORE_SITE_FILE=${HADOOP_CONF_DIR}/core-site.xml
120 | export HDFS_SITE_FILE=${HADOOP_CONF_DIR}/hdfs-site.xml
121 | export MAPRED_SITE_FILE=${HADOOP_CONF_DIR}/mapred-site.xml
122 | export YARN_SITE_FILE=${HADOOP_CONF_DIR}/yarn-site.xml
123 | export WORKERS_FILE=${HADOOP_CONF_DIR}/slaves
124 | export PATH=$PATH:${HADOOP_PREFIX}/sbin:${HADOOP_PREFIX}/bin
125 | EOF
126 | }
127 |
128 | _generate_ssh_keys() {
129 | mkdir -p $HADOOP_USER_HOME/.ssh
130 | ssh-keygen -t rsa -N '' -f $HADOOP_USER_HOME/.ssh/id_rsa
131 | cat $HADOOP_USER_HOME/.ssh/id_rsa.pub >> $HADOOP_USER_HOME/.ssh/authorized_keys
132 | chmod 0600 $HADOOP_USER_HOME/.ssh/authorized_keys
133 | chown -R hadoop:hadoop $HADOOP_USER_HOME/.ssh
134 | }
135 |
136 | _hadoop_profile
137 | # runuser -l hadoop -c $'env' # debug hadoop env
138 |
139 | if $IS_NAME_NODE; then
140 | mkdir -p /hdfsdata/namenode
141 | chown -R hadoop:hadoop /hdfsdata/namenode
142 | fi
143 | if $IS_DATA_NODE; then
144 | mkdir -p /hdfsdata/datanode
145 | chown -R hadoop:hadoop /hdfsdata/datanode
146 | fi
147 |
148 | chown -R hadoop:hadoop /home/hadoop/public
149 |
150 | /usr/sbin/sshd -D &
151 |
152 | runuser -l hadoop -c $'sed -i \'s:export JAVA_HOME=.*:export JAVA_HOME=/usr/java/jdk1.8.0_161/jre:\' /home/hadoop/hadoop/etc/hadoop/hadoop-env.sh'
153 |
154 | _core_site_xml
155 | _hdfs_site_xml
156 | _mapred_site_xml
157 | _yarn_site_xml
158 | _workers
159 |
160 | IS_FIRST_RUN=$(if [ ! -f "/home/hadoop/.ssh/id_rsa.pub" ]; then echo 'true'; else echo 'false'; fi)
161 |
162 | if $IS_NAME_NODE; then
163 | if $IS_FIRST_RUN; then
164 | echo "NameNode copy ssh"
165 | _generate_ssh_keys
166 | cp -r /home/hadoop/.ssh /home/hadoop/public/
167 | fi
168 | else
169 | while [ ! -d /home/hadoop/public/.ssh ]; do
170 | echo "waiting for /home/hadoop/public/.ssh"
171 | sleep 2
172 | done
173 | if $IS_FIRST_RUN; then
174 | echo "COPY: .ssh from namenode to $(hostname)"
175 | cp -rf /home/hadoop/public/.ssh /home/hadoop/
176 | cat /home/hadoop/.ssh/id_rsa.pub >> /home/hadoop/.ssh/authorized_keys
177 | chown -R hadoop:hadoop /home/hadoop/.ssh
178 | fi
179 | fi
180 |
181 | if $IS_FIRST_RUN; then
182 | while read node; do
183 | echo "node = $node"
184 | until runuser -l hadoop -c $'ssh-keyscan $node >> /home/hadoop/.ssh/known_hosts'; do sleep 2; done
185 | done < <($CLUSTER_NODES)
186 | fi
187 |
188 | if $IS_NAME_NODE; then
189 | echo "Staring NameNode"
190 | if $IS_FIRST_RUN; then
191 | runuser -l hadoop -c $'$HADOOP_PREFIX/bin/hdfs namenode -format'
192 | fi
193 | runuser -l hadoop -c $'$HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode'
194 | fi
195 |
196 | if $IS_SECONDARY_NAME_NODE; then
197 | echo "Staring SecondaryNameNode"
198 | runuser -l hadoop -c $'$HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start secondarynamenode'
199 | fi
200 |
201 | if $IS_DATA_NODE; then
202 | echo "Staring DataNode"
203 | runuser -l hadoop -c $'$HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode'
204 | fi
205 |
206 | if $IS_RESOURCE_MANAGER; then
207 | echo "Staring ResourceManager"
208 | runuser -l hadoop -c $'$YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager'
209 | fi
210 |
211 | if $IS_NODE_MANAGER; then
212 | echo "Staring NodeManager"
213 | runuser -l hadoop -c $'$YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager'
214 | fi
215 |
216 | tail -f /dev/null
217 |
218 | exec "$@"
219 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Apache Hadoop in Docker
2 |
3 | This work has been inspired by:
4 |
5 | - techadmin.net: [Setup Hadoop cluster on CentOS](https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/)
6 | - Oracle Java 8: [binarybabel/docker-jdk](https://github.com/binarybabel/docker-jdk/blob/master/src/centos.Dockerfile)
7 | - CentOS 7 base image: [krallin/tini-images](https://github.com/krallin/tini-images)
8 | - ExoGENI Recipes: [RENCI-NRIG/exogeni-recipes/hadoop](https://github.com/RENCI-NRIG/exogeni-recipes/tree/master/hadoop/hadoop-2)
9 |
10 | ### What Is Apache Hadoop?
11 |
12 | The Apache Hadoop project develops open-source software for reliable, scalable, distributed computing.
13 |
14 | The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage. Rather than rely on hardware to deliver high-availability, the library itself is designed to detect and handle failures at the application layer, so delivering a highly-available service on top of a cluster of computers, each of which may be prone to failures.
15 |
16 | See [official documentation](http://hadoop.apache.org) for more information.
17 |
18 | ## How to use this image
19 |
20 | ### Build locally
21 |
22 |
23 | ```
24 | $ docker build -t renci/hadoop:2.9.0 ./2.9.0/
25 | ...
26 | $ docker images
27 | REPOSITORY TAG IMAGE ID CREATED SIZE
28 | renci/hadoop 2.9.0 4a4de8ed48b2 3 minutes ago 1.92GB
29 | ...
30 | ```
31 |
32 | Example `docker-compose.yml` file included that builds from local repository and deploys a single node cluster based on [[1](https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/)].
33 |
34 | ```
35 | $ docker-compose build
36 | ...
37 | $ docker-compose up -d
38 | ...
39 | $ docker-compose ps
40 | Name Command State Ports
41 | --------------------------------------------------------------------------------------------------------------------------------------------
42 | hadoop /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50070->50070/tcp, 0.0.0.0:50075->50075/tcp, 0.0.0.0:50090->50090/tcp,
43 | 0.0.0.0:8042->8042/tcp, 0.0.0.0:8088->8088/tcp
44 | ```
45 |
46 | - Port mappings from above:
47 |
48 | ```
49 | ports:
50 | - '8042:8042' # NodeManager web ui
51 | - '8088:8088' # ResourceManager web ui
52 | - '50070:50070' # NameNode web ui
53 | - '50075:50075' # DataNode web ui
54 | - '50090:50090' # Secondary NameNode web ui
55 | ```
56 |
57 | ### From Docker Hub
58 |
59 | Automated builds are generated at: [https://hub.docker.com/u/renci](https://hub.docker.com/u/renci/dashboard/) and can be pulled as follows.
60 |
61 | ```
62 | $ docker pull renci/hadoop:2.9.0
63 | ```
64 |
65 | ## Example: Five node cluster
66 |
67 | Using the provided [`5-node-cluster.yml`](5-node-cluster.yml) file to stand up a five node Hadoop cluster that includes a `namenode`, `resourcemanager` and three workers (`worker1`, `worker2` and `worker3`).
68 |
69 | Hadoop docker network and port mappings (specific network values subject to change based on system):
70 |
71 |
72 |
73 | The nodes will use the definitions found in the [site-files](site-files) directory to configure the cluster. These files can be modified as needed to configure your cluster as needed at runtime.
74 |
75 | A docker volume named `hadoop-public` is also created to allow the nodes to exchange SSH key information between themselves on startup.
76 |
77 | ```yaml
78 | version: '3.1'
79 |
80 | services:
81 | namenode:
82 | image: renci/hadoop:2.9.0
83 | container_name: namenode
84 | volumes:
85 | - hadoop-public:/home/hadoop/public
86 | - ./site-files:/site-files
87 | restart: always
88 | hostname: namenode
89 | networks:
90 | - hadoop
91 | ports:
92 | - '50070:50070'
93 | environment:
94 | IS_NODE_MANAGER: 'false'
95 | IS_NAME_NODE: 'true'
96 | IS_SECONDARY_NAME_NODE: 'false'
97 | IS_DATA_NODE: 'false'
98 | IS_RESOURCE_MANAGER: 'false'
99 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
100 |
101 | resourcemanager:
102 | image: renci/hadoop:2.9.0
103 | depends_on:
104 | - namenode
105 | container_name: resourcemanager
106 | volumes:
107 | - hadoop-public:/home/hadoop/public
108 | - ./site-files:/site-files
109 | restart: always
110 | hostname: resourcemanager
111 | networks:
112 | - hadoop
113 | ports:
114 | - '8088:8088'
115 | environment:
116 | IS_NODE_MANAGER: 'false'
117 | IS_NAME_NODE: 'false'
118 | IS_SECONDARY_NAME_NODE: 'false'
119 | IS_DATA_NODE: 'false'
120 | IS_RESOURCE_MANAGER: 'true'
121 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
122 |
123 | worker1:
124 | image: renci/hadoop:2.9.0
125 | depends_on:
126 | - namenode
127 | container_name: worker1
128 | volumes:
129 | - hadoop-public:/home/hadoop/public
130 | - ./site-files:/site-files
131 | restart: always
132 | hostname: worker1
133 | networks:
134 | - hadoop
135 | ports:
136 | - '8042:8042'
137 | - '50075:50075'
138 | environment:
139 | IS_NODE_MANAGER: 'true'
140 | IS_NAME_NODE: 'false'
141 | IS_SECONDARY_NAME_NODE: 'false'
142 | IS_DATA_NODE: 'true'
143 | IS_RESOURCE_MANAGER: 'false'
144 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
145 |
146 | worker2:
147 | image: renci/hadoop:2.9.0
148 | depends_on:
149 | - namenode
150 | container_name: worker2
151 | volumes:
152 | - hadoop-public:/home/hadoop/public
153 | - ./site-files:/site-files
154 | restart: always
155 | hostname: worker2
156 | networks:
157 | - hadoop
158 | ports:
159 | - '8043:8042'
160 | - '50076:50075'
161 | environment:
162 | IS_NODE_MANAGER: 'true'
163 | IS_NAME_NODE: 'false'
164 | IS_SECONDARY_NAME_NODE: 'false'
165 | IS_DATA_NODE: 'true'
166 | IS_RESOURCE_MANAGER: 'false'
167 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
168 |
169 | worker3:
170 | image: renci/hadoop:2.9.0
171 | depends_on:
172 | - namenode
173 | container_name: worker3
174 | volumes:
175 | - hadoop-public:/home/hadoop/public
176 | - ./site-files:/site-files
177 | restart: always
178 | hostname: worker3
179 | networks:
180 | - hadoop
181 | ports:
182 | - '8044:8042'
183 | - '50077:50075'
184 | environment:
185 | IS_NODE_MANAGER: 'true'
186 | IS_NAME_NODE: 'false'
187 | IS_SECONDARY_NAME_NODE: 'false'
188 | IS_DATA_NODE: 'true'
189 | IS_RESOURCE_MANAGER: 'false'
190 | CLUSTER_NODES: namenode resourcemanager worker1 worker2 worker3
191 |
192 | volumes:
193 | hadoop-public:
194 |
195 | networks:
196 | hadoop:
197 | ```
198 |
199 | ### Start the cluster
200 |
201 | Using `docker-compose`
202 |
203 | ```
204 | $ docker-compose -f 5-node-cluster.yml up -d
205 | ```
206 |
207 | After a few moments all containers will be running and should display in a `ps` call.
208 |
209 | ```
210 | $ docker-compose -f 5-node-cluster.yml ps
211 | Name Command State Ports
212 | -------------------------------------------------------------------------------------------------------------------
213 | namenode /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50070->50070/tcp
214 | resourcemanager /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:8088->8088/tcp
215 | worker1 /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50075->50075/tcp, 0.0.0.0:8042->8042/tcp
216 | worker2 /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50076->50075/tcp, 0.0.0.0:8043->8042/tcp
217 | worker3 /usr/local/bin/tini -- /do ... Up 22/tcp, 0.0.0.0:50077->50075/tcp, 0.0.0.0:8044->8042/tcp
218 | ```
219 |
220 | Since the ports of the containers were mapped to the host the various web ui's can be observed using a local browser.
221 |
222 | **namenode container**: NameNode Web UI on port 50070
223 |
224 | NameNode: [http://localhost:50070/dfshealth.html#tab-datanode](http://localhost:50070/dfshealth.html#tab-datanode)
225 |
226 |
227 |
228 | **resource manager container**: ResourceManager Web UI on port 8088
229 |
230 | ResourceManger: [http://localhost:8088/cluster](http://localhost:8088/cluster)
231 |
232 |
233 |
234 |
235 | **worker1, worker2 and worker3 containers**: DataNode Web UI on ports 50075, 50076 and 50077, NodeManager Web UI on ports 8042, 8043 and 8044.
236 |
237 | DataNode (worker1): [http://localhost:50075/datanode.html](http://localhost:50075/datanode.html)
238 |
239 |
240 |
241 | NodeManager (worker1): [http://localhost:8042/node](http://localhost:8042/node)
242 |
243 |
244 |
245 | Worker2 DataNode: [http://localhost:50076/datanode.html](http://localhost:50076/datanode.html)
246 |
247 |
248 |
249 | Worker3 DataNode: [http://localhost:50077/datanode.html](http://localhost:50077/datanode.html)
250 |
251 |
252 |
253 | ### Stop the cluster
254 |
255 | The cluster can be stopped by issuing a `stop` call.
256 |
257 | ```
258 | $ docker-compose -f 5-node-cluster.yml stop
259 | Stopping worker2 ... done
260 | Stopping resourcemanager ... done
261 | Stopping worker1 ... done
262 | Stopping worker3 ... done
263 | Stopping namenode ... done
264 | ```
265 |
266 | ### Restart the cluster
267 |
268 | So long as the container definitions have not been removed, the cluster can be restarted by using a `start` call.
269 |
270 | ```
271 | $ docker-compose -f 5-node-cluster.yml start
272 | Starting namenode ... done
273 | Starting worker1 ... done
274 | Starting worker3 ... done
275 | Starting worker2 ... done
276 | Starting resourcemanager ... done
277 | ```
278 |
279 | After a few moments all cluster activity should be back to normal.
280 |
281 | ### Remove the cluster
282 |
283 | The entire cluster can be removed by first stopping it, and then removing the containers from the local machine.
284 |
285 | ```
286 | $ docker-compose -f 5-node-cluster.yml stop && docker-compose -f 5-node-cluster.yml rm -f
287 | Stopping worker2 ... done
288 | Stopping resourcemanager ... done
289 | Stopping worker1 ... done
290 | Stopping worker3 ... done
291 | Stopping namenode ... done
292 | Going to remove worker2, resourcemanager, worker1, worker3, namenode
293 | Removing worker2 ... done
294 | Removing resourcemanager ... done
295 | Removing worker1 ... done
296 | Removing worker3 ... done
297 | Removing namenode ... done
298 | ```
299 |
300 | ## Example: Map Reduce
301 |
302 | **NOTE**: Assumes the existence of the five node cluster from the previous example.
303 |
304 | A simple map reduce example has been provided in the [mapreduce-example.sh](mapreduce-example.sh) script.
305 |
306 | The script is meant to be run from the host machine and uses `docker exec` to relay commands to the docker `namenode` container as the `hadoop` user.
307 |
308 |
309 | ```
310 | $ ./mapreduce-example.sh
311 | INFO: remove input/output HDFS directories if they already exist
312 | rm: `input': No such file or directory
313 | rm: `output': No such file or directory
314 | INFO: hdfs dfs -mkdir -p /user/hadoop/input
315 | INFO: hdfs dfs -put hadoop/README.txt /user/hadoop/input/
316 | INFO: hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.0.jar wordcount input output
317 | 18/02/17 19:42:38 INFO client.RMProxy: Connecting to ResourceManager at resourcemanager/172.19.0.5:8032
318 | 18/02/17 19:42:39 INFO input.FileInputFormat: Total input files to process : 1
319 | 18/02/17 19:42:39 INFO mapreduce.JobSubmitter: number of splits:1
320 | 18/02/17 19:42:39 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
321 | 18/02/17 19:42:39 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1518896527275_0001
322 | 18/02/17 19:42:40 INFO impl.YarnClientImpl: Submitted application application_1518896527275_0001
323 | 18/02/17 19:42:40 INFO mapreduce.Job: The url to track the job: http://resourcemanager:8088/proxy/application_1518896527275_0001/
324 | 18/02/17 19:42:40 INFO mapreduce.Job: Running job: job_1518896527275_0001
325 | 18/02/17 19:42:51 INFO mapreduce.Job: Job job_1518896527275_0001 running in uber mode : false
326 | 18/02/17 19:42:51 INFO mapreduce.Job: map 0% reduce 0%
327 | 18/02/17 19:42:58 INFO mapreduce.Job: map 100% reduce 0%
328 | 18/02/17 19:43:05 INFO mapreduce.Job: map 100% reduce 100%
329 | 18/02/17 19:43:05 INFO mapreduce.Job: Job job_1518896527275_0001 completed successfully
330 | 18/02/17 19:43:05 INFO mapreduce.Job: Counters: 49
331 | File System Counters
332 | FILE: Number of bytes read=1836
333 | FILE: Number of bytes written=407057
334 | FILE: Number of read operations=0
335 | FILE: Number of large read operations=0
336 | FILE: Number of write operations=0
337 | HDFS: Number of bytes read=1480
338 | HDFS: Number of bytes written=1306
339 | HDFS: Number of read operations=6
340 | HDFS: Number of large read operations=0
341 | HDFS: Number of write operations=2
342 | Job Counters
343 | Launched map tasks=1
344 | Launched reduce tasks=1
345 | Rack-local map tasks=1
346 | Total time spent by all maps in occupied slots (ms)=3851
347 | Total time spent by all reduces in occupied slots (ms)=3718
348 | Total time spent by all map tasks (ms)=3851
349 | Total time spent by all reduce tasks (ms)=3718
350 | Total vcore-milliseconds taken by all map tasks=3851
351 | Total vcore-milliseconds taken by all reduce tasks=3718
352 | Total megabyte-milliseconds taken by all map tasks=3943424
353 | Total megabyte-milliseconds taken by all reduce tasks=3807232
354 | Map-Reduce Framework
355 | Map input records=31
356 | Map output records=179
357 | Map output bytes=2055
358 | Map output materialized bytes=1836
359 | Input split bytes=114
360 | Combine input records=179
361 | Combine output records=131
362 | Reduce input groups=131
363 | Reduce shuffle bytes=1836
364 | Reduce input records=131
365 | Reduce output records=131
366 | Spilled Records=262
367 | Shuffled Maps =1
368 | Failed Shuffles=0
369 | Merged Map outputs=1
370 | GC time elapsed (ms)=114
371 | CPU time spent (ms)=1330
372 | Physical memory (bytes) snapshot=482201600
373 | Virtual memory (bytes) snapshot=3950104576
374 | Total committed heap usage (bytes)=281018368
375 | Shuffle Errors
376 | BAD_ID=0
377 | CONNECTION=0
378 | IO_ERROR=0
379 | WRONG_LENGTH=0
380 | WRONG_MAP=0
381 | WRONG_REDUCE=0
382 | File Input Format Counters
383 | Bytes Read=1366
384 | File Output Format Counters
385 | Bytes Written=1306
386 | INFO: hdfs dfs -ls /user/hadoop/output
387 | Found 2 items
388 | -rw-r--r-- 2 hadoop supergroup 0 2018-02-17 19:43 /user/hadoop/output/_SUCCESS
389 | -rw-r--r-- 2 hadoop supergroup 1306 2018-02-17 19:43 /user/hadoop/output/part-r-00000
390 | INFO: cat hadoop/README.txt
391 | For the latest information about Hadoop, please visit our website at:
392 |
393 | http://hadoop.apache.org/core/
394 |
395 | and our wiki, at:
396 |
397 | http://wiki.apache.org/hadoop/
398 |
399 | This distribution includes cryptographic software. The country in
400 | which you currently reside may have restrictions on the import,
401 | possession, use, and/or re-export to another country, of
402 | encryption software. BEFORE using any encryption software, please
403 | check your country's laws, regulations and policies concerning the
404 | import, possession, or use, and re-export of encryption software, to
405 | see if this is permitted. See for more
406 | information.
407 |
408 | The U.S. Government Department of Commerce, Bureau of Industry and
409 | Security (BIS), has classified this software as Export Commodity
410 | Control Number (ECCN) 5D002.C.1, which includes information security
411 | software using or performing cryptographic functions with asymmetric
412 | algorithms. The form and manner of this Apache Software Foundation
413 | distribution makes it eligible for export under the License Exception
414 | ENC Technology Software Unrestricted (TSU) exception (see the BIS
415 | Export Administration Regulations, Section 740.13) for both object
416 | code and source code.
417 |
418 | The following provides more details on the included cryptographic
419 | software:
420 | Hadoop Core uses the SSL libraries from the Jetty project written
421 | by mortbay.org.
422 | INFO: hdfs dfs -cat /user/hadoop/output/part-r-00000
423 | (BIS), 1
424 | (ECCN) 1
425 | (TSU) 1
426 | (see 1
427 | 5D002.C.1, 1
428 | 740.13) 1
429 | 1
430 | Administration 1
431 | Apache 1
432 | BEFORE 1
433 | BIS 1
434 | Bureau 1
435 | Commerce, 1
436 | Commodity 1
437 | Control 1
438 | Core 1
439 | Department 1
440 | ENC 1
441 | Exception 1
442 | Export 2
443 | For 1
444 | Foundation 1
445 | Government 1
446 | Hadoop 1
447 | Hadoop, 1
448 | Industry 1
449 | Jetty 1
450 | License 1
451 | Number 1
452 | Regulations, 1
453 | SSL 1
454 | Section 1
455 | Security 1
456 | See 1
457 | Software 2
458 | Technology 1
459 | The 4
460 | This 1
461 | U.S. 1
462 | Unrestricted 1
463 | about 1
464 | algorithms. 1
465 | and 6
466 | and/or 1
467 | another 1
468 | any 1
469 | as 1
470 | asymmetric 1
471 | at: 2
472 | both 1
473 | by 1
474 | check 1
475 | classified 1
476 | code 1
477 | code. 1
478 | concerning 1
479 | country 1
480 | country's 1
481 | country, 1
482 | cryptographic 3
483 | currently 1
484 | details 1
485 | distribution 2
486 | eligible 1
487 | encryption 3
488 | exception 1
489 | export 1
490 | following 1
491 | for 3
492 | form 1
493 | from 1
494 | functions 1
495 | has 1
496 | have 1
497 | http://hadoop.apache.org/core/ 1
498 | http://wiki.apache.org/hadoop/ 1
499 | if 1
500 | import, 2
501 | in 1
502 | included 1
503 | includes 2
504 | information 2
505 | information. 1
506 | is 1
507 | it 1
508 | latest 1
509 | laws, 1
510 | libraries 1
511 | makes 1
512 | manner 1
513 | may 1
514 | more 2
515 | mortbay.org. 1
516 | object 1
517 | of 5
518 | on 2
519 | or 2
520 | our 2
521 | performing 1
522 | permitted. 1
523 | please 2
524 | policies 1
525 | possession, 2
526 | project 1
527 | provides 1
528 | re-export 2
529 | regulations 1
530 | reside 1
531 | restrictions 1
532 | security 1
533 | see 1
534 | software 2
535 | software, 2
536 | software. 2
537 | software: 1
538 | source 1
539 | the 8
540 | this 3
541 | to 2
542 | under 1
543 | use, 2
544 | uses 1
545 | using 2
546 | visit 1
547 | website 1
548 | which 2
549 | wiki, 1
550 | with 1
551 | written 1
552 | you 1
553 | your 1
554 | HDFS directories at: http://localhost:50070/explorer.html#/user/hadoop
555 | ```
556 |
557 | NameNode: [http://localhost:50070/explorer.html#/user/hadoop](http://localhost:50070/explorer.html#/user/hadoop)
558 |
559 |
560 |
561 | ### References
562 |
563 | 1. [https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/](https://tecadmin.net/setup-hadoop-single-node-cluster-on-centos-redhat/)
564 | 2. [https://github.com/RENCI-NRIG/exogeni-recipes/hadoop/hadoop-2/hadoop\_exogeni\_postboot.sh](https://github.com/RENCI-NRIG/exogeni-recipes/blob/master/hadoop/hadoop-2/hadoop_exogeni_postboot.sh)
565 | 3. Hadoop configuration files
566 | - Common: [hadoop-common/core-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-common/core-default.xml)
567 | - HDFS: [hadoop-hdfs/hdfs-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml)
568 | - MapReduce: [hadoop-mapreduce-client-core/mapred-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml)
569 | - Yarn: [hadoop-yarn-common/yarn-default.xml](http://hadoop.apache.org/docs/r2.9.0/hadoop-yarn/hadoop-yarn-common/yarn-default.xml)
570 | - Deprecated Properties: [hadoop-common/DeprecatedProperties.html](http://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-common/DeprecatedProperties.html)
571 | 4. Example MapReduce: [https://tecadmin.net/hadoop-running-a-wordcount-mapreduce-example/](https://tecadmin.net/hadoop-running-a-wordcount-mapreduce-example/)
572 |
--------------------------------------------------------------------------------