├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── build-image.sh
├── config
├── .DS_Store
├── core-site.xml
├── hadoop-env.sh
├── hbase-env.sh
├── hbase-site.xml
├── hdfs-site.xml
├── mapred-site.xml
├── run-wordcount.sh
├── spark-defaults.conf
├── ssh_config
├── start-hadoop.sh
├── start-kafka-zookeeper.sh
├── workers
└── yarn-site.xml
├── docker-compose.yml
├── hadoop-cluster-docker.png
└── scripts
├── build-image.sh
├── resize-cluster.sh
└── start-container.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:latest
2 |
3 | WORKDIR /root
4 |
5 | # install requisites
6 | RUN apt-get update && apt-get install -y openssh-server openjdk-8-jdk ssh wget curl vim python3 && \
7 | rm -rf /var/lib/apt/lists/*
8 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
9 | python3 get-pip.py && \
10 | rm get-pip.py && \
11 | python3 -m pip install --upgrade pip setuptools
12 |
13 | # Install Hadoop
14 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && \
15 | tar -xzf hadoop-3.3.6.tar.gz && \
16 | mv hadoop-3.3.6 /usr/local/hadoop && \
17 | rm hadoop-3.3.6.tar.gz
18 |
19 | # Install Spark
20 | RUN wget https://mirror.lyrahosting.com/apache/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz && \
21 | tar -xzf spark-3.5.0-bin-hadoop3.tgz && \
22 | mv spark-3.5.0-bin-hadoop3 /usr/local/spark && \
23 | rm spark-3.5.0-bin-hadoop3.tgz
24 |
25 |
26 | # Install pyspark
27 | RUN pip install pyspark
28 |
29 | # Install Kafka
30 | RUN wget https://archive.apache.org/dist/kafka/3.6.1/kafka_2.13-3.6.1.tgz && \
31 | tar -xzf kafka_2.13-3.6.1.tgz && \
32 | mv kafka_2.13-3.6.1 /usr/local/kafka && \
33 | rm kafka_2.13-3.6.1.tgz
34 |
35 | # Install HBase
36 | RUN wget https://archive.apache.org/dist/hbase/2.5.8/hbase-2.5.8-hadoop3-bin.tar.gz && \
37 | tar -xzf hbase-2.5.8-hadoop3-bin.tar.gz && \
38 | mv hbase-2.5.8-hadoop3 /usr/local/hbase && \
39 | rm hbase-2.5.8-hadoop3-bin.tar.gz
40 |
41 |
42 | # set environment variables
43 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
44 | ENV HADOOP_HOME=/usr/local/hadoop
45 | ENV YARN_HOME=/usr/local/hadoop
46 | ENV SPARK_HOME=/usr/local/spark
47 | ENV KAFKA_HOME=/usr/local/kafka
48 | ENV HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
49 | ENV YARN_CONF_DIR=/usr/local/hadoop/etc/hadoop
50 | ENV LD_LIBRARY_PATH=/usr/local/hadoop/lib/native:$LD_LIBRARY_PATH
51 | ENV HBASE_HOME=/usr/local/hbase
52 | ENV CLASSPATH=$CLASSPATH:/usr/local/hbase/lib/*
53 | ENV PATH=$PATH:/usr/local/hadoop/bin:/usr/local/hadoop/sbin:/usr/local/spark/bin:/usr/local/kafka/bin:/usr/local/hbase/bin
54 |
55 | # ssh without key
56 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \
57 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
58 | chmod 0600 ~/.ssh/authorized_keys
59 |
60 | RUN mkdir -p ~/hdfs/namenode && \
61 | mkdir -p ~/hdfs/datanode && \
62 | mkdir $HADOOP_HOME/logs
63 |
64 | COPY config/* /tmp/
65 |
66 | RUN mv /tmp/ssh_config ~/.ssh/config && \
67 | mv /tmp/hadoop-env.sh /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
68 | mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml && \
69 | mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml && \
70 | mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml && \
71 | mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml && \
72 | mv /tmp/workers $HADOOP_HOME/etc/hadoop/workers && \
73 | mv /tmp/start-kafka-zookeeper.sh ~/start-kafka-zookeeper.sh && \
74 | mv /tmp/start-hadoop.sh ~/start-hadoop.sh && \
75 | mv /tmp/run-wordcount.sh ~/run-wordcount.sh && \
76 | mv /tmp/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf && \
77 | mv /tmp/hbase-env.sh $HBASE_HOME/conf/hbase-env.sh && \
78 | mv /tmp/hbase-site.xml $HBASE_HOME/conf/hbase-site.xml &&\
79 | mv /tmp/purchases.txt /root/purchases.txt && \
80 | mv /tmp/purchases2.txt /root/purchases2.txt
81 |
82 | RUN chmod +x ~/start-hadoop.sh && \
83 | chmod +x ~/start-kafka-zookeeper.sh && \
84 | chmod +x ~/run-wordcount.sh && \
85 | chmod +x $HADOOP_HOME/sbin/start-dfs.sh && \
86 | chmod +x $HADOOP_HOME/sbin/start-yarn.sh
87 |
88 | # format namenode
89 | RUN /usr/local/hadoop/bin/hdfs namenode -format
90 |
91 | CMD [ "sh", "-c", "service ssh start; bash"]
92 |
93 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Présentation
2 |
3 | Ces contenaires ont été initialement issus de https://github.com/kiwenlau/hadoop-cluster-docker
4 |
5 | Un cluster de trois contenaires est créé, avec comme plateformes installées:
6 |
7 | * [Apache Hadoop](http://hadoop.apache.org/) Version: 2.7.2
8 | * [Apache Spark](https://spark.apache.org/) Version: 2.2.1
9 | * [Apache Kafka](https://kafka.apache.org/) Version 2.11-1.0.2
10 | * [Apache HBase](https://hbase.apache.org/) Version 1.4.8
11 |
12 |
13 | ## Lancement des contenaires
14 |
15 | - Vous pouvez soit utiliser les scripts dans le répertoire `scripts`.
16 |
17 | ```python
18 | scripts/
19 | |- build-image.sh # la créarion de l'image docker
20 | |- resize-cluser.sh # pour définir le nombre d'esclaves
21 | |- start-container.sh # lancer les contenaires
22 | ```
23 | - Vous pouvez utiliser docker compose pour lancer les services déclaré dans le ficher `docker_compose/docker-compose.yml`
24 |
25 | ```sh
26 | cd docker_compose
27 | docker compose up
28 | ```
29 |
30 | Pour des tutoriaux détaillés sur la façon d'utiliser ces contenaires, visiter:
31 | https://insatunisia.github.io/TP-BigData
32 |
33 | Bonne lecture!
--------------------------------------------------------------------------------
/build-image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo ""
4 |
5 | echo -e "\nbuild docker hadoop & spark image\n"
6 | sudo docker build -t spark-hadoop:latest .
7 |
8 | echo ""
9 |
--------------------------------------------------------------------------------
/config/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliasfaxi/hadoop-cluster-docker/41daeb01112718e320569338505d876e44b52416/config/.DS_Store
--------------------------------------------------------------------------------
/config/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 | fs.defaultFS
21 | hdfs://hadoop-master:9000/
22 |
23 |
24 |
--------------------------------------------------------------------------------
/config/hadoop-env.sh:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Set Hadoop-specific environment variables here.
18 |
19 | # The only required environment variable is JAVA_HOME. All others are
20 | # optional. When running a distributed configuration it is best to
21 | # set JAVA_HOME in this file, so that it is correctly defined on
22 | # remote nodes.
23 |
24 | # The java implementation to use.
25 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
26 |
27 | # The jsvc implementation to use. Jsvc is required to run secure datanodes
28 | # that bind to privileged ports to provide authentication of data transfer
29 | # protocol. Jsvc is not required if SASL is configured for authentication of
30 | # data transfer protocol using non-privileged ports.
31 | #export JSVC_HOME=${JSVC_HOME}
32 |
33 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
34 |
35 | # Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
36 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
37 | if [ "$HADOOP_CLASSPATH" ]; then
38 | export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
39 | else
40 | export HADOOP_CLASSPATH=$f
41 | fi
42 | done
43 |
44 | # The maximum amount of heap to use, in MB. Default is 1000.
45 | #export HADOOP_HEAPSIZE=
46 | #export HADOOP_NAMENODE_INIT_HEAPSIZE=""
47 |
48 | # Extra Java runtime options. Empty by default.
49 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
50 |
51 | # Command specific options appended to HADOOP_OPTS when specified
52 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
53 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
54 |
55 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
56 |
57 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
58 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
59 |
60 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc)
61 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
62 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
63 |
64 | # On secure datanodes, user to run the datanode as after dropping privileges.
65 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports
66 | # to provide authentication of data transfer protocol. This **MUST NOT** be
67 | # defined if SASL is configured for authentication of data transfer protocol
68 | # using non-privileged ports.
69 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
70 |
71 | # Where log files are stored. $HADOOP_HOME/logs by default.
72 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
73 |
74 | # Where log files are stored in the secure data environment.
75 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
76 |
77 | ###
78 | # HDFS Mover specific parameters
79 | ###
80 | # Specify the JVM options to be used when starting the HDFS Mover.
81 | # These options will be appended to the options specified as HADOOP_OPTS
82 | # and therefore may override any similar flags set in HADOOP_OPTS
83 | #
84 | # export HADOOP_MOVER_OPTS=""
85 |
86 | ###
87 | # Advanced Users Only!
88 | ###
89 |
90 | # The directory where pid files are stored. /tmp by default.
91 | # NOTE: this should be set to a directory that can only be written to by
92 | # the user that will run the hadoop daemons. Otherwise there is the
93 | # potential for a symlink attack.
94 | export HADOOP_PID_DIR=${HADOOP_PID_DIR}
95 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
96 |
97 | # A string representing this instance of hadoop. $USER by default.
98 | export HADOOP_IDENT_STRING=$USER
99 |
100 |
101 | export HDFS_NAMENODE_USER="root"
102 | export HDFS_DATANODE_USER="root"
103 | export HDFS_SECONDARYNAMENODE_USER="root"
104 | export YARN_RESOURCEMANAGER_USER="root"
105 | export YARN_NODEMANAGER_USER="root"
--------------------------------------------------------------------------------
/config/hbase-env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | #/**
4 | # * Licensed to the Apache Software Foundation (ASF) under one
5 | # * or more contributor license agreements. See the NOTICE file
6 | # * distributed with this work for additional information
7 | # * regarding copyright ownership. The ASF licenses this file
8 | # * to you under the Apache License, Version 2.0 (the
9 | # * "License"); you may not use this file except in compliance
10 | # * with the License. You may obtain a copy of the License at
11 | # *
12 | # * http://www.apache.org/licenses/LICENSE-2.0
13 | # *
14 | # * Unless required by applicable law or agreed to in writing, software
15 | # * distributed under the License is distributed on an "AS IS" BASIS,
16 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # * See the License for the specific language governing permissions and
18 | # * limitations under the License.
19 | # */
20 |
21 | # Set environment variables here.
22 |
23 | # This script sets variables multiple times over the course of starting an hbase process,
24 | # so try to keep things idempotent unless you want to take an even deeper look
25 | # into the startup scripts (bin/hbase, etc.)
26 |
27 | # The java implementation to use. Java 1.8+ required.
28 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
29 |
30 | # Extra Java CLASSPATH elements. Optional.
31 | # export HBASE_CLASSPATH=
32 |
33 | # The maximum amount of heap to use. Default is left to JVM default.
34 | # export HBASE_HEAPSIZE=1G
35 |
36 | # Uncomment below if you intend to use off heap cache. For example, to allocate 8G of
37 | # offheap, set the value to "8G".
38 | # export HBASE_OFFHEAPSIZE=1G
39 |
40 | # Extra Java runtime options.
41 | # Default settings are applied according to the detected JVM version. Override these default
42 | # settings by specifying a value here. For more details on possible settings,
43 | # see http://hbase.apache.org/book.html#_jvm_tuning
44 | export HBASE_OPTS="-XX:+UseConcMarkSweepGC"
45 |
46 | # Uncomment one of the below three options to enable java garbage collection logging for the server-side processes.
47 |
48 | # This enables basic gc logging to the .out file.
49 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps"
50 |
51 | # This enables basic gc logging to its own file.
52 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
53 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:"
54 |
55 | # This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+.
56 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
57 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc: -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M"
58 |
59 | # Uncomment one of the below three options to enable java garbage collection logging for the client processes.
60 |
61 | # This enables basic gc logging to the .out file.
62 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps"
63 |
64 | # This enables basic gc logging to its own file.
65 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
66 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:"
67 |
68 | # This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+.
69 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR .
70 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc: -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M"
71 |
72 | # See the package documentation for org.apache.hadoop.hbase.io.hfile for other configurations
73 | # needed setting up off-heap block caching.
74 |
75 | # Uncomment and adjust to enable JMX exporting
76 | # See jmxremote.password and jmxremote.access in $JRE_HOME/lib/management to configure remote password access.
77 | # More details at: http://java.sun.com/javase/6/docs/technotes/guides/management/agent.html
78 | # NOTE: HBase provides an alternative JMX implementation to fix the random ports issue, please see JMX
79 | # section in HBase Reference Guide for instructions.
80 |
81 | # export HBASE_JMX_BASE="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false"
82 | # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10101"
83 | # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10102"
84 | # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10103"
85 | # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10104"
86 | # export HBASE_REST_OPTS="$HBASE_REST_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10105"
87 |
88 | # File naming hosts on which HRegionServers will run. $HBASE_HOME/conf/regionservers by default.
89 | # export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers
90 |
91 | # Uncomment and adjust to keep all the Region Server pages mapped to be memory resident
92 | #HBASE_REGIONSERVER_MLOCK=true
93 | #HBASE_REGIONSERVER_UID="hbase"
94 |
95 | # File naming hosts on which backup HMaster will run. $HBASE_HOME/conf/backup-masters by default.
96 | # export HBASE_BACKUP_MASTERS=${HBASE_HOME}/conf/backup-masters
97 |
98 | # Extra ssh options. Empty by default.
99 | # export HBASE_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HBASE_CONF_DIR"
100 |
101 | # Where log files are stored. $HBASE_HOME/logs by default.
102 | # export HBASE_LOG_DIR=${HBASE_HOME}/logs
103 |
104 | # Enable remote JDWP debugging of major HBase processes. Meant for Core Developers
105 | # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8070"
106 | # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8071"
107 | # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8072"
108 | # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8073"
109 | # export HBASE_REST_OPTS="$HBASE_REST_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8074"
110 |
111 | # A string representing this instance of hbase. $USER by default.
112 | # export HBASE_IDENT_STRING=$USER
113 |
114 | # The scheduling priority for daemon processes. See 'man nice'.
115 | # export HBASE_NICENESS=10
116 |
117 | # The directory where pid files are stored. /tmp by default.
118 | # export HBASE_PID_DIR=/var/hadoop/pids
119 |
120 | # Seconds to sleep between slave commands. Unset by default. This
121 | # can be useful in large clusters, where, e.g., slave rsyncs can
122 | # otherwise arrive faster than the master can service them.
123 | # export HBASE_SLAVE_SLEEP=0.1
124 |
125 | # Tell HBase whether it should manage it's own instance of ZooKeeper or not.
126 | # export HBASE_MANAGES_ZK=true
127 |
128 | # The default log rolling policy is RFA, where the log file is rolled as per the size defined for the
129 | # RFA appender. Please refer to the log4j.properties file to see more details on this appender.
130 | # In case one needs to do log rolling on a date change, one should set the environment property
131 | # HBASE_ROOT_LOGGER to ",DRFA".
132 | # For example:
133 | # HBASE_ROOT_LOGGER=INFO,DRFA
134 | # The reason for changing default to RFA is to avoid the boundary case of filling out disk space as
135 | # DRFA doesn't put any cap on the log size. Please refer to HBase-5655 for more context.
136 |
137 | # Tell HBase whether it should include Hadoop's lib when start up,
138 | # the default value is false,means that includes Hadoop's lib.
139 | # export HBASE_DISABLE_HADOOP_CLASSPATH_LOOKUP="true"
140 |
141 | # Override text processing tools for use by these launch scripts.
142 | # export GREP="${GREP-grep}"
143 | # export SED="${SED-sed}"
144 | export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m -XX:ReservedCodeCacheSize=256m"
145 | export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m -XX:ReservedCodeCacheSize=256m"
--------------------------------------------------------------------------------
/config/hbase-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
22 |
23 |
24 | hbase.zookeeper.property.dataDir
25 | /usr/local/zookeeper
26 |
27 |
28 | hbase.zookeeper.quorum
29 | localhost
30 |
31 |
32 | hbase.zookeeper.property.clientPort
33 | 2181
34 |
35 |
36 | hbase.cluster.distributed
37 | true
38 |
39 |
40 | hbase.rootdir
41 | hdfs://hadoop-master:9000/hbase
42 |
43 |
44 | hbase.wal.provider
45 | filesystem
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/config/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | dfs.namenode.name.dir
22 | file:///root/hdfs/namenode
23 | NameNode directory for namespace and transaction logs storage.
24 |
25 |
26 | dfs.datanode.data.dir
27 | file:///root/hdfs/datanode
28 | DataNode directory
29 |
30 |
31 | dfs.replication
32 | 2
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/config/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | mapreduce.framework.name
22 | yarn
23 |
24 |
25 | yarn.app.mapreduce.am.env
26 | HADOOP_MAPRED_HOME=$HADOOP_HOME
27 |
28 |
29 | mapreduce.map.env
30 | HADOOP_MAPRED_HOME=$HADOOP_HOME
31 |
32 |
33 | mapreduce.reduce.env
34 | HADOOP_MAPRED_HOME=$HADOOP_HOME
35 |
36 |
37 |
--------------------------------------------------------------------------------
/config/run-wordcount.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # test the hadoop cluster by running wordcount
4 |
5 | # create input files
6 | mkdir input
7 | echo "Hello Docker" >input/file2.txt
8 | echo "Hello Hadoop" >input/file1.txt
9 |
10 | # create input directory on HDFS
11 | hadoop fs -mkdir -p input
12 |
13 | # put input files to HDFS
14 | hdfs dfs -put ./input/* input
15 |
16 | # run wordcount
17 | hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-2.7.2-sources.jar org.apache.hadoop.examples.WordCount input output
18 |
19 | # print the input files
20 | echo -e "\ninput file1.txt:"
21 | hdfs dfs -cat input/file1.txt
22 |
23 | echo -e "\ninput file2.txt:"
24 | hdfs dfs -cat input/file2.txt
25 |
26 | # print the output of wordcount
27 | echo -e "\nwordcount output:"
28 | hdfs dfs -cat output/part-r-00000
29 |
30 |
--------------------------------------------------------------------------------
/config/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.master yarn
2 | spark.driver.memory 4g
3 | spark.yarn.am.memory 1g
4 | spark.executor.memory 2g
5 | spark.executor.cores 1
6 |
--------------------------------------------------------------------------------
/config/ssh_config:
--------------------------------------------------------------------------------
1 | Host localhost
2 | StrictHostKeyChecking no
3 |
4 | Host 0.0.0.0
5 | StrictHostKeyChecking no
6 |
7 | Host hadoop-*
8 | StrictHostKeyChecking no
9 | UserKnownHostsFile=/dev/null
--------------------------------------------------------------------------------
/config/start-hadoop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo -e "\n"
4 |
5 | $HADOOP_HOME/sbin/start-dfs.sh
6 |
7 | echo -e "\n"
8 |
9 | $HADOOP_HOME/sbin/start-yarn.sh
10 |
11 | echo -e "\n"
12 |
13 |
--------------------------------------------------------------------------------
/config/start-kafka-zookeeper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo -e "\n"
4 |
5 | $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties &
6 |
7 | echo -e "\n"
8 |
9 | $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties &
10 |
11 | echo -e "\n"
12 |
13 |
--------------------------------------------------------------------------------
/config/workers:
--------------------------------------------------------------------------------
1 | hadoop-worker1
2 | hadoop-worker2
3 |
--------------------------------------------------------------------------------
/config/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | yarn.nodemanager.pmem-check-enabled
5 | false
6 |
7 |
8 |
9 | yarn.nodemanager.vmem-check-enabled
10 | false
11 |
12 |
13 | yarn.nodemanager.aux-services
14 | mapreduce_shuffle
15 |
16 |
17 | yarn.nodemanager.aux-services.mapreduce_shuffle.class
18 | org.apache.hadoop.mapred.ShuffleHandler
19 |
20 |
21 | yarn.resourcemanager.hostname
22 | hadoop-master
23 |
24 |
25 | yarn.application.classpath
26 | /usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*:/usr/local/hadoop/contrib/capacity-scheduler/*.jar
27 |
28 |
29 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | networks:
3 | default:
4 | name: hadoop
5 | driver: bridge
6 | services:
7 | hadoop-master:
8 | image: liliasfaxi/hadoop-cluster:latest
9 | container_name: hadoop-master
10 | hostname: hadoop-master
11 | ports:
12 |
13 | - "9870:9870" # NameNode WebUI
14 | - "8088:8088" # Viewing jobs infos and so on
15 | - "7077:7077" # Spark master port
16 | - "16010:16010" # HBase Web UI
17 | - "9092:9092" # Kafka producer uses port 9092 to communicate with Kafka broker
18 | - "2181:2181" # Allow Kafka producer running on the host to connect to ZooKeeper running inside the container
19 | - "9000:9000" # HDFS port
20 | - "9090:9090" # to connect with REST service
21 | networks:
22 | - default
23 |
24 | hadoop-worker1:
25 | image: liliasfaxi/hadoop-cluster:latest
26 | container_name: hadoop-worker1
27 | hostname: hadoop-worker1
28 | ports:
29 | - "8040:8042"
30 | networks:
31 | - default
32 |
33 | hadoop-worker2:
34 | image: liliasfaxi/hadoop-cluster:latest
35 | container_name: hadoop-worker2
36 | hostname: hadoop-worker2
37 | ports:
38 | - "8041:8042"
39 | networks:
40 | - default
41 |
42 |
--------------------------------------------------------------------------------
/hadoop-cluster-docker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liliasfaxi/hadoop-cluster-docker/41daeb01112718e320569338505d876e44b52416/hadoop-cluster-docker.png
--------------------------------------------------------------------------------
/scripts/build-image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo ""
4 |
5 | echo -e "\nbuild docker hadoop & spark image\n"
6 | sudo docker build -t spark-hadoop:latest .
7 |
8 | echo ""
9 |
--------------------------------------------------------------------------------
/scripts/resize-cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # N is the node number of hadoop cluster
4 | N=$1
5 |
6 | if [ $# = 0 ]
7 | then
8 | echo "Please specify the node number of hadoop cluster!"
9 | exit 1
10 | fi
11 |
12 | # change slaves file
13 | i=1
14 | rm config/slaves
15 | while [ $i -lt $N ]
16 | do
17 | echo "hadoop-slave$i" >> config/slaves
18 | ((i++))
19 | done
20 |
21 | echo ""
22 |
23 | echo -e "\nbuild docker hadoop image\n"
24 |
25 | # rebuild kiwenlau/hadoop image
26 | sudo docker build -t kiwenlau/hadoop:1.0 .
27 |
28 | echo ""
29 |
--------------------------------------------------------------------------------
/scripts/start-container.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo docker network create --driver=bridge hadoop
4 |
5 | # the default node number is 3
6 | N=${1:-3}
7 |
8 |
9 | # start hadoop master container
10 | sudo docker rm -f hadoop-master &> /dev/null
11 | echo "start hadoop-master container..."
12 | sudo docker run -itd \
13 | --net=hadoop \
14 | -p 50070:50070 \
15 | -p 8088:8088 \
16 | -p 7077:7077 \
17 | -p 16010:16010 \
18 | --name hadoop-master \
19 | --hostname hadoop-master \
20 | spark-hadoop:latest &> /dev/null
21 |
22 |
23 | # start hadoop slave container
24 | i=1
25 | while [ $i -lt $N ]
26 | do
27 | sudo docker rm -f hadoop-slave$i &> /dev/null
28 | echo "start hadoop-slave$i container..."
29 | port=$(( 8040 + $i ))
30 | sudo docker run -itd \
31 | -p $port:8042 \
32 | --net=hadoop \
33 | --name hadoop-slave$i \
34 | --hostname hadoop-slave$i \
35 | spark-hadoop:latest &> /dev/null
36 | i=$(( $i + 1 ))
37 | done
38 |
39 | # get into hadoop master container
40 | sudo docker exec -it hadoop-master bash
41 |
--------------------------------------------------------------------------------