├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── build-image.sh ├── config ├── .DS_Store ├── core-site.xml ├── hadoop-env.sh ├── hbase-env.sh ├── hbase-site.xml ├── hdfs-site.xml ├── mapred-site.xml ├── run-wordcount.sh ├── spark-defaults.conf ├── ssh_config ├── start-hadoop.sh ├── start-kafka-zookeeper.sh ├── workers └── yarn-site.xml ├── docker-compose.yml ├── hadoop-cluster-docker.png └── scripts ├── build-image.sh ├── resize-cluster.sh └── start-container.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest 2 | 3 | WORKDIR /root 4 | 5 | # install requisites 6 | RUN apt-get update && apt-get install -y openssh-server openjdk-8-jdk ssh wget curl vim python3 && \ 7 | rm -rf /var/lib/apt/lists/* 8 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 9 | python3 get-pip.py && \ 10 | rm get-pip.py && \ 11 | python3 -m pip install --upgrade pip setuptools 12 | 13 | # Install Hadoop 14 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && \ 15 | tar -xzf hadoop-3.3.6.tar.gz && \ 16 | mv hadoop-3.3.6 /usr/local/hadoop && \ 17 | rm hadoop-3.3.6.tar.gz 18 | 19 | # Install Spark 20 | RUN wget https://mirror.lyrahosting.com/apache/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz && \ 21 | tar -xzf spark-3.5.0-bin-hadoop3.tgz && \ 22 | mv spark-3.5.0-bin-hadoop3 /usr/local/spark && \ 23 | rm spark-3.5.0-bin-hadoop3.tgz 24 | 25 | 26 | # Install pyspark 27 | RUN pip install pyspark 28 | 29 | # Install Kafka 30 | RUN wget https://archive.apache.org/dist/kafka/3.6.1/kafka_2.13-3.6.1.tgz && \ 31 | tar -xzf kafka_2.13-3.6.1.tgz && \ 32 | mv kafka_2.13-3.6.1 /usr/local/kafka && \ 33 | rm kafka_2.13-3.6.1.tgz 34 | 35 | # Install HBase 36 | RUN wget https://archive.apache.org/dist/hbase/2.5.8/hbase-2.5.8-hadoop3-bin.tar.gz && \ 37 | tar -xzf hbase-2.5.8-hadoop3-bin.tar.gz && \ 38 | mv hbase-2.5.8-hadoop3 /usr/local/hbase && \ 39 | rm hbase-2.5.8-hadoop3-bin.tar.gz 40 | 41 | 42 | # set environment variables 43 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 44 | ENV HADOOP_HOME=/usr/local/hadoop 45 | ENV YARN_HOME=/usr/local/hadoop 46 | ENV SPARK_HOME=/usr/local/spark 47 | ENV KAFKA_HOME=/usr/local/kafka 48 | ENV HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop 49 | ENV YARN_CONF_DIR=/usr/local/hadoop/etc/hadoop 50 | ENV LD_LIBRARY_PATH=/usr/local/hadoop/lib/native:$LD_LIBRARY_PATH 51 | ENV HBASE_HOME=/usr/local/hbase 52 | ENV CLASSPATH=$CLASSPATH:/usr/local/hbase/lib/* 53 | ENV PATH=$PATH:/usr/local/hadoop/bin:/usr/local/hadoop/sbin:/usr/local/spark/bin:/usr/local/kafka/bin:/usr/local/hbase/bin 54 | 55 | # ssh without key 56 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \ 57 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ 58 | chmod 0600 ~/.ssh/authorized_keys 59 | 60 | RUN mkdir -p ~/hdfs/namenode && \ 61 | mkdir -p ~/hdfs/datanode && \ 62 | mkdir $HADOOP_HOME/logs 63 | 64 | COPY config/* /tmp/ 65 | 66 | RUN mv /tmp/ssh_config ~/.ssh/config && \ 67 | mv /tmp/hadoop-env.sh /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \ 68 | mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml && \ 69 | mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml && \ 70 | mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml && \ 71 | mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml && \ 72 | mv /tmp/workers $HADOOP_HOME/etc/hadoop/workers && \ 73 | mv /tmp/start-kafka-zookeeper.sh ~/start-kafka-zookeeper.sh && \ 74 | mv /tmp/start-hadoop.sh ~/start-hadoop.sh && \ 75 | mv /tmp/run-wordcount.sh ~/run-wordcount.sh && \ 76 | mv /tmp/spark-defaults.conf $SPARK_HOME/conf/spark-defaults.conf && \ 77 | mv /tmp/hbase-env.sh $HBASE_HOME/conf/hbase-env.sh && \ 78 | mv /tmp/hbase-site.xml $HBASE_HOME/conf/hbase-site.xml &&\ 79 | mv /tmp/purchases.txt /root/purchases.txt && \ 80 | mv /tmp/purchases2.txt /root/purchases2.txt 81 | 82 | RUN chmod +x ~/start-hadoop.sh && \ 83 | chmod +x ~/start-kafka-zookeeper.sh && \ 84 | chmod +x ~/run-wordcount.sh && \ 85 | chmod +x $HADOOP_HOME/sbin/start-dfs.sh && \ 86 | chmod +x $HADOOP_HOME/sbin/start-yarn.sh 87 | 88 | # format namenode 89 | RUN /usr/local/hadoop/bin/hdfs namenode -format 90 | 91 | CMD [ "sh", "-c", "service ssh start; bash"] 92 | 93 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Présentation 2 | 3 | Ces contenaires ont été initialement issus de https://github.com/kiwenlau/hadoop-cluster-docker 4 | 5 | Un cluster de trois contenaires est créé, avec comme plateformes installées: 6 | 7 | * [Apache Hadoop](http://hadoop.apache.org/) Version: 2.7.2 8 | * [Apache Spark](https://spark.apache.org/) Version: 2.2.1 9 | * [Apache Kafka](https://kafka.apache.org/) Version 2.11-1.0.2 10 | * [Apache HBase](https://hbase.apache.org/) Version 1.4.8 11 | 12 | 13 | ## Lancement des contenaires 14 | 15 | - Vous pouvez soit utiliser les scripts dans le répertoire `scripts`. 16 | 17 | ```python 18 | scripts/ 19 | |- build-image.sh # la créarion de l'image docker 20 | |- resize-cluser.sh # pour définir le nombre d'esclaves 21 | |- start-container.sh # lancer les contenaires 22 | ``` 23 | - Vous pouvez utiliser docker compose pour lancer les services déclaré dans le ficher `docker_compose/docker-compose.yml` 24 | 25 | ```sh 26 | cd docker_compose 27 | docker compose up 28 | ``` 29 | 30 | Pour des tutoriaux détaillés sur la façon d'utiliser ces contenaires, visiter: 31 | https://insatunisia.github.io/TP-BigData 32 | 33 | Bonne lecture! -------------------------------------------------------------------------------- /build-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | 5 | echo -e "\nbuild docker hadoop & spark image\n" 6 | sudo docker build -t spark-hadoop:latest . 7 | 8 | echo "" 9 | -------------------------------------------------------------------------------- /config/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliasfaxi/hadoop-cluster-docker/41daeb01112718e320569338505d876e44b52416/config/.DS_Store -------------------------------------------------------------------------------- /config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | fs.defaultFS 21 | hdfs://hadoop-master:9000/ 22 | 23 | 24 | -------------------------------------------------------------------------------- /config/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Set Hadoop-specific environment variables here. 18 | 19 | # The only required environment variable is JAVA_HOME. All others are 20 | # optional. When running a distributed configuration it is best to 21 | # set JAVA_HOME in this file, so that it is correctly defined on 22 | # remote nodes. 23 | 24 | # The java implementation to use. 25 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 26 | 27 | # The jsvc implementation to use. Jsvc is required to run secure datanodes 28 | # that bind to privileged ports to provide authentication of data transfer 29 | # protocol. Jsvc is not required if SASL is configured for authentication of 30 | # data transfer protocol using non-privileged ports. 31 | #export JSVC_HOME=${JSVC_HOME} 32 | 33 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} 34 | 35 | # Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. 36 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do 37 | if [ "$HADOOP_CLASSPATH" ]; then 38 | export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f 39 | else 40 | export HADOOP_CLASSPATH=$f 41 | fi 42 | done 43 | 44 | # The maximum amount of heap to use, in MB. Default is 1000. 45 | #export HADOOP_HEAPSIZE= 46 | #export HADOOP_NAMENODE_INIT_HEAPSIZE="" 47 | 48 | # Extra Java runtime options. Empty by default. 49 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" 50 | 51 | # Command specific options appended to HADOOP_OPTS when specified 52 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" 53 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" 54 | 55 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" 56 | 57 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" 58 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS" 59 | 60 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc) 61 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS" 62 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" 63 | 64 | # On secure datanodes, user to run the datanode as after dropping privileges. 65 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports 66 | # to provide authentication of data transfer protocol. This **MUST NOT** be 67 | # defined if SASL is configured for authentication of data transfer protocol 68 | # using non-privileged ports. 69 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} 70 | 71 | # Where log files are stored. $HADOOP_HOME/logs by default. 72 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER 73 | 74 | # Where log files are stored in the secure data environment. 75 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} 76 | 77 | ### 78 | # HDFS Mover specific parameters 79 | ### 80 | # Specify the JVM options to be used when starting the HDFS Mover. 81 | # These options will be appended to the options specified as HADOOP_OPTS 82 | # and therefore may override any similar flags set in HADOOP_OPTS 83 | # 84 | # export HADOOP_MOVER_OPTS="" 85 | 86 | ### 87 | # Advanced Users Only! 88 | ### 89 | 90 | # The directory where pid files are stored. /tmp by default. 91 | # NOTE: this should be set to a directory that can only be written to by 92 | # the user that will run the hadoop daemons. Otherwise there is the 93 | # potential for a symlink attack. 94 | export HADOOP_PID_DIR=${HADOOP_PID_DIR} 95 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} 96 | 97 | # A string representing this instance of hadoop. $USER by default. 98 | export HADOOP_IDENT_STRING=$USER 99 | 100 | 101 | export HDFS_NAMENODE_USER="root" 102 | export HDFS_DATANODE_USER="root" 103 | export HDFS_SECONDARYNAMENODE_USER="root" 104 | export YARN_RESOURCEMANAGER_USER="root" 105 | export YARN_NODEMANAGER_USER="root" -------------------------------------------------------------------------------- /config/hbase-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | #/** 4 | # * Licensed to the Apache Software Foundation (ASF) under one 5 | # * or more contributor license agreements. See the NOTICE file 6 | # * distributed with this work for additional information 7 | # * regarding copyright ownership. The ASF licenses this file 8 | # * to you under the Apache License, Version 2.0 (the 9 | # * "License"); you may not use this file except in compliance 10 | # * with the License. You may obtain a copy of the License at 11 | # * 12 | # * http://www.apache.org/licenses/LICENSE-2.0 13 | # * 14 | # * Unless required by applicable law or agreed to in writing, software 15 | # * distributed under the License is distributed on an "AS IS" BASIS, 16 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # * See the License for the specific language governing permissions and 18 | # * limitations under the License. 19 | # */ 20 | 21 | # Set environment variables here. 22 | 23 | # This script sets variables multiple times over the course of starting an hbase process, 24 | # so try to keep things idempotent unless you want to take an even deeper look 25 | # into the startup scripts (bin/hbase, etc.) 26 | 27 | # The java implementation to use. Java 1.8+ required. 28 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ 29 | 30 | # Extra Java CLASSPATH elements. Optional. 31 | # export HBASE_CLASSPATH= 32 | 33 | # The maximum amount of heap to use. Default is left to JVM default. 34 | # export HBASE_HEAPSIZE=1G 35 | 36 | # Uncomment below if you intend to use off heap cache. For example, to allocate 8G of 37 | # offheap, set the value to "8G". 38 | # export HBASE_OFFHEAPSIZE=1G 39 | 40 | # Extra Java runtime options. 41 | # Default settings are applied according to the detected JVM version. Override these default 42 | # settings by specifying a value here. For more details on possible settings, 43 | # see http://hbase.apache.org/book.html#_jvm_tuning 44 | export HBASE_OPTS="-XX:+UseConcMarkSweepGC" 45 | 46 | # Uncomment one of the below three options to enable java garbage collection logging for the server-side processes. 47 | 48 | # This enables basic gc logging to the .out file. 49 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps" 50 | 51 | # This enables basic gc logging to its own file. 52 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 53 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:" 54 | 55 | # This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+. 56 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 57 | # export SERVER_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc: -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M" 58 | 59 | # Uncomment one of the below three options to enable java garbage collection logging for the client processes. 60 | 61 | # This enables basic gc logging to the .out file. 62 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps" 63 | 64 | # This enables basic gc logging to its own file. 65 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 66 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:" 67 | 68 | # This enables basic GC logging to its own file with automatic log rolling. Only applies to jdk 1.6.0_34+ and 1.7.0_2+. 69 | # If FILE-PATH is not replaced, the log file(.gc) would still be generated in the HBASE_LOG_DIR . 70 | # export CLIENT_GC_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc: -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=1 -XX:GCLogFileSize=512M" 71 | 72 | # See the package documentation for org.apache.hadoop.hbase.io.hfile for other configurations 73 | # needed setting up off-heap block caching. 74 | 75 | # Uncomment and adjust to enable JMX exporting 76 | # See jmxremote.password and jmxremote.access in $JRE_HOME/lib/management to configure remote password access. 77 | # More details at: http://java.sun.com/javase/6/docs/technotes/guides/management/agent.html 78 | # NOTE: HBase provides an alternative JMX implementation to fix the random ports issue, please see JMX 79 | # section in HBase Reference Guide for instructions. 80 | 81 | # export HBASE_JMX_BASE="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false" 82 | # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10101" 83 | # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10102" 84 | # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10103" 85 | # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10104" 86 | # export HBASE_REST_OPTS="$HBASE_REST_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.port=10105" 87 | 88 | # File naming hosts on which HRegionServers will run. $HBASE_HOME/conf/regionservers by default. 89 | # export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers 90 | 91 | # Uncomment and adjust to keep all the Region Server pages mapped to be memory resident 92 | #HBASE_REGIONSERVER_MLOCK=true 93 | #HBASE_REGIONSERVER_UID="hbase" 94 | 95 | # File naming hosts on which backup HMaster will run. $HBASE_HOME/conf/backup-masters by default. 96 | # export HBASE_BACKUP_MASTERS=${HBASE_HOME}/conf/backup-masters 97 | 98 | # Extra ssh options. Empty by default. 99 | # export HBASE_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HBASE_CONF_DIR" 100 | 101 | # Where log files are stored. $HBASE_HOME/logs by default. 102 | # export HBASE_LOG_DIR=${HBASE_HOME}/logs 103 | 104 | # Enable remote JDWP debugging of major HBase processes. Meant for Core Developers 105 | # export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8070" 106 | # export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8071" 107 | # export HBASE_THRIFT_OPTS="$HBASE_THRIFT_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8072" 108 | # export HBASE_ZOOKEEPER_OPTS="$HBASE_ZOOKEEPER_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8073" 109 | # export HBASE_REST_OPTS="$HBASE_REST_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8074" 110 | 111 | # A string representing this instance of hbase. $USER by default. 112 | # export HBASE_IDENT_STRING=$USER 113 | 114 | # The scheduling priority for daemon processes. See 'man nice'. 115 | # export HBASE_NICENESS=10 116 | 117 | # The directory where pid files are stored. /tmp by default. 118 | # export HBASE_PID_DIR=/var/hadoop/pids 119 | 120 | # Seconds to sleep between slave commands. Unset by default. This 121 | # can be useful in large clusters, where, e.g., slave rsyncs can 122 | # otherwise arrive faster than the master can service them. 123 | # export HBASE_SLAVE_SLEEP=0.1 124 | 125 | # Tell HBase whether it should manage it's own instance of ZooKeeper or not. 126 | # export HBASE_MANAGES_ZK=true 127 | 128 | # The default log rolling policy is RFA, where the log file is rolled as per the size defined for the 129 | # RFA appender. Please refer to the log4j.properties file to see more details on this appender. 130 | # In case one needs to do log rolling on a date change, one should set the environment property 131 | # HBASE_ROOT_LOGGER to ",DRFA". 132 | # For example: 133 | # HBASE_ROOT_LOGGER=INFO,DRFA 134 | # The reason for changing default to RFA is to avoid the boundary case of filling out disk space as 135 | # DRFA doesn't put any cap on the log size. Please refer to HBase-5655 for more context. 136 | 137 | # Tell HBase whether it should include Hadoop's lib when start up, 138 | # the default value is false,means that includes Hadoop's lib. 139 | # export HBASE_DISABLE_HADOOP_CLASSPATH_LOOKUP="true" 140 | 141 | # Override text processing tools for use by these launch scripts. 142 | # export GREP="${GREP-grep}" 143 | # export SED="${SED-sed}" 144 | export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m -XX:ReservedCodeCacheSize=256m" 145 | export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m -XX:ReservedCodeCacheSize=256m" -------------------------------------------------------------------------------- /config/hbase-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 | 24 | hbase.zookeeper.property.dataDir 25 | /usr/local/zookeeper 26 | 27 | 28 | hbase.zookeeper.quorum 29 | localhost 30 | 31 | 32 | hbase.zookeeper.property.clientPort 33 | 2181 34 | 35 | 36 | hbase.cluster.distributed 37 | true 38 | 39 | 40 | hbase.rootdir 41 | hdfs://hadoop-master:9000/hbase 42 | 43 | 44 | hbase.wal.provider 45 | filesystem 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /config/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.namenode.name.dir 22 | file:///root/hdfs/namenode 23 | NameNode directory for namespace and transaction logs storage. 24 | 25 | 26 | dfs.datanode.data.dir 27 | file:///root/hdfs/datanode 28 | DataNode directory 29 | 30 | 31 | dfs.replication 32 | 2 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /config/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | yarn.app.mapreduce.am.env 26 | HADOOP_MAPRED_HOME=$HADOOP_HOME 27 | 28 | 29 | mapreduce.map.env 30 | HADOOP_MAPRED_HOME=$HADOOP_HOME 31 | 32 | 33 | mapreduce.reduce.env 34 | HADOOP_MAPRED_HOME=$HADOOP_HOME 35 | 36 | 37 | -------------------------------------------------------------------------------- /config/run-wordcount.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # test the hadoop cluster by running wordcount 4 | 5 | # create input files 6 | mkdir input 7 | echo "Hello Docker" >input/file2.txt 8 | echo "Hello Hadoop" >input/file1.txt 9 | 10 | # create input directory on HDFS 11 | hadoop fs -mkdir -p input 12 | 13 | # put input files to HDFS 14 | hdfs dfs -put ./input/* input 15 | 16 | # run wordcount 17 | hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-2.7.2-sources.jar org.apache.hadoop.examples.WordCount input output 18 | 19 | # print the input files 20 | echo -e "\ninput file1.txt:" 21 | hdfs dfs -cat input/file1.txt 22 | 23 | echo -e "\ninput file2.txt:" 24 | hdfs dfs -cat input/file2.txt 25 | 26 | # print the output of wordcount 27 | echo -e "\nwordcount output:" 28 | hdfs dfs -cat output/part-r-00000 29 | 30 | -------------------------------------------------------------------------------- /config/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master yarn 2 | spark.driver.memory 4g 3 | spark.yarn.am.memory 1g 4 | spark.executor.memory 2g 5 | spark.executor.cores 1 6 | -------------------------------------------------------------------------------- /config/ssh_config: -------------------------------------------------------------------------------- 1 | Host localhost 2 | StrictHostKeyChecking no 3 | 4 | Host 0.0.0.0 5 | StrictHostKeyChecking no 6 | 7 | Host hadoop-* 8 | StrictHostKeyChecking no 9 | UserKnownHostsFile=/dev/null -------------------------------------------------------------------------------- /config/start-hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo -e "\n" 4 | 5 | $HADOOP_HOME/sbin/start-dfs.sh 6 | 7 | echo -e "\n" 8 | 9 | $HADOOP_HOME/sbin/start-yarn.sh 10 | 11 | echo -e "\n" 12 | 13 | -------------------------------------------------------------------------------- /config/start-kafka-zookeeper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo -e "\n" 4 | 5 | $KAFKA_HOME/bin/zookeeper-server-start.sh $KAFKA_HOME/config/zookeeper.properties & 6 | 7 | echo -e "\n" 8 | 9 | $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties & 10 | 11 | echo -e "\n" 12 | 13 | -------------------------------------------------------------------------------- /config/workers: -------------------------------------------------------------------------------- 1 | hadoop-worker1 2 | hadoop-worker2 3 | -------------------------------------------------------------------------------- /config/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.nodemanager.pmem-check-enabled 5 | false 6 | 7 | 8 | 9 | yarn.nodemanager.vmem-check-enabled 10 | false 11 | 12 | 13 | yarn.nodemanager.aux-services 14 | mapreduce_shuffle 15 | 16 | 17 | yarn.nodemanager.aux-services.mapreduce_shuffle.class 18 | org.apache.hadoop.mapred.ShuffleHandler 19 | 20 | 21 | yarn.resourcemanager.hostname 22 | hadoop-master 23 | 24 | 25 | yarn.application.classpath 26 | /usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*:/usr/local/hadoop/contrib/capacity-scheduler/*.jar 27 | 28 | 29 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | networks: 3 | default: 4 | name: hadoop 5 | driver: bridge 6 | services: 7 | hadoop-master: 8 | image: liliasfaxi/hadoop-cluster:latest 9 | container_name: hadoop-master 10 | hostname: hadoop-master 11 | ports: 12 | 13 | - "9870:9870" # NameNode WebUI 14 | - "8088:8088" # Viewing jobs infos and so on 15 | - "7077:7077" # Spark master port 16 | - "16010:16010" # HBase Web UI 17 | - "9092:9092" # Kafka producer uses port 9092 to communicate with Kafka broker 18 | - "2181:2181" # Allow Kafka producer running on the host to connect to ZooKeeper running inside the container 19 | - "9000:9000" # HDFS port 20 | - "9090:9090" # to connect with REST service 21 | networks: 22 | - default 23 | 24 | hadoop-worker1: 25 | image: liliasfaxi/hadoop-cluster:latest 26 | container_name: hadoop-worker1 27 | hostname: hadoop-worker1 28 | ports: 29 | - "8040:8042" 30 | networks: 31 | - default 32 | 33 | hadoop-worker2: 34 | image: liliasfaxi/hadoop-cluster:latest 35 | container_name: hadoop-worker2 36 | hostname: hadoop-worker2 37 | ports: 38 | - "8041:8042" 39 | networks: 40 | - default 41 | 42 | -------------------------------------------------------------------------------- /hadoop-cluster-docker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liliasfaxi/hadoop-cluster-docker/41daeb01112718e320569338505d876e44b52416/hadoop-cluster-docker.png -------------------------------------------------------------------------------- /scripts/build-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | 5 | echo -e "\nbuild docker hadoop & spark image\n" 6 | sudo docker build -t spark-hadoop:latest . 7 | 8 | echo "" 9 | -------------------------------------------------------------------------------- /scripts/resize-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # N is the node number of hadoop cluster 4 | N=$1 5 | 6 | if [ $# = 0 ] 7 | then 8 | echo "Please specify the node number of hadoop cluster!" 9 | exit 1 10 | fi 11 | 12 | # change slaves file 13 | i=1 14 | rm config/slaves 15 | while [ $i -lt $N ] 16 | do 17 | echo "hadoop-slave$i" >> config/slaves 18 | ((i++)) 19 | done 20 | 21 | echo "" 22 | 23 | echo -e "\nbuild docker hadoop image\n" 24 | 25 | # rebuild kiwenlau/hadoop image 26 | sudo docker build -t kiwenlau/hadoop:1.0 . 27 | 28 | echo "" 29 | -------------------------------------------------------------------------------- /scripts/start-container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo docker network create --driver=bridge hadoop 4 | 5 | # the default node number is 3 6 | N=${1:-3} 7 | 8 | 9 | # start hadoop master container 10 | sudo docker rm -f hadoop-master &> /dev/null 11 | echo "start hadoop-master container..." 12 | sudo docker run -itd \ 13 | --net=hadoop \ 14 | -p 50070:50070 \ 15 | -p 8088:8088 \ 16 | -p 7077:7077 \ 17 | -p 16010:16010 \ 18 | --name hadoop-master \ 19 | --hostname hadoop-master \ 20 | spark-hadoop:latest &> /dev/null 21 | 22 | 23 | # start hadoop slave container 24 | i=1 25 | while [ $i -lt $N ] 26 | do 27 | sudo docker rm -f hadoop-slave$i &> /dev/null 28 | echo "start hadoop-slave$i container..." 29 | port=$(( 8040 + $i )) 30 | sudo docker run -itd \ 31 | -p $port:8042 \ 32 | --net=hadoop \ 33 | --name hadoop-slave$i \ 34 | --hostname hadoop-slave$i \ 35 | spark-hadoop:latest &> /dev/null 36 | i=$(( $i + 1 )) 37 | done 38 | 39 | # get into hadoop master container 40 | sudo docker exec -it hadoop-master bash 41 | --------------------------------------------------------------------------------