├── .gitignore
├── README.md
├── base
├── README.md
├── dockerfile
├── pip.conf
├── requirements.txt
└── sources.list
├── data
└── README.md
├── flume
├── README.md
├── conf
│ ├── flume-avro.conf
│ ├── flume-exec.conf
│ └── flume-netcat.conf
└── dockerfile
├── hadoop
├── README.md
├── conf
│ ├── core-site.xml
│ ├── hdfs-site.xml
│ ├── mapred-site.xml
│ ├── masters
│ ├── slaves
│ └── yarn-site.xml
├── docker-compose.yml
├── dockerfile
└── hadoop-entrypoint.sh
├── hbase
├── README.md
├── conf
│ ├── hbase-site.xml
│ └── regionservers
├── docker-compose.yml
└── dockerfile
├── hive
├── README.md
├── conf
│ └── hive-site.xml
└── dockerfile
├── kafka
├── README.md
├── conf
│ └── server.propertie
└── dockerfile
├── mysql
├── README.md
└── dockerfile
├── redis
├── README.md
└── redis.dockerfile
├── spark
├── README.md
├── compose.sh
├── conf
│ ├── slaves
│ └── spark-env.sh
├── docker-compose.yml
└── dockerfile
├── storm
├── README.md
├── conf
│ └── storm.yaml
└── dockerfile
└── zookeeper
├── README.md
├── conf
└── zoo.cfg
├── docker-compose.yml
└── dockerfile
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | **/.DS_Store
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # YangDocker
2 |
3 | 自助搭建的 hadoop + spark + kafka + zookeeper + storm + hbase + hive + flume + mysql + redis 集群,一主两从。
4 |
5 | ## 操作指南
6 |
7 | 先将项目 clone 到本地,然后进入目录,然后运行:
8 |
9 | ```bash
10 | bash ./image.sh
11 | ```
12 |
13 | 然后等待,构建所有的相关镜像。
14 |
15 | ## 操作系统
16 |
17 | ubuntu:16.04
18 |
19 | ## 工具包
20 |
21 | Hadoop 2.6:[下载地址](http://archive.apache.org/dist/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz)
22 |
23 | Hive 1.2.2:[下载地址](http://mirror.bit.edu.cn/apache/hive/hive-1.2.2/apache-hive-1.2.2-bin.tar.gz)
24 |
25 | kafka 2.11:[下载地址](http://mirrors.tuna.tsinghua.edu.cn/apache/kafka/0.10.2.1/kafka_2.11-0.10.2.1.tgz)
26 |
27 | JDK 1.8:[下载地址](http://download.oracle.com/otn-pub/java/jdk/8u172-b11/a58eab1ec242421181065cdc37240b08/jdk-8u172-linux-x64.tar.gz)
28 |
29 | scala 2.11.4:[下载地址](https://downloads.lightbend.com/scala/2.11.4/scala-2.11.4.tgz)
30 |
31 | spark 1.6.0:[下载地址](https://archive.apache.org/dist/spark/spark-1.6.0/spark-1.6.0-bin-hadoop2.6.tgz)
32 |
33 | zookeeper 3.4.5:[下载地址](http://archive.apache.org/dist/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz)
34 |
35 | hbase 0.98.6:[下载地址](http://archive.apache.org/dist/hbase/hbase-0.98.6/hbase-0.98.6-hadoop2-bin.tar.gz)
36 |
37 | flume 1.6.0:[下载地址](http://archive.apache.org/dist/flume/1.6.0/apache-flume-1.6.0-bin.tar.gz)
38 |
39 | mysql-connector 5.1.41:[下载地址](http://dev.mysql.com/get/Downloads/MySQL-5.1/mysql-5.1.41.tar.gz)
40 |
41 | redis 2.8.3:[下载地址](http://download.redis.io/releases/redis-2.8.3.tar.gz)
42 |
43 | storm 0.9.3:[下载地址](https://archive.apache.org/dist/storm/apache-storm-0.9.3/apache-storm-0.9.3.tar.gz)
44 |
45 | ## 注意
46 |
47 | 因为所有的环境/安装包都是临时下载的,所以构建过程速度较慢,镜像较大。
48 |
49 | ## 其他
50 |
51 | 欢迎 issue。
52 |
53 | ## 参考
54 |
55 | 1. [利用Docker搭建大数据处理集群](https://blog.csdn.net/iigeoxiaoyang/article/details/53020066)
56 |
--------------------------------------------------------------------------------
/base/README.md:
--------------------------------------------------------------------------------
1 | # base
2 |
3 | ## apt 源
4 |
5 | [sources.list](./sources.list):使用阿里源,加快下载速度。
6 |
7 | - 安装 java scala python3。
8 | - 配置各个节点之间的免密登陆。
9 | - zsh 命令行 ys 主题。
10 |
11 | ## pip 源
12 |
13 | [pip.conf](./pip.conf):使用清华源,加快 python package 下载速度。
14 |
15 | ## python package
16 |
17 | [requirements.txt](./requirements.txt)
18 |
19 | ## 构建
20 |
21 | ```shell
22 | docker -t build cluster-base ./
23 | ```
24 |
25 | ## 启动集群
26 |
27 | ```bash
28 | docker network create --subnet=172.20.0.0/16 cluster-network
29 | docker run -itd --name master --hostname=master --net cluster-network --ip 172.20.0.2 ubuntu:16.04 bash
30 | docker run -itd --name slave1 --hostname=slave1 --net cluster-network --ip 172.20.0.3 ubuntu:16.04 bash
31 | docker run -itd --name slave2 --hostname=slave2 --net cluster-network --ip 172.20.0.4 ubuntu:16.04 bash
32 | ```
33 |
--------------------------------------------------------------------------------
/base/dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | # 将 apt 源更换一下,速度更快。
4 | ADD ./sources.list /etc/apt/
5 |
6 | # 将 pip 源更换一下,提高下载速度。
7 | RUN mkdir /root/.pip/
8 | ADD ./pip.conf /root/.pip/
9 |
10 | # install apt & pip3 package.
11 | RUN apt-get update && apt-get upgrade -y \
12 | && apt-get install -y apt-utils wget openssh-server openssh-client python3 python3-pip \
13 | # install java env path:/usr/bin/java
14 | && apt-get install -y software-properties-common python-software-properties \
15 | && add-apt-repository ppa:webupd8team/java && apt-get update \
16 | && apt-get install -y oracle-java8-installer && update-java-alternatives -s java-8-oracle \
17 | # install scala env path:/usr/bin/scala
18 | && apt-get install -y scala \
19 | # install python3 package
20 | && pip3 install ipython -i https://pypi.douban.com/simple/ \
21 | && rm -rf /var/lib/apt/lists/* && apt-get clean \
22 | && rm -rf ~/.cache/pip/
23 |
24 | # oh-my-zsh 并且更为 zsh 风格 ys,将 zsh 替代默认的bash。
25 | RUN git clone https://github.com/robbyrussell/oh-my-zsh.git ~/.oh-my-zsh \
26 | && cp ~/.oh-my-zsh/templates/zshrc.zsh-template ~/.zshrc \
27 | && cp ~/.zshrc ~/.zshrc.orig \
28 | && chsh -s /bin/zsh \
29 | && sed -ri 's/^ZSH_THEME="robbyrussell"/ZSH_THEME="ys"/' /root/.zshrc
30 |
31 | RUN mkdir /var/run/sshd
32 | RUN echo 'root:root' |chpasswd
33 |
34 | # ssh.
35 | RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
36 | RUN sed -i 's/.*StrictHostKeyChecking ask/StrictHostKeyChecking no/' /etc/ssh/ssh_config
37 |
38 | # 直接在此处设置免密登录,便于后续的操作。
39 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \
40 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
41 | chmod 700 ~/.ssh && \
42 | chmod 600 ~/.ssh/id_rsa && \
43 | chmod 644 ~/.ssh/authorized_keys
44 |
45 | EXPOSE 22
46 |
47 | CMD ["/usr/sbin/sshd", "-D"]
48 |
--------------------------------------------------------------------------------
/base/pip.conf:
--------------------------------------------------------------------------------
1 | [global]
2 | index-url=https://pypi.tuna.tsinghua.edu.cn/simple
3 | [install]
4 | trusted-host=pypi.tuna.tsinghua.edu.cn
5 |
--------------------------------------------------------------------------------
/base/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | ipthon
3 | scipy
4 | pandas
5 |
--------------------------------------------------------------------------------
/base/sources.list:
--------------------------------------------------------------------------------
1 | # deb cdrom:[Ubuntu 16.04 LTS _Xenial Xerus_ - Release amd64 (20160420.1)]/ xenial main restricted
2 | deb-src http://archive.ubuntu.com/ubuntu xenial main restricted #Added by software-properties
3 | deb http://mirrors.aliyun.com/ubuntu/ xenial main restricted
4 | deb-src http://mirrors.aliyun.com/ubuntu/ xenial main restricted multiverse universe #Added by software-properties
5 | deb http://mirrors.aliyun.com/ubuntu/ xenial-updates main restricted
6 | deb-src http://mirrors.aliyun.com/ubuntu/ xenial-updates main restricted multiverse universe #Added by software-properties
7 | deb http://mirrors.aliyun.com/ubuntu/ xenial universe
8 | deb http://mirrors.aliyun.com/ubuntu/ xenial-updates universe
9 | deb http://mirrors.aliyun.com/ubuntu/ xenial multiverse
10 | deb http://mirrors.aliyun.com/ubuntu/ xenial-updates multiverse
11 | deb http://mirrors.aliyun.com/ubuntu/ xenial-backports main restricted universe multiverse
12 | deb-src http://mirrors.aliyun.com/ubuntu/ xenial-backports main restricted universe multiverse #Added by software-properties
13 | deb http://archive.canonical.com/ubuntu xenial partner
14 | deb-src http://archive.canonical.com/ubuntu xenial partner
15 | deb http://mirrors.aliyun.com/ubuntu/ xenial-security main restricted
16 | deb-src http://mirrors.aliyun.com/ubuntu/ xenial-security main restricted multiverse universe #Added by software-properties
17 | deb http://mirrors.aliyun.com/ubuntu/ xenial-security universe
18 | deb http://mirrors.aliyun.com/ubuntu/ xenial-security multiverse
19 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # DATA
2 |
--------------------------------------------------------------------------------
/flume/README.md:
--------------------------------------------------------------------------------
1 | # flume
2 |
--------------------------------------------------------------------------------
/flume/conf/flume-avro.conf:
--------------------------------------------------------------------------------
1 | # Define a memory channel called c1 on agent
2 | agent.channels.c1.type=memory
3 |
4 | # Define an avro source alled r1 on agent and tell it
5 | agent.sources.r1.channels=c1
6 | agent.sources.r1.type=avro
7 | agent.sources.r1.bind=127.0.0.1
8 | agent.sources.r1.port=44444
9 |
10 | # Describe/configuration the source
11 | agent.sinks.k1.type=hdfs
12 | agent.sinks.k1.channel=c1
13 | agent.sinks.k1.hdfs.path=hdfs://master:9000/flume_data_pool
14 | agent.sinks.k1.hdfs.filePrefix=events-
15 | agent.sinks.k1.hdfs.fileType=DataStream
16 | agent.sinks.k1.hdfs.writeFormat=Text
17 | agent.sinks.k1.hdfs.rollSize=0
18 | agent.sinks.k1.hdfs.rollCount= 600000
19 | agent.sinks.k1.hdfs.rollInterval=600
20 |
21 | agent.channels=c1
22 | agent.sources=r1
23 | agent.sinks=k1
24 |
--------------------------------------------------------------------------------
/flume/conf/flume-exec.conf:
--------------------------------------------------------------------------------
1 | # Name the components on this agent
2 | agent.sources=r1
3 | agent.sinks=k1
4 | agent.channels=c1
5 |
6 | # Describe/configuration the source
7 | agent.sources.r1.type=exec
8 | agent.sources.r1.command=tail -f /data/hadoop/flume/test.txt
9 |
10 | # Describe the sink
11 | agent.sinks.k1.type=logger
12 |
13 | # Use a channel which buffers events in memory
14 | agent.channels.c1.type=memory
15 | agent.channels.c1.capacity=1000
16 | agent.channels.c1.transactionCapacity=100
17 |
18 | # Bind the source and sink to the channel
19 | agent.sources.r1.channels=c1
20 | agent.sinks.k1.channel=c1
21 |
--------------------------------------------------------------------------------
/flume/conf/flume-netcat.conf:
--------------------------------------------------------------------------------
1 | # Name the components on this agent
2 | agent.sources=r1
3 | agent.sinks=k1
4 | agent.channels=c1
5 |
6 | # Describe/configuration the source
7 | agent.sources.r1.type=netcat
8 | agent.sources.r1.bind=127.0.0.1
9 | agent.sources.r1.port=44444
10 |
11 | # Describe the sink
12 | agent.sinks.k1.type=logger
13 |
14 | # Use a channel which buffers events in memory
15 | agent.channels.c1.type=memory
16 | agent.channels.c1.capacity=1000
17 | agent.channels.c1.transactionCapacity=100
18 |
19 | # Bind the source and sink to the channel
20 | agent.sources.r1.channels=c1
21 | agent.sinks.k1.channel=c1
22 |
--------------------------------------------------------------------------------
/flume/dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN wget http://archive.apache.org/dist/flume/1.6.0/apache-flume-1.6.0-bin.tar.gz \
4 | && tar -zxvf /apache-flume-1.6.0-bin.tar.gz -C /usr/local/ \
5 | && mv /usr/local/apache-flume-1.6.0-bin /usr/local/flume \
6 | && rm /apache-flume-1.6.0-bin.tar.gz
7 |
--------------------------------------------------------------------------------
/hadoop/README.md:
--------------------------------------------------------------------------------
1 | # hadoop
2 |
3 | ## 网络
4 |
5 | - master: 172.17.0.2
6 | - slave1: 172.17.0.3
7 | - slave2: 172.17.0.4
8 |
9 | ## cluster-hadoop
10 |
11 | hadoop 单机的集群,以 cluster-base 为基础。
12 |
13 | ## 启动镜像
14 |
15 | ```bash
16 | docker network create --subnet=172.18.0.0/16 hadoop-network
17 | docker run -itd --name hadoop-master --hostname=hadoop-master --net hadoop-network --ip 172.18.0.2 cluster-hadoop zsh
18 | docker run -itd --name hadoop-slave1 --hostname=hadoop-slave1 --net hadoop-network --ip 172.18.0.2 cluster-hadoop zsh
19 | docker run -itd --name hadoop-slave2 --hostname=hadoop-slave2 --net hadoop-network --ip 172.18.0.2 cluster-hadoop zsh
20 | ```
21 |
22 | docker run -ti --name master --hostname=master ubuntu:16.04 zsh
23 | docker run -ti --name slave1 --hostname=slave1 ubuntu:16.04 zsh
24 | docker run -ti --name slave2 --hostname=slave2 ubuntu:16.04 zsh
--------------------------------------------------------------------------------
/hadoop/conf/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fs.defaultFS
4 | hdfs://master:9000
5 |
6 |
7 |
8 | hadoop.tmp.dir
9 | file:/usr/local/hadoop/tmp
10 |
11 |
12 |
--------------------------------------------------------------------------------
/hadoop/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | dfs.namenode.secondary.http-address
4 | master:9001
5 |
6 |
7 |
8 | dfs.namenode.name.dir
9 | file:/usr/local/hadoop/name/
10 |
11 |
12 |
13 | dfs.datanode.data.dir
14 | file:/usr/local/hadoop/data/
15 |
16 |
17 |
18 | dfs.repliction
19 | 3
20 |
21 |
22 |
--------------------------------------------------------------------------------
/hadoop/conf/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce.framework.name
4 | yarn
5 |
6 |
7 |
--------------------------------------------------------------------------------
/hadoop/conf/masters:
--------------------------------------------------------------------------------
1 | master
--------------------------------------------------------------------------------
/hadoop/conf/slaves:
--------------------------------------------------------------------------------
1 | slave1
2 | slave2
--------------------------------------------------------------------------------
/hadoop/conf/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | yarn.nodemanager.aux-services
4 | mapreduce_shuffle
5 |
6 |
7 |
8 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
9 | org.apache.hadoop.mapred.ShuffleHandler
10 |
11 |
12 |
13 | yarn.resourcemanager.address
14 | master:8032
15 |
16 |
17 |
18 | yarn.resourcemanager.scheduler.address
19 | master:8030
20 |
21 |
22 |
23 | yarn.resourcemanager.resource-tracker.address
24 | master:8035
25 |
26 |
27 |
28 | yarn.resourcemanager.admin.address
29 | master:8033
30 |
31 |
32 |
33 | yarn.resourcemanager.webapp.address
34 | master:8088
35 |
36 |
37 |
--------------------------------------------------------------------------------
/hadoop/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | master:
4 | image: cluster-hadoop
5 | container_name: hadoop-master
6 | networks:
7 | cluster-network:
8 | ipv4_address: 172.20.0.2
9 | aliases:
10 | - hadoop-master
11 | volumes:
12 | - ../data/:/data
13 | - ./:/hadoop
14 | extra_hosts:
15 | - "slave1:172.20.0.3"
16 | - "slave2:172.20.0.4"
17 | hostname: master
18 | environment:
19 | ROLE: master
20 |
21 | slave1:
22 | image: cluster-hadoop
23 | container_name: hadoop-slave1
24 | networks:
25 | cluster-network:
26 | ipv4_address: 172.20.0.3
27 | aliases:
28 | - hadoop-slave1
29 | volumes:
30 | - ../data/:/data
31 | - ./:/hadoop
32 | extra_hosts:
33 | - "master:172.20.0.2"
34 | - "slave2:172.20.0.4"
35 | hostname: slave1
36 | environment:
37 | ROLE: slave
38 |
39 | slave2:
40 | image: cluster-hadoop
41 | container_name: hadoop-slave2
42 | networks:
43 | cluster-network:
44 | ipv4_address: 172.20.0.4
45 | aliases:
46 | - hadoop-slave2
47 | volumes:
48 | - ../data/:/data
49 | - ./:/hadoop
50 | extra_hosts:
51 | - "master:172.20.0.2"
52 | - "slave1:172.20.0.3"
53 | hostname: slave2
54 | environment:
55 | ROLE: slave
56 |
57 | networks:
58 | cluster-network:
59 | driver: bridge
60 | driver_opts:
61 | com.docker.network.enable_ipv6: "false"
62 | ipam:
63 | driver: default
64 | config:
65 | - subnet: 172.20.0.0/16
66 |
--------------------------------------------------------------------------------
/hadoop/dockerfile:
--------------------------------------------------------------------------------
1 | FROM cluster-base
2 |
3 | # hadoop 环境安装:
4 | RUN wget http://archive.apache.org/dist/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz \
5 | && tar -zxvf /hadoop-2.6.0.tar.gz -C /usr/local/ \
6 | && mv /usr/local/hadoop-2.6.0 /usr/local/hadoop \
7 | && mkdir /usr/local/hadoop/tmp/ \
8 | && mkdir /usr/local/hadoop/data/ \
9 | && mkdir /usr/local/hadoop/name/ \
10 | && rm /hadoop-2.6.0.tar.gz
11 |
12 | # java8 环境。
13 | ENV JAVA_HOME=/usr/local/jdk1.8
14 | ENV PATH $JAVA_HOME/bin:$PATH
15 |
16 | # hadoop 环境。
17 | ENV HADOOP_HOME=/usr/local/hadoop
18 | ENV PATH $HADOOP_HOME/bin:$PATH
19 | ENV HADOOP_PREFIX=$HADOOP_HOME
20 |
21 | RUN cd $HADOOP_HOME \
22 | && echo "export JAVA_HOME=$JAVA_HOME" >> etc/hadoop/hadoop-env.sh \
23 | && echo "export HADOOP_PREFIX=$HADOOP_PREFIX" >> etc/hadoop/hadoop-env.sh \
24 | && echo "export JAVA_HOME=$JAVA_HOME" >> etc/hadoop/yarn-env.sh
25 |
26 | COPY ./conf/core-site.xml $HADOOP_HOME/etc/hadoop
27 | COPY ./conf/hdfs-site.xml $HADOOP_HOME/etc/hadoop
28 | COPY ./conf/mapred-site.xml $HADOOP_HOME/etc/hadoop
29 | COPY ./conf/yarn-site.xml $HADOOP_HOME/etc/hadoop
30 | COPY ./conf/masters $HADOOP_HOME/etc/hadoop
31 | COPY ./conf/slaves $HADOOP_HOME/etc/hadoop
32 |
33 | WORKDIR /hadoop
34 |
35 | EXPOSE 22
36 |
37 | CMD ["/usr/sbin/sshd", "-D"]
38 |
--------------------------------------------------------------------------------
/hadoop/hadoop-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | startMaster() {
4 | /usr/local/hadoop/sbin/start-all.sh
5 | }
6 |
7 | stopMaster() {
8 | /usr/local/hadoop/sbin/stop-all.sh
9 | }
10 |
11 | main() {
12 | service ssh restart
13 | service sshd restart
14 |
15 | sleep 5
16 |
17 | if [ ${ROLE} == "master" ]
18 | then
19 | hdfs namenode -format
20 | startMaster
21 | fi
22 | }
23 |
24 | main
25 |
--------------------------------------------------------------------------------
/hbase/README.md:
--------------------------------------------------------------------------------
1 | # hbase
2 |
--------------------------------------------------------------------------------
/hbase/conf/hbase-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | hbase.tmp.dir
4 | /usr/local/hbase/tmp
5 |
6 |
7 |
8 | hbase.rootdir
9 | hdfs://master:9000/hbase
10 |
11 |
12 |
13 | hbase.cluster.distributed
14 | true
15 |
16 |
17 |
18 | hbase.zookeeper.quorum
19 | master, slave1, slave2
20 |
21 |
22 |
23 | hbase.zookeeper.property.dataDir
24 | /usr/local/hbase/zookeeper
25 |
26 |
27 |
28 | hbase.master.info.port
29 | 60010
30 |
31 |
32 |
--------------------------------------------------------------------------------
/hbase/conf/regionservers:
--------------------------------------------------------------------------------
1 | master
2 | slave1
3 | slave2
4 |
--------------------------------------------------------------------------------
/hbase/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | master:
4 | image: cluster-hbase
5 | container_name: hbase-master
6 | networks:
7 | default:
8 | ipv4_address: 172.17.0.2
9 | extra_hosts:
10 | - "slave1:172.17.0.3"
11 | - "slave2:172.17.0.4"
12 | hostname: master
13 | environment:
14 | ZK_ID: 1
15 | ROLE: master
16 | tty: true
17 | stdin_open: true
18 |
19 | slave1:
20 | image: cluster-hbase
21 | container_name: hbase-slave1
22 | networks:
23 | default:
24 | ipv4_address: 172.17.0.3
25 | extra_hosts:
26 | - "master:172.17.0.2"
27 | - "slave2:172.17.0.4"
28 | hostname: slave1
29 | environment:
30 | ZK_ID: 2
31 | ROLE: slave
32 | tty: true
33 | stdin_open: true
34 |
35 | slave2:
36 | image: cluster-hbase
37 | container_name: hbase-slave2
38 | networks:
39 | default:
40 | ipv4_address: 172.17.0.4
41 | extra_hosts:
42 | - "master:172.17.0.2"
43 | - "slave1:172.17.0.3"
44 | hostname: slave2
45 | environment:
46 | ZK_ID: 3
47 | ROLE: slave
48 | tty: true
49 | stdin_open: true
50 |
51 | networks:
52 | default:
53 | driver: bridge
54 | driver_opts:
55 | com.docker.network.enable_ipv6: "false"
56 | ipam:
57 | driver: default
58 | config:
59 | - subnet: 172.17.0.0/17
60 |
--------------------------------------------------------------------------------
/hbase/dockerfile:
--------------------------------------------------------------------------------
1 | FROM cluster-spark
2 |
3 | RUN wget http://archive.apache.org/dist/hbase/hbase-0.98.6/hbase-0.98.6-hadoop2-bin.tar.gz \
4 | && tar -zxvf /hbase-0.98.6-hadoop2-bin.tar.gz -C /usr/local/ \
5 | && mv /usr/local/hbase-0.98.6-hadoop2 /usr/local/hbase \
6 | && rm /hbase-0.98.6-hadoop2-bin.tar.gz \
7 | && mkdir /usr/local/hbase/tmp
8 |
9 | ENV JAVA_HOME=/usr/local/jdk1.8
10 | ENV CLASSPATH=.:$CLASSPATH:$JAVA_HOME/lib
11 |
12 | ENV HBASE_HOME=/usr/local/hbase
13 | ENV HBASE_CLASSPATH=$HBASE_HOME/conf
14 | ENV HBASE_LOG_DIR=$HBASE_HOME/logs
15 | ENV PATH=$PATH:$HBASE_HOME/bin
16 |
--------------------------------------------------------------------------------
/hive/README.md:
--------------------------------------------------------------------------------
1 | # hive
2 |
--------------------------------------------------------------------------------
/hive/conf/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | javax.jdo.option.ConnectionURL
4 | jdbc:mysql://master:3306/hive?createDatabaseIfNotExist=true
5 |
6 |
7 | javax.jdo.option.ConnectionDriverName
8 | com.mysql.jdbc.Driver
9 |
10 |
11 | javax.jdo.option.ConnectionUserName
12 | root
13 |
14 |
15 | javax.jdo.option.ConnectionPassword
16 | hadoop
17 |
18 |
19 |
--------------------------------------------------------------------------------
/hive/dockerfile:
--------------------------------------------------------------------------------
1 | FROM cluster-mysql
2 |
3 | RUN wget mirror.bit.edu.cn/apache/hive/hive-1.2.2/apache-hive-1.2.2-bin.tar.gz \
4 | && tar -zxvf /apache-hive-1.2.2-bin.tar.gz -C /usr/local/ \
5 | && mv /usr/local/apache-hive-1.2.2-bin /usr/local/hive \
6 | && rm /apache-hive-1.2.2-bin.tar.gz
7 |
8 | ENV export HIVE_HOME=/usr/local/hive
9 | ENV PATH=$HIVE_HOME/bin:$PATH
10 |
--------------------------------------------------------------------------------
/kafka/README.md:
--------------------------------------------------------------------------------
1 | # Kafka
2 |
3 | If you want to run kafka in cluster,in current folder run:
4 |
5 | ```bash
6 | docker-compose up
7 | ```
8 |
9 | After that
10 |
11 | ```bash
12 | docker ps
13 | ```
14 |
15 | You can find that there 3 containers started which named kafka-master, kafka-slave1, kafka-slave2
16 |
17 | Then go inside the Master node, which is kafka-master.
18 |
19 | ```bash
20 | docker exec -it kafka-master zsh
21 | jps
22 | ```
23 |
24 | If you want to stop kafka, in current folder run:
25 |
26 | ```bash
27 | docker-compose down
28 | ```
29 |
--------------------------------------------------------------------------------
/kafka/conf/server.propertie:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # see kafka.server.KafkaConfig for additional details and defaults
17 |
18 | ############################# Server Basics #############################
19 |
20 |
21 | # Switch to enable topic deletion or not, default value is false
22 | #delete.topic.enable=true
23 |
24 | ############################# Socket Server Settings #############################
25 |
26 | # The address the socket server listens on. It will get the value returned from
27 | # java.net.InetAddress.getCanonicalHostName() if not configured.
28 | # FORMAT:
29 | # listeners = listener_name://host_name:port
30 | # EXAMPLE:
31 | # listeners = PLAINTEXT://your.host.name:9092
32 | #listeners=PLAINTEXT://:9092
33 |
34 | # Hostname and port the broker will advertise to producers and consumers. If not set,
35 | # it uses the value for "listeners" if configured. Otherwise, it will use the value
36 | # returned from java.net.InetAddress.getCanonicalHostName().
37 | #advertised.listeners=PLAINTEXT://your.host.name:9092
38 |
39 | # Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
40 | #listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
41 |
42 | # The number of threads handling network requests
43 | num.network.threads=3
44 |
45 | # The number of threads doing disk I/O
46 | num.io.threads=8
47 |
48 | # The send buffer (SO_SNDBUF) used by the socket server
49 | socket.send.buffer.bytes=102400
50 |
51 | # The receive buffer (SO_RCVBUF) used by the socket server
52 | socket.receive.buffer.bytes=102400
53 |
54 | # The maximum size of a request that the socket server will accept (protection against OOM)
55 | socket.request.max.bytes=104857600
56 |
57 |
58 | ############################# Log Basics #############################
59 |
60 | # A comma seperated list of directories under which to store log files
61 | log.dirs=/usr/local/kafka/kafka-logs
62 | zookeeper.connect=master:2181, slave1:2181, slave2:2181
63 |
64 | # The default number of log partitions per topic. More partitions allow greater
65 | # parallelism for consumption, but this will also result in more files across
66 | # the brokers.
67 | num.partitions=1
68 |
69 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
70 | # This value is recommended to be increased for installations with data dirs located in RAID array.
71 | num.recovery.threads.per.data.dir=1
72 |
73 | ############################# Log Flush Policy #############################
74 |
75 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
76 | # the OS cache lazily. The following configurations control the flush of data to disk.
77 | # There are a few important trade-offs here:
78 | # 1. Durability: Unflushed data may be lost if you are not using replication.
79 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
80 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
81 | # The settings below allow one to configure the flush policy to flush data after a period of time or
82 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
83 |
84 | # The number of messages to accept before forcing a flush of data to disk
85 | #log.flush.interval.messages=10000
86 |
87 | # The maximum amount of time a message can sit in a log before we force a flush
88 | #log.flush.interval.ms=1000
89 |
90 | ############################# Log Retention Policy #############################
91 |
92 | # The following configurations control the disposal of log segments. The policy can
93 | # be set to delete segments after a period of time, or after a given size has accumulated.
94 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
95 | # from the end of the log.
96 |
97 | # The minimum age of a log file to be eligible for deletion due to age
98 | log.retention.hours=168
99 |
100 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
101 | # segments don't drop below log.retention.bytes. Functions independently of log.retention.hours.
102 | #log.retention.bytes=1073741824
103 |
104 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
105 | log.segment.bytes=1073741824
106 |
107 | # The interval at which log segments are checked to see if they can be deleted according
108 | # to the retention policies
109 | log.retention.check.interval.ms=300000
110 |
111 | ############################# Zookeeper #############################
112 |
113 | # Zookeeper connection string (see zookeeper docs for details).
114 | # This is a comma separated host:port pairs, each corresponding to a zk
115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
116 | # You can also append an optional chroot string to the urls to specify the
117 | # root directory for all kafka znodes.
118 |
119 | # Timeout in ms for connecting to zookeeper
120 | zookeeper.connection.timeout.ms=6000
121 |
--------------------------------------------------------------------------------
/kafka/dockerfile:
--------------------------------------------------------------------------------
1 | FROM cluster-hadoop
2 |
3 | RUN wget http://mirrors.tuna.tsinghua.edu.cn/apache/kafka/0.10.2.1/kafka_2.11-0.10.2.1.tgz \
4 | && tar -zxvf /kafka_2.11-0.10.2.1.tgz -C /usr/local/ \
5 | && mv /usr/local/kafka_2.11-0.10.2.1 /usr/local/kafka \
6 | && rm /kafka_2.11-0.10.2.1.tgz \
7 | && mkdir -p /usr/local/kafka/kafka-logs
8 |
9 | COPY ./conf/server.properties /usr/local/kafka/conf
10 |
11 | ENV KAFKA_HOME=/usr/local/kafka
12 | ENV PATH $KAFKA_HOME/bin:$PATH
13 |
--------------------------------------------------------------------------------
/mysql/README.md:
--------------------------------------------------------------------------------
1 | # mysql
--------------------------------------------------------------------------------
/mysql/dockerfile:
--------------------------------------------------------------------------------
1 | FROM cluster-base
2 |
3 | RUN wget --no-check-certificate --no-cookies http://dev.mysql.com/get/Downloads/MySQL-5.1/mysql-5.1.41.tar.gz \
4 | && tar -zxvf /mysql-5.1.41.tar.gz -C /usr/local \
5 | && mv /usr/local/mysql-5.1.41 /usr/local/mysql \
6 | && rm /mysql-5.1.41.tar.gz
7 |
--------------------------------------------------------------------------------
/redis/README.md:
--------------------------------------------------------------------------------
1 | # redis
2 |
--------------------------------------------------------------------------------
/redis/redis.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN wget http://download.redis.io/releases/redis-2.8.3.tar.gz \
4 | && tar -zxvf /redis-2.8.3.tar.gz -C /usr/local/ \
5 | && mv /usr/local/redis-2.8.3 /usr/local/redis \
6 | && rm /redis-2.8.3.tar.gz
7 |
--------------------------------------------------------------------------------
/spark/README.md:
--------------------------------------------------------------------------------
1 | # spark
2 |
3 | 本项⽬是通过 Docker 镜像来构建分布式集群,⼀个主节点,两个从结点。
4 |
5 | ## 使用 spark
6 |
7 | > 所有的功能写成了如下脚本。
8 |
9 | ### spark 相关脚本介绍
10 |
11 | - `bash ./initial-spark.sh`:初始化 spark 环境,只能初始化一次,重新初始化之前需要删除 spark 环境。
12 | - `bash ./start-spark.sh`:启动 spark 环境,后台启动,不会有任何反应。
13 | - `bash ./stop-spark.sh`:停用 spark 环境。
14 | - `bash ./delete-spark.sh`:删除 spark 环境。
15 | - `bash ./pyspark-shell.sh`:在完成初始化和启动 spark 环境后,执行此命令可以启动 pyspark 交互式窗口。
16 |
17 | ### pyspark 交互式
18 |
19 | 1. 在命令行中,进入本文件夹所在的目录。
20 | 2. 执行 `bash ./initial-spark.sh`,初始化环境 **(第一次下载安装包等待时间较长)**,然后执行 `bash ./start-spark.sh`,启动 spark 环境,接着执行 `bash ./pyspark-shell.sh`,直接进入了 pyspark 交互式窗口,此处配置的 `python3` 版本,使用 `ipython` 接口。
21 | 3. 执行结束了,可以执行 `bash ./stop-spark.sh` 来停用 spark 环境或者执行 `bash ./delete-spark.sh` 来删除 spark 环境。
22 |
23 | 由于 `pyspark` 是交互式执⾏行行的界⾯面,启动是已经帮你创建好了了会话 `SparkContext` ,所以可以直接使 ⽤用 `sc`。
24 |
25 | `ipython` 里,可以使⽤用 `tab` 来⾃自动补全。
26 |
27 | ### 数据⽂件
28 |
29 | 将需要的数据⽂件放在本项目的 `/data` 目录下,在 `pyspark` 交互中,目录同样在 `/data` 下。
30 |
--------------------------------------------------------------------------------
/spark/compose.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function main() {
4 | service ssh restart
5 | service sshd restart
6 | if [ 'master' = ${ROLE} ]
7 | then
8 | hdfs namenode -format
9 | fi
10 | }
11 |
12 | main
13 |
--------------------------------------------------------------------------------
/spark/conf/slaves:
--------------------------------------------------------------------------------
1 | slave1
2 | slave2
3 |
--------------------------------------------------------------------------------
/spark/conf/spark-env.sh:
--------------------------------------------------------------------------------
1 | export SCALA_HOME=/usr/local/scala
2 | export JAVA_HOME=/usr/local/jdk1.8
3 | export HADOOP_HOME=/usr/local/hadoop
4 | export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
5 |
6 | SPARK_MASTER_IP=master
7 | SPARK_LOCAL_DIRS=/usr/local/spark
8 | SPARK_DRIVER_MEMORY=1G # 每个worker结点能够最大分配给executors的内存大小,依据自己的机器而定
9 | export SPARK_WORKER_INSTANCES=1 # 每台机器上开启worker结点的数目
10 | export SPARK_MASTER_PORT=7077
11 | export SPARK_WORKER_CORES=2 # 每个worker结点所占有的CPU核数目
12 | export SPARK_WORKER_MEMORY=512m
13 |
--------------------------------------------------------------------------------
/spark/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | master:
4 | image: cluster-spark
5 | container_name: spark-master
6 | volumes:
7 | - ../data/:/data
8 | - ./:/spark
9 | networks:
10 | cluster-spark-network:
11 | ipv4_address: 172.88.0.2
12 | extra_hosts:
13 | - "slave1:172.88.0.3"
14 | - "slave2:172.88.0.4"
15 | hostname: master
16 | environment:
17 | ROLE: master
18 | command: ['bash','/spark/compose.sh']
19 |
20 | slave1:
21 | image: cluster-spark
22 | container_name: spark-slave1
23 | volumes:
24 | - ../data/:/data
25 | - ./:/spark
26 | depends_on:
27 | - master
28 | networks:
29 | cluster-spark-network:
30 | ipv4_address: 172.88.0.3
31 | extra_hosts:
32 | - "master:172.88.0.2"
33 | - "slave2:172.88.0.4"
34 | hostname: slave1
35 | environment:
36 | ROLE: slave
37 |
38 | slave2:
39 | image: cluster-spark
40 | container_name: spark-slave2
41 | volumes:
42 | - ../data/:/data
43 | - ./:/spark
44 | networks:
45 | cluster-spark-network:
46 | ipv4_address: 172.88.0.4
47 | extra_hosts:
48 | - "master:172.88.0.2"
49 | - "slave1:172.88.0.3"
50 | depends_on:
51 | - master
52 | hostname: slave2
53 | environment:
54 | ROLE: slave
55 |
56 | networks:
57 | cluster-spark-network:
58 | driver: bridge
59 | driver_opts:
60 | com.docker.network.enable_ipv6: "false"
61 | ipam:
62 | driver: default
63 | config:
64 | - subnet: 172.88.0.0/16
65 |
--------------------------------------------------------------------------------
/spark/dockerfile:
--------------------------------------------------------------------------------
1 | FROM cluster-hadoop
2 |
3 | RUN wget https://archive.apache.org/dist/spark/spark-1.6.0/spark-1.6.0-bin-hadoop2.6.tgz \
4 | && tar -zxvf /spark-1.6.0-bin-hadoop2.6.tgz -C /usr/local/ \
5 | && mv /usr/local/spark-1.6.0-bin-hadoop2.6 /usr/local/spark \
6 | && rm /spark-1.6.0-bin-hadoop2.6.tgz
7 |
8 | COPY ./conf/slaves /usr/local/spark/conf/
9 | COPY ./conf/spark-env.sh /usr/local/spark/conf/
10 |
11 | # 配置 pyspark 的 python3 环境,并设置 ipython 接口。
12 | ENV PYSPARK_PYTHON=python3
13 | ENV PYSPARK_DRIVER_PYTHON=ipython
14 | ENV SPARK_HOME=/usr/local/spark
15 |
16 | WORKDIR /spark
17 |
18 | EXPOSE 22
19 |
20 | CMD ["/usr/sbin/sshd", "-D"]
21 |
--------------------------------------------------------------------------------
/storm/README.md:
--------------------------------------------------------------------------------
1 | # storm
--------------------------------------------------------------------------------
/storm/conf/storm.yaml:
--------------------------------------------------------------------------------
1 | storm.zookeeper.servers:
2 | - "master"
3 | - "slave1"
4 | - "slave2"
5 |
6 | nimbus.host: "master"
7 | supervisor.slots.ports:
8 | - 6700
9 | - 6701
10 | - 6702
11 | - 6703
12 | - 6704
13 | - 6705
14 |
--------------------------------------------------------------------------------
/storm/dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN wget wget https://archive.apache.org/dist/storm/apache-storm-0.9.3/apache-storm-0.9.3.tar.gz \
4 | && tar -zxvf /apache-storm-0.9.3.tar.gz -C /usr/local \
5 | && mv /usr/local/apache-storm-0.9.3 /usr/local/storm \
6 | && rm /apache-storm-0.9.3.tar.gz
7 |
8 | ENV STORM_HOME=/usr/local/storm
9 | ENV PATH=$PATH:$STORM_HOME/bin
10 |
11 | RUN chmod +x /usr/local/src/storm/bin/stop-storm.sh
12 |
--------------------------------------------------------------------------------
/zookeeper/README.md:
--------------------------------------------------------------------------------
1 | # zookeeper
2 |
--------------------------------------------------------------------------------
/zookeeper/conf/zoo.cfg:
--------------------------------------------------------------------------------
1 | dataDir=/usr/local/zookeeper/data
2 | dataLogDir=/usr/local/zookeeper/log
3 | server.1=master:2888:3888
4 | server.2=slave1:2888:3888
5 | server.3=slave2:2888:3888
6 | tickTime=2000
7 | clientPort=2181
8 | initLimit=5
9 | syncLimit=2
10 |
--------------------------------------------------------------------------------
/zookeeper/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | master:
4 | image: cluster-zookeeper
5 | container_name: zk-master
6 | networks:
7 | default:
8 | ipv4_address: 172.17.0.2
9 | extra_hosts:
10 | - "slave1:172.17.0.3"
11 | - "slave2:172.17.0.4"
12 | hostname: master
13 | environment:
14 | ZK_ID: 1
15 | tty: true
16 |
17 | slave1:
18 | image: cluster-zookeeper
19 | container_name: zk-slave1
20 | networks:
21 | default:
22 | ipv4_address: 172.17.0.3
23 | extra_hosts:
24 | - "master:172.17.0.2"
25 | - "slave2:172.17.0.4"
26 | hostname: slave1
27 | environment:
28 | ZK_ID: 2
29 | tty: true
30 |
31 | slave2:
32 | image: cluster-zookeeper
33 | container_name: zk-slave2
34 | networks:
35 | default:
36 | ipv4_address: 172.17.0.4
37 | extra_hosts:
38 | - "master:172.17.0.2"
39 | - "slave1:172.17.0.3"
40 | hostname: slave2
41 | environment:
42 | ZK_ID: 3
43 | tty: true
44 |
45 | networks:
46 | default:
47 | driver: bridge
48 | driver_opts:
49 | com.docker.network.enable_ipv6: "false"
50 | ipam:
51 | driver: default
52 | config:
53 | - subnet: 172.17.0.0/17
54 |
--------------------------------------------------------------------------------
/zookeeper/dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | # zookeeper 环境安装:
4 | RUN wget http://mirror.bit.edu.cn/apache/zookeeper/zookeeper-3.4.10/zookeeper-3.4.10.tar.gz \
5 | && tar -zxvf /zookeeper-3.4.10.tar.gz -C /usr/local/ \
6 | && mv /usr/local/zookeeper-3.4.10 /usr/local/zookeeper \
7 | && rm /zookeeper-3.4.10.tar.gz \
8 | && mkdir /usr/local/zookeeper/data \
9 | && mkdir /usr/local/zookeeper/log
10 |
11 | COPY ./conf/zoo.cfg /usr/local/zookeeper/conf
12 |
13 | ENV ZOOKEEPER_HOME=/usr/local/zookeeper
14 | ENV PATH $ZOOKEEPER_HOME/bin:$PATH
15 |
--------------------------------------------------------------------------------