├── .gitignore
├── conf
├── config
├── hdfs-site.xml
├── mapred-site.xml
├── core-site.xml
└── yarn-site.xml
├── start-hadoop.sh
├── LICENSE
├── Dockerfile
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | *.tar.gz
2 | *.tgz
3 | *.zip
4 |
--------------------------------------------------------------------------------
/conf/config:
--------------------------------------------------------------------------------
1 | Host *
2 | UserKnownHostsFile /dev/null
3 | StrictHostKeyChecking no
4 |
--------------------------------------------------------------------------------
/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | dfs.replication
4 | 1
5 |
6 |
7 |
--------------------------------------------------------------------------------
/conf/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce.framework.name
4 | yarn
5 |
6 |
7 |
--------------------------------------------------------------------------------
/conf/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fs.defaultFS
4 | hdfs://localhost:9000
5 |
6 |
7 |
--------------------------------------------------------------------------------
/conf/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | yarn.nodemanager.aux-services
4 | mapreduce_shuffle
5 |
6 |
7 | yarn.resourcemanager.address
8 | 127.0.0.1:8032
9 |
10 |
11 |
--------------------------------------------------------------------------------
/start-hadoop.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | # start ssh server
4 | /etc/init.d/ssh start
5 |
6 | # format namenode
7 | $HADOOP_HOME/bin/hdfs namenode -format
8 |
9 | # start hadoop
10 | $HADOOP_HOME/sbin/start-dfs.sh
11 | $HADOOP_HOME/sbin/start-yarn.sh
12 | $HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
13 |
14 | # keep container running
15 | tail -f /dev/null
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Carneirão
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 |
3 | # set environment vars
4 | ENV HADOOP_BASE /opt/hadoop
5 | ENV HADOOP_HOME /opt/hadoop/current
6 | ENV HADOOP_VERSION=2.8.5
7 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
8 | ENV SPARK_BASE /opt/spark
9 | ENV SPARK_HOME /opt/spark/current
10 | ENV SPARK_VERSION=2.4.4
11 |
12 | # configuring tz to avoid problems with interaction problems with tzdata package
13 | ENV TZ=America/Sao_Paulo
14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
15 |
16 | # Install packages
17 | RUN \
18 | apt-get update && apt-get install -y \
19 | net-tools \
20 | sudo \
21 | curl \
22 | ssh \
23 | rsync \
24 | vim \
25 | openjdk-8-jdk \
26 | maven \
27 | python3-pip \
28 | jupyter-notebook
29 |
30 |
31 | # download and extract hadoop, set JAVA_HOME in hadoop-env.sh, update path
32 | RUN curl -L \
33 | --progress-bar 'https://www-us.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz' \
34 | -o "hadoop-$HADOOP_VERSION.tar.gz"
35 |
36 | COPY hadoop-$HADOOP_VERSION.tar.gz .
37 | RUN mkdir -p $HADOOP_BASE \
38 | && tar -xzvmf hadoop-$HADOOP_VERSION.tar.gz -C $HADOOP_BASE/ \
39 | && cd $HADOOP_BASE \
40 | && ln -s hadoop-$HADOOP_VERSION current \
41 | && cd / \
42 | && echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
43 | && echo "PATH=$PATH:$HADOOP_HOME/bin" >> ~/.bashrc
44 |
45 | # create ssh keys
46 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
47 | RUN cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
48 | RUN chmod 0600 ~/.ssh/authorized_keys
49 |
50 | # copy hadoop configs
51 | COPY conf/*xml $HADOOP_HOME/etc/hadoop/
52 |
53 | # copy ssh config
54 | COPY conf/config /root/.ssh/config
55 |
56 | # create hduser user
57 | RUN useradd -m -s /bin/bash hduser \
58 | && groupadd hdfs \
59 | && usermod -aG hdfs hduser \
60 | && usermod -aG sudo hduser \
61 | && mkdir ~hduser/.ssh
62 |
63 | # create ssh keys
64 | RUN ssh-keygen -t rsa -P '' -f ~hduser/.ssh/id_rsa \
65 | && cat ~/.ssh/id_rsa.pub >> ~hduser/.ssh/authorized_keys \
66 | && chmod 0600 ~hduser/.ssh/authorized_keys
67 |
68 | # download and build spark with maven with Hive and hive-trhift support
69 | ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
70 | RUN curl -L \
71 | --progress-bar 'https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4.tgz' \
72 | -o "spark-$SPARK_VERSION.tgz"
73 |
74 | COPY spark-$SPARK_VERSION.tgz .
75 | ENV SPARK_PART_VERSION=2.4
76 | ENV HADOOP_PART_VERSION=2.8
77 |
78 | RUN mkdir -p $SPARK_BASE && tar -xzmvf spark-$SPARK_VERSION.tgz \
79 | && cd spark-$SPARK_VERSION \
80 | && ./build/mvn \
81 | -Pyarn -Phadoop-$HADOOP_PART_VERSION -Dhadoop.version=$HADOOP_VERSION \
82 | -Phive -Phive-thriftserver \
83 | -DskipTests clean package
84 |
85 | # Moving Spark after build dirs to $SPARK_HOME proving to be IMPOSSIBLE!
86 | RUN cd /
87 | RUN tar -cBpvzf spark-$SPARK_VERSION.tar.gz spark-$SPARK_VERSION
88 | RUN tar -xzvmf spark-$SPARK_VERSION.tar.gz -C $SPARK_BASE/
89 | RUN ln -s spark-$SPARK_VERSION $SPARK_HOME \
90 | && cd /
91 |
92 | # Install pyspark
93 | RUN pip3 install pyspark
94 |
95 | # Configuring ~hduser/.bashrc
96 | RUN echo "export JAVA_HOME=$JAVA_HOME" >> ~hduser/.bashrc \
97 | && echo "export HADOOP_HOME=$HADOOP_HOME" >> ~hduser/.bashrc \
98 | && echo "alias python='python3.6'" >> ~hduser/.bashrc \
99 | && echo "alias pip='pip3'" >> ~hduser/.bashrc \
100 | && echo "export PYSPARK_PYTHON='python3.6'" >> ~hduser/.bashrc \
101 | && echo "export SPARK_HOME=$SPARK_HOME" >> ~hduser/.bashrc \
102 | && echo "export SPARK_MAJOR_VERSION=2" >> ~hduser/.bashrc \
103 | && echo "export PATH=$PATH:$HADOOP_HOME/bin:$SPARK_HOME/bin" >> ~hduser/.bashrc
104 |
105 | # copy script to start hadoop
106 | COPY start-hadoop.sh /start-hadoop.sh
107 | RUN bash start-hadoop.sh &
108 |
109 | # Preparing HDFS for hduser
110 | RUN $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hduser
111 | RUN $HADOOP_HOME/bin/hdfs dfs -chown hduser /user/hduser
112 |
113 | # Cleanup
114 | RUN rm -f *.tar.gz *.tgz *.sh
115 |
116 | # expose various ports
117 | EXPOSE 8088 8888 5000 50070 50075 50030 50060
118 |
119 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # bigram-hadoop
2 |
3 | A "bootstrap" image for pySpark developers.
4 |
5 |
6 | ## Version
7 |
8 | 0.0.8
9 |
10 |
11 |
12 | ## Introduction
13 |
14 | This is a repository for Dockerfile and some components necessary to build an image that provides a minimal of environment to work with Hadoop and pySpark.
15 |
16 |
17 |
18 | ## Features
19 |
20 | * Hadoop 2.8.5(Mapreduce + YARN + HDFS)
21 | * Spark 2.4.4( Built using Maven because this combination of versions of Hadoop and Spark must be built )
22 | * Python 3.6
23 | * jupyter-notebook
24 |
25 |
26 |
27 | ## Requirements
28 |
29 | * Some Linux distro( I can't tell if this works on Windows nor MacOS. Probably yes on MacOS!)
30 |
31 | * docker 19.03.5
32 |
33 | * Dockerfile
34 |
35 | * 16GB of RAM
36 |
37 | * Intel Core i5 is ok but, i7 is recommended
38 |
39 |
40 |
41 | ## Files, directories
42 |
43 | ```bash
44 | .
45 | ├── conf
46 | │ ├── config
47 | │ ├── core-site.xml
48 | │ ├── hdfs-site.xml
49 | │ ├── mapred-site.xml
50 | │ └── yarn-site.xml
51 | ├── Dockerfile
52 | ├── LICENSE
53 | ├── README.md
54 | └── start-hadoop.sh
55 |
56 | ```
57 |
58 | * conf/config: ssh configuration file
59 |
60 | * conf/*-site.xml: hadoop basic configuration files
61 |
62 | * Dockerfile: docker file for build the image/container
63 |
64 | * start-hadoop.sh: script that starts Hadoop environment(zookeeper, hdfs, yarn, etc.)
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 | ## Getting started
73 |
74 | First of all, **install docker!**
75 |
76 | * [How to install docker on Ubuntu/Mint](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
77 |
78 |
79 |
80 | Then, choose your "destiny"!
81 |
82 |
83 |
84 |
85 |
86 | ### Dockerhub way
87 |
88 | Faster, not so fun and **probably out of date**. But works! Just run *docker pull* as at bellow:
89 |
90 | `docker pull carneiro/bigram-hadoop`
91 |
92 |
93 |
94 | Dockerhub image site: [carneiro/bigram-hadoop](https://hub.docker.com/repository/docker/carneiro/bigram-hadoop)
95 |
96 |
97 |
98 | ### Dockerfile way
99 |
100 | The Dockfile is certainly updated, but is very slow to run!
101 |
102 |
103 |
104 | What this will do?
105 |
106 | 1. Install basic Linux image(Ubuntu18.04)
107 | 2. Install Hadoop 2.5.8 basic stack(HDFS,Yarn,HDFS, Hive etc)
108 | 3. Build Spark 2.4.4 using Maven and configure it
109 |
110 |
111 |
112 | All stuff on [Github](https://github.com/bang/bigram-hadoop)
113 |
114 |
115 |
116 | 1. Dockerfile:
117 |
118 | ```dockerfile
119 | FROM ubuntu:18.04
120 |
121 | # set environment vars
122 | ENV HADOOP_BASE /opt/hadoop
123 | ENV HADOOP_HOME /opt/hadoop/current
124 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
125 | ENV SPARK_BASE /opt/spark
126 | ENV SPARK_HOME /opt/spark/current
127 |
128 | # configuring tz to avoid problems with interaction problems with tzdata package
129 | ENV TZ=America/Sao_Paulo
130 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
131 |
132 | # Install packages
133 | RUN \
134 | apt-get update && apt-get install -y \
135 | net-tools \
136 | sudo \
137 | curl \
138 | ssh \
139 | rsync \
140 | vim \
141 | openjdk-8-jdk \
142 | maven \
143 | python3-pip \
144 | jupyter-notebook
145 |
146 |
147 | # download and extract hadoop, set JAVA_HOME in hadoop-env.sh, update path
148 | #RUN curl -L \
149 | # --progress-bar 'https://www-us.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz' \
150 | # -o "hadoop-2.8.5.tar.gz"
151 | ENV HADOOP_VERSION=2.8.5
152 | COPY hadoop-$HADOOP_VERSION.tar.gz .
153 | RUN mkdir -p $HADOOP_BASE \
154 | && tar -xzvmf hadoop-$HADOOP_VERSION.tar.gz -C $HADOOP_BASE/ \
155 | && cd $HADOOP_BASE \
156 | && ln -s hadoop-$HADOOP_VERSION current \
157 | && cd / \
158 | && echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
159 | && echo "PATH=$PATH:$HADOOP_HOME/bin" >> ~/.bashrc
160 |
161 | # create ssh keys
162 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
163 | RUN cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
164 | RUN chmod 0600 ~/.ssh/authorized_keys
165 |
166 | # copy hadoop configs
167 | COPY conf/*xml $HADOOP_HOME/etc/hadoop/
168 |
169 | # copy ssh config
170 | COPY conf/config /root/.ssh/config
171 |
172 | # create hduser user
173 | RUN useradd -m -s /bin/bash hduser \
174 | && groupadd hdfs \
175 | && usermod -aG hdfs hduser \
176 | && usermod -aG sudo hduser \
177 | && mkdir ~hduser/.ssh
178 |
179 | # create ssh keys
180 | RUN ssh-keygen -t rsa -P '' -f ~hduser/.ssh/id_rsa \
181 | && cat ~/.ssh/id_rsa.pub >> ~hduser/.ssh/authorized_keys \
182 | && chmod 0600 ~hduser/.ssh/authorized_keys
183 |
184 | # download and build spark with maven with Hive and hive-trhift support
185 | ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
186 | # RUN curl -L \
187 | # # --progress-bar 'https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4.tgz' \
188 | # # -o "spark-2.4.4.tgz"
189 |
190 | ENV SPARK_VERSION=2.4.4
191 |
192 | COPY spark-$SPARK_VERSION.tgz .
193 | ENV SPARK_PART_VERSION=2.4
194 | ENV HADOOP_PART_VERSION=2.8
195 |
196 | RUN mkdir -p $SPARK_BASE && tar -xzmvf spark-$SPARK_VERSION.tgz \
197 | && cd spark-$SPARK_VERSION \
198 | && ./build/mvn \
199 | -Pyarn -Phadoop-$HADOOP_PART_VERSION -Dhadoop.version=$HADOOP_VERSION \
200 | -Phive -Phive-thriftserver \
201 | -DskipTests clean package
202 |
203 | # Moving Spark after build dirs to $SPARK_HOME proving to be IMPOSSIBLE!
204 | # ENV SPARK_VERSION=2.4.4
205 | # ENV SPARK_BASE=/opt/spark
206 | # ENV SPARK_HOME=$SPARK_BASE/current
207 | RUN cd /
208 | RUN tar -cBpvzf spark-$SPARK_VERSION.tar.gz spark-$SPARK_VERSION
209 | #RUN rm -f spark-$SPARK_BASE/$SPARK_VERSION
210 | RUN tar -xzvmf spark-$SPARK_VERSION.tar.gz -C $SPARK_BASE/
211 | RUN ln -s spark-$SPARK_VERSION $SPARK_HOME \
212 | && cd /
213 |
214 | # Install pyspark
215 | RUN pip3 install pyspark
216 |
217 | # Configuring ~hduser/.bashrc
218 | RUN echo "export JAVA_HOME=$JAVA_HOME" >> ~hduser/.bashrc \
219 | && echo "export HADOOP_HOME=$HADOOP_HOME" >> ~hduser/.bashrc \
220 | && echo "alias python='python3.6'" >> ~hduser/.bashrc \
221 | && echo "alias pip='pip3'" >> ~hduser/.bashrc \
222 | && echo "export PYSPARK_PYTHON='python3.6'" >> ~hduser/.bashrc \
223 | && echo "export SPARK_HOME=$SPARK_HOME" >> ~hduser/.bashrc \
224 | && echo "export SPARK_MAJOR_VERSION=2" >> ~hduser/.bashrc \
225 | && echo "export PATH=$PATH:$HADOOP_HOME/bin:$SPARK_HOME/bin" >> ~hduser/.bashrc
226 |
227 | # copy script to start hadoop
228 | COPY start-hadoop.sh /start-hadoop.sh
229 | RUN bash start-hadoop.sh &
230 |
231 | # Preparing HDFS for hduser
232 | RUN $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hduser
233 | RUN $HADOOP_HOME/bin/hdfs dfs -chown hduser /user/hduser
234 |
235 | # Cleanup
236 | RUN rm -f *.tar.gz *.tgz *.sh
237 |
238 | # RUNNING jupyter-notebook
239 | # as hduser ??????
240 |
241 |
242 | # expose various ports
243 | EXPOSE 8088 8888 5000 50070 50075 50030 50060
244 |
245 |
246 |
247 |
248 | ```
249 |
250 |
251 |
252 | 2. Building the image
253 |
254 | `docker build -t bigram-hadoop .`
255 |
256 |
257 |
258 | 3. Creating the container
259 |
260 | ```bash
261 | docker run \
262 | --network host \
263 | --cpus=".5" \
264 | --memory="8g" \
265 | --name bigram-hadoop-container \
266 | -d bigram-hadoop
267 | ```
268 |
269 |
270 |
271 |
272 | * **--network host**: A simple explanation is that container will inherit IP and expose all ports. **Never use this way in production! Just for developing in YOUR machine!**
273 |
274 | * **--cpus**=".5": enable 50% of CPUs for the container
275 |
276 | * **--memory**="8g": allocate max 8GB for the container
277 |
278 | * **--name**: sets a name for the container
279 |
280 | * **-d**: container on background
281 |
282 | * **bigram-hadoop**: name of the image
283 |
284 |
285 |
286 |
287 | 3. Running the container
288 |
289 | `docker run --network host --user -p8888 -p8088 hduser -it bigram-hadoop-container jupyter-notebook`
290 |
291 |
292 |
293 | You'll see something like this:
294 |
295 | ```bash
296 | WARNING: Published ports are discarded when using host network mode
297 | [I 15:36:47.100 NotebookApp] Writing notebook server cookie secret to /home/hduser/.local/share/jupyter/runtime/notebook_cookie_secret
298 | [I 15:36:47.277 NotebookApp] Serving notebooks from local directory: /
299 | [I 15:36:47.277 NotebookApp] 0 active kernels
300 | [I 15:36:47.277 NotebookApp] The Jupyter Notebook is running at:
301 | [I 15:36:47.277 NotebookApp] http://localhost:8888/?token=d940ac2eff1330843681bb360ffec84f604a3c43643723a1
302 | [I 15:36:47.277 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
303 | [W 15:36:47.277 NotebookApp] No web browser found: could not locate runnable browser.
304 | [C 15:36:47.277 NotebookApp]
305 |
306 | Copy/paste this URL into your browser when you connect for the first time,
307 | to login with a token:
308 | http://localhost:8888/?token=d940ac2eff1330843681bb360ffec84f604a3c43643723a1
309 | ```
310 |
311 |
312 |
313 | Now, copy the 'http' address, create a new Python3 notebook and place this code at below and try to run it!
314 |
315 | ```python
316 | import pyspark
317 | import pyspark.sql.functions as F
318 | from pyspark.sql import SparkSession
319 | from pyspark.sql.types import *
320 |
321 | # start session
322 | spark = SparkSession.builder.appName('test').enableHiveSupport().getOrCreate()
323 |
324 | # Setting some data
325 | data = [["Spark","is","awsome!"]]
326 |
327 | # Declaring schema
328 | schema = StructType(fields = [
329 | StructField("col1",StringType(),True)
330 | ,StructField("col2",StringType(),True)
331 | ,StructField("col3",StringType(),True)
332 | ])
333 |
334 | # Getting a dataframe from all of this
335 | df = spark.createDataFrame(data,schema)
336 |
337 | ```
338 |
339 |
340 |
341 |
342 |
343 |
--------------------------------------------------------------------------------