├── .gitignore ├── conf ├── config ├── hdfs-site.xml ├── mapred-site.xml ├── core-site.xml └── yarn-site.xml ├── start-hadoop.sh ├── LICENSE ├── Dockerfile └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.tar.gz 2 | *.tgz 3 | *.zip 4 | -------------------------------------------------------------------------------- /conf/config: -------------------------------------------------------------------------------- 1 | Host * 2 | UserKnownHostsFile /dev/null 3 | StrictHostKeyChecking no 4 | -------------------------------------------------------------------------------- /conf/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | dfs.replication 4 | 1 5 | 6 | 7 | -------------------------------------------------------------------------------- /conf/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce.framework.name 4 | yarn 5 | 6 | 7 | -------------------------------------------------------------------------------- /conf/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.defaultFS 4 | hdfs://localhost:9000 5 | 6 | 7 | -------------------------------------------------------------------------------- /conf/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | yarn.nodemanager.aux-services 4 | mapreduce_shuffle 5 | 6 | 7 | yarn.resourcemanager.address 8 | 127.0.0.1:8032 9 | 10 | 11 | -------------------------------------------------------------------------------- /start-hadoop.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | # start ssh server 4 | /etc/init.d/ssh start 5 | 6 | # format namenode 7 | $HADOOP_HOME/bin/hdfs namenode -format 8 | 9 | # start hadoop 10 | $HADOOP_HOME/sbin/start-dfs.sh 11 | $HADOOP_HOME/sbin/start-yarn.sh 12 | $HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver 13 | 14 | # keep container running 15 | tail -f /dev/null 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Carneirão 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | # set environment vars 4 | ENV HADOOP_BASE /opt/hadoop 5 | ENV HADOOP_HOME /opt/hadoop/current 6 | ENV HADOOP_VERSION=2.8.5 7 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 8 | ENV SPARK_BASE /opt/spark 9 | ENV SPARK_HOME /opt/spark/current 10 | ENV SPARK_VERSION=2.4.4 11 | 12 | # configuring tz to avoid problems with interaction problems with tzdata package 13 | ENV TZ=America/Sao_Paulo 14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 15 | 16 | # Install packages 17 | RUN \ 18 | apt-get update && apt-get install -y \ 19 | net-tools \ 20 | sudo \ 21 | curl \ 22 | ssh \ 23 | rsync \ 24 | vim \ 25 | openjdk-8-jdk \ 26 | maven \ 27 | python3-pip \ 28 | jupyter-notebook 29 | 30 | 31 | # download and extract hadoop, set JAVA_HOME in hadoop-env.sh, update path 32 | RUN curl -L \ 33 | --progress-bar 'https://www-us.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz' \ 34 | -o "hadoop-$HADOOP_VERSION.tar.gz" 35 | 36 | COPY hadoop-$HADOOP_VERSION.tar.gz . 37 | RUN mkdir -p $HADOOP_BASE \ 38 | && tar -xzvmf hadoop-$HADOOP_VERSION.tar.gz -C $HADOOP_BASE/ \ 39 | && cd $HADOOP_BASE \ 40 | && ln -s hadoop-$HADOOP_VERSION current \ 41 | && cd / \ 42 | && echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \ 43 | && echo "PATH=$PATH:$HADOOP_HOME/bin" >> ~/.bashrc 44 | 45 | # create ssh keys 46 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa 47 | RUN cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 48 | RUN chmod 0600 ~/.ssh/authorized_keys 49 | 50 | # copy hadoop configs 51 | COPY conf/*xml $HADOOP_HOME/etc/hadoop/ 52 | 53 | # copy ssh config 54 | COPY conf/config /root/.ssh/config 55 | 56 | # create hduser user 57 | RUN useradd -m -s /bin/bash hduser \ 58 | && groupadd hdfs \ 59 | && usermod -aG hdfs hduser \ 60 | && usermod -aG sudo hduser \ 61 | && mkdir ~hduser/.ssh 62 | 63 | # create ssh keys 64 | RUN ssh-keygen -t rsa -P '' -f ~hduser/.ssh/id_rsa \ 65 | && cat ~/.ssh/id_rsa.pub >> ~hduser/.ssh/authorized_keys \ 66 | && chmod 0600 ~hduser/.ssh/authorized_keys 67 | 68 | # download and build spark with maven with Hive and hive-trhift support 69 | ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" 70 | RUN curl -L \ 71 | --progress-bar 'https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4.tgz' \ 72 | -o "spark-$SPARK_VERSION.tgz" 73 | 74 | COPY spark-$SPARK_VERSION.tgz . 75 | ENV SPARK_PART_VERSION=2.4 76 | ENV HADOOP_PART_VERSION=2.8 77 | 78 | RUN mkdir -p $SPARK_BASE && tar -xzmvf spark-$SPARK_VERSION.tgz \ 79 | && cd spark-$SPARK_VERSION \ 80 | && ./build/mvn \ 81 | -Pyarn -Phadoop-$HADOOP_PART_VERSION -Dhadoop.version=$HADOOP_VERSION \ 82 | -Phive -Phive-thriftserver \ 83 | -DskipTests clean package 84 | 85 | # Moving Spark after build dirs to $SPARK_HOME proving to be IMPOSSIBLE! 86 | RUN cd / 87 | RUN tar -cBpvzf spark-$SPARK_VERSION.tar.gz spark-$SPARK_VERSION 88 | RUN tar -xzvmf spark-$SPARK_VERSION.tar.gz -C $SPARK_BASE/ 89 | RUN ln -s spark-$SPARK_VERSION $SPARK_HOME \ 90 | && cd / 91 | 92 | # Install pyspark 93 | RUN pip3 install pyspark 94 | 95 | # Configuring ~hduser/.bashrc 96 | RUN echo "export JAVA_HOME=$JAVA_HOME" >> ~hduser/.bashrc \ 97 | && echo "export HADOOP_HOME=$HADOOP_HOME" >> ~hduser/.bashrc \ 98 | && echo "alias python='python3.6'" >> ~hduser/.bashrc \ 99 | && echo "alias pip='pip3'" >> ~hduser/.bashrc \ 100 | && echo "export PYSPARK_PYTHON='python3.6'" >> ~hduser/.bashrc \ 101 | && echo "export SPARK_HOME=$SPARK_HOME" >> ~hduser/.bashrc \ 102 | && echo "export SPARK_MAJOR_VERSION=2" >> ~hduser/.bashrc \ 103 | && echo "export PATH=$PATH:$HADOOP_HOME/bin:$SPARK_HOME/bin" >> ~hduser/.bashrc 104 | 105 | # copy script to start hadoop 106 | COPY start-hadoop.sh /start-hadoop.sh 107 | RUN bash start-hadoop.sh & 108 | 109 | # Preparing HDFS for hduser 110 | RUN $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hduser 111 | RUN $HADOOP_HOME/bin/hdfs dfs -chown hduser /user/hduser 112 | 113 | # Cleanup 114 | RUN rm -f *.tar.gz *.tgz *.sh 115 | 116 | # expose various ports 117 | EXPOSE 8088 8888 5000 50070 50075 50030 50060 118 | 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bigram-hadoop 2 | 3 | A "bootstrap" image for pySpark developers. 4 | 5 | 6 | ## Version 7 | 8 | 0.0.8 9 | 10 | 11 | 12 | ## Introduction 13 | 14 | This is a repository for Dockerfile and some components necessary to build an image that provides a minimal of environment to work with Hadoop and pySpark. 15 | 16 | 17 | 18 | ## Features 19 | 20 | * Hadoop 2.8.5(Mapreduce + YARN + HDFS) 21 | * Spark 2.4.4( Built using Maven because this combination of versions of Hadoop and Spark must be built ) 22 | * Python 3.6 23 | * jupyter-notebook 24 | 25 | 26 | 27 | ## Requirements 28 | 29 | * Some Linux distro( I can't tell if this works on Windows nor MacOS. Probably yes on MacOS!) 30 | 31 | * docker 19.03.5 32 | 33 | * Dockerfile 34 | 35 | * 16GB of RAM 36 | 37 | * Intel Core i5 is ok but, i7 is recommended 38 | 39 | 40 | 41 | ## Files, directories 42 | 43 | ```bash 44 | . 45 | ├── conf 46 | │   ├── config 47 | │   ├── core-site.xml 48 | │   ├── hdfs-site.xml 49 | │   ├── mapred-site.xml 50 | │   └── yarn-site.xml 51 | ├── Dockerfile 52 | ├── LICENSE 53 | ├── README.md 54 | └── start-hadoop.sh 55 | 56 | ``` 57 | 58 | * conf/config: ssh configuration file 59 | 60 | * conf/*-site.xml: hadoop basic configuration files 61 | 62 | * Dockerfile: docker file for build the image/container 63 | 64 | * start-hadoop.sh: script that starts Hadoop environment(zookeeper, hdfs, yarn, etc.) 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | ## Getting started 73 | 74 | First of all, **install docker!** 75 | 76 | * [How to install docker on Ubuntu/Mint](https://docs.docker.com/install/linux/docker-ce/ubuntu/) 77 | 78 | 79 | 80 | Then, choose your "destiny"! 81 | 82 | 83 | 84 | 85 | 86 | ### Dockerhub way 87 | 88 | Faster, not so fun and **probably out of date**. But works! Just run *docker pull* as at bellow: 89 | 90 | `docker pull carneiro/bigram-hadoop` 91 | 92 | 93 | 94 | Dockerhub image site: [carneiro/bigram-hadoop](https://hub.docker.com/repository/docker/carneiro/bigram-hadoop) 95 | 96 | 97 | 98 | ### Dockerfile way 99 | 100 | The Dockfile is certainly updated, but is very slow to run! 101 | 102 | 103 | 104 | What this will do? 105 | 106 | 1. Install basic Linux image(Ubuntu18.04) 107 | 2. Install Hadoop 2.5.8 basic stack(HDFS,Yarn,HDFS, Hive etc) 108 | 3. Build Spark 2.4.4 using Maven and configure it 109 | 110 | 111 | 112 | All stuff on [Github](https://github.com/bang/bigram-hadoop) 113 | 114 | 115 | 116 | 1. Dockerfile: 117 | 118 | ```dockerfile 119 | FROM ubuntu:18.04 120 | 121 | # set environment vars 122 | ENV HADOOP_BASE /opt/hadoop 123 | ENV HADOOP_HOME /opt/hadoop/current 124 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 125 | ENV SPARK_BASE /opt/spark 126 | ENV SPARK_HOME /opt/spark/current 127 | 128 | # configuring tz to avoid problems with interaction problems with tzdata package 129 | ENV TZ=America/Sao_Paulo 130 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 131 | 132 | # Install packages 133 | RUN \ 134 | apt-get update && apt-get install -y \ 135 | net-tools \ 136 | sudo \ 137 | curl \ 138 | ssh \ 139 | rsync \ 140 | vim \ 141 | openjdk-8-jdk \ 142 | maven \ 143 | python3-pip \ 144 | jupyter-notebook 145 | 146 | 147 | # download and extract hadoop, set JAVA_HOME in hadoop-env.sh, update path 148 | #RUN curl -L \ 149 | # --progress-bar 'https://www-us.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz' \ 150 | # -o "hadoop-2.8.5.tar.gz" 151 | ENV HADOOP_VERSION=2.8.5 152 | COPY hadoop-$HADOOP_VERSION.tar.gz . 153 | RUN mkdir -p $HADOOP_BASE \ 154 | && tar -xzvmf hadoop-$HADOOP_VERSION.tar.gz -C $HADOOP_BASE/ \ 155 | && cd $HADOOP_BASE \ 156 | && ln -s hadoop-$HADOOP_VERSION current \ 157 | && cd / \ 158 | && echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \ 159 | && echo "PATH=$PATH:$HADOOP_HOME/bin" >> ~/.bashrc 160 | 161 | # create ssh keys 162 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa 163 | RUN cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 164 | RUN chmod 0600 ~/.ssh/authorized_keys 165 | 166 | # copy hadoop configs 167 | COPY conf/*xml $HADOOP_HOME/etc/hadoop/ 168 | 169 | # copy ssh config 170 | COPY conf/config /root/.ssh/config 171 | 172 | # create hduser user 173 | RUN useradd -m -s /bin/bash hduser \ 174 | && groupadd hdfs \ 175 | && usermod -aG hdfs hduser \ 176 | && usermod -aG sudo hduser \ 177 | && mkdir ~hduser/.ssh 178 | 179 | # create ssh keys 180 | RUN ssh-keygen -t rsa -P '' -f ~hduser/.ssh/id_rsa \ 181 | && cat ~/.ssh/id_rsa.pub >> ~hduser/.ssh/authorized_keys \ 182 | && chmod 0600 ~hduser/.ssh/authorized_keys 183 | 184 | # download and build spark with maven with Hive and hive-trhift support 185 | ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" 186 | # RUN curl -L \ 187 | # # --progress-bar 'https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4.tgz' \ 188 | # # -o "spark-2.4.4.tgz" 189 | 190 | ENV SPARK_VERSION=2.4.4 191 | 192 | COPY spark-$SPARK_VERSION.tgz . 193 | ENV SPARK_PART_VERSION=2.4 194 | ENV HADOOP_PART_VERSION=2.8 195 | 196 | RUN mkdir -p $SPARK_BASE && tar -xzmvf spark-$SPARK_VERSION.tgz \ 197 | && cd spark-$SPARK_VERSION \ 198 | && ./build/mvn \ 199 | -Pyarn -Phadoop-$HADOOP_PART_VERSION -Dhadoop.version=$HADOOP_VERSION \ 200 | -Phive -Phive-thriftserver \ 201 | -DskipTests clean package 202 | 203 | # Moving Spark after build dirs to $SPARK_HOME proving to be IMPOSSIBLE! 204 | # ENV SPARK_VERSION=2.4.4 205 | # ENV SPARK_BASE=/opt/spark 206 | # ENV SPARK_HOME=$SPARK_BASE/current 207 | RUN cd / 208 | RUN tar -cBpvzf spark-$SPARK_VERSION.tar.gz spark-$SPARK_VERSION 209 | #RUN rm -f spark-$SPARK_BASE/$SPARK_VERSION 210 | RUN tar -xzvmf spark-$SPARK_VERSION.tar.gz -C $SPARK_BASE/ 211 | RUN ln -s spark-$SPARK_VERSION $SPARK_HOME \ 212 | && cd / 213 | 214 | # Install pyspark 215 | RUN pip3 install pyspark 216 | 217 | # Configuring ~hduser/.bashrc 218 | RUN echo "export JAVA_HOME=$JAVA_HOME" >> ~hduser/.bashrc \ 219 | && echo "export HADOOP_HOME=$HADOOP_HOME" >> ~hduser/.bashrc \ 220 | && echo "alias python='python3.6'" >> ~hduser/.bashrc \ 221 | && echo "alias pip='pip3'" >> ~hduser/.bashrc \ 222 | && echo "export PYSPARK_PYTHON='python3.6'" >> ~hduser/.bashrc \ 223 | && echo "export SPARK_HOME=$SPARK_HOME" >> ~hduser/.bashrc \ 224 | && echo "export SPARK_MAJOR_VERSION=2" >> ~hduser/.bashrc \ 225 | && echo "export PATH=$PATH:$HADOOP_HOME/bin:$SPARK_HOME/bin" >> ~hduser/.bashrc 226 | 227 | # copy script to start hadoop 228 | COPY start-hadoop.sh /start-hadoop.sh 229 | RUN bash start-hadoop.sh & 230 | 231 | # Preparing HDFS for hduser 232 | RUN $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hduser 233 | RUN $HADOOP_HOME/bin/hdfs dfs -chown hduser /user/hduser 234 | 235 | # Cleanup 236 | RUN rm -f *.tar.gz *.tgz *.sh 237 | 238 | # RUNNING jupyter-notebook 239 | # as hduser ?????? 240 | 241 | 242 | # expose various ports 243 | EXPOSE 8088 8888 5000 50070 50075 50030 50060 244 | 245 | 246 | 247 | 248 | ``` 249 | 250 | 251 | 252 | 2. Building the image 253 | 254 | `docker build -t bigram-hadoop .` 255 | 256 | 257 | 258 | 3. Creating the container 259 | 260 | ```bash 261 | docker run \ 262 | --network host \ 263 | --cpus=".5" \ 264 | --memory="8g" \ 265 | --name bigram-hadoop-container \ 266 | -d bigram-hadoop 267 | ``` 268 | 269 | 270 | 271 | 272 | * **--network host**: A simple explanation is that container will inherit IP and expose all ports. **Never use this way in production! Just for developing in YOUR machine!** 273 | 274 | * **--cpus**=".5": enable 50% of CPUs for the container 275 | 276 | * **--memory**="8g": allocate max 8GB for the container 277 | 278 | * **--name**: sets a name for the container 279 | 280 | * **-d**: container on background 281 | 282 | * **bigram-hadoop**: name of the image 283 | 284 | 285 | 286 | 287 | 3. Running the container 288 | 289 | `docker run --network host --user -p8888 -p8088 hduser -it bigram-hadoop-container jupyter-notebook` 290 | 291 | 292 | 293 | You'll see something like this: 294 | 295 | ```bash 296 | WARNING: Published ports are discarded when using host network mode 297 | [I 15:36:47.100 NotebookApp] Writing notebook server cookie secret to /home/hduser/.local/share/jupyter/runtime/notebook_cookie_secret 298 | [I 15:36:47.277 NotebookApp] Serving notebooks from local directory: / 299 | [I 15:36:47.277 NotebookApp] 0 active kernels 300 | [I 15:36:47.277 NotebookApp] The Jupyter Notebook is running at: 301 | [I 15:36:47.277 NotebookApp] http://localhost:8888/?token=d940ac2eff1330843681bb360ffec84f604a3c43643723a1 302 | [I 15:36:47.277 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). 303 | [W 15:36:47.277 NotebookApp] No web browser found: could not locate runnable browser. 304 | [C 15:36:47.277 NotebookApp] 305 | 306 | Copy/paste this URL into your browser when you connect for the first time, 307 | to login with a token: 308 | http://localhost:8888/?token=d940ac2eff1330843681bb360ffec84f604a3c43643723a1 309 | ``` 310 | 311 | 312 | 313 | Now, copy the 'http' address, create a new Python3 notebook and place this code at below and try to run it! 314 | 315 | ```python 316 | import pyspark 317 | import pyspark.sql.functions as F 318 | from pyspark.sql import SparkSession 319 | from pyspark.sql.types import * 320 | 321 | # start session 322 | spark = SparkSession.builder.appName('test').enableHiveSupport().getOrCreate() 323 | 324 | # Setting some data 325 | data = [["Spark","is","awsome!"]] 326 | 327 | # Declaring schema 328 | schema = StructType(fields = [ 329 | StructField("col1",StringType(),True) 330 | ,StructField("col2",StringType(),True) 331 | ,StructField("col3",StringType(),True) 332 | ]) 333 | 334 | # Getting a dataframe from all of this 335 | df = spark.createDataFrame(data,schema) 336 | 337 | ``` 338 | 339 | 340 | 341 | 342 | 343 | --------------------------------------------------------------------------------