├── .gitignore ├── README.md ├── python2 ├── spark1.6 │ └── Dockerfile └── spark2.1 │ └── Dockerfile └── python3 ├── spark1.6 └── Dockerfile └── spark2.1 └── Dockerfile /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | 5 | .cache/ 6 | .idea/ 7 | 8 | .ipynb_checkpoints/ 9 | *.ipynb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-spark 2 | 3 | This image is based off the [`python:2.7`](https://hub.docker.com/_/python/) image and 4 | contains Hadoop, Sqoop and Spark binaries. Installs OpenJDK 7. 5 | 6 | This is used as a base image for [`airflow-pipeline`](https://github.com/datagovsg/airflow-pipeline), a simplified setup for Airflow to launch Hadoop and Spark jobs. 7 | 8 | Useful packages included for Spark and Sqoop: 9 | - Spark-csv 10 | - Spark-avro 11 | - Spark graphframes 12 | - MySQL JDBC driver 13 | - PostgreSQL JDBC driver 14 | - MS SQL JDBC driver 15 | 16 | ## Kerberos support 17 | Kerberos support has been added, but derivative images would need to update the kerberos configuration file in `/etc/krb5.conf`. To use spark with kerberos, the 18 | environment variable `PYSPARK_SUBMIT_ARGS` should also be modified to specify the kerberos principal and location of the corresponding keytab. Derivative images 19 | are responsible for putting in the keytab by mounting the volume containing the keytab into the image (suggested location: /etc/kerberos). So for example, if 20 | the principal is `kerbuser` and the keytab is `kerbuser.keytab` in the default location, then `PYSPARK_SUBMIT_ARGS` would be 21 | 22 | `[other spark parameters] --principal kerbuser --keytab /etc/kerberos/kerbuser.keytab pyspark-shell` 23 | 24 | ## Logging in to Docker Hub Container Registry 25 | Credentials are managed by `docker login`. Login with `docker login` 26 | 27 | ## Building and pushing new images 28 | Assuming that the Python version we want to build is `2.7`, the Hadoop version is `2.7.3` and the Spark version is `2.1.0`, then we can build the images as such: 29 | 30 | ```bash 31 | docker build -t datagovsg/python-spark:2.7-2.1 -f python2/spark2.1/Dockerfile . 32 | docker push datagovsg/python-spark:2.7-2.1 33 | ``` 34 | 35 | ## Supported tags and respective `Dockerfile` links 36 | 37 | - `latest`: Python 2.7 with Spark 1.6.1 ([Dockerfile](python2/spark1.6/Dockerfile)) 38 | - `2.7-1.6`: Python 2.7 with Spark 1.6.1 ([Dockerfile](python2/spark1.6/Dockerfile)) 39 | - `2.7-2.1`: Python 2.7 with Spark 2.1.0 ([Dockerfile](python2/spark2.1/Dockerfile)) 40 | - `3.6-2.1`: Python 3.6 with Spark 2.1.0 ([Dockerfile](python3/spark2.1/Dockerfile)) 41 | 42 | ## Spark version specific things to take note 43 | - `PYTHONPATH` and `PYSPARK_SUBMIT_ARGS` 44 | - Spark packages installed into the Ivy cache 45 | - Spark 1.6.1 is good up to Cloudera CDH 5.9 46 | - Spark 1.6.2 is good up to Cloudera CDH 5.10 47 | -------------------------------------------------------------------------------- /python2/spark1.6/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | 3 | # Setup Java 4 | RUN set -x && \ 5 | apt-get update && \ 6 | apt-get install -y --no-install-recommends --force-yes openjdk-7-jdk && \ 7 | apt-get clean && \ 8 | apt-get autoremove && \ 9 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* && \ 10 | update-java-alternatives -s java-1.7.0-openjdk-amd64 && \ 11 | (find /usr/share/doc -type f -and -not -name copyright -print0 | xargs -0 rm) && \ 12 | java -version 13 | ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64 14 | 15 | ARG HADOOP_VERSION=2.6.5 16 | ARG SPARK_VERSION=1.6.1 17 | 18 | # Setup Hadoop variables 19 | ENV HADOOP_HOME /opt/hadoop 20 | ENV PATH ${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin 21 | ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} 22 | ENV HADOOP_COMMON_HOME ${HADOOP_HOME} 23 | ENV HADOOP_HDFS_HOME ${HADOOP_HOME} 24 | ENV YARN_HOME ${HADOOP_HOME} 25 | ENV HADOOP_COMMON_LIB_NATIVE_DIR ${HADOOP_HOME}/lib/native 26 | ENV HADOOP_OPTS "-Djava.library.path=${HADOOP_HOME}/lib" 27 | ENV HDFS_CONF_DIR ${HADOOP_HOME}/etc/hadoop 28 | ENV YARN_CONF_DIR ${HADOOP_HOME}/etc/hadoop 29 | ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop 30 | 31 | # Setup Hive 32 | ENV HIVE_CONF_DIR ${HADOOP_CONF_DIR} 33 | 34 | # Setup Spark 35 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION} 36 | ENV PYSPARK_PYTHON=python 37 | ENV PATH=$PATH:${SPARK_HOME}/bin 38 | 39 | # Set Python Spark 1.6 specific settings 40 | ENV PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-csv_2.10:1.5.0,com.databricks:spark-avro_2.10:2.0.1,graphframes:graphframes:0.5.0-spark1.6-s_2.10 pyspark-shell" 41 | ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.9-src.zip 42 | 43 | # Exposes the relevant ports and setup the port settings 44 | ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 45 | ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 46 | ENV SPARK_MASTER_PORT 7077 47 | ENV SPARK_MASTER_WEBUI_PORT 8080 48 | ENV SPARK_WORKER_PORT 8888 49 | ENV SPARK_WORKER_WEBUI_PORT 8081 50 | 51 | # Set up Sqoop 52 | ENV SQOOP_HOME /opt/sqoop 53 | ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin 54 | 55 | # Download binaries 56 | RUN /bin/bash -c 'set -x && \ 57 | echo "Downloading Hadoop ${HADOOP_VERSION}" && \ 58 | wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ 59 | tar -xz -C /opt/ && \ 60 | mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ 61 | echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ 62 | wget -qO - http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3}.tgz |\ 63 | tar -xz -C /opt/ && \ 64 | mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3} /opt/spark-${SPARK_VERSION} && \ 65 | echo "Downloading Spark packages" && \ 66 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-avro_2.10/2.0.1/spark-avro_2.10-2.0.1.jar -P ${SPARK_HOME}/lib && \ 67 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-csv_2.10/1.5.0/spark-csv_2.10-1.5.0.jar -P ${SPARK_HOME}/lib && \ 68 | echo "Downloading Sqoop" && \ 69 | wget -qO - http://www.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | tar -xz -C /opt && \ 70 | cd /opt && ln -s ./sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop && \ 71 | echo "Downloading the JDBC drivers for Postgresql" && \ 72 | wget -qP /opt/sqoop/lib/ https://jdbc.postgresql.org/download/postgresql-9.4-1201.jdbc4.jar && \ 73 | echo "Downloading the JDBC drivers for MySQL" && \ 74 | wget -qP /tmp/ http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.37.tar.gz && \ 75 | tar -C /tmp/ -xzf /tmp/mysql-connector-java-5.1.37.tar.gz && \ 76 | cp /tmp/mysql-connector-java-5.1.37/mysql-connector-java-5.1.37-bin.jar /opt/sqoop/lib/ && \ 77 | echo "Downloading the JDBC drivers for MS SQL" && \ 78 | wget -qO - https://download.microsoft.com/download/F/0/F/F0FF3F95-D42A-46AF-B0F9-8887987A2C4B/sqljdbc_4.2.8112.100_enu.tar.gz | \ 79 | tar xz -C /tmp && \ 80 | mv /tmp/sqljdbc_4.2/enu/jre7/sqljdbc41.jar ${SQOOP_HOME}/lib && \ 81 | rm -r /tmp/sqljdbc_4.2 && \ 82 | echo "Cleaning up" && \ 83 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ${HADOOP_HOME}/share/doc/* ${SPARK_HOME}/lib/spark-examples*.jar' 84 | 85 | EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 86 | 87 | # Install kerberos client support 88 | RUN apt-get update && \ 89 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --force-yes krb5-user && \ 90 | apt-get clean && \ 91 | apt-get autoremove && \ 92 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* 93 | 94 | CMD '/bin/bash' 95 | -------------------------------------------------------------------------------- /python2/spark2.1/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | 3 | # Setup Java 4 | RUN set -x && \ 5 | apt-get update && \ 6 | apt-get install -y --no-install-recommends --force-yes openjdk-7-jdk && \ 7 | apt-get clean && \ 8 | apt-get autoremove && \ 9 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* && \ 10 | update-java-alternatives -s java-1.7.0-openjdk-amd64 && \ 11 | (find /usr/share/doc -type f -and -not -name copyright -print0 | xargs -0 rm) && \ 12 | java -version 13 | ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64 14 | 15 | ARG HADOOP_VERSION=2.7.5 16 | ARG SPARK_VERSION=2.1.2 17 | 18 | # Setup Hadoop variables 19 | ENV HADOOP_HOME /opt/hadoop 20 | ENV PATH ${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin 21 | ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} 22 | ENV HADOOP_COMMON_HOME ${HADOOP_HOME} 23 | ENV HADOOP_HDFS_HOME ${HADOOP_HOME} 24 | ENV YARN_HOME ${HADOOP_HOME} 25 | ENV HADOOP_COMMON_LIB_NATIVE_DIR ${HADOOP_HOME}/lib/native 26 | ENV HADOOP_OPTS "-Djava.library.path=${HADOOP_HOME}/lib" 27 | ENV HDFS_CONF_DIR ${HADOOP_HOME}/etc/hadoop 28 | ENV YARN_CONF_DIR ${HADOOP_HOME}/etc/hadoop 29 | ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop 30 | 31 | # Setup Hive 32 | ENV HIVE_CONF_DIR ${HADOOP_CONF_DIR} 33 | 34 | # Setup Spark 35 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION} 36 | ENV PYSPARK_PYTHON=python 37 | ENV PATH=$PATH:${SPARK_HOME}/bin 38 | 39 | # Set Python Spark 2 specific settings 40 | ENV PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-csv_2.11:1.5.0,com.databricks:spark-avro_2.11:3.1.0,graphframes:graphframes:0.5.0-spark2.0-s_2.11 pyspark-shell" 41 | ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip 42 | 43 | # Exposes the relevant ports and setup the port settings 44 | ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 45 | ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 46 | ENV SPARK_MASTER_PORT 7077 47 | ENV SPARK_MASTER_WEBUI_PORT 8080 48 | ENV SPARK_WORKER_PORT 8888 49 | ENV SPARK_WORKER_WEBUI_PORT 8081 50 | 51 | # Set up Sqoop 52 | ENV SQOOP_HOME /opt/sqoop 53 | ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin 54 | 55 | # Download binaries 56 | RUN /bin/bash -c 'set -x && \ 57 | echo "Downloading Hadoop ${HADOOP_VERSION}" && \ 58 | wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ 59 | tar -xz -C /opt/ && \ 60 | mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ 61 | echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ 62 | wget -qO - http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3}.tgz |\ 63 | tar -xz -C /opt/ && \ 64 | mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3} /opt/spark-${SPARK_VERSION} && \ 65 | echo "Downloading Spark packages" && \ 66 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-avro_2.11/3.1.0/spark-avro_2.11-3.1.0.jar -P ${SPARK_HOME}/jars && \ 67 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-csv_2.11/1.5.0/spark-csv_2.11-1.5.0.jar -P ${SPARK_HOME}/jars && \ 68 | echo "Downloading Sqoop" && \ 69 | wget -qO - http://www.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | tar -xz -C /opt && \ 70 | cd /opt && ln -s ./sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop && \ 71 | echo "Downloading the JDBC drivers for Postgresql" && \ 72 | wget -qP /opt/sqoop/lib/ https://jdbc.postgresql.org/download/postgresql-9.4-1201.jdbc4.jar && \ 73 | echo "Downloading the JDBC drivers for MySQL" && \ 74 | wget -qP /tmp/ http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.37.tar.gz && \ 75 | tar -C /tmp/ -xzf /tmp/mysql-connector-java-5.1.37.tar.gz && \ 76 | cp /tmp/mysql-connector-java-5.1.37/mysql-connector-java-5.1.37-bin.jar /opt/sqoop/lib/ && \ 77 | echo "Downloading the JDBC drivers for MS SQL" && \ 78 | wget -qO - https://download.microsoft.com/download/F/0/F/F0FF3F95-D42A-46AF-B0F9-8887987A2C4B/sqljdbc_4.2.8112.100_enu.tar.gz | \ 79 | tar xz -C /tmp && \ 80 | mv /tmp/sqljdbc_4.2/enu/jre7/sqljdbc41.jar ${SQOOP_HOME}/lib && \ 81 | rm -r /tmp/sqljdbc_4.2 && \ 82 | echo "Cleaning up" && \ 83 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ${HADOOP_HOME}/share/doc/* ${SPARK_HOME}/lib/spark-examples*.jar' 84 | 85 | EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 86 | 87 | # Install kerberos client support 88 | RUN apt-get update && \ 89 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --force-yes krb5-user && \ 90 | apt-get clean && \ 91 | apt-get autoremove && \ 92 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* 93 | 94 | CMD '/bin/bash' 95 | -------------------------------------------------------------------------------- /python3/spark1.6/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | # Setup Java 4 | RUN set -x && \ 5 | apt-get update && \ 6 | apt-get install -y --no-install-recommends --force-yes openjdk-7-jdk && \ 7 | apt-get clean && \ 8 | apt-get autoremove && \ 9 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* && \ 10 | update-java-alternatives -s java-1.7.0-openjdk-amd64 && \ 11 | (find /usr/share/doc -type f -and -not -name copyright -print0 | xargs -0 rm) && \ 12 | java -version 13 | ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64 14 | 15 | ARG HADOOP_VERSION=2.6.5 16 | ARG SPARK_VERSION=1.6.1 17 | 18 | # Setup Hadoop variables 19 | ENV HADOOP_HOME /opt/hadoop 20 | ENV PATH ${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin 21 | ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} 22 | ENV HADOOP_COMMON_HOME ${HADOOP_HOME} 23 | ENV HADOOP_HDFS_HOME ${HADOOP_HOME} 24 | ENV YARN_HOME ${HADOOP_HOME} 25 | ENV HADOOP_COMMON_LIB_NATIVE_DIR ${HADOOP_HOME}/lib/native 26 | ENV HADOOP_OPTS "-Djava.library.path=${HADOOP_HOME}/lib" 27 | ENV HDFS_CONF_DIR ${HADOOP_HOME}/etc/hadoop 28 | ENV YARN_CONF_DIR ${HADOOP_HOME}/etc/hadoop 29 | ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop 30 | 31 | # Setup Hive 32 | ENV HIVE_CONF_DIR ${HADOOP_CONF_DIR} 33 | 34 | # Setup Spark 35 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION} 36 | ENV PYSPARK_PYTHON=python 37 | ENV PATH=$PATH:${SPARK_HOME}/bin 38 | 39 | # Set Python Spark 1.6 specific settings 40 | ENV PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-csv_2.10:1.5.0,com.databricks:spark-avro_2.10:2.0.1,graphframes:graphframes:0.5.0-spark1.6-s_2.10 pyspark-shell" 41 | ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.9-src.zip 42 | 43 | # Exposes the relevant ports and setup the port settings 44 | ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 45 | ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 46 | ENV SPARK_MASTER_PORT 7077 47 | ENV SPARK_MASTER_WEBUI_PORT 8080 48 | ENV SPARK_WORKER_PORT 8888 49 | ENV SPARK_WORKER_WEBUI_PORT 8081 50 | 51 | # Set up Sqoop 52 | ENV SQOOP_HOME /opt/sqoop 53 | ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin 54 | 55 | # Download binaries 56 | RUN /bin/bash -c 'set -x && \ 57 | echo "Downloading Hadoop ${HADOOP_VERSION}" && \ 58 | wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ 59 | tar -xz -C /opt/ && \ 60 | mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ 61 | echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ 62 | wget -qO - http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3}.tgz |\ 63 | tar -xz -C /opt/ && \ 64 | mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3} /opt/spark-${SPARK_VERSION} && \ 65 | echo "Downloading Spark packages" && \ 66 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-avro_2.10/2.0.1/spark-avro_2.10-2.0.1.jar -P ${SPARK_HOME}/lib && \ 67 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-csv_2.10/1.5.0/spark-csv_2.10-1.5.0.jar -P ${SPARK_HOME}/lib && \ 68 | echo "Downloading Sqoop" && \ 69 | wget -qO - http://www.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | tar -xz -C /opt && \ 70 | cd /opt && ln -s ./sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop && \ 71 | echo "Downloading the JDBC drivers for Postgresql" && \ 72 | wget -qP /opt/sqoop/lib/ https://jdbc.postgresql.org/download/postgresql-9.4-1201.jdbc4.jar && \ 73 | echo "Downloading the JDBC drivers for MySQL" && \ 74 | wget -qP /tmp/ http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.37.tar.gz && \ 75 | tar -C /tmp/ -xzf /tmp/mysql-connector-java-5.1.37.tar.gz && \ 76 | cp /tmp/mysql-connector-java-5.1.37/mysql-connector-java-5.1.37-bin.jar /opt/sqoop/lib/ && \ 77 | echo "Downloading the JDBC drivers for MS SQL" && \ 78 | wget -qO - https://download.microsoft.com/download/F/0/F/F0FF3F95-D42A-46AF-B0F9-8887987A2C4B/sqljdbc_4.2.8112.100_enu.tar.gz | \ 79 | tar xz -C /tmp && \ 80 | mv /tmp/sqljdbc_4.2/enu/jre7/sqljdbc41.jar ${SQOOP_HOME}/lib && \ 81 | rm -r /tmp/sqljdbc_4.2 && \ 82 | echo "Cleaning up" && \ 83 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ${HADOOP_HOME}/share/doc/* ${SPARK_HOME}/lib/spark-examples*.jar' 84 | 85 | EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 86 | 87 | # Install kerberos client support 88 | RUN apt-get update && \ 89 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --force-yes krb5-user && \ 90 | apt-get clean && \ 91 | apt-get autoremove && \ 92 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* 93 | 94 | CMD '/bin/bash' 95 | -------------------------------------------------------------------------------- /python3/spark2.1/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | # Setup Java 4 | RUN set -x && \ 5 | apt-get update && \ 6 | apt-get install -y --no-install-recommends --force-yes openjdk-7-jdk && \ 7 | apt-get clean && \ 8 | apt-get autoremove && \ 9 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* && \ 10 | update-java-alternatives -s java-1.7.0-openjdk-amd64 && \ 11 | (find /usr/share/doc -type f -and -not -name copyright -print0 | xargs -0 rm) && \ 12 | java -version 13 | ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64 14 | 15 | ARG HADOOP_VERSION=2.7.5 16 | ARG SPARK_VERSION=2.1.2 17 | 18 | # Setup Hadoop variables 19 | ENV HADOOP_HOME /opt/hadoop 20 | ENV PATH ${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin 21 | ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} 22 | ENV HADOOP_COMMON_HOME ${HADOOP_HOME} 23 | ENV HADOOP_HDFS_HOME ${HADOOP_HOME} 24 | ENV YARN_HOME ${HADOOP_HOME} 25 | ENV HADOOP_COMMON_LIB_NATIVE_DIR ${HADOOP_HOME}/lib/native 26 | ENV HADOOP_OPTS "-Djava.library.path=${HADOOP_HOME}/lib" 27 | ENV HDFS_CONF_DIR ${HADOOP_HOME}/etc/hadoop 28 | ENV YARN_CONF_DIR ${HADOOP_HOME}/etc/hadoop 29 | ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop 30 | 31 | # Setup Hive 32 | ENV HIVE_CONF_DIR ${HADOOP_CONF_DIR} 33 | 34 | # Setup Spark 35 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION} 36 | ENV PYSPARK_PYTHON=python 37 | ENV PATH=$PATH:${SPARK_HOME}/bin 38 | 39 | # Set Python Spark 2 specific settings 40 | ENV PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-csv_2.11:1.5.0,com.databricks:spark-avro_2.11:3.1.0,graphframes:graphframes:0.5.0-spark2.0-s_2.11 pyspark-shell" 41 | ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip 42 | 43 | # Exposes the relevant ports and setup the port settings 44 | ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 45 | ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" 46 | ENV SPARK_MASTER_PORT 7077 47 | ENV SPARK_MASTER_WEBUI_PORT 8080 48 | ENV SPARK_WORKER_PORT 8888 49 | ENV SPARK_WORKER_WEBUI_PORT 8081 50 | 51 | # Set up Sqoop 52 | ENV SQOOP_HOME /opt/sqoop 53 | ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin 54 | 55 | # Download binaries 56 | RUN /bin/bash -c 'set -x && \ 57 | echo "Downloading Hadoop ${HADOOP_VERSION}" && \ 58 | wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ 59 | tar -xz -C /opt/ && \ 60 | mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ 61 | echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ 62 | wget -qO - http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3}.tgz |\ 63 | tar -xz -C /opt/ && \ 64 | mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3} /opt/spark-${SPARK_VERSION} && \ 65 | echo "Downloading Spark packages" && \ 66 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-avro_2.11/3.1.0/spark-avro_2.11-3.1.0.jar -P ${SPARK_HOME}/jars && \ 67 | wget -q http://repo1.maven.org/maven2/com/databricks/spark-csv_2.11/1.5.0/spark-csv_2.11-1.5.0.jar -P ${SPARK_HOME}/jars && \ 68 | echo "Downloading Sqoop" && \ 69 | wget -qO - http://www.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | tar -xz -C /opt && \ 70 | cd /opt && ln -s ./sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop && \ 71 | echo "Downloading the JDBC drivers for Postgresql" && \ 72 | wget -qP /opt/sqoop/lib/ https://jdbc.postgresql.org/download/postgresql-9.4-1201.jdbc4.jar && \ 73 | echo "Downloading the JDBC drivers for MySQL" && \ 74 | wget -qP /tmp/ http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.37.tar.gz && \ 75 | tar -C /tmp/ -xzf /tmp/mysql-connector-java-5.1.37.tar.gz && \ 76 | cp /tmp/mysql-connector-java-5.1.37/mysql-connector-java-5.1.37-bin.jar /opt/sqoop/lib/ && \ 77 | echo "Downloading the JDBC drivers for MS SQL" && \ 78 | wget -qO - https://download.microsoft.com/download/F/0/F/F0FF3F95-D42A-46AF-B0F9-8887987A2C4B/sqljdbc_4.2.8112.100_enu.tar.gz | \ 79 | tar xz -C /tmp && \ 80 | mv /tmp/sqljdbc_4.2/enu/jre7/sqljdbc41.jar ${SQOOP_HOME}/lib && \ 81 | rm -r /tmp/sqljdbc_4.2 && \ 82 | echo "Cleaning up" && \ 83 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ${HADOOP_HOME}/share/doc/* ${SPARK_HOME}/lib/spark-examples*.jar' 84 | 85 | EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 86 | 87 | # Install kerberos client support 88 | RUN apt-get update && \ 89 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --force-yes krb5-user && \ 90 | apt-get clean && \ 91 | apt-get autoremove && \ 92 | rm -rf /var/cache/apk/* /var/lib/apt/lists/* 93 | 94 | CMD '/bin/bash' 95 | --------------------------------------------------------------------------------