├── .gitattributes ├── Dockerfile ├── README.md └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | 4 | 5 | ENV PYTHONUNBUFFERED 1 6 | ENV CHROME_BIN /usr/bin/chromium-browser 7 | ENV CHROME_PATH /usr/lib/chromium/ 8 | 9 | RUN apk add --no-cache tini # Tini is now available at /sbin/tini 10 | 11 | # Java 12 | 13 | ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' 14 | 15 | RUN apk add --no-cache --virtual .build-deps curl binutils \ 16 | && GLIBC_VER="2.29-r0" \ 17 | && ALPINE_GLIBC_REPO="https://github.com/sgerrand/alpine-pkg-glibc/releases/download" \ 18 | && GCC_LIBS_URL="https://archive.archlinux.org/packages/g/gcc-libs/gcc-libs-9.1.0-2-x86_64.pkg.tar.xz" \ 19 | && GCC_LIBS_SHA256="91dba90f3c20d32fcf7f1dbe91523653018aa0b8d2230b00f822f6722804cf08" \ 20 | && ZLIB_URL="https://archive.archlinux.org/packages/z/zlib/zlib-1%3A1.2.11-3-x86_64.pkg.tar.xz" \ 21 | && ZLIB_SHA256=17aede0b9f8baa789c5aa3f358fbf8c68a5f1228c5e6cba1a5dd34102ef4d4e5 \ 22 | && curl -LfsS https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub -o /etc/apk/keys/sgerrand.rsa.pub \ 23 | && SGERRAND_RSA_SHA256="823b54589c93b02497f1ba4dc622eaef9c813e6b0f0ebbb2f771e32adf9f4ef2" \ 24 | && echo "${SGERRAND_RSA_SHA256} */etc/apk/keys/sgerrand.rsa.pub" | sha256sum -c - \ 25 | && curl -LfsS ${ALPINE_GLIBC_REPO}/${GLIBC_VER}/glibc-${GLIBC_VER}.apk > /tmp/glibc-${GLIBC_VER}.apk \ 26 | && apk add /tmp/glibc-${GLIBC_VER}.apk \ 27 | && curl -LfsS ${ALPINE_GLIBC_REPO}/${GLIBC_VER}/glibc-bin-${GLIBC_VER}.apk > /tmp/glibc-bin-${GLIBC_VER}.apk \ 28 | && apk add /tmp/glibc-bin-${GLIBC_VER}.apk \ 29 | && curl -Ls ${ALPINE_GLIBC_REPO}/${GLIBC_VER}/glibc-i18n-${GLIBC_VER}.apk > /tmp/glibc-i18n-${GLIBC_VER}.apk \ 30 | && apk add /tmp/glibc-i18n-${GLIBC_VER}.apk \ 31 | && /usr/glibc-compat/bin/localedef --force --inputfile POSIX --charmap UTF-8 "$LANG" || true \ 32 | && echo "export LANG=$LANG" > /etc/profile.d/locale.sh \ 33 | && curl -LfsS ${GCC_LIBS_URL} -o /tmp/gcc-libs.tar.xz \ 34 | && echo "${GCC_LIBS_SHA256} */tmp/gcc-libs.tar.xz" | sha256sum -c - \ 35 | && mkdir /tmp/gcc \ 36 | && tar -xf /tmp/gcc-libs.tar.xz -C /tmp/gcc \ 37 | && mv /tmp/gcc/usr/lib/libgcc* /tmp/gcc/usr/lib/libstdc++* /usr/glibc-compat/lib \ 38 | && strip /usr/glibc-compat/lib/libgcc_s.so.* /usr/glibc-compat/lib/libstdc++.so* \ 39 | && curl -LfsS ${ZLIB_URL} -o /tmp/libz.tar.xz \ 40 | && echo "${ZLIB_SHA256} */tmp/libz.tar.xz" | sha256sum -c - \ 41 | && mkdir /tmp/libz \ 42 | && tar -xf /tmp/libz.tar.xz -C /tmp/libz \ 43 | && mv /tmp/libz/usr/lib/libz.so* /usr/glibc-compat/lib \ 44 | && apk del --purge .build-deps glibc-i18n \ 45 | && rm -rf /tmp/*.apk /tmp/gcc /tmp/gcc-libs.tar.xz /tmp/libz /tmp/libz.tar.xz /var/cache/apk/* 46 | 47 | ENV JAVA_VERSION jdk8u 48 | 49 | RUN apk add --update openssl wget bash 50 | 51 | RUN set -eux; \ 52 | apk add --virtual .fetch-deps curl; \ 53 | ARCH="$(apk --print-arch)"; \ 54 | case "${ARCH}" in \ 55 | aarch64|arm64) \ 56 | ESUM='32b5f06fdaf7183a5b55b37ff4a88734c00e16b3e2c7ff42daecb96583e0841f'; \ 57 | BINARY_URL='https://github.com/AdoptOpenJDK/openjdk8-binaries/releases/download/jdk8u-2019-07-16-20-20/OpenJDK8U-jdk_aarch64_linux_hotspot_2019-07-16-20-20.tar.gz'; \ 58 | ;; \ 59 | ppc64el|ppc64le) \ 60 | ESUM='a24e6e143a8eaf0b3a8477cac7555177d17667bf5e702db8adea7df8f9af837b'; \ 61 | BINARY_URL='https://github.com/AdoptOpenJDK/openjdk8-binaries/releases/download/jdk8u-2019-07-16-20-20/OpenJDK8U-jdk_ppc64le_linux_hotspot_2019-07-16-20-20.tar.gz'; \ 62 | ;; \ 63 | s390x) \ 64 | ESUM='f8656a806527dfb4ca7e6590cb0de5da679cf4eb2cdbe92834c7e5b5ff3ac66d'; \ 65 | BINARY_URL='https://github.com/AdoptOpenJDK/openjdk8-binaries/releases/download/jdk8u-2019-07-16-20-20/OpenJDK8U-jdk_s390x_linux_hotspot_2019-07-16-20-20.tar.gz'; \ 66 | ;; \ 67 | amd64|x86_64) \ 68 | ESUM='9356e89f321cdfac35813875f93213c656016ea0c2c72994d60d9729472309fe'; \ 69 | BINARY_URL='https://github.com/AdoptOpenJDK/openjdk8-binaries/releases/download/jdk8u-2019-07-16-20-20/OpenJDK8U-jdk_x64_linux_hotspot_2019-07-16-20-20.tar.gz'; \ 70 | ;; \ 71 | *) \ 72 | echo "Unsupported arch: ${ARCH}"; \ 73 | exit 1; \ 74 | ;; \ 75 | esac; \ 76 | curl -LfsSo /tmp/openjdk.tar.gz ${BINARY_URL}; \ 77 | echo "${ESUM} */tmp/openjdk.tar.gz" | sha256sum -c -; \ 78 | mkdir -p /opt/java/openjdk; \ 79 | cd /opt/java/openjdk; \ 80 | tar -xf /tmp/openjdk.tar.gz --strip-components=1; \ 81 | apk del --purge .fetch-deps; \ 82 | rm -rf /var/cache/apk/*; \ 83 | rm -rf /tmp/openjdk.tar.gz; 84 | 85 | ENV JAVA_HOME=/opt/java/openjdk \ 86 | PATH="/opt/java/openjdk/bin:/opt/java/openjdk/jre/bin:$PATH" 87 | 88 | 89 | 90 | # Before running spark , download spark from official site 91 | # (https://www.apache.org/dyn/closer.lua/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz) 92 | # currently 2.4.3 is latest 93 | 94 | # Add directories to respective place in image 95 | ADD jars /opt/spark/jars 96 | ADD bin /opt/spark/bin 97 | ADD sbin /opt/spark/sbin 98 | ADD kubernetes/dockerfiles/spark/entrypoint.sh /opt/ 99 | ADD examples /opt/spark/examples 100 | ADD kubernetes/tests /opt/spark/tests 101 | ADD data /opt/spark/data 102 | 103 | # Adding requirements 104 | ADD requirements.txt /requirements.txt 105 | 106 | # Specifying dependencies that are needed for numpy,pandas 107 | RUN set -ex \ 108 | && apk update \ 109 | && apk upgrade \ 110 | && apk add --no-cache \ 111 | libstdc++ \ 112 | python3-dev \ 113 | fontconfig \ 114 | chromium \ 115 | chromium-chromedriver \ 116 | && apk add --no-cache --virtual .build-deps \ 117 | g++ \ 118 | gcc \ 119 | make \ 120 | libc-dev \ 121 | libffi-dev \ 122 | openssl-dev \ 123 | ca-certificates \ 124 | libxml2-dev \ 125 | libxslt-dev \ 126 | libjpeg-turbo-dev \ 127 | zlib-dev \ 128 | musl-dev \ 129 | linux-headers \ 130 | pcre-dev \ 131 | curl \ 132 | git \ 133 | && update-ca-certificates 2>/dev/null || true \ 134 | && export PATH=$PATH:/usr/lib/chromium-browser \ 135 | && pip3.6 install -U pip==9.0.3 \ 136 | 137 | 138 | && pip3.6 install --no-cache-dir -r requirements.txt \ 139 | && apk del .build-deps 140 | 141 | 142 | 143 | 144 | 145 | # Setting environments 146 | ENV SPARK_HOME /opt/spark 147 | 148 | WORKDIR /opt/spark/work-dir 149 | 150 | ENV PATH="/opt/spark/bin:${PATH}" 151 | 152 | # most important file , if this won't work properly you will get driver-py not found in $PATH 153 | # Default location on spark is kubernetes/dockerfiles/spark/entrypoint.sh in spark installation 154 | 155 | ENTRYPOINT [ "/opt/entrypoint.sh" ] 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # alpine-python3-numpy-pandas-sparkContainer-spark-submit # 2 | 3 | ### If you are interested in working with Big Data(spark,pyspark,Java),ML(numpy,pandas),Data Engineering (kafka,pymongo,flatbuffers)on Docker - Kubernetes ### 4 | 5 | ![you are expected to know everything possible with data in it](https://i.imgur.com/el8l3nS.jpg) 6 | 7 | 8 | Using python3.6 alpine base image adds java,pandas, numpy,pyspark,spark,kafka,pymongo,flatbuffers as rundeps. 9 | 10 | 11 | This image can be used as container image when you run spark-submit on k8. 12 | 13 | 14 | When you spark-submit you have to specify "spark.kubernetes.container.image="in conf options . 15 | 16 | That image will be applied on the drivers and executors. So if you need any dependencies like numpy,pandas in your code 17 | than you need to make sure that these dependencies are present in that image.Also this image should have spark and other dependencies like(Java,pyspark) 18 | 19 | 20 | This image has all these dependencies(check requirements.txt) 21 | 22 | 23 | If you need some more dependencies to be added you can add it in requirements.txt 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile --output-file requirements.txt requirements.in 6 | # 7 | atomicwrites==1.1.5 # via pytest 8 | attrs==18.1.0 # via pytest 9 | funcsigs==1.0.2 # via pytest 10 | more-itertools==4.3.0 # via pytest 11 | numpy==1.15.0 12 | pandas==0.23.3 13 | pathlib2==2.3.2 # via pytest 14 | pluggy==0.7.1 # via pytest 15 | py==1.5.4 # via pytest 16 | pytest==3.7.0 17 | python-dateutil==2.6.1 # via pandas 18 | pytz==2018.5 # via pandas 19 | scandir==1.7 # via pathlib2 20 | six==1.11.0 # via more-itertools, pathlib2, pytest, python-dateutil 21 | pyspark==2.4.3 22 | boto3 23 | requests 24 | kafka-python==1.4.4 25 | jsonschema==2.6.0 26 | ujson==1.35 27 | pymongo==3.8.0 28 | grpcio==1.20.1 29 | grpcio-tools==1.20.1 30 | setuptools 31 | configparser==3.5.0 32 | google-api-python-client 33 | gcloud 34 | protoc_gen_swagger 35 | flatbuffers 36 | --------------------------------------------------------------------------------