├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── README.rst ├── docker-demo ├── README.rst ├── docker-compose.yaml └── image │ ├── Dockerfile │ ├── cloudera-cdh6.repo │ └── files │ ├── etc │ ├── dask │ │ └── config.yaml │ ├── edge.supervisord.conf │ ├── edge.supervisord.d │ │ └── jupyterhub.conf │ ├── hadoop │ │ ├── conf.kerberos │ │ │ ├── capacity-scheduler.xml │ │ │ ├── container-executor.cfg │ │ │ ├── core-site.xml │ │ │ ├── hdfs-site.xml │ │ │ ├── mapred-site.xml │ │ │ ├── resource-types.xml │ │ │ └── yarn-site.xml │ │ └── conf.temp │ │ │ ├── core-site.xml │ │ │ └── hdfs-site.xml │ ├── jupyter │ │ └── jupyter_config.py │ ├── jupyterhub │ │ └── jupyterhub_config.py │ ├── krb5.conf │ ├── master.supervisord.conf │ ├── master.supervisord.d │ │ ├── hdfs-namenode.conf │ │ ├── kerberos.conf │ │ └── yarn-resourcemanager.conf │ ├── spark │ │ └── conf │ │ │ └── spark-defaults.conf │ ├── worker.supervisord.conf │ └── worker.supervisord.d │ │ ├── hdfs-datanode.conf │ │ └── yarn-nodemanager.conf │ ├── opt │ └── jupyterhub │ │ └── start.sh │ ├── root │ ├── init-hdfs.sh │ ├── setup-jupyterhub.sh │ └── setup.sh │ └── var │ └── kerberos │ └── krb5kdc │ ├── kadm5.acl │ └── kdc.conf └── docs ├── Makefile └── source ├── _images ├── architecture.svg ├── dask-usage.gif ├── jupyterlab_interface.png └── login-page-hello-world.png ├── _static └── custom.css ├── _templates └── help.html ├── conf.py ├── contents-managers.rst ├── customization.rst ├── dask.rst ├── demo.rst ├── enable-https.rst ├── index.rst ├── installation.rst ├── jupyterlab.rst ├── manual-authentication.rst ├── manual-installation.rst ├── manual-setup.rst ├── manual-spawner.rst └── spark.rst /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | docs/build/ 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/source/conf.py 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Jim Crist 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | jupyterhub-on-hadoop 2 | ==================== 3 | 4 | |Doc Status| 5 | 6 | Documentation and resources for deploying and managing JupyterHub_ on a 7 | `Hadoop Cluster`_. 8 | 9 | See `the documentation`_. 10 | 11 | LICENSE 12 | ------- 13 | 14 | New BSD. See the `License File`_. 15 | 16 | .. _JupyterHub: https://jupyterhub.readthedocs.io/ 17 | .. _Hadoop Cluster: https://hadoop.apache.org/ 18 | .. _the documentation: https://jupyterhub-on-hadoop.readthedocs.io 19 | .. _License File: https://github.com/jupyterhub/jupyterhub-on-hadoop/blob/master/LICENSE 20 | 21 | .. |Doc Status| image:: https://readthedocs.org/projects/jupyterhub-on-hadoop/badge/?version=latest 22 | :target: https://jupyterhub-on-hadoop.readthedocs.io 23 | :alt: Documentation Status 24 | -------------------------------------------------------------------------------- /docker-demo/README.rst: -------------------------------------------------------------------------------- 1 | JupyterHub on Hadoop Docker Demo 2 | ================================ 3 | 4 | This is a demo setup of JupyterHub on Hadoop deployed via Docker Compose. 5 | 6 | Startup 7 | ------- 8 | 9 | From this directory: 10 | 11 | .. code-block:: shell 12 | 13 | $ docker-compose up -d 14 | 15 | Usage 16 | ----- 17 | 18 | Three user accounts have been created: 19 | 20 | - ``alice`` 21 | - ``bob`` 22 | - ``carl`` 23 | 24 | All have the same password ``testpass``. 25 | 26 | After logging in you should be dropped into a Jupyter Notebook with common 27 | Python libraries like Pandas, NumPy, and Dask installed. 28 | 29 | Shutdown 30 | -------- 31 | 32 | From this directory 33 | 34 | .. code-block:: shell 35 | 36 | $ docker-compose down 37 | -------------------------------------------------------------------------------- /docker-demo/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | networks: 4 | default: 5 | name: example.com 6 | 7 | services: 8 | master: 9 | image: jcrist/jupyterhub-hadoop-demo 10 | user: root 11 | command: supervisord -c /etc/master.supervisord.conf 12 | container_name: master 13 | hostname: master 14 | domainname: example.com 15 | ports: 16 | - 8020:8020 # NN 17 | - 9000:9000 # NN 18 | - 50070:50070 # NN webui 19 | - 8088:8088 # RM webui 20 | - 88:88/udp # Kerberos 21 | - 749:749 # Kerberos 22 | tmpfs: 23 | - /tmp:noexec 24 | 25 | worker: 26 | image: jcrist/jupyterhub-hadoop-demo 27 | user: root 28 | command: supervisord -c /etc/worker.supervisord.conf 29 | container_name: worker 30 | hostname: worker 31 | domainname: example.com 32 | ports: 33 | - 50075:50075 # DN webui 34 | - 8042:8042 # NM webui 35 | tmpfs: 36 | - /tmp:noexec 37 | 38 | edge: 39 | image: jcrist/jupyterhub-hadoop-demo 40 | user: root 41 | environment: 42 | - JHUB_AUTHENTICATOR=${JHUB_AUTHENTICATOR:-dummy} 43 | command: supervisord -c /etc/edge.supervisord.conf 44 | container_name: edge 45 | hostname: edge 46 | domainname: example.com 47 | ports: 48 | - 8888:8888 # jupyterhub 49 | -------------------------------------------------------------------------------- /docker-demo/image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:centos7 2 | MAINTAINER jcrist 3 | 4 | # Install common utilities and kerberos 5 | RUN yum install -y \ 6 | sudo \ 7 | bzip2 \ 8 | java-1.8.0-openjdk \ 9 | krb5-libs \ 10 | krb5-server \ 11 | krb5-workstation \ 12 | && yum clean all \ 13 | && rm -rf /var/cache/yum 14 | 15 | # Install supervisord 16 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \ 17 | && python get-pip.py \ 18 | && pip install supervisor \ 19 | && rm get-pip.py 20 | 21 | # Install CDH 22 | # Note: We force remove a couple unnecessary packages to shrink the docker 23 | # image slightly 24 | ADD cloudera-cdh6.repo /etc/yum.repos.d/ 25 | RUN rpm --import https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/RPM-GPG-KEY-cloudera \ 26 | && yum install -y \ 27 | hadoop-yarn-resourcemanager \ 28 | hadoop-hdfs-namenode \ 29 | hadoop-yarn-nodemanager \ 30 | hadoop-hdfs-datanode \ 31 | hadoop-client \ 32 | hadoop-libhdfs \ 33 | spark-core \ 34 | spark-python \ 35 | && yum clean all \ 36 | && rm -rf /var/cache/yum \ 37 | && rm -rf /usr/lib/kite \ 38 | && rm -rf /usr/lib/hive \ 39 | && rm -rf /usr/lib/solr \ 40 | && rm -rf /usr/lib/sentry \ 41 | && rm -rf /usr/lib/flume-ng 42 | 43 | # Copy over files 44 | COPY ./files / 45 | 46 | # Setup hadoop and kerberos 47 | RUN /root/setup.sh 48 | 49 | # Setup jupyterhub 50 | RUN /root/setup-jupyterhub.sh 51 | 52 | ENV HADOOP_TESTING_VERSION=$HADOOP_TESTING_VERSION 53 | ENV LIBHDFS3_CONF /etc/hadoop/conf/hdfs-site.xml 54 | ENV HADOOP_CONF_DIR /etc/hadoop/conf 55 | ENV HADOOP_HOME /usr/lib/hadoop 56 | ENV HADOOP_COMMON_HOME /usr/lib/hadoop 57 | ENV HADOOP_YARN_HOME /usr/lib/hadoop-yarn 58 | ENV HADOOP_HDFS_HOME /usr/lib/hadoop-hdfs 59 | ENV SPARK_HOME /usr/lib/spark 60 | ENV JAVA_HOME /usr/lib/jvm/jre 61 | -------------------------------------------------------------------------------- /docker-demo/image/cloudera-cdh6.repo: -------------------------------------------------------------------------------- 1 | [cloudera-cdh6] 2 | # Packages for Cloudera CDH, Version 6, on RedHat or CentOS 7 x86_64 3 | name=Cloudera CDH 4 | baseurl=https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/ 5 | gpgkey=https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/RPM-GPG-KEY-cloudera 6 | gpgcheck = 1 7 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/dask/config.yaml: -------------------------------------------------------------------------------- 1 | yarn: 2 | environment: venv:///opt/jupyterhub/miniconda 3 | 4 | deploy-mode: local 5 | 6 | scheduler: 7 | vcores: 1 8 | memory: 512 MiB 9 | 10 | worker: 11 | vcores: 1 12 | memory: 512 MiB 13 | 14 | distributed: 15 | dashboard: 16 | link: /user/{JUPYTERHUB_USER}/proxy/{port}/status 17 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/edge.supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | strip_ansi = true 3 | nodaemon = true 4 | logfile = /var/log/supervisord.log 5 | pidfile = /var/run/supervisord.pid 6 | 7 | [unix_http_server] 8 | file = /tmp/supervisor.sock 9 | 10 | [rpcinterface:supervisor] 11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 12 | 13 | [supervisorctl] 14 | serverurl = unix:///tmp/supervisor.sock 15 | prompt = edge 16 | 17 | [include] 18 | files = /etc/edge.supervisord.d/*.conf 19 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/edge.supervisord.d/jupyterhub.conf: -------------------------------------------------------------------------------- 1 | [program:jupyterhub] 2 | command=/opt/jupyterhub/start.sh 3 | startsecs=2 4 | stopwaitsecs=10 5 | user=jupyterhub 6 | redirect_stderr=true 7 | stdout_logfile=/var/log/jupyterhub/jupyterhub.log 8 | autostart=true 9 | autorestart=false 10 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/capacity-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | yarn.scheduler.capacity.root.queues 10 | jupyterhub,default 11 | 12 | 13 | 14 | yarn.scheduler.capacity.maximum-am-resource-percent 15 | 0.75 16 | 17 | 18 | 19 | 20 | yarn.scheduler.capacity.root.jupyterhub.capacity 21 | 50.0 22 | 23 | 24 | 25 | yarn.scheduler.capacity.root.jupyterhub.maximum-capacity 26 | 100.0 27 | 28 | 29 | 30 | 31 | yarn.scheduler.capacity.root.default.capacity 32 | 50.0 33 | 34 | 35 | 36 | yarn.scheduler.capacity.root.default.maximum-capacity 37 | 100.0 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/container-executor.cfg: -------------------------------------------------------------------------------- 1 | yarn.nodemanager.local-dirs=/var/lib/hadoop-yarn/cache/yarn/nm-local-dir 2 | yarn.nodemanager.linux-container-executor.group=yarn 3 | yarn.nodemanager.log-dirs=/var/log/hadoop-yarn/containers 4 | banned.users=hdfs,yarn,mapred,bin 5 | 6 | min.user.id=500 7 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | hadoop.tmp.dir 5 | /var/tmp/ 6 | 7 | 8 | 9 | fs.defaultFS 10 | hdfs://master.example.com:9000 11 | 12 | 13 | 14 | hadoop.proxyuser.mapred.hosts 15 | * 16 | 17 | 18 | 19 | hadoop.proxyuser.mapred.groups 20 | * 21 | 22 | 23 | 24 | hadoop.proxyuser.jupyterhub.hosts 25 | * 26 | 27 | 28 | 29 | hadoop.proxyuser.jupyterhub.users 30 | * 31 | 32 | 33 | 34 | hadoop.security.authentication 35 | kerberos 36 | 37 | 38 | 39 | hadoop.security.authorization 40 | true 41 | 42 | 43 | 44 | hadoop.http.filter.initializers 45 | org.apache.hadoop.security.AuthenticationFilterInitializer 46 | 47 | 48 | 49 | hadoop.http.authentication.type 50 | kerberos 51 | 52 | 53 | 54 | hadoop.http.authentication.signature.secret.file 55 | /etc/hadoop/conf/http-secret-file 56 | 57 | 58 | 59 | hadoop.http.authentication.cookie.domain 60 | .example.com 61 | 62 | 63 | 64 | hadoop.http.authentication.simple.anonymous.allowed 65 | false 66 | 67 | 68 | 69 | hadoop.http.authentication.kerberos.principal 70 | HTTP/master.example.com@EXAMPLE.COM 71 | 72 | 73 | 74 | hadoop.http.authentication.kerberos.keytab 75 | /etc/hadoop/conf/master-keytabs/HTTP.keytab 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.replication 5 | 1 6 | 7 | 8 | 9 | dfs.permissions.enabled 10 | true 11 | 12 | 13 | 14 | dfs.webhdfs.enabled 15 | true 16 | 17 | 18 | 19 | dfs.block.access.token.enable 20 | true 21 | 22 | 23 | 24 | dfs.namenode.keytab.file 25 | /etc/hadoop/conf/master-keytabs/hdfs.keytab 26 | 27 | 28 | 29 | dfs.namenode.kerberos.principal 30 | hdfs/master.example.com@EXAMPLE.COM 31 | 32 | 33 | 34 | dfs.namenode.kerberos.internal.spnego.principal 35 | HTTP/master.example.com@EXAMPLE.COM 36 | 37 | 38 | 39 | dfs.datanode.keytab.file 40 | /etc/hadoop/conf/worker-keytabs/hdfs.keytab 41 | 42 | 43 | 44 | dfs.datanode.kerberos.principal 45 | hdfs/worker.example.com@EXAMPLE.COM 46 | 47 | 48 | 49 | dfs.web.authentication.kerberos.principal 50 | HTTP/master.example.com@EXAMPLE.COM 51 | 52 | 53 | 54 | dfs.web.authentication.kerberos.keytab 55 | /etc/hadoop/conf/master-keytabs/HTTP.keytab 56 | 57 | 58 | 59 | ignore.secure.ports.for.testing 60 | true 61 | 62 | 63 | 64 | dfs.http.policy 65 | HTTP_ONLY 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | mapreduce.framework.name 5 | yarn 6 | 7 | 8 | 9 | mapreduce.jobhistory.address 10 | master.example.com:10020 11 | 12 | 13 | 14 | mapreduce.jobhistory.webapp.address 15 | master.example.com:19888 16 | 17 | 18 | 19 | mapreduce.jobhistory.keytab 20 | /etc/hadoop/conf/master-keytabs/mapred.keytab 21 | 22 | 23 | 24 | mapreduce.jobhistory.principal 25 | mapred/master.example.com@EXAMPLE.COM 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/resource-types.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | yarn.resource-types 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.kerberos/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.resourcemanager.hostname 5 | master.example.com 6 | 7 | 8 | 9 | yarn.nodemanager.aux-services 10 | mapreduce_shuffle 11 | 12 | 13 | 14 | yarn.application.classpath 15 | 16 | $HADOOP_CONF_DIR, 17 | $HADOOP_COMMON_HOME/*, 18 | $HADOOP_COMMON_HOME/lib/*, 19 | $HADOOP_HDFS_HOME/*, 20 | $HADOOP_HDFS_HOME/lib/*, 21 | $HADOOP_MAPRED_HOME/*, 22 | $HADOOP_MAPRED_HOME/lib/*, 23 | $HADOOP_YARN_HOME/*, 24 | $HADOOP_YARN_HOME/lib/* 25 | 26 | 27 | 28 | 29 | yarn.nodemanager.local-dirs 30 | file:///var/tmp/hadoop-yarn/local 31 | 32 | 33 | 34 | yarn.nodemanager.log-dirs 35 | file:///var/tmp/hadoop-yarn/logs 36 | 37 | 38 | 39 | yarn.log-aggregation-enable 40 | true 41 | 42 | 43 | 44 | yarn.nodemanager.remote-app-log-dir 45 | hdfs://master.example.com:9000/var/log/hadoop-yarn/apps 46 | 47 | 48 | 49 | yarn.scheduler.minimum-allocation-mb 50 | 32 51 | 52 | 53 | 54 | yarn.resource-types.memory-mb.increment-allocation 55 | ${yarn.scheduler.minimum-allocation-mb} 56 | 57 | 58 | 59 | yarn.scheduler.increment-allocation-mb 60 | ${yarn.scheduler.minimum-allocation-mb} 61 | 62 | 63 | 64 | yarn.nodemanager.resource.cpu-vcores 65 | 16 66 | 67 | 68 | 69 | yarn.resourcemanager.scheduler.class 70 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler 71 | 72 | 73 | 74 | yarn.resourcemanager.keytab 75 | /etc/hadoop/conf/master-keytabs/yarn.keytab 76 | 77 | 78 | 79 | yarn.resourcemanager.principal 80 | yarn/master.example.com@EXAMPLE.COM 81 | 82 | 83 | 84 | yarn.nodemanager.keytab 85 | /etc/hadoop/conf/worker-keytabs/yarn.keytab 86 | 87 | 88 | 89 | yarn.nodemanager.principal 90 | yarn/worker.example.com@EXAMPLE.COM 91 | 92 | 93 | 94 | yarn.nodemanager.container-executor.class 95 | org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor 96 | 97 | 98 | 99 | yarn.nodemanager.linux-container-executor.path 100 | /usr/lib/hadoop-yarn/bin/container-executor 101 | 102 | 103 | 104 | yarn.nodemanager.linux-container-executor.group 105 | yarn 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.temp/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | hadoop.tmp.dir 5 | /var/tmp/ 6 | 7 | 8 | 9 | fs.defaultFS 10 | hdfs://master.example.com:9000 11 | 12 | 13 | 14 | hadoop.security.authentication 15 | simple 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/hadoop/conf.temp/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.replication 5 | 1 6 | 7 | 8 | 9 | dfs.permissions.enabled 10 | true 11 | 12 | 13 | 14 | dfs.webhdfs.enabled 15 | true 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/jupyter/jupyter_config.py: -------------------------------------------------------------------------------- 1 | c.NotebookApp.contents_manager_class = "hdfscm.HDFSContentsManager" 2 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/jupyterhub/jupyterhub_config.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | # Basic setup 5 | c.JupyterHub.bind_url = 'http://:8888' 6 | c.JupyterHub.cookie_secret_file = '/etc/jupyterhub/jupyterhub_cookie_secret' 7 | c.JupyterHub.db_url = 'sqlite:////var/jupyterhub/jupyterhub.sqlite' 8 | 9 | # Make the JupyterHub internal communication accessible from other machines 10 | # in the cluster 11 | c.JupyterHub.hub_ip = '' 12 | 13 | # Enable yarnspawner 14 | c.JupyterHub.spawner_class = 'yarnspawner.YarnSpawner' 15 | 16 | # Configuration for kerberos security 17 | c.YarnSpawner.principal = 'jupyterhub' 18 | c.YarnSpawner.keytab = '/etc/jupyterhub/jupyterhub.keytab' 19 | 20 | # Resource limits per-user 21 | c.YarnSpawner.mem_limit = '2 G' 22 | c.YarnSpawner.cpu_limit = 1 23 | 24 | ## Configure environment variables in user notebook sessions 25 | # Find pyspark modules to add to python path, so they can be used as regular 26 | # libraries 27 | pyspark = '/usr/lib/spark/python/' 28 | py4j = glob.glob(os.path.join(pyspark, 'lib', 'py4j-*.zip'))[0] 29 | pythonpath = ':'.join([pyspark, py4j]) 30 | c.YarnSpawner.environment = { 31 | 'PYTHONPATH': pythonpath, 32 | 'PYSPARK_PYTHON': '/opt/jupyterhub/miniconda/bin/python', 33 | 'PYSPARK_DRIVER_PYTHON': '/opt/jupyterhub/miniconda/bin/python', 34 | } 35 | 36 | # The YARN queue to use 37 | c.YarnSpawner.queue = 'jupyterhub' 38 | 39 | # Activate the JupyterHub conda environment 40 | c.YarnSpawner.prologue = 'source /opt/jupyterhub/miniconda/bin/activate' 41 | 42 | authenticator = os.environ.get('JHUB_AUTHENTICATOR', 'dummy').lower() 43 | if authenticator == 'kerberos': 44 | c.JupyterHub.authenticator_class = 'kerberosauthenticator.KerberosAuthenticator' 45 | c.KerberosAuthenticator.keytab = '/etc/jupyterhub/jupyterhub.keytab' 46 | else: 47 | c.JupyterHub.authenticator_class = 'dummyauthenticator.DummyAuthenticator' 48 | c.DummyAuthenticator.password = "testpass" 49 | # A whitelist of valid usernames. The kerberosauthenticator will enforce 50 | # only valid users are logged in, but the dummyauthenticator has no such 51 | # mechanism so we need to rely on a whitelist 52 | c.DummyAuthenticator.whitelist = [ 53 | 'alice', 54 | 'bob', 55 | 'carl' 56 | ] 57 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/krb5.conf: -------------------------------------------------------------------------------- 1 | [logging] 2 | default = FILE:/var/log/krb5libs.log 3 | kdc = FILE:/var/log/krb5kdc.log 4 | admin_server = FILE:/var/log/kadmind.log 5 | 6 | [libdefaults] 7 | default_realm = EXAMPLE.COM 8 | dns_lookup_realm = false 9 | dns_lookup_kdc = false 10 | ticket_lifetime = 24h 11 | renew_lifetime = 7d 12 | forwardable = true 13 | 14 | [realms] 15 | EXAMPLE.COM = { 16 | kdc = master.example.com 17 | admin_server = master.example.com 18 | } 19 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/master.supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | strip_ansi = true 3 | nodaemon = true 4 | logfile = /var/log/supervisord.log 5 | pidfile = /var/run/supervisord.pid 6 | 7 | [unix_http_server] 8 | file = /tmp/supervisor.sock 9 | 10 | [rpcinterface:supervisor] 11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 12 | 13 | [supervisorctl] 14 | serverurl = unix:///tmp/supervisor.sock 15 | prompt = master 16 | 17 | [include] 18 | files = /etc/master.supervisord.d/*.conf 19 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/master.supervisord.d/hdfs-namenode.conf: -------------------------------------------------------------------------------- 1 | [program:hdfs-namenode] 2 | command=hdfs namenode 3 | startsecs=2 4 | stopwaitsecs=10 5 | user=hdfs 6 | redirect_stderr=true 7 | stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-namenode.log 8 | autostart=true 9 | autorestart=false 10 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/master.supervisord.d/kerberos.conf: -------------------------------------------------------------------------------- 1 | [program:krb5kdc] 2 | command=/bin/bash -c "exec /usr/sbin/krb5kdc -r EXAMPLE.COM -P /var/run/krb5kdc.pid -n" 3 | redirect_stderr=true 4 | stdout_logfile=/dev/stdout 5 | stdout_logfile_maxbytes=0 6 | autostart=true 7 | autorestart=true 8 | 9 | [program:kadmind] 10 | command=/bin/bash -c "exec /usr/sbin/kadmind -r EXAMPLE.COM -P /var/run/kadmind.pid -nofork" 11 | redirect_stderr=true 12 | stdout_logfile=/dev/stdout 13 | stdout_logfile_maxbytes=0 14 | autostart=true 15 | autorestart=true 16 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/master.supervisord.d/yarn-resourcemanager.conf: -------------------------------------------------------------------------------- 1 | [program:yarn-resourcemanager] 2 | command=yarn resourcemanager 3 | startsecs=2 4 | stopwaitsecs=10 5 | user=yarn 6 | redirect_stderr=true 7 | stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-resourcemanager.log 8 | autostart=true 9 | autorestart=false 10 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/spark/conf/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # Spark on yarn configuration 2 | spark.master yarn 3 | spark.submit.deployMode client 4 | spark.yarn.queue default 5 | spark.yarn.jars local:/usr/lib/spark/jars/* 6 | 7 | # Minimize memory requirements in demo cluster 8 | spark.driver.memory 512m 9 | spark.executor.memory 512m 10 | spark.executor.instances 1 11 | 12 | # Pyspark configuration 13 | spark.pyspark.python /opt/jupyterhub/miniconda/bin/python 14 | spark.pyspark.driver.python /opt/jupyterhub/miniconda/bin/python 15 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/worker.supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | strip_ansi = true 3 | nodaemon = true 4 | logfile = /var/log/supervisord.log 5 | pidfile = /var/run/supervisord.pid 6 | 7 | [unix_http_server] 8 | file = /tmp/supervisor.sock 9 | 10 | [rpcinterface:supervisor] 11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 12 | 13 | [supervisorctl] 14 | serverurl = unix:///tmp/supervisor.sock 15 | prompt = worker 16 | 17 | [include] 18 | files = /etc/worker.supervisord.d/*.conf 19 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/worker.supervisord.d/hdfs-datanode.conf: -------------------------------------------------------------------------------- 1 | [program:hdfs-datanode] 2 | command=hdfs datanode 3 | startsecs=2 4 | stopwaitsecs=10 5 | user=hdfs 6 | redirect_stderr=true 7 | stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-datanode.log 8 | autostart=true 9 | autorestart=false 10 | -------------------------------------------------------------------------------- /docker-demo/image/files/etc/worker.supervisord.d/yarn-nodemanager.conf: -------------------------------------------------------------------------------- 1 | [program:yarn-nodemanager] 2 | command=yarn nodemanager 3 | startsecs=2 4 | stopwaitsecs=10 5 | user=yarn 6 | redirect_stderr=true 7 | stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-nodemanager.log 8 | autostart=true 9 | autorestart=false 10 | -------------------------------------------------------------------------------- /docker-demo/image/files/opt/jupyterhub/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PATH="/opt/jupyterhub/miniconda/bin:$PATH" 4 | cd /var/jupyterhub 5 | exec jupyterhub -f /etc/jupyterhub/jupyterhub_config.py 6 | -------------------------------------------------------------------------------- /docker-demo/image/files/root/init-hdfs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Exponential backoff on testing hdfs status, then run init script 4 | echo "Waiting to connect to HDFS" 5 | timeout=2 6 | exit_code=0 7 | for attempt in {1..5}; do 8 | hdfs dfs -ls / 9 | exit_code=$? 10 | 11 | if [[ $exit_code == 0 ]]; then 12 | break 13 | fi 14 | 15 | echo "Retrying in $timeout.." 1>&2 16 | sleep $timeout 17 | timeout=$[$timeout * 2] 18 | done 19 | 20 | if [[ $exit_code != 0 ]]; then 21 | echo "Failed to connect to HDFS" 22 | exit $exit_code 23 | fi 24 | echo "HDFS connected, initializing directory structure" 25 | 26 | hdfs dfs -mkdir -p /tmp \ 27 | && hdfs dfs -chmod -R 1777 /tmp \ 28 | && hdfs dfs -mkdir -p /var/log \ 29 | && hdfs dfs -chmod -R 1775 /var/log \ 30 | && hdfs dfs -chown yarn:hadoop /var/log \ 31 | && hdfs dfs -mkdir -p /tmp/hadoop-yarn \ 32 | && hdfs dfs -chown -R mapred:hadoop /tmp/hadoop-yarn \ 33 | && hdfs dfs -mkdir -p /tmp/hadoop-yarn/staging/history/done_intermediate \ 34 | && hdfs dfs -chown -R mapred:hadoop /tmp/hadoop-yarn/staging \ 35 | && hdfs dfs -chmod -R 1777 /tmp \ 36 | && hdfs dfs -mkdir -p /var/log/hadoop-yarn/apps \ 37 | && hdfs dfs -chmod -R 1777 /var/log/hadoop-yarn/apps \ 38 | && hdfs dfs -chown yarn:hadoop /var/log/hadoop-yarn/apps \ 39 | && hdfs dfs -mkdir -p /user \ 40 | && hdfs dfs -mkdir -p /user/root \ 41 | && hdfs dfs -chmod -R 777 /user/root \ 42 | && hdfs dfs -chown root /user/root \ 43 | && hdfs dfs -mkdir -p /user/history \ 44 | && hdfs dfs -chmod -R 1777 /user/history \ 45 | && hdfs dfs -chown mapred:hadoop /user/history \ 46 | && hdfs dfs -mkdir -p /user/alice \ 47 | && hdfs dfs -chown alice /user/alice \ 48 | && hdfs dfs -mkdir -p /user/bob \ 49 | && hdfs dfs -chown bob /user/bob \ 50 | && hdfs dfs -mkdir -p /user/carl \ 51 | && hdfs dfs -chown bob /user/carl 52 | 53 | exit_code=$? 54 | if [[ $exit_code != 0 ]]; then 55 | echo "Failed to initialize HDFS" 56 | exit $exit_code 57 | fi 58 | echo "Initialized HDFS" 59 | -------------------------------------------------------------------------------- /docker-demo/image/files/root/setup-jupyterhub.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | # Make jupyterhub directories 6 | mkdir -p /etc/jupyterhub 7 | mkdir -p /opt/jupyterhub 8 | mkdir -p /var/jupyterhub 9 | mkdir -p /var/log/jupyterhub 10 | chown jupyterhub /var/jupyterhub 11 | chown jupyterhub /var/log/jupyterhub 12 | 13 | # Create jupyterhub cookie secret 14 | openssl rand -hex 32 > /etc/jupyterhub/jupyterhub_cookie_secret 15 | chmod 400 /etc/jupyterhub/jupyterhub_cookie_secret 16 | chown jupyterhub /etc/jupyterhub/jupyterhub_cookie_secret 17 | 18 | # Install miniconda 19 | curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \ 20 | && /bin/bash /tmp/miniconda.sh -b -p /opt/jupyterhub/miniconda \ 21 | && rm /tmp/miniconda.sh \ 22 | && echo 'export PATH="/opt/jupyterhub/miniconda/bin:$PATH"' >> /root/.bashrc \ 23 | && source /root/.bashrc \ 24 | && conda config --set always_yes yes --set changeps1 no 25 | 26 | # Install JupyterHub, dependencies, and user packages. Normally you'd create a 27 | # separate Python environment here (and optionally package it and put it on 28 | # HDFS). However, to save memory usage in the docker images we'll use the same 29 | # environment for everything. Adding dependencies for use by users: 30 | conda install -c conda-forge \ 31 | jupyterhub \ 32 | jupyterhub-yarnspawner \ 33 | jupyter-hdfscm \ 34 | jupyter-server-proxy \ 35 | tornado==5.1.1 \ 36 | notebook \ 37 | jupyterlab \ 38 | ipywidgets \ 39 | pykerberos \ 40 | dask-yarn \ 41 | dask==1.2.0 \ 42 | pyarrow \ 43 | pandas==0.24.2 \ 44 | numpy==1.16.2 \ 45 | nomkl 46 | 47 | # Patch out no HTTPS warning in login script to give prettier demos. 48 | sed -i '/^