├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── README.rst
├── docker-demo
    ├── README.rst
    ├── docker-compose.yaml
    └── image
    │   ├── Dockerfile
    │   ├── cloudera-cdh6.repo
    │   └── files
    │       ├── etc
    │           ├── dask
    │           │   └── config.yaml
    │           ├── edge.supervisord.conf
    │           ├── edge.supervisord.d
    │           │   └── jupyterhub.conf
    │           ├── hadoop
    │           │   ├── conf.kerberos
    │           │   │   ├── capacity-scheduler.xml
    │           │   │   ├── container-executor.cfg
    │           │   │   ├── core-site.xml
    │           │   │   ├── hdfs-site.xml
    │           │   │   ├── mapred-site.xml
    │           │   │   ├── resource-types.xml
    │           │   │   └── yarn-site.xml
    │           │   └── conf.temp
    │           │   │   ├── core-site.xml
    │           │   │   └── hdfs-site.xml
    │           ├── jupyter
    │           │   └── jupyter_config.py
    │           ├── jupyterhub
    │           │   └── jupyterhub_config.py
    │           ├── krb5.conf
    │           ├── master.supervisord.conf
    │           ├── master.supervisord.d
    │           │   ├── hdfs-namenode.conf
    │           │   ├── kerberos.conf
    │           │   └── yarn-resourcemanager.conf
    │           ├── spark
    │           │   └── conf
    │           │   │   └── spark-defaults.conf
    │           ├── worker.supervisord.conf
    │           └── worker.supervisord.d
    │           │   ├── hdfs-datanode.conf
    │           │   └── yarn-nodemanager.conf
    │       ├── opt
    │           └── jupyterhub
    │           │   └── start.sh
    │       ├── root
    │           ├── init-hdfs.sh
    │           ├── setup-jupyterhub.sh
    │           └── setup.sh
    │       └── var
    │           └── kerberos
    │               └── krb5kdc
    │                   ├── kadm5.acl
    │                   └── kdc.conf
└── docs
    ├── Makefile
    └── source
        ├── _images
            ├── architecture.svg
            ├── dask-usage.gif
            ├── jupyterlab_interface.png
            └── login-page-hello-world.png
        ├── _static
            └── custom.css
        ├── _templates
            └── help.html
        ├── conf.py
        ├── contents-managers.rst
        ├── customization.rst
        ├── dask.rst
        ├── demo.rst
        ├── enable-https.rst
        ├── index.rst
        ├── installation.rst
        ├── jupyterlab.rst
        ├── manual-authentication.rst
        ├── manual-installation.rst
        ├── manual-setup.rst
        ├── manual-spawner.rst
        └── spark.rst


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | docs/build/
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | sphinx:
4 |   configuration: docs/source/conf.py
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, Jim Crist
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |    this list of conditions and the following disclaimer in the documentation
12 |    and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 |   may be used to endorse or promote products derived from this software
16 |   without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | jupyterhub-on-hadoop
 2 | ====================
 3 | 
 4 | |Doc Status|
 5 | 
 6 | Documentation and resources for deploying and managing JupyterHub_ on a
 7 | `Hadoop Cluster`_.
 8 | 
 9 | See `the documentation`_.
10 | 
11 | LICENSE
12 | -------
13 | 
14 | New BSD. See the `License File`_.
15 | 
16 | .. _JupyterHub: https://jupyterhub.readthedocs.io/
17 | .. _Hadoop Cluster: https://hadoop.apache.org/
18 | .. _the documentation: https://jupyterhub-on-hadoop.readthedocs.io
19 | .. _License File: https://github.com/jupyterhub/jupyterhub-on-hadoop/blob/master/LICENSE
20 | 
21 | .. |Doc Status| image:: https://readthedocs.org/projects/jupyterhub-on-hadoop/badge/?version=latest
22 |    :target: https://jupyterhub-on-hadoop.readthedocs.io
23 |    :alt: Documentation Status
24 | 


--------------------------------------------------------------------------------
/docker-demo/README.rst:
--------------------------------------------------------------------------------
 1 | JupyterHub on Hadoop Docker Demo
 2 | ================================
 3 | 
 4 | This is a demo setup of JupyterHub on Hadoop deployed via Docker Compose.
 5 | 
 6 | Startup
 7 | -------
 8 | 
 9 | From this directory:
10 | 
11 | .. code-block:: shell
12 | 
13 |     $ docker-compose up -d
14 | 
15 | Usage
16 | -----
17 | 
18 | Three user accounts have been created:
19 | 
20 | - ``alice``
21 | - ``bob``
22 | - ``carl``
23 | 
24 | All have the same password ``testpass``.
25 | 
26 | After logging in you should be dropped into a Jupyter Notebook with common
27 | Python libraries like Pandas, NumPy, and Dask installed.
28 | 
29 | Shutdown
30 | --------
31 | 
32 | From this directory
33 | 
34 | .. code-block:: shell
35 | 
36 |     $ docker-compose down
37 | 


--------------------------------------------------------------------------------
/docker-demo/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.5"
 2 | 
 3 | networks:
 4 |   default:
 5 |     name: example.com
 6 | 
 7 | services:
 8 |   master:
 9 |     image: jcrist/jupyterhub-hadoop-demo
10 |     user: root
11 |     command: supervisord -c /etc/master.supervisord.conf
12 |     container_name: master
13 |     hostname: master
14 |     domainname: example.com
15 |     ports:
16 |       - 8020:8020    # NN
17 |       - 9000:9000    # NN
18 |       - 50070:50070  # NN webui
19 |       - 8088:8088    # RM webui
20 |       - 88:88/udp    # Kerberos
21 |       - 749:749      # Kerberos
22 |     tmpfs:
23 |       - /tmp:noexec
24 | 
25 |   worker:
26 |     image: jcrist/jupyterhub-hadoop-demo
27 |     user: root
28 |     command: supervisord -c /etc/worker.supervisord.conf
29 |     container_name: worker
30 |     hostname: worker
31 |     domainname: example.com
32 |     ports:
33 |       - 50075:50075  # DN webui
34 |       - 8042:8042    # NM webui
35 |     tmpfs:
36 |       - /tmp:noexec
37 | 
38 |   edge:
39 |     image: jcrist/jupyterhub-hadoop-demo
40 |     user: root
41 |     environment:
42 |       - JHUB_AUTHENTICATOR=${JHUB_AUTHENTICATOR:-dummy}
43 |     command: supervisord -c /etc/edge.supervisord.conf
44 |     container_name: edge
45 |     hostname: edge
46 |     domainname: example.com
47 |     ports:
48 |       - 8888:8888  # jupyterhub
49 | 


--------------------------------------------------------------------------------
/docker-demo/image/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM centos:centos7
 2 | MAINTAINER jcrist
 3 | 
 4 | # Install common utilities and kerberos
 5 | RUN yum install -y \
 6 |         sudo \
 7 |         bzip2 \
 8 |         java-1.8.0-openjdk \
 9 |         krb5-libs \
10 |         krb5-server \
11 |         krb5-workstation \
12 |     && yum clean all \
13 |     && rm -rf /var/cache/yum
14 | 
15 | # Install supervisord
16 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
17 |     && python get-pip.py \
18 |     && pip install supervisor \
19 |     && rm get-pip.py
20 | 
21 | # Install CDH
22 | # Note: We force remove a couple unnecessary packages to shrink the docker
23 | # image slightly
24 | ADD cloudera-cdh6.repo /etc/yum.repos.d/
25 | RUN rpm --import https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/RPM-GPG-KEY-cloudera \
26 |     && yum install -y \
27 |         hadoop-yarn-resourcemanager \
28 |         hadoop-hdfs-namenode \
29 |         hadoop-yarn-nodemanager \
30 |         hadoop-hdfs-datanode \
31 |         hadoop-client \
32 |         hadoop-libhdfs \
33 |         spark-core \
34 |         spark-python \
35 |     && yum clean all \
36 |     && rm -rf /var/cache/yum \
37 |     && rm -rf /usr/lib/kite \
38 |     && rm -rf /usr/lib/hive \
39 |     && rm -rf /usr/lib/solr \
40 |     && rm -rf /usr/lib/sentry \
41 |     && rm -rf /usr/lib/flume-ng
42 | 
43 | # Copy over files
44 | COPY ./files /
45 | 
46 | # Setup hadoop and kerberos
47 | RUN /root/setup.sh
48 | 
49 | # Setup jupyterhub
50 | RUN /root/setup-jupyterhub.sh
51 | 
52 | ENV HADOOP_TESTING_VERSION=$HADOOP_TESTING_VERSION
53 | ENV LIBHDFS3_CONF /etc/hadoop/conf/hdfs-site.xml
54 | ENV HADOOP_CONF_DIR /etc/hadoop/conf
55 | ENV HADOOP_HOME /usr/lib/hadoop
56 | ENV HADOOP_COMMON_HOME /usr/lib/hadoop
57 | ENV HADOOP_YARN_HOME /usr/lib/hadoop-yarn
58 | ENV HADOOP_HDFS_HOME /usr/lib/hadoop-hdfs
59 | ENV SPARK_HOME /usr/lib/spark
60 | ENV JAVA_HOME /usr/lib/jvm/jre
61 | 


--------------------------------------------------------------------------------
/docker-demo/image/cloudera-cdh6.repo:
--------------------------------------------------------------------------------
1 | [cloudera-cdh6]
2 | # Packages for Cloudera CDH, Version 6, on RedHat or CentOS 7 x86_64                 
3 | name=Cloudera CDH
4 | baseurl=https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/
5 | gpgkey=https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/RPM-GPG-KEY-cloudera
6 | gpgcheck = 1
7 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/dask/config.yaml:
--------------------------------------------------------------------------------
 1 | yarn:
 2 |   environment: venv:///opt/jupyterhub/miniconda
 3 | 
 4 |   deploy-mode: local
 5 | 
 6 |   scheduler:
 7 |     vcores: 1
 8 |     memory: 512 MiB
 9 | 
10 |   worker:
11 |     vcores: 1
12 |     memory: 512 MiB
13 | 
14 | distributed:
15 |   dashboard:
16 |     link: /user/{JUPYTERHUB_USER}/proxy/{port}/status
17 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/edge.supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | strip_ansi = true
 3 | nodaemon = true
 4 | logfile = /var/log/supervisord.log
 5 | pidfile = /var/run/supervisord.pid
 6 | 
 7 | [unix_http_server]
 8 | file = /tmp/supervisor.sock
 9 | 
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 | 
13 | [supervisorctl]
14 | serverurl = unix:///tmp/supervisor.sock
15 | prompt = edge
16 | 
17 | [include]
18 | files = /etc/edge.supervisord.d/*.conf
19 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/edge.supervisord.d/jupyterhub.conf:
--------------------------------------------------------------------------------
 1 | [program:jupyterhub]
 2 | command=/opt/jupyterhub/start.sh
 3 | startsecs=2
 4 | stopwaitsecs=10
 5 | user=jupyterhub
 6 | redirect_stderr=true
 7 | stdout_logfile=/var/log/jupyterhub/jupyterhub.log
 8 | autostart=true
 9 | autorestart=false
10 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/capacity-scheduler.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <!--
 3 |     The following queue structures is used:
 4 |       - jupyterhub (cap: 50%, max: 100%)
 5 |       - default (cap: 50%, max: 100%)
 6 |   -->
 7 | 
 8 |   <property>
 9 |     <name>yarn.scheduler.capacity.root.queues</name>
10 |     <value>jupyterhub,default</value>
11 |   </property>
12 | 
13 |   <property>
14 |     <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
15 |     <value>0.75</value>
16 |   </property>
17 | 
18 |   <!-- root.jupyterhub -->
19 |   <property>
20 |     <name>yarn.scheduler.capacity.root.jupyterhub.capacity</name>
21 |     <value>50.0</value>
22 |   </property>
23 | 
24 |   <property>
25 |     <name>yarn.scheduler.capacity.root.jupyterhub.maximum-capacity</name>
26 |     <value>100.0</value>
27 |   </property>
28 | 
29 |   <!-- root.default -->
30 |   <property>
31 |     <name>yarn.scheduler.capacity.root.default.capacity</name>
32 |     <value>50.0</value>
33 |   </property>
34 | 
35 |   <property>
36 |     <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
37 |     <value>100.0</value>
38 |   </property>
39 | 
40 | </configuration>
41 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/container-executor.cfg:
--------------------------------------------------------------------------------
1 | yarn.nodemanager.local-dirs=/var/lib/hadoop-yarn/cache/yarn/nm-local-dir
2 | yarn.nodemanager.linux-container-executor.group=yarn
3 | yarn.nodemanager.log-dirs=/var/log/hadoop-yarn/containers
4 | banned.users=hdfs,yarn,mapred,bin
5 | 
6 | min.user.id=500
7 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/core-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <property>
 4 |         <name>hadoop.tmp.dir</name>
 5 |         <value>/var/tmp/</value>
 6 |     </property>
 7 | 
 8 |     <property>
 9 |         <name>fs.defaultFS</name>
10 |         <value>hdfs://master.example.com:9000</value>
11 |     </property>
12 | 
13 |     <property>
14 |         <name>hadoop.proxyuser.mapred.hosts</name>
15 |         <value>*</value>
16 |     </property>
17 | 
18 |     <property>
19 |         <name>hadoop.proxyuser.mapred.groups</name>
20 |         <value>*</value>
21 |     </property>
22 | 
23 |     <property>
24 |         <name>hadoop.proxyuser.jupyterhub.hosts</name>
25 |         <value>*</value>
26 |     </property>
27 | 
28 |     <property>
29 |         <name>hadoop.proxyuser.jupyterhub.users</name>
30 |         <value>*</value>
31 |     </property>
32 | 
33 |     <property>
34 |         <name>hadoop.security.authentication</name>
35 |         <value>kerberos</value>
36 |     </property>
37 | 
38 |     <property>
39 |         <name>hadoop.security.authorization</name>
40 |         <value>true</value>
41 |     </property>
42 | 
43 |     <property>
44 |         <name>hadoop.http.filter.initializers</name>
45 |         <value>org.apache.hadoop.security.AuthenticationFilterInitializer</value>
46 |     </property>
47 | 
48 |     <property>
49 |         <name>hadoop.http.authentication.type</name>
50 |         <value>kerberos</value>
51 |     </property>
52 | 
53 |     <property>
54 |         <name>hadoop.http.authentication.signature.secret.file</name>
55 |         <value>/etc/hadoop/conf/http-secret-file</value>
56 |     </property>
57 | 
58 |     <property>
59 |         <name>hadoop.http.authentication.cookie.domain</name>
60 |         <value>.example.com</value>
61 |     </property>
62 | 
63 |     <property>
64 |         <name>hadoop.http.authentication.simple.anonymous.allowed</name>
65 |         <value>false</value>
66 |     </property>
67 | 
68 |     <property>
69 |         <name>hadoop.http.authentication.kerberos.principal</name>
70 |         <value>HTTP/master.example.com@EXAMPLE.COM</value>
71 |     </property>
72 | 
73 |     <property>
74 |         <name>hadoop.http.authentication.kerberos.keytab</name>
75 |         <value>/etc/hadoop/conf/master-keytabs/HTTP.keytab</value>
76 |     </property>
77 | 
78 | </configuration>
79 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <property>
 4 |         <name>dfs.replication</name>
 5 |         <value>1</value>
 6 |     </property>
 7 | 
 8 |     <property>
 9 |         <name>dfs.permissions.enabled</name>
10 |         <value>true</value>
11 |     </property>
12 | 
13 |     <property>
14 |         <name>dfs.webhdfs.enabled</name>
15 |         <value>true</value>
16 |     </property>
17 | 
18 |     <property>
19 |         <name>dfs.block.access.token.enable</name>
20 |         <value>true</value>
21 |     </property>
22 | 
23 |     <property>
24 |         <name>dfs.namenode.keytab.file</name>
25 |         <value>/etc/hadoop/conf/master-keytabs/hdfs.keytab</value>
26 |     </property>
27 | 
28 |     <property>
29 |         <name>dfs.namenode.kerberos.principal</name>
30 |         <value>hdfs/master.example.com@EXAMPLE.COM</value>
31 |     </property>
32 | 
33 |     <property>
34 |         <name>dfs.namenode.kerberos.internal.spnego.principal</name>
35 |         <value>HTTP/master.example.com@EXAMPLE.COM</value>
36 |     </property>
37 | 
38 |     <property>
39 |         <name>dfs.datanode.keytab.file</name>
40 |         <value>/etc/hadoop/conf/worker-keytabs/hdfs.keytab</value>
41 |     </property>
42 | 
43 |     <property>
44 |         <name>dfs.datanode.kerberos.principal</name>
45 |         <value>hdfs/worker.example.com@EXAMPLE.COM</value>
46 | 	</property>
47 | 
48 |     <property>
49 |         <name>dfs.web.authentication.kerberos.principal</name>
50 |         <value>HTTP/master.example.com@EXAMPLE.COM</value>
51 |     </property>
52 | 
53 |     <property>
54 |         <name>dfs.web.authentication.kerberos.keytab</name>
55 |         <value>/etc/hadoop/conf/master-keytabs/HTTP.keytab</value>
56 |     </property>
57 | 
58 |     <property>
59 |         <name>ignore.secure.ports.for.testing</name>
60 |         <value>true</value>
61 |     </property>
62 | 
63 |     <property>
64 |         <name>dfs.http.policy</name>
65 |         <value>HTTP_ONLY</value>
66 | 	</property>
67 | 
68 | </configuration>
69 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <property>
 4 |         <name>mapreduce.framework.name</name>
 5 |         <value>yarn</value>
 6 |     </property>
 7 | 
 8 |     <property>
 9 |         <name>mapreduce.jobhistory.address</name>
10 |         <value>master.example.com:10020</value>
11 |     </property>
12 | 
13 |     <property>
14 |         <name>mapreduce.jobhistory.webapp.address</name>
15 |         <value>master.example.com:19888</value>
16 |     </property>
17 | 
18 |     <property>
19 |         <name>mapreduce.jobhistory.keytab</name>
20 |         <value>/etc/hadoop/conf/master-keytabs/mapred.keytab</value>
21 |     </property>
22 | 
23 |     <property>
24 |         <name>mapreduce.jobhistory.principal</name>
25 |         <value>mapred/master.example.com@EXAMPLE.COM</value>
26 |     </property>
27 | 
28 | </configuration>
29 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/resource-types.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |   <property>
3 |     <name>yarn.resource-types</name>
4 |     <value></value>
5 |   </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/yarn-site.xml:
--------------------------------------------------------------------------------
  1 | <configuration>
  2 | 
  3 |     <property>
  4 |         <name>yarn.resourcemanager.hostname</name>
  5 |         <value>master.example.com</value>
  6 |     </property>
  7 | 
  8 |     <property>
  9 |         <name>yarn.nodemanager.aux-services</name>
 10 |         <value>mapreduce_shuffle</value>
 11 |     </property>
 12 | 
 13 |     <property>
 14 |         <name>yarn.application.classpath</name>
 15 |         <value>
 16 |         $HADOOP_CONF_DIR,
 17 |         $HADOOP_COMMON_HOME/*,
 18 |         $HADOOP_COMMON_HOME/lib/*,
 19 |         $HADOOP_HDFS_HOME/*,
 20 |         $HADOOP_HDFS_HOME/lib/*,
 21 |         $HADOOP_MAPRED_HOME/*,
 22 |         $HADOOP_MAPRED_HOME/lib/*,
 23 |         $HADOOP_YARN_HOME/*,
 24 |         $HADOOP_YARN_HOME/lib/*
 25 |         </value>
 26 |     </property>
 27 | 
 28 |     <property>
 29 |         <name>yarn.nodemanager.local-dirs</name>
 30 |         <value>file:///var/tmp/hadoop-yarn/local</value>
 31 |     </property>
 32 | 
 33 |     <property>
 34 |         <name>yarn.nodemanager.log-dirs</name>
 35 |         <value>file:///var/tmp/hadoop-yarn/logs</value>
 36 |     </property>
 37 | 
 38 |     <property>
 39 |         <name>yarn.log-aggregation-enable</name>
 40 |         <value>true</value>
 41 |     </property>
 42 | 
 43 |     <property>
 44 |         <name>yarn.nodemanager.remote-app-log-dir</name>
 45 |         <value>hdfs://master.example.com:9000/var/log/hadoop-yarn/apps</value>
 46 |     </property>
 47 | 
 48 |     <property>
 49 |         <name>yarn.scheduler.minimum-allocation-mb</name>
 50 |         <value>32</value>
 51 |     </property>
 52 | 
 53 |     <property>
 54 |         <name>yarn.resource-types.memory-mb.increment-allocation</name>
 55 |         <value>${yarn.scheduler.minimum-allocation-mb}</value>
 56 |     </property>
 57 | 
 58 |     <property>
 59 |         <name>yarn.scheduler.increment-allocation-mb</name>
 60 |         <value>${yarn.scheduler.minimum-allocation-mb}</value>
 61 |     </property>
 62 | 
 63 |     <property>
 64 |         <name>yarn.nodemanager.resource.cpu-vcores</name>
 65 |         <value>16</value>
 66 |     </property>
 67 | 
 68 |     <property>
 69 |       <name>yarn.resourcemanager.scheduler.class</name>
 70 |       <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
 71 |     </property>
 72 | 
 73 |     <property>
 74 |         <name>yarn.resourcemanager.keytab</name>
 75 |         <value>/etc/hadoop/conf/master-keytabs/yarn.keytab</value>
 76 |     </property>
 77 | 
 78 |     <property>
 79 |         <name>yarn.resourcemanager.principal</name>
 80 |         <value>yarn/master.example.com@EXAMPLE.COM</value>
 81 |     </property>
 82 | 
 83 |     <property>
 84 |         <name>yarn.nodemanager.keytab</name>
 85 |         <value>/etc/hadoop/conf/worker-keytabs/yarn.keytab</value>
 86 |     </property>
 87 | 
 88 |     <property>
 89 |         <name>yarn.nodemanager.principal</name>
 90 |         <value>yarn/worker.example.com@EXAMPLE.COM</value>
 91 |     </property>
 92 | 
 93 |     <property>
 94 |         <name>yarn.nodemanager.container-executor.class</name>
 95 |         <value>org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor</value>
 96 |     </property>
 97 | 
 98 |     <property>
 99 |         <name>yarn.nodemanager.linux-container-executor.path</name>
100 |         <value>/usr/lib/hadoop-yarn/bin/container-executor</value>
101 |     </property>
102 | 
103 |     <property>
104 |         <name>yarn.nodemanager.linux-container-executor.group</name>
105 |         <value>yarn</value>
106 |     </property>
107 | 
108 | </configuration>
109 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.temp/core-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <property>
 4 |         <name>hadoop.tmp.dir</name>
 5 |         <value>/var/tmp/</value>
 6 |     </property>
 7 | 
 8 |     <property>
 9 |         <name>fs.defaultFS</name>
10 |         <value>hdfs://master.example.com:9000</value>
11 |     </property>
12 | 
13 |     <property>
14 |         <name>hadoop.security.authentication</name>
15 |         <value>simple</value>
16 |     </property>
17 | 
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.temp/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <property>
 4 |         <name>dfs.replication</name>
 5 |         <value>1</value>
 6 |     </property>
 7 | 
 8 |     <property>
 9 |         <name>dfs.permissions.enabled</name>
10 |         <value>true</value>
11 |     </property>
12 | 
13 |     <property>
14 |         <name>dfs.webhdfs.enabled</name>
15 |         <value>true</value>
16 |     </property>
17 | 
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/jupyter/jupyter_config.py:
--------------------------------------------------------------------------------
1 | c.NotebookApp.contents_manager_class = "hdfscm.HDFSContentsManager"
2 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/jupyterhub/jupyterhub_config.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | # Basic setup
 5 | c.JupyterHub.bind_url = 'http://:8888'
 6 | c.JupyterHub.cookie_secret_file = '/etc/jupyterhub/jupyterhub_cookie_secret'
 7 | c.JupyterHub.db_url = 'sqlite:////var/jupyterhub/jupyterhub.sqlite'
 8 | 
 9 | # Make the JupyterHub internal communication accessible from other machines
10 | # in the cluster
11 | c.JupyterHub.hub_ip = ''
12 | 
13 | # Enable yarnspawner
14 | c.JupyterHub.spawner_class = 'yarnspawner.YarnSpawner'
15 | 
16 | # Configuration for kerberos security
17 | c.YarnSpawner.principal = 'jupyterhub'
18 | c.YarnSpawner.keytab = '/etc/jupyterhub/jupyterhub.keytab'
19 | 
20 | # Resource limits per-user
21 | c.YarnSpawner.mem_limit = '2 G'
22 | c.YarnSpawner.cpu_limit = 1
23 | 
24 | ## Configure environment variables in user notebook sessions
25 | # Find pyspark modules to add to python path, so they can be used as regular
26 | # libraries
27 | pyspark = '/usr/lib/spark/python/'
28 | py4j = glob.glob(os.path.join(pyspark, 'lib', 'py4j-*.zip'))[0]
29 | pythonpath = ':'.join([pyspark, py4j])
30 | c.YarnSpawner.environment = {
31 |     'PYTHONPATH': pythonpath,
32 |     'PYSPARK_PYTHON': '/opt/jupyterhub/miniconda/bin/python',
33 |     'PYSPARK_DRIVER_PYTHON': '/opt/jupyterhub/miniconda/bin/python',
34 | }
35 | 
36 | # The YARN queue to use
37 | c.YarnSpawner.queue = 'jupyterhub'
38 | 
39 | # Activate the JupyterHub conda environment
40 | c.YarnSpawner.prologue = 'source /opt/jupyterhub/miniconda/bin/activate'
41 | 
42 | authenticator = os.environ.get('JHUB_AUTHENTICATOR', 'dummy').lower()
43 | if authenticator == 'kerberos':
44 |     c.JupyterHub.authenticator_class = 'kerberosauthenticator.KerberosAuthenticator'
45 |     c.KerberosAuthenticator.keytab = '/etc/jupyterhub/jupyterhub.keytab'
46 | else:
47 |     c.JupyterHub.authenticator_class = 'dummyauthenticator.DummyAuthenticator'
48 |     c.DummyAuthenticator.password = "testpass"
49 |     # A whitelist of valid usernames. The kerberosauthenticator will enforce
50 |     # only valid users are logged in, but the dummyauthenticator has no such
51 |     # mechanism so we need to rely on a whitelist
52 |     c.DummyAuthenticator.whitelist = [
53 |         'alice',
54 |         'bob',
55 |         'carl'
56 |     ]
57 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/krb5.conf:
--------------------------------------------------------------------------------
 1 | [logging]
 2 |  default = FILE:/var/log/krb5libs.log
 3 |  kdc = FILE:/var/log/krb5kdc.log
 4 |  admin_server = FILE:/var/log/kadmind.log
 5 | 
 6 | [libdefaults]
 7 |  default_realm = EXAMPLE.COM
 8 |  dns_lookup_realm = false
 9 |  dns_lookup_kdc = false
10 |  ticket_lifetime = 24h
11 |  renew_lifetime = 7d
12 |  forwardable = true
13 | 
14 | [realms]
15 |  EXAMPLE.COM = {
16 |   kdc = master.example.com
17 |   admin_server = master.example.com
18 | }
19 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | strip_ansi = true
 3 | nodaemon = true
 4 | logfile = /var/log/supervisord.log
 5 | pidfile = /var/run/supervisord.pid
 6 | 
 7 | [unix_http_server]
 8 | file = /tmp/supervisor.sock
 9 | 
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 | 
13 | [supervisorctl]
14 | serverurl = unix:///tmp/supervisor.sock
15 | prompt = master
16 | 
17 | [include]
18 | files = /etc/master.supervisord.d/*.conf
19 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.d/hdfs-namenode.conf:
--------------------------------------------------------------------------------
 1 | [program:hdfs-namenode]
 2 | command=hdfs namenode
 3 | startsecs=2
 4 | stopwaitsecs=10
 5 | user=hdfs
 6 | redirect_stderr=true
 7 | stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-namenode.log
 8 | autostart=true
 9 | autorestart=false
10 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.d/kerberos.conf:
--------------------------------------------------------------------------------
 1 | [program:krb5kdc]
 2 | command=/bin/bash -c "exec /usr/sbin/krb5kdc -r EXAMPLE.COM -P /var/run/krb5kdc.pid -n"
 3 | redirect_stderr=true
 4 | stdout_logfile=/dev/stdout
 5 | stdout_logfile_maxbytes=0
 6 | autostart=true
 7 | autorestart=true
 8 | 
 9 | [program:kadmind]
10 | command=/bin/bash -c "exec /usr/sbin/kadmind -r EXAMPLE.COM -P /var/run/kadmind.pid -nofork"
11 | redirect_stderr=true
12 | stdout_logfile=/dev/stdout
13 | stdout_logfile_maxbytes=0
14 | autostart=true
15 | autorestart=true
16 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.d/yarn-resourcemanager.conf:
--------------------------------------------------------------------------------
 1 | [program:yarn-resourcemanager]
 2 | command=yarn resourcemanager
 3 | startsecs=2
 4 | stopwaitsecs=10
 5 | user=yarn
 6 | redirect_stderr=true
 7 | stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-resourcemanager.log
 8 | autostart=true
 9 | autorestart=false
10 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/spark/conf/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | # Spark on yarn configuration
 2 | spark.master yarn
 3 | spark.submit.deployMode client
 4 | spark.yarn.queue default
 5 | spark.yarn.jars local:/usr/lib/spark/jars/*
 6 | 
 7 | # Minimize memory requirements in demo cluster
 8 | spark.driver.memory 512m
 9 | spark.executor.memory 512m
10 | spark.executor.instances 1
11 | 
12 | # Pyspark configuration
13 | spark.pyspark.python /opt/jupyterhub/miniconda/bin/python
14 | spark.pyspark.driver.python /opt/jupyterhub/miniconda/bin/python
15 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/worker.supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | strip_ansi = true
 3 | nodaemon = true
 4 | logfile = /var/log/supervisord.log
 5 | pidfile = /var/run/supervisord.pid
 6 | 
 7 | [unix_http_server]
 8 | file = /tmp/supervisor.sock
 9 | 
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 | 
13 | [supervisorctl]
14 | serverurl = unix:///tmp/supervisor.sock
15 | prompt = worker
16 | 
17 | [include]
18 | files = /etc/worker.supervisord.d/*.conf
19 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/worker.supervisord.d/hdfs-datanode.conf:
--------------------------------------------------------------------------------
 1 | [program:hdfs-datanode]
 2 | command=hdfs datanode
 3 | startsecs=2
 4 | stopwaitsecs=10
 5 | user=hdfs
 6 | redirect_stderr=true
 7 | stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-datanode.log
 8 | autostart=true
 9 | autorestart=false
10 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/etc/worker.supervisord.d/yarn-nodemanager.conf:
--------------------------------------------------------------------------------
 1 | [program:yarn-nodemanager]
 2 | command=yarn nodemanager
 3 | startsecs=2
 4 | stopwaitsecs=10
 5 | user=yarn
 6 | redirect_stderr=true
 7 | stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-nodemanager.log
 8 | autostart=true
 9 | autorestart=false
10 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/opt/jupyterhub/start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | export PATH="/opt/jupyterhub/miniconda/bin:$PATH"
4 | cd /var/jupyterhub
5 | exec jupyterhub -f /etc/jupyterhub/jupyterhub_config.py
6 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/root/init-hdfs.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Exponential backoff on testing hdfs status, then run init script
 4 | echo "Waiting to connect to HDFS"
 5 | timeout=2
 6 | exit_code=0
 7 | for attempt in {1..5}; do
 8 |     hdfs dfs -ls /
 9 |     exit_code=$?
10 | 
11 |     if [[ $exit_code == 0 ]]; then
12 |         break
13 |     fi
14 | 
15 |     echo "Retrying in $timeout.." 1>&2
16 |     sleep $timeout
17 |     timeout=$[$timeout * 2]
18 | done
19 | 
20 | if [[ $exit_code != 0 ]]; then
21 |     echo "Failed to connect to HDFS"
22 |     exit $exit_code
23 | fi
24 | echo "HDFS connected, initializing directory structure"
25 | 
26 | hdfs dfs -mkdir -p /tmp \
27 | && hdfs dfs -chmod -R 1777 /tmp \
28 | && hdfs dfs -mkdir -p /var/log \
29 | && hdfs dfs -chmod -R 1775 /var/log \
30 | && hdfs dfs -chown yarn:hadoop /var/log \
31 | && hdfs dfs -mkdir -p /tmp/hadoop-yarn \
32 | && hdfs dfs -chown -R mapred:hadoop /tmp/hadoop-yarn \
33 | && hdfs dfs -mkdir -p /tmp/hadoop-yarn/staging/history/done_intermediate \
34 | && hdfs dfs -chown -R mapred:hadoop /tmp/hadoop-yarn/staging \
35 | && hdfs dfs -chmod -R 1777 /tmp \
36 | && hdfs dfs -mkdir -p /var/log/hadoop-yarn/apps \
37 | && hdfs dfs -chmod -R 1777 /var/log/hadoop-yarn/apps \
38 | && hdfs dfs -chown yarn:hadoop /var/log/hadoop-yarn/apps \
39 | && hdfs dfs -mkdir -p /user \
40 | && hdfs dfs -mkdir -p /user/root \
41 | && hdfs dfs -chmod -R 777 /user/root \
42 | && hdfs dfs -chown root /user/root \
43 | && hdfs dfs -mkdir -p /user/history \
44 | && hdfs dfs -chmod -R 1777 /user/history \
45 | && hdfs dfs -chown mapred:hadoop /user/history \
46 | && hdfs dfs -mkdir -p /user/alice \
47 | && hdfs dfs -chown alice /user/alice \
48 | && hdfs dfs -mkdir -p /user/bob \
49 | && hdfs dfs -chown bob /user/bob \
50 | && hdfs dfs -mkdir -p /user/carl \
51 | && hdfs dfs -chown bob /user/carl
52 | 
53 | exit_code=$?
54 | if [[ $exit_code != 0 ]]; then
55 |     echo "Failed to initialize HDFS"
56 |     exit $exit_code
57 | fi
58 | echo "Initialized HDFS"
59 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/root/setup-jupyterhub.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | # Make jupyterhub directories
 6 | mkdir -p /etc/jupyterhub
 7 | mkdir -p /opt/jupyterhub
 8 | mkdir -p /var/jupyterhub
 9 | mkdir -p /var/log/jupyterhub
10 | chown jupyterhub /var/jupyterhub
11 | chown jupyterhub /var/log/jupyterhub
12 | 
13 | # Create jupyterhub cookie secret
14 | openssl rand -hex 32 > /etc/jupyterhub/jupyterhub_cookie_secret
15 | chmod 400 /etc/jupyterhub/jupyterhub_cookie_secret
16 | chown jupyterhub /etc/jupyterhub/jupyterhub_cookie_secret
17 | 
18 | # Install miniconda
19 | curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \
20 |     && /bin/bash /tmp/miniconda.sh -b -p /opt/jupyterhub/miniconda \
21 |     && rm /tmp/miniconda.sh \
22 |     && echo 'export PATH="/opt/jupyterhub/miniconda/bin:$PATH"' >> /root/.bashrc \
23 |     && source /root/.bashrc \
24 |     && conda config --set always_yes yes --set changeps1 no
25 | 
26 | # Install JupyterHub, dependencies, and user packages. Normally you'd create a
27 | # separate Python environment here (and optionally package it and put it on
28 | # HDFS). However, to save memory usage in the docker images we'll use the same
29 | # environment for everything. Adding dependencies for use by users:
30 | conda install -c conda-forge \
31 |     jupyterhub \
32 |     jupyterhub-yarnspawner \
33 |     jupyter-hdfscm \
34 |     jupyter-server-proxy \
35 |     tornado==5.1.1 \
36 |     notebook \
37 |     jupyterlab \
38 |     ipywidgets \
39 |     pykerberos \
40 |     dask-yarn \
41 |     dask==1.2.0 \
42 |     pyarrow \
43 |     pandas==0.24.2 \
44 |     numpy==1.16.2 \
45 |     nomkl
46 | 
47 | # Patch out no HTTPS warning in login script to give prettier demos.
48 | sed -i '/^<script>/,/^<\/script>/d' /opt/jupyterhub/miniconda/share/jupyterhub/templates/login.html
49 | 
50 | # Extra packages that aren't on conda-forge
51 | pip install jupyterhub-kerberosauthenticator --no-deps
52 | pip install jupyterhub-dummyauthenticator --no-deps
53 | 
54 | # Remove any unused packages
55 | conda clean  -a
56 | 
57 | # Remove optional files to minimize layer size of the docker image
58 | find /opt/jupyterhub/miniconda/ -type f -name '*.a' -delete
59 | find /opt/jupyterhub/miniconda/ -type f -name '*.js.map' -delete
60 | find /opt/jupyterhub/miniconda/ -type f -name '*.pyc' -delete
61 | rm -rf /opt/jupyterhub/miniconda/pkgs
62 | # Strip shared libraries in case they weren't already
63 | find /opt/jupyterhub/miniconda/ -type f -name '*.so' -exec strip --strip-all {} + || true
64 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/root/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | # Create user accounts
 6 | useradd -m alice
 7 | useradd -m bob
 8 | useradd -m carl
 9 | useradd -M jupyterhub
10 | 
11 | # Copy around default log4j.properties file
12 | ln -s /etc/hadoop/conf.empty/log4j.properties /etc/hadoop/conf.temp/log4j.properties \
13 |     && ln -s /etc/hadoop/conf.empty/log4j.properties /etc/hadoop/conf.kerberos/log4j.properties \
14 | 
15 | # Fix container-executor permissions
16 | chmod 6050 /etc/hadoop/conf.kerberos/container-executor.cfg
17 | 
18 | # Create yarn directories with proper permissions
19 | mkdir -p /var/tmp/hadoop-yarn/local /var/tmp/hadoop-yarn/logs \
20 |     && chown -R yarn:yarn /var/tmp/hadoop-yarn/local /var/tmp/hadoop-yarn/logs
21 | 
22 | # Create secret key to authenticate web access
23 | dd if=/dev/urandom bs=64 count=1 > /etc/hadoop/conf.kerberos/http-secret-file
24 | chown hdfs:hadoop /etc/hadoop/conf.kerberos/http-secret-file
25 | chmod 440 /etc/hadoop/conf.kerberos/http-secret-file
26 | 
27 | # Temporarily Configure HDFS to use non-kerberos credentials
28 | alternatives --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.temp 50 \
29 |     && alternatives --set hadoop-conf /etc/hadoop/conf.temp
30 | 
31 | # Format namenode
32 | sudo -E -u hdfs bash -c "hdfs namenode -format -force"
33 | 
34 | # Format filesystem
35 | # NOTE: Even though the worker and master will be different filesystems at
36 | # *runtime*, the directories they write to are different so we can intitialize
37 | # both in the same image. This is a bit of a hack, but makes startup quicker
38 | # and easier.
39 | # XXX: Add to hosts to resolve name temporarily
40 | echo "127.0.0.1 master.example.com" >> /etc/hosts
41 | sudo -E -u hdfs bash -c "hdfs namenode"&
42 | sudo -E -u hdfs bash -c "hdfs datanode"&
43 | sudo -E -u hdfs /root/init-hdfs.sh
44 | killall java
45 | 
46 | # Configure to use kerberos config now
47 | alternatives --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.kerberos 50 \
48 |     && alternatives --set hadoop-conf /etc/hadoop/conf.kerberos
49 | 
50 | # Setup kerberos keytabs
51 | create_keytabs() {
52 |     HOST="$1.example.com"
53 |     KEYTABS="/etc/hadoop/conf.kerberos/$1-keytabs"
54 |     kadmin.local -q "addprinc -randkey hdfs/$HOST@EXAMPLE.COM" \
55 |     && kadmin.local -q "addprinc -randkey mapred/$HOST@EXAMPLE.COM" \
56 |     && kadmin.local -q "addprinc -randkey yarn/$HOST@EXAMPLE.COM" \
57 |     && kadmin.local -q "addprinc -randkey HTTP/$HOST@EXAMPLE.COM" \
58 |     && mkdir "$KEYTABS" \
59 |     && kadmin.local -q "xst -norandkey -k $KEYTABS/hdfs.keytab hdfs/$HOST HTTP/$HOST" \
60 |     && kadmin.local -q "xst -norandkey -k $KEYTABS/mapred.keytab mapred/$HOST HTTP/$HOST" \
61 |     && kadmin.local -q "xst -norandkey -k $KEYTABS/yarn.keytab yarn/$HOST HTTP/$HOST" \
62 |     && kadmin.local -q "xst -norandkey -k $KEYTABS/HTTP.keytab HTTP/$HOST" \
63 |     && chown hdfs:hadoop $KEYTABS/hdfs.keytab \
64 |     && chown mapred:hadoop $KEYTABS/mapred.keytab \
65 |     && chown yarn:hadoop $KEYTABS/yarn.keytab \
66 |     && chown hdfs:hadoop $KEYTABS/HTTP.keytab \
67 |     && chmod 440 $KEYTABS/*.keytab
68 | }
69 | 
70 | kdb5_util create -s -P adminpass \
71 | && create_keytabs master \
72 | && create_keytabs worker \
73 | && kadmin.local -q "addprinc -pw adminpass root/admin" \
74 | && kadmin.local -q "addprinc -pw testpass alice" \
75 | && kadmin.local -q "addprinc -pw testpass bob" \
76 | && kadmin.local -q "addprinc -pw testpass carl" \
77 | && kadmin.local -q "addprinc -randkey jupyterhub" \
78 | && kadmin.local -q "addprinc -randkey HTTP/edge.example.com@EXAMPLE.COM" \
79 | && kadmin.local -q "xst -norandkey -k /etc/jupyterhub/jupyterhub.keytab jupyterhub HTTP/edge.example.com" \
80 | && chown jupyterhub /etc/jupyterhub/jupyterhub.keytab \
81 | && chmod 400 /etc/jupyterhub/jupyterhub.keytab
82 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/var/kerberos/krb5kdc/kadm5.acl:
--------------------------------------------------------------------------------
1 | root/admin@EXAMPLE.COM     ex
2 | 


--------------------------------------------------------------------------------
/docker-demo/image/files/var/kerberos/krb5kdc/kdc.conf:
--------------------------------------------------------------------------------
 1 | [kdcdefaults]
 2 |  kdc_ports = 88
 3 |  kdc_tcp_ports = 88
 4 | 
 5 | [realms]
 6 |  EXAMPLE.COM = {
 7 |   acl_file = /var/kerberos/krb5kdc/kadm5.acl
 8 |   dict_file = /usr/share/dict/words
 9 |   admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab
10 |   supported_enctypes = des3-hmac-sha1:normal arcfour-hmac:normal des-hmac-sha1:normal des-cbc-md5:normal des-cbc-crc:normal
11 | }
12 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = jupyterhub-on-hadoop
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/source/_images/architecture.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="265.25476mm"
 13 |    height="159.99429mm"
 14 |    viewBox="0 0 265.25476 159.99429"
 15 |    version="1.1"
 16 |    id="svg8"
 17 |    inkscape:version="0.92.2 5c3e80d, 2017-08-06"
 18 |    sodipodi:docname="architecture.svg">
 19 |   <defs
 20 |      id="defs2">
 21 |     <marker
 22 |        inkscape:stockid="Arrow1Lend"
 23 |        orient="auto"
 24 |        refY="0.0"
 25 |        refX="0.0"
 26 |        id="Arrow1Lend"
 27 |        style="overflow:visible;"
 28 |        inkscape:isstock="true">
 29 |       <path
 30 |          id="path1921"
 31 |          d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
 32 |          style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1;fill:#000000;fill-opacity:1"
 33 |          transform="scale(0.8) rotate(180) translate(12.5,0)" />
 34 |     </marker>
 35 |     <marker
 36 |        inkscape:stockid="Arrow1Lstart"
 37 |        orient="auto"
 38 |        refY="0"
 39 |        refX="0"
 40 |        id="Arrow1Lstart"
 41 |        style="overflow:visible"
 42 |        inkscape:isstock="true"
 43 |        inkscape:collect="always">
 44 |       <path
 45 |          id="path1918"
 46 |          d="M 0,0 5,-5 -12.5,0 5,5 Z"
 47 |          style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt;stroke-opacity:1"
 48 |          transform="matrix(0.8,0,0,0.8,10,0)"
 49 |          inkscape:connector-curvature="0" />
 50 |     </marker>
 51 |   </defs>
 52 |   <sodipodi:namedview
 53 |      id="base"
 54 |      pagecolor="#ffffff"
 55 |      bordercolor="#666666"
 56 |      borderopacity="1.0"
 57 |      inkscape:pageopacity="0.0"
 58 |      inkscape:pageshadow="2"
 59 |      inkscape:zoom="0.7"
 60 |      inkscape:cx="389.42005"
 61 |      inkscape:cy="171.99546"
 62 |      inkscape:document-units="mm"
 63 |      inkscape:current-layer="g5843"
 64 |      showgrid="false"
 65 |      inkscape:window-width="1440"
 66 |      inkscape:window-height="856"
 67 |      inkscape:window-x="0"
 68 |      inkscape:window-y="0"
 69 |      inkscape:window-maximized="1"
 70 |      fit-margin-top="10"
 71 |      fit-margin-left="10"
 72 |      fit-margin-bottom="10"
 73 |      fit-margin-right="10"
 74 |      inkscape:pagecheckerboard="false" />
 75 |   <metadata
 76 |      id="metadata5">
 77 |     <rdf:RDF>
 78 |       <cc:Work
 79 |          rdf:about="">
 80 |         <dc:format>image/svg+xml</dc:format>
 81 |         <dc:type
 82 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 83 |         <dc:title />
 84 |       </cc:Work>
 85 |     </rdf:RDF>
 86 |   </metadata>
 87 |   <g
 88 |      inkscape:label="Layer 1"
 89 |      inkscape:groupmode="layer"
 90 |      id="layer1"
 91 |      transform="translate(-6.734812,-92.949852)">
 92 |     <g
 93 |        id="g5843">
 94 |       <g
 95 |          id="g967">
 96 |         <g
 97 |            id="g1905"
 98 |            transform="translate(-15.119048,0.3779758)">
 99 |           <text
100 |              id="text1164"
101 |              y="227.15875"
102 |              x="45.061745"
103 |              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:7.05555534px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
104 |              xml:space="preserve"><tspan
105 |                y="227.15875"
106 |                x="45.061745"
107 |                id="tspan1168"
108 |                sodipodi:role="line">Hadoop Cluster</tspan></text>
109 |           <rect
110 |              y="103.29588"
111 |              x="32.307583"
112 |              height="138.81656"
113 |              width="186.12056"
114 |              id="rect815"
115 |              style="fill:none;fill-opacity:1;stroke:#c83737;stroke-width:0.90744573;stroke-miterlimit:2.5999999;stroke-dasharray:5.44467449, 5.44467449;stroke-dashoffset:0;stroke-opacity:1" />
116 |         </g>
117 |         <g
118 |            id="g4787">
119 |           <rect
120 |              style="fill:none;fill-opacity:1;stroke:#3771c8;stroke-width:0.95737338;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
121 |              id="rect819"
122 |              width="82.587158"
123 |              height="58.744518"
124 |              x="25.066305"
125 |              y="111.74225" />
126 |           <g
127 |              transform="translate(-15.119048,0.3779758)"
128 |              id="g4577">
129 |             <rect
130 |                style="fill:#87cdde;fill-opacity:1;stroke:#2c89a0;stroke-width:0.5;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
131 |                id="rect1313"
132 |                width="22"
133 |                height="22"
134 |                x="69.740593"
135 |                y="117.42819" />
136 |             <path
137 |                style="fill:#4d4d4d;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
138 |                d="m 80.740333,120.48245 a 6.1471992,6.1471992 0 0 0 -6.146911,6.14743 6.1471992,6.1471992 0 0 0 3.469555,5.52473 7.5,7.5 0 0 0 -4.81676,6.84299 h -0.0057 v 0.68058 h 15.000117 v -0.68058 h -0.0083 a 7.5,7.5 0 0 0 -4.80229,-6.84557 6.1471992,6.1471992 0 0 0 3.457151,-5.52215 6.1471992,6.1471992 0 0 0 -6.146395,-6.14743 z"
139 |                id="path1244"
140 |                inkscape:connector-curvature="0" />
141 |           </g>
142 |           <g
143 |              id="g4773">
144 |             <rect
145 |                y="142.42285"
146 |                x="79.238213"
147 |                height="22"
148 |                width="22"
149 |                id="rect1267"
150 |                style="fill:#87cdde;fill-opacity:1;stroke:#2c89a0;stroke-width:0.5;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
151 |             <path
152 |                id="path1269"
153 |                d="m 90.237962,145.47713 a 6.1471992,6.1471992 0 0 0 -6.146914,6.14743 6.1471992,6.1471992 0 0 0 3.469554,5.52473 7.5,7.5 0 0 0 -4.816759,6.84299 h -0.0057 v 0.68058 h 15.000139 v -0.68058 h -0.008 a 7.5,7.5 0 0 0 -4.80229,-6.84557 6.1471992,6.1471992 0 0 0 3.45715,-5.52215 6.1471992,6.1471992 0 0 0 -6.1464,-6.14743 z"
154 |                style="fill:#757575;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
155 |                inkscape:connector-curvature="0" />
156 |           </g>
157 |           <g
158 |              transform="translate(1.776837,-31.364139)"
159 |              id="g1333">
160 |             <rect
161 |                y="149.1703"
162 |                x="77.461372"
163 |                height="22"
164 |                width="22"
165 |                id="rect1315"
166 |                style="fill:#87cdde;fill-opacity:1;stroke:#2c89a0;stroke-width:0.5;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
167 |             <path
168 |                inkscape:connector-curvature="0"
169 |                style="fill:#9e9e9e;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
170 |                d="m 88.461122,152.22457 a 6.1471992,6.1471992 0 0 0 -6.146911,6.14743 6.1471992,6.1471992 0 0 0 3.469556,5.52473 7.5,7.5 0 0 0 -4.816761,6.84299 h -0.0057 v 0.68058 h 15.000135 v -0.68058 h -0.0083 a 7.5,7.5 0 0 0 -4.80229,-6.84557 6.1471992,6.1471992 0 0 0 3.457151,-5.52215 6.1471992,6.1471992 0 0 0 -6.146395,-6.14743 z"
171 |                id="path1273" />
172 |           </g>
173 |           <text
174 |              xml:space="preserve"
175 |              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:7.05555534px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
176 |              x="29.900225"
177 |              y="163.49066"
178 |              id="text1337"><tspan
179 |                y="163.49066"
180 |                x="29.900225"
181 |                id="tspan1350"
182 |                sodipodi:role="line">Worker Node</tspan></text>
183 |         </g>
184 |         <g
185 |            id="g1589"
186 |            transform="translate(-10.996513,-5.753361)">
187 |           <g
188 |              id="g913">
189 |             <g
190 |                id="g904">
191 |               <rect
192 |                  y="123.10008"
193 |                  x="149.66737"
194 |                  height="18.22611"
195 |                  width="50.732063"
196 |                  id="rect852"
197 |                  style="fill:#2c89a0;fill-opacity:1;stroke:none;stroke-width:0;stroke-linejoin:round;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
198 |               <text
199 |                  id="text858"
200 |                  y="134.15962"
201 |                  x="152.99168"
202 |                  style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:7.05555534px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
203 |                  xml:space="preserve"><tspan
204 |                    sodipodi:role="line"
205 |                    id="tspan876"
206 |                    x="152.99168"
207 |                    y="134.15962">Hub &amp; Proxy</tspan></text>
208 |             </g>
209 |             <rect
210 |                style="fill:none;fill-opacity:1;stroke:#3771c8;stroke-width:0.95737338;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
211 |                id="rect1471"
212 |                width="82.587158"
213 |                height="58.744518"
214 |                x="123.84068"
215 |                y="117.49561" />
216 |             <text
217 |                xml:space="preserve"
218 |                style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:7.05555534px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
219 |                x="128.88985"
220 |                y="169.24402"
221 |                id="text1475"><tspan
222 |                  sodipodi:role="line"
223 |                  id="tspan1477"
224 |                  x="128.88985"
225 |                  y="169.24402">Hub Node</tspan></text>
226 |           </g>
227 |         </g>
228 |         <g
229 |            id="g1914"
230 |            transform="translate(-13.607143,-24.568452)">
231 |           <g
232 |              transform="translate(1.1339286,12.473214)"
233 |              id="g1735">
234 |             <path
235 |                inkscape:connector-curvature="0"
236 |                id="path1292"
237 |                d="m 256.78594,115.04509 a 6.1471992,6.1471992 0 0 0 -6.14743,6.14742 6.1471992,6.1471992 0 0 0 3.46956,5.52473 7.5,7.5 0 0 0 -4.81676,6.84299 h -0.006 v 10.45673 h 15.00012 v -10.45673 h -0.008 a 7.5,7.5 0 0 0 -4.80229,-6.84558 6.1471992,6.1471992 0 0 0 3.45715,-5.52214 6.1471992,6.1471992 0 0 0 -6.14691,-6.14742 z"
238 |                style="fill:#9e9e9e;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
239 |             <path
240 |                inkscape:connector-curvature="0"
241 |                id="path1187"
242 |                d="m 266.9633,120.82974 a 6.1471992,6.1471992 0 0 0 -6.14743,6.14742 6.1471992,6.1471992 0 0 0 3.46956,5.52473 7.5,7.5 0 0 0 -4.81676,6.84299 h -0.006 v 10.45673 h 15.00012 v -10.45673 h -0.008 a 7.5,7.5 0 0 0 -4.80229,-6.84558 6.1471992,6.1471992 0 0 0 3.45715,-5.52214 6.1471992,6.1471992 0 0 0 -6.14691,-6.14742 z"
243 |                style="fill:#757575;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
244 |             <path
245 |                style="fill:#f37626;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
246 |                d="m 249.28331,125.22018 a 6.1471992,6.1471992 0 0 0 -6.14743,6.14742 6.1471992,6.1471992 0 0 0 3.46956,5.52473 7.5,7.5 0 0 0 -4.81676,6.84299 h -0.006 v 10.45673 h 15.00012 v -10.45673 h -0.008 a 7.5,7.5 0 0 0 -4.80229,-6.84558 6.1471992,6.1471992 0 0 0 3.45715,-5.52214 6.1471992,6.1471992 0 0 0 -6.14691,-6.14742 z"
247 |                id="path1189"
248 |                inkscape:connector-curvature="0" />
249 |             <path
250 |                inkscape:connector-curvature="0"
251 |                id="path1279"
252 |                d="m 261.04018,129.94649 a 6.1471992,6.1471992 0 0 0 -6.14743,6.14742 6.1471992,6.1471992 0 0 0 3.46956,5.52473 7.5,7.5 0 0 0 -4.81676,6.84299 h -0.006 v 10.45673 h 15.00012 v -10.45673 h -0.008 a 7.5,7.5 0 0 0 -4.80229,-6.84558 6.1471992,6.1471992 0 0 0 3.45715,-5.52214 6.1471992,6.1471992 0 0 0 -6.14691,-6.14742 z"
253 |                style="fill:#4d4d4d;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
254 |           </g>
255 |           <text
256 |              xml:space="preserve"
257 |              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:7.05555534px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
258 |              x="249.1687"
259 |              y="179.83614"
260 |              id="text1898"><tspan
261 |                y="179.83614"
262 |                x="249.1687"
263 |                id="tspan1900"
264 |                sodipodi:role="line">Users</tspan></text>
265 |         </g>
266 |         <path
267 |            style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#Arrow1Lstart)"
268 |            d="m 190.95358,125.85238 h 35.52976"
269 |            id="path1916"
270 |            inkscape:connector-curvature="0" />
271 |         <g
272 |            id="g4802">
273 |           <g
274 |              id="g4791">
275 |             <rect
276 |                style="fill:#87cdde;fill-opacity:1;stroke:#2c89a0;stroke-width:0.5;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
277 |                id="rect1317"
278 |                width="22"
279 |                height="22"
280 |                x="167.2718"
281 |                y="181.49861" />
282 |             <path
283 |                inkscape:connector-curvature="0"
284 |                style="fill:#f37626;fill-opacity:1;stroke:none;stroke-width:1.11863995;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
285 |                d="m 178.27156,184.55289 a 6.1471992,6.1471992 0 0 0 -6.14692,6.14743 6.1471992,6.1471992 0 0 0 3.46956,5.52473 7.5,7.5 0 0 0 -4.81676,6.84299 h -0.006 v 0.68058 h 15.00013 v -0.68058 h -0.008 a 7.5,7.5 0 0 0 -4.80229,-6.84557 6.1471992,6.1471992 0 0 0 3.45716,-5.52215 6.1471992,6.1471992 0 0 0 -6.1464,-6.14743 z"
286 |                id="path1277" />
287 |           </g>
288 |           <rect
289 |              y="175.67751"
290 |              x="112.90798"
291 |              height="58.744518"
292 |              width="82.587158"
293 |              id="rect1409"
294 |              style="fill:none;fill-opacity:1;stroke:#3771c8;stroke-width:0.95737338;stroke-miterlimit:2.5999999;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" />
295 |           <text
296 |              id="text1433"
297 |              y="227.42587"
298 |              x="117.7419"
299 |              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:7.05555534px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
300 |              xml:space="preserve"><tspan
301 |                sodipodi:role="line"
302 |                id="tspan1431"
303 |                x="117.7419"
304 |                y="227.42587">Worker Node</tspan></text>
305 |           <text
306 |              id="text4670"
307 |              y="208.11612"
308 |              x="177.95801"
309 |              style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;line-height:125%;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
310 |              xml:space="preserve"><tspan
311 |                y="208.11612"
312 |                x="177.95801"
313 |                id="tspan4674"
314 |                sodipodi:role="line">User's</tspan><tspan
315 |                y="213.40779"
316 |                x="177.95801"
317 |                id="tspan4676"
318 |                sodipodi:role="line">Container</tspan></text>
319 |         </g>
320 |         <path
321 |            style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1Lend)"
322 |            d="m 177.95802,136.83813 v 42.70305"
323 |            id="path4810"
324 |            inkscape:connector-curvature="0" />
325 |       </g>
326 |     </g>
327 |   </g>
328 | </svg>
329 | 


--------------------------------------------------------------------------------
/docs/source/_images/dask-usage.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jupyterhub/jupyterhub-on-hadoop/b4e4171732016e9c27bbb52a854a460c2bf54d7a/docs/source/_images/dask-usage.gif


--------------------------------------------------------------------------------
/docs/source/_images/jupyterlab_interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jupyterhub/jupyterhub-on-hadoop/b4e4171732016e9c27bbb52a854a460c2bf54d7a/docs/source/_images/jupyterlab_interface.png


--------------------------------------------------------------------------------
/docs/source/_images/login-page-hello-world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jupyterhub/jupyterhub-on-hadoop/b4e4171732016e9c27bbb52a854a460c2bf54d7a/docs/source/_images/login-page-hello-world.png


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
 1 | pre {
 2 |     padding: 7px 10px;
 3 | }
 4 | 
 5 | .body li {
 6 |     margin: 0 0 15px 0;
 7 | }
 8 | 
 9 | .toctree-wrapper li {
10 |     margin: 0 0 0 0;
11 | }
12 | 
13 | .contents li {
14 |     margin: 0 0 0 0;
15 | }
16 | 
17 | .light-bordered-image {
18 |     border: 1px solid whitesmoke;
19 | }
20 | 


--------------------------------------------------------------------------------
/docs/source/_templates/help.html:
--------------------------------------------------------------------------------
1 | <h3>Need help?</h3>
2 | 
3 | <p>
4 |   Open an issue in the <a href="https://github.com/jupyterhub/jupyterhub-on-hadoop/issues">issue tracker</a>.
5 | </p>
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | import alabaster
 2 | 
 3 | # Project settings
 4 | project = 'JupyterHub on Hadoop'
 5 | copyright = '2019, Jim Crist'
 6 | author = 'Jim Crist'
 7 | release = version = '0.1.0'
 8 | 
 9 | source_suffix = '.rst'
10 | master_doc = 'index'
11 | language = None
12 | pygments_style = 'sphinx'
13 | exclude_patterns = []
14 | 
15 | # Sphinx Extensions
16 | extensions = ['sphinx.ext.extlinks']
17 | 
18 | numpydoc_show_class_members = False
19 | 
20 | extlinks = {
21 |     'issue': ('https://github.com/jupyterhub/jupyterhub-on-hadoop/issues/%s', 'Issue #'),
22 |     'pr': ('https://github.com/jupyterhub/jupyterhub-on-hadoop/pull/%s', 'PR #')
23 | }
24 | 
25 | # Sphinx Theme
26 | html_theme = 'alabaster'
27 | html_theme_path = [alabaster.get_path()]
28 | templates_path = ['_templates']
29 | html_static_path = ['_static']
30 | html_theme_options = {
31 |     'description': 'Documentation for deploying JupyterHub on a Hadoop Cluster',
32 |     'github_button': True,
33 |     'github_count': False,
34 |     'github_user': 'jupyterhub',
35 |     'github_repo': 'jupyterhub-on-hadoop',
36 |     'travis_button': False,
37 |     'show_powered_by': False,
38 |     'page_width': '960px',
39 |     'sidebar_width': '250px',
40 |     'code_font_size': '0.8em'
41 | }
42 | html_sidebars = {
43 |     '**': ['about.html',
44 |            'navigation.html',
45 |            'help.html',
46 |            'searchbox.html']
47 | }
48 | 


--------------------------------------------------------------------------------
/docs/source/contents-managers.rst:
--------------------------------------------------------------------------------
 1 | Add a Contents Manager
 2 | ======================
 3 | 
 4 | JupyterHub has a pluggable storage API (called a `contents manager`_) for
 5 | persisting notebooks and files. Without a contents manager users will be able
 6 | to create notebooks, but their notebooks won't persist between sessions. *Note
 7 | that the contents managers are for storing relatively small files (notebooks,
 8 | scripts, etc...) and not large files (datasets, etc...).*
 9 | 
10 | 
11 | Storing Notebooks on HDFS
12 | -------------------------
13 | 
14 | Notebooks can be persisted to HDFS using the `jupyter-hdfscm`_ package.
15 | 
16 | To enable, first install ``jupyter-hdfscm`` in the :ref:`notebook environment
17 | <notebook-environments>` *not* the JupyterHub environment.
18 | 
19 | .. code-block:: shell
20 | 
21 |     # Install in the notebook environment
22 |     $ conda install -c conda-forge jupyter-hdfscm
23 | 
24 | Then add the following to your ``jupyterhub_config.py`` file. This forwards the
25 | contents manager configuration to the notebook process started by
26 | ``YarnSpawner``.
27 | 
28 | .. code-block:: python
29 | 
30 |     # Enable jupyter-hdfscm
31 |     c.YarnSpawner.args = ['--NotebookApp.contents_manager_class="hdfscm.HDFSContentsManager"']
32 | 
33 | 
34 | For more information see the `jupyter-hdfscm documentation`_.
35 | 
36 | 
37 | Other Options
38 | -------------
39 | 
40 | As with authentication, you have several options for Contents Managers. A few
41 | other options:
42 | 
43 | - s3contents_: stores contents in object stores like S3_ or GCS_
44 | - pgcontents_: stores contents in a Postgres_ database
45 | 
46 | 
47 | .. _contents manager: https://jupyter-notebook.readthedocs.io/en/stable/extending/contents.html
48 | .. _jupyter-hdfscm:
49 | .. _jupyter-hdfscm documentation: https://jcrist.github.io/hdfscm/
50 | .. _s3contents: https://github.com/danielfrg/s3contents
51 | .. _S3: https://aws.amazon.com/s3/
52 | .. _GCS: https://cloud.google.com/storage/
53 | .. _pgcontents: https://github.com/quantopian/pgcontents
54 | .. _postgres: https://www.postgresql.org/
55 | 


--------------------------------------------------------------------------------
/docs/source/customization.rst:
--------------------------------------------------------------------------------
 1 | Customization
 2 | =============
 3 | 
 4 | .. toctree::
 5 | 
 6 |     enable-https
 7 |     contents-managers
 8 |     jupyterlab
 9 |     dask
10 |     spark
11 | 


--------------------------------------------------------------------------------
/docs/source/dask.rst:
--------------------------------------------------------------------------------
  1 | Integration with Dask
  2 | =====================
  3 | 
  4 | Dask_ is a powerful and flexible tool for scaling Python analytics across a
  5 | cluster. Dask works out-of-the-box with JupyterHub, but there are several
  6 | things you can configure to make the experience nicer.
  7 | 
  8 | 
  9 | Install Dependencies
 10 | --------------------
 11 | 
 12 | To run Dask on Hadoop you'll need to install `dask-yarn`_ in the :ref:`notebook
 13 | environment <notebook-environments>`.
 14 | 
 15 | .. code-block:: shell
 16 | 
 17 |     # Install with conda
 18 |     $ conda install -c conda-forge dask-yarn
 19 | 
 20 |     # Or install with pip
 21 |     $ pip install dask-yarn
 22 | 
 23 | 
 24 | Install Optional Dependencies
 25 | -----------------------------
 26 | 
 27 | For a nicer UI experience and access to the `Dask Dashboard`_, you'll also want
 28 | to install `jupyter-server-proxy`_ and ipywidgets_ in the :ref:`notebook
 29 | environment <notebook-environments>`.
 30 | 
 31 | .. code-block:: shell
 32 | 
 33 |     # Install with conda
 34 |     $ conda install -c conda-forge jupyter-server-proxy ipywidgets
 35 | 
 36 |     #  Or install with pip
 37 |     $ pip install jupyter-server-proxy ipywidgets
 38 | 
 39 | 
 40 | Configuration
 41 | -------------
 42 | 
 43 | While Dask will work as long as ``dask-yarn`` is installed, as an administrator
 44 | you can preconfigure a few default fields to make things easier for your users.
 45 | 
 46 | Dask's default configuration is collected from the following locations (earlier
 47 | sources taking precedence):
 48 | 
 49 | - Environment variables like ``DASK_YARN__ENVIRONMENT``
 50 | - YAML files found in ``$DASK_CONFIG`` (defaults to ``~/.config/dask/``)
 51 | - YAML files found in ``<ENV>/etc/dask`` where ``<ENV>`` is a user's
 52 |   Conda/Virtual environment
 53 | - YAML files found in the system ``/etc/dask/`` directory
 54 | - Default settings within the various dask libraries
 55 | 
 56 | As such, you have a few options for how to pass configuration on to your users.
 57 | If you're using :ref:`archived notebook environments <archived-environments>`,
 58 | a good option is to put configuration in ``<ENV>/etc/dask/config.yaml`` (where
 59 | ``<ENV>`` is the top-directory of the environment). This allows having
 60 | different configuration settings for different environemnts. These can also be
 61 | combined with environment variables set via ``YarnSpawner.environment`` (useful
 62 | for things determined at runtime, or overrides for the static config files).
 63 | 
 64 | Below we provide an example configuration file:
 65 | 
 66 | .. code-block:: yaml
 67 | 
 68 |     yarn:
 69 |       # Specify the default Python environment to use for the workers.
 70 |       # In most cases this should be the same as the environment used for the
 71 |       # user's notebook server.
 72 |       # Here we use an archived environment stored on hdfs
 73 |       environment: hdfs:///jupyterhub/example.tar.gz',
 74 | 
 75 |       # To instead specify an environment already on every node, use
 76 |       # environment: local:///path/to/environment
 77 | 
 78 |       # The YARN queue to submit applications to by default
 79 |       queue: dask
 80 | 
 81 |       # Configure the default worker vcores and memory
 82 |       # These can be overridden by the user as needed
 83 |       worker:
 84 |         vcores: 2
 85 |         memory: 2 GiB
 86 | 
 87 |       # Use `local` deploy mode. This runs the scheduler in the same container
 88 |       # as the notebook, and allows for viewing the dashboard using
 89 |       # jupyter-server-proxy.
 90 |       deploy-mode: local
 91 | 
 92 |     distributed:
 93 |       dashboard:
 94 |         # Configure the link template for the dask dashboard. This updates the
 95 |         # dashboard links to proxy through jupyter-server-proxy
 96 |         link: /user/{JUPYTERHUB_USER}/proxy/{port}/status
 97 | 
 98 | The above are likely parameters you'll want to set, but Dask has many more
 99 | configuration options. For more information see the `Dask configuration
100 | documentation`_.
101 | 
102 | 
103 | Usage
104 | -----
105 | 
106 | Given a fully configured system, users should be able to create a Dask Cluster
107 | as follows:
108 | 
109 | .. code-block:: python
110 | 
111 |     import dask_yarn
112 |     cluster = dask_yarn.YarnCluster()
113 | 
114 | 
115 | Default parameters can be overridden by specifying them at runtime:
116 | 
117 | .. code-block:: python
118 | 
119 |     import dask_yarn
120 |     # Use different worker resources than the default
121 |     cluster = dask_yarn.YarnCluster(
122 |         worker_vcores=4,
123 |         worker_memory='4 GiB'
124 |     )
125 | 
126 | Users can then connect to their cluster and start doing work:
127 | 
128 | .. code-block:: python
129 | 
130 |     from dask.distributed import Client
131 |     client = Client(cluster)
132 | 
133 |     # Start doing computations
134 |     import dask.dataframe as dd
135 |     ddf = dd.read_parquet("hdfs:///path/to/my/data.parquet")
136 | 
137 | Clusters can be scaled dynamically at runtime:
138 | 
139 | .. code-block:: python
140 | 
141 |     # Scale up to 10 workers
142 |     cluster.scale(10)
143 | 
144 |     # Scale down to 4 workers
145 |     cluster.scale(4)
146 | 
147 | If you installed ``jupyter-server-proxy`` and ``ipywidgets`` you'll also get a
148 | nice UI for interacting with the cluster:
149 | 
150 | .. image:: /_images/dask-usage.gif
151 |     :width: 90 %
152 |     :align: center
153 |     :alt: Interactive usage of Dask
154 | 
155 | Clusters can be shutdown manually, or will be automatically shutdown on
156 | notebook exit.
157 | 
158 | 
159 | Further Reading
160 | ---------------
161 | 
162 | Dask integrates well with the extensive Python datascience ecosystem. For more
163 | information please see the following resources:
164 | 
165 | - `Dask documentation`_
166 | - `Dask on YARN`_
167 | - `Dask Examples`_
168 | 
169 | 
170 | .. _Dask documentation:
171 | .. _Dask: https://dask.org/
172 | .. _Dask on Yarn: https://yarn.dask.org
173 | .. _Dask-Yarn: https://yarn.dask.org
174 | .. _Dask Dashboard: https://docs.dask.org/en/latest/diagnostics-distributed.html#dashboard
175 | .. _Dask configuration documentation: https://docs.dask.org/en/latest/configuration.html
176 | .. _jupyter-server-proxy: https://jupyter-server-proxy.readthedocs.io/
177 | .. _ipywidgets: https://ipywidgets.readthedocs.io/
178 | .. _Dask Examples: https://examples.dask.org/
179 | 


--------------------------------------------------------------------------------
/docs/source/demo.rst:
--------------------------------------------------------------------------------
 1 | Docker-Compose Demo
 2 | -------------------
 3 | 
 4 | For demonstration and experimentation purposes, a docker-compose_ setup can be
 5 | found `here
 6 | <https://github.com/jupyterhub/jupyterhub-on-hadoop/tree/master/docker-demo>`__.
 7 | 
 8 | To run, first install docker_ and docker-compose_, following the instructions
 9 | for your OS. You'll also need to make sure that docker is started with
10 | sufficient resources - we recommend having at least 4 GB allocated to your
11 | ``docker-machine``.
12 | 
13 | The demo cluster can then be started as follows:
14 | 
15 | .. code-block:: shell
16 | 
17 |     # Clone the repository
18 |     $ git clone https://github.com/jupyterhub/jupyterhub-on-hadoop.git
19 | 
20 |     # Enter the `docker-demo` directory
21 |     $ cd jupyterhub-on-hadoop/docker-demo
22 | 
23 |     # Start the demo cluster
24 |     $ docker-compose up -d
25 | 
26 | JupyterHub will then be available on port ``8888`` at your docker-machine
27 | IP address. This IP address can be found at:
28 | 
29 | .. code-block:: shell
30 | 
31 |     $ docker-machine inspect --format {{.Driver.IPAddress}})
32 | 
33 | Once you're done using the demo, it can be shutdown with (from the
34 | ``docker-demo`` directory):
35 | 
36 | .. code-block:: shell
37 | 
38 |     $ docker-compose down
39 | 
40 | The demo comes with the following features:
41 | 
42 | - A realistic Hadoop 3 (`CDH 6`_) cluster (1 master, 1 worker, 1 edge node),
43 |   with Kerberos security enabled.
44 | 
45 | - 3 user accounts (``alice``, ``bob``, and ``carl``). The password for each is
46 |   ``testpass``.
47 | 
48 | - Both Jupyter Notebook and JupyterLab are available. The default upon login is
49 |   the Notebook interface, replace ``/tree`` with ``/lab`` in the URL to access
50 |   JupyterLab.
51 | 
52 | - Each user gets access to a Python 3.7 environment, with common packages like
53 |   ``numpy`` and ``pandas`` already installed.
54 | 
55 | - Dask_ and Spark_ are both installed and fully configured. See :doc:`dask` and
56 |   :doc:`spark` for more information on use.
57 | 
58 | 
59 | For a walkthrough using the same demo cluster, see this video:
60 | 
61 | .. raw:: html
62 | 
63 |     <div style="text-align:center">
64 |       <iframe
65 |         width="640"
66 |         height="385"
67 |         src="https://www.youtube.com/embed/M7T8Xnj9M6c"
68 |         frameborder="0"
69 |         allow="autoplay; encrypted-media;"
70 |         allowfullscreen>
71 |       </iframe>
72 |     </div>
73 | 
74 | .. _Docker: https://www.docker.com/
75 | .. _Docker-Compose: https://docs.docker.com/compose/
76 | .. _CDH 6: https://www.cloudera.com/documentation/enterprise/6/6.2/topics/cdh_intro.html
77 | .. _Dask: https://dask.org/
78 | .. _Spark: https://spark.apache.org/
79 | 


--------------------------------------------------------------------------------
/docs/source/enable-https.rst:
--------------------------------------------------------------------------------
 1 | Enable HTTPS
 2 | ============
 3 | 
 4 | While JupyterHub runs fine with just HTTP, we highly recommend enabling SSL
 5 | encryption (HTTPS). You can either have JupyterHub handle its own SSL
 6 | termination, or run JupyterHub behind a proxy and handle SSL termination
 7 | externally. For more information, see the `JupyterHub documentation`_.
 8 | 
 9 | Handling SSL termination in JupyterHub
10 | --------------------------------------
11 | 
12 | If you have your own SSL certificates you can have JupyterHub handle SSL
13 | termination itself. To do so, you need to specify their locations in your
14 | ``jupyterhub_config.py``:
15 | 
16 | .. code-block:: python
17 | 
18 |     c.JupyterHub.ssl_cert = '/path/to/your.cert'
19 |     c.JupyterHub.ssl_key = '/path/to/your.key'
20 | 
21 | If your cert file also contains the key (which is sometimes true), you can omit
22 | setting ``ssl_key``.
23 | 
24 | You'll also need to change any configuration fields that you set to use
25 | ``http`` to ``https`` (typically this is just ``bind_url``):
26 | 
27 | .. code-block:: python
28 | 
29 |     c.JupyterHub.bind_url = 'https://:<PORT-TO-USE>'
30 | 
31 | Just as with kerberos keytabs, it is important to put
32 | these files in a secure location on your server where they are not readable by
33 | regular users. In the absence of a better location, we recommend
34 | ``/etc/jupyterhub/``.
35 | 
36 | .. code-block:: shell
37 | 
38 |     $ mv mycert.cert /etc/jupyterhub/jupyterhub.cert
39 |     $ mv mykey.key /etc/jupyterhub/jupyterhub.key
40 |     $ chmod 400 /etc/jupyterhub/jupyterhub.cert /etc/jupyterhub/jupyterhub.key
41 |     $ chown jupyterhub /etc/jupyterhub/jupyterhub.cert /etc/jupyterhub/jupyterhub.key
42 | 
43 | Using external SSL termination
44 | ------------------------------
45 | 
46 | If JupyterHub is running behind a reverse proxy that already handles SSL
47 | termination (NGINX, etc...), you can omit configuring ``ssl_cert`` and
48 | ``ssl_key``. No further steps are necessary.
49 | 
50 | .. _JupyterHub documentation: https://jupyterhub.readthedocs.io/en/stable/getting-started/security-basics.html#enabling-ssl-encryption
51 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
  1 | JupyterHub on Hadoop
  2 | ====================
  3 | 
  4 | JupyterHub_ provides a secure, multi-user interactive notebook_ environment. It
  5 | allows users to access a shared computing environment conveniently through a
  6 | webpage, with no local installation required.
  7 | 
  8 | JupyterHub is flexible and can be deployed in many different environments. In
  9 | the spirit of Zero-to-JupyterHub-Kubernetes_, this guide aims to help you set
 10 | up your own JupyterHub on an existing `Hadoop Cluster`_.
 11 | 
 12 | Note that this guide is under active development. If you find things unclear or
 13 | incorrect, or have any questions/comments, feel free to `create an issue on
 14 | github`_.
 15 | 
 16 | 
 17 | Walkthrough
 18 | -----------
 19 | 
 20 | .. raw:: html
 21 | 
 22 |     <div style="text-align:center">
 23 |       <iframe
 24 |         width="640"
 25 |         height="385"
 26 |         src="https://www.youtube.com/embed/M7T8Xnj9M6c"
 27 |         frameborder="0"
 28 |         allow="autoplay; encrypted-media;"
 29 |         allowfullscreen>
 30 |       </iframe>
 31 |     </div>
 32 | 
 33 | 
 34 | Why JupyterHub?
 35 | ---------------
 36 | 
 37 | JupyterHub is not the only option for providing users a notebook environment
 38 | with Hadoop integration, but we believe this setup has some benefits over other
 39 | options.
 40 | 
 41 | - **Familiar**: JupyterHub provides the same Jupyter_ interface users know and
 42 |   love. It integrates well with the existing Data Science ecosystem, and is
 43 |   used extensively in both the private and public sector.
 44 | 
 45 | - **Extensible**: JupyterHub is open source and community supported, and has a
 46 |   large ecosystem of plugins. It can support `dozens of languages`_ (Python, R,
 47 |   Julia, Scala...), and user interfaces (Jupyter Notebooks, JupyterLab,
 48 |   RStudio...).
 49 | 
 50 | - **Scalable**: With JupyterHub, each user gets their own environment running
 51 |   in their own private container. This reduces the load on a single node, and
 52 |   allows resource usage to scale dynamically with the number of users. For
 53 |   large data, tools such as Spark_ and Dask_ work natively with no additional
 54 |   overhead.
 55 | 
 56 | - **Portable**: JupyterHub is flexible and isn't bound to a single cluster
 57 |   manager. It runs great on clusters (Kubernetes, Hadoop, HPC...) as well
 58 |   as single machines. This means that if you change your infrastructure in the
 59 |   future you can still keep using JupyterHub.
 60 | 
 61 | 
 62 | Architecture Overview
 63 | ---------------------
 64 | 
 65 | JupyterHub is divided into three separate components:
 66 | 
 67 | - Multiple **single-user notebook servers** (one per active user)
 68 | - An **HTTP proxy** for proxying traffic between users and their respective
 69 |   servers.
 70 | - A central **Hub** that manages authentication and single-user server
 71 |   startup/shutdown.
 72 | 
 73 | When deploying JupyterHub on a Hadoop cluster, the **Hub** and **HTTP proxy**
 74 | are run on a single node (typically an edge node), and the **single-user
 75 | servers** are distributed throughout the cluster.
 76 | 
 77 | .. image:: /_images/architecture.svg
 78 |     :width: 90 %
 79 |     :align: center
 80 |     :alt: JupyterHub on Hadoop high-level architecture
 81 | 
 82 | The resource requirements for the Hub node are minimal (a minimum of 1 GB RAM
 83 | should be sufficient), as user's notebooks (where the actual work is being done)
 84 | are distributed throughout the Hadoop cluster reducing the load on any single
 85 | node.
 86 | 
 87 | 
 88 | Installation
 89 | ------------
 90 | 
 91 | As cluster management practices differ, we hope to provide several options for
 92 | installation. Currently only a manual installation tutorial is provided - if
 93 | you're interested in providing alternative options (`Cloudera Parcels`_,
 94 | RPMs_...) please `get in touch on github`_.
 95 | 
 96 | - :doc:`manual-installation`
 97 | 
 98 | 
 99 | Customization
100 | -------------
101 | 
102 | Once basic installation is complete, there are several options for additional
103 | customization.
104 | 
105 | - :doc:`enable-https`
106 | - :doc:`contents-managers`
107 | - :doc:`jupyterlab`
108 | - :doc:`dask`
109 | - :doc:`spark`
110 | 
111 | 
112 | .. toctree::
113 |     :hidden:
114 | 
115 |     installation
116 |     customization
117 |     demo
118 | 
119 | 
120 | .. _JupyterHub: https://jupyterhub.readthedocs.io/
121 | .. _Jupyter:
122 | .. _notebook: https://jupyter.org/
123 | .. _Zero-to-JupyterHub-Kubernetes: https://zero-to-jupyterhub.readthedocs.io/
124 | .. _Hadoop Cluster: https://hadoop.apache.org/
125 | .. _create an issue on github: https://github.com/jupyterhub/jupyterhub-on-hadoop/issues
126 | .. _dozens of languages: https://github.com/jupyter/jupyter/wiki/Jupyter-kernels
127 | .. _Dask: https://dask.org/
128 | .. _Spark: https://spark.apache.org/
129 | .. _Cloudera Parcels: https://github.com/jupyterhub/jupyterhub-on-hadoop/issues/1
130 | .. _RPMs: https://github.com/jupyterhub/jupyterhub-on-hadoop/issues/8
131 | .. _get in touch on github: https://github.com/jupyterhub/jupyterhub-on-hadoop/
132 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 | 
4 | .. toctree::
5 | 
6 |     manual-installation
7 | 


--------------------------------------------------------------------------------
/docs/source/jupyterlab.rst:
--------------------------------------------------------------------------------
 1 | Use JupyterLab by default
 2 | =========================
 3 | 
 4 | By default JupyterHub uses the classic Notebook_ frontend. Several alternative
 5 | UIs are supported, one of which is JupyterLab_. This provides a more featured
 6 | UI, with a whole ecosystem of extensions. It looks like this (borrowed from the
 7 | JupyterLab documentation):
 8 | 
 9 | .. image:: /_images/jupyterlab_interface.png
10 |     :width: 90 %
11 |     :align: center
12 |     :alt: JupyterLab's modern UI.
13 | 
14 | 
15 | Installation
16 | ------------
17 | 
18 | To enable, first install ``jupyterlab`` in the :ref:`notebook environment
19 | <notebook-environments>` *not* the JupyterHub environment.
20 | 
21 | .. code-block:: shell
22 | 
23 |     # Install using conda
24 |     $ conda install -c conda-forge jupyterlab
25 | 
26 |     # Or install with pip
27 |     $ pip install jupyterlab
28 | 
29 | Next, optionally install the JupyterLab Hub extension into the notebook (not
30 | the JupyterHub) environment. This isn't strictly necessary, but adds a
31 | JupyterHub control panel to the JupyterLab UI allowing easier login/logout.
32 | 
33 |    .. code-block:: shell
34 | 
35 |     $ jupyter labextension install @jupyterlab/hub-extension
36 | 
37 | Finally, configure JupyterHub to start start JupyterLab instead of Jupyter
38 | Notebook by default on startup.
39 | 
40 |    .. code-block:: python
41 | 
42 |     # Start users in JupyterLab by default
43 |     c.YarnSpawner.default_url = '/lab'
44 | 
45 |     # Start JupyterLab with the hub extension (only required if you
46 |     # installed the JupyterLab Hub extension above)
47 |     c.YarnSpawner.cmd = ['python -m yarnspawner.jupyter_labhub']
48 | 
49 | For more information see the `JupyterLab on JupyterHub`_ and `JupyterLab Hub
50 | Extension`_ documentation.
51 | 
52 | 
53 | Useful Extensions
54 | -----------------
55 | 
56 | JupyterLab has a whole ecsystem of useful extensions. As above, extensions
57 | must be installed in the *notebook* environment to properly work. Below we
58 | list a few that may be useful.
59 | 
60 | - `Dask Extension <https://github.com/dask/dask-labextension/>`__
61 | - `Interactive Widgets Extension <https://ipywidgets.readthedocs.io/en/stable/user_install.html#installing-the-jupyterlab-extension>`__
62 | - `Git Extension <https://github.com/jupyterlab/jupyterlab-git>`__
63 | - `GitHub Extension <https://github.com/jupyterlab/jupyterlab-github>`__
64 | 
65 | .. _Notebook: https://jupyter-notebook.readthedocs.io/en/stable/
66 | .. _JupyterLab: https://jupyterlab.readthedocs.io/
67 | .. _JupyterLab on JupyterHub: https://jupyterlab.readthedocs.io/en/stable/user/jupyterhub.html
68 | .. _JupyterLab Hub Extension: https://jupyterlab.readthedocs.io/en/stable/user/jupyterhub.html
69 | 


--------------------------------------------------------------------------------
/docs/source/manual-authentication.rst:
--------------------------------------------------------------------------------
  1 | Add Authentication
  2 | ==================
  3 | 
  4 | The last thing we need to do before testing is configure an authenticator_.
  5 | JupyterHub has pluggable authentication, and implementations for many common
  6 | authentication models already exist. Here we discuss a few that may be of
  7 | interest - for more information see JupyterHub's `authenticator docs`_.
  8 | 
  9 | .. contents:: :local:
 10 | 
 11 | Dummy Authenticator
 12 | -------------------
 13 | 
 14 | The `dummy authenticator`_ is useful for testing purposes, but should *not* be
 15 | used in production. It accepts any username and password combination as valid.
 16 | To make things *slightly* more secure, you can set a single global shared
 17 | password.
 18 | 
 19 | We recommend installing this authenticator to test that things are working
 20 | before moving on to an actually secure authenticator implementation.
 21 | 
 22 | .. code-block:: bash
 23 | 
 24 |     # Install the authenticator in JupyterHub's python environment
 25 |     $ pip install jupyterhub-dummyauthenticator
 26 | 
 27 | 
 28 | .. code-block:: python
 29 | 
 30 |     c.JupyterHub.authenticator_class = 'dummyauthenticator.DummyAuthenticator'
 31 |     # Optionally add a shared global password to be used by all users
 32 |     c.DummyAuthenticator.password = "password-for-testing"
 33 | 
 34 | 
 35 | Kerberos Authenticator
 36 | ----------------------
 37 | 
 38 | The `kerberos authenticator`_ can be used to enable user authentication with
 39 | Kerberos_.
 40 | 
 41 | .. code-block:: shell
 42 | 
 43 |     # Install the authenticator in JupyterHub's python environment
 44 |     $ pip install jupyterhub-kerberosauthenticator
 45 | 
 46 | Kerberos authentication requires a keytab for the ``HTTP`` service principal
 47 | for the host running JupyterHub. Keytabs can be created on the command-line as
 48 | follows:
 49 | 
 50 | .. code-block:: shell
 51 | 
 52 |     $ kadmin -q "addprinc -randkey HTTP/FQDN"
 53 |     $ kadmin -q "xst -norandkey -k /etc/jupyterhub/HTTP.keytab HTTP/FQDN"
 54 | 
 55 |     # Make the keytab readable/writable only by jupyterhub
 56 |     $ chown jupyterhub /etc/jupyterhub/HTTP.keytab
 57 |     $ chmod 400 /etc/jupyterhub/HTTP.keytab
 58 | 
 59 | where ``FQDN`` is the `fully qualified domain name`_ of the host running
 60 | JupyterHub. Alternatively you could include the ``HTTP`` principal in the
 61 | keytab created during :ref:`spawner configuration <create-spawner-keytab>`.
 62 | 
 63 | The authenticator can then be enabled by adding the following lines to your
 64 | ``jupyterhub_config.py``:
 65 | 
 66 | .. code-block:: python
 67 | 
 68 |     c.JupyterHub.authenticator_class = 'kerberosauthenticator.KerberosAuthenticator'
 69 |     c.JupyterHub.keytab = '/etc/jupyterhub/HTTP.keytab'
 70 | 
 71 | For more information see the `kerberos authenticator docs`_.
 72 | 
 73 | 
 74 | LDAP Authenticator
 75 | ------------------
 76 | 
 77 | The `LDAP authenticator`_ can be used to authenticate with LDAP_ or `Active
 78 | Directory`_. An example install and configuration might look like:
 79 | 
 80 | .. code-block:: shell
 81 | 
 82 |     # Install the authenticator in JupyterHub's python environment
 83 |     $ pip install jupyterhub-ldapauthenticator
 84 | 
 85 | .. code-block:: python
 86 | 
 87 |     c.JupyterHub.authenticator_class = 'ldapauthenticator.LDAPAuthenticator'
 88 |     c.LDAPAuthenticator.server_address = 'address.of.your.ldap.server'
 89 |     c.LDAPAuthenticator.bind_dn_template = 'cn={username},ou=edir,ou=people,ou=EXAMPLE-UNIT,o=EXAMPLE'
 90 | 
 91 | See the ldapauthenticator_ documentation for more information.
 92 | 
 93 | 
 94 | Remote User Authenticator
 95 | -------------------------
 96 | 
 97 | The `remote user authenticator`_ makes use of the ``REMOTE_USER`` header for
 98 | authentication. This allows use of JupyterHub with an external authenticating
 99 | proxy (such as `Apache Knox`_).
100 | 
101 | .. code-block:: shell
102 | 
103 |     # Install the authenticator in JupyterHub's python environment
104 |     $ pip install jhub_remote_user_authenticator
105 | 
106 | .. code-block:: python
107 | 
108 |     c.JupyterHub.authenticator_class = 'jhub_remote_user_authenticator.remote_user_auth.RemoteUserAuthenticator'
109 | 
110 | For more information see the `remote user authenticator docs`_.
111 | 
112 | 
113 | .. _authenticator docs:
114 | .. _authenticator: https://jupyterhub.readthedocs.io/en/stable/reference/authenticators.html
115 | .. _dummy authenticator: https://github.com/jupyterhub/dummyauthenticator
116 | .. _kerberos authenticator:
117 | .. _kerberos authenticator docs: https://jupyterhub-kerberosauthenticator.readthedocs.io/
118 | .. _Kerberos: https://web.mit.edu/kerberos/
119 | .. _fully qualified domain name: https://en.wikipedia.org/wiki/Fully_qualified_domain_name
120 | .. _LDAP authenticator:
121 | .. _ldapauthenticator: https://github.com/jupyterhub/ldapauthenticator
122 | .. _LDAP: https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol
123 | .. _Active Directory: https://en.wikipedia.org/wiki/Active_Directory
124 | .. _remote user authenticator:
125 | .. _remote user authenticator docs: https://github.com/cwaldbieser/jhub_remote_user_authenticator
126 | .. _Apache Knox: https://knox.apache.org/
127 | 


--------------------------------------------------------------------------------
/docs/source/manual-installation.rst:
--------------------------------------------------------------------------------
 1 | Manual Installation
 2 | ===================
 3 | 
 4 | Here we provide a step-by-step guide on how to manually install and configure
 5 | JupyterHub on Hadoop. The guide is designed to have checkpoints where you can
 6 | test that things work before moving on to the next section.
 7 | 
 8 | .. toctree::
 9 |     :titlesonly:
10 | 
11 |     manual-setup
12 |     manual-spawner
13 |     manual-authentication
14 | 


--------------------------------------------------------------------------------
/docs/source/manual-setup.rst:
--------------------------------------------------------------------------------
  1 | Install & Setup Basic JupyterHub
  2 | ================================
  3 | 
  4 | Here we setup a basic JupyterHub installation. At the end of this section
  5 | you'll have a running (but nonfunctional) JupyterHub server. Further sections
  6 | will complete the install by integrating it with your cluster.
  7 | 
  8 | .. contents:: :local:
  9 | 
 10 | Create a user for running JupyterHub
 11 | ------------------------------------
 12 | 
 13 | Before installing anything, you'll need to create the user account that will be
 14 | used to run JupyterHub. Here we'll use ``jupyterhub``:
 15 | 
 16 | .. code-block:: bash
 17 | 
 18 |     $ adduser jupyterhub
 19 | 
 20 | 
 21 | Create directories to store JupyterHub Installation
 22 | ---------------------------------------------------
 23 | 
 24 | A JupyterHub installation has three types of files which will need their own
 25 | directories created before installation:
 26 | 
 27 | - Software files. This includes a Python environment and all required
 28 |   libraries. Here we use ``/opt/jupyterhub``.
 29 | - Configuration files. Here we use ``/etc/jupyterhub``.
 30 | - Runtime files. By default this includes a sqlite database (can be configured
 31 |   to use other databases). Here we use ``/var/jupyterhub``.
 32 | 
 33 | .. code-block:: bash
 34 | 
 35 |     # Software files
 36 |     $ mkdir -p /opt/jupyterhub
 37 | 
 38 |     # Configuration files
 39 |     $ mkdir /etc/jupyterhub
 40 | 
 41 |     # Runtime files
 42 |     $ mkdir /var/jupyterhub
 43 |     $ chown jupyterhub /var/jupyterhub
 44 | 
 45 | 
 46 | Install a private python environment
 47 | ------------------------------------
 48 | 
 49 | To avoid interactions between the system python installation and JupyterHub,
 50 | we'll install a full Python environment into the software directory. The
 51 | easiest way to do this is to use miniconda_, but this isn't a strict
 52 | requirement.
 53 | 
 54 | .. code-block:: bash
 55 | 
 56 |     $ curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh
 57 |     $ bash /tmp/miniconda.sh -b -p /opt/jupyterhub/miniconda
 58 |     $ rm /tmp/miniconda.sh
 59 | 
 60 | We also recommend adding miniconda to the ``root`` user's path to ease further
 61 | commands.
 62 | 
 63 | .. code-block:: bash
 64 | 
 65 |     $ echo 'export PATH="/opt/jupyterhub/miniconda/bin:$PATH"' >> /root/.bashrc
 66 |     $ source /root/.bashrc
 67 | 
 68 | 
 69 | Install JupyterHub
 70 | ------------------
 71 | 
 72 | Now we can install JupyterHub and its dependencies. Since JupyterHub has many
 73 | configuration options, we'll only install the basic ``jupyterhub`` application
 74 | for now - further steps will add more packages later on.
 75 | 
 76 | .. code-block:: bash
 77 | 
 78 |     $ conda install -c conda-forge jupyterhub -y
 79 | 
 80 | 
 81 | Configure JupyterHub
 82 | --------------------
 83 | 
 84 | Now we're ready to configure our basic JupyterHub installation and get things
 85 | running to see if everything works. JupyterHub configuration is written as a
 86 | Python file (typically ``/etc/jupyterhub/jupyterhub_config.py``). Here we'll
 87 | configure only a few fields:
 88 | 
 89 | - ``JupyterHub.bind_url``: the public facing URL of the whole JupyterHub
 90 |   application.
 91 | 
 92 | - ``JupyterHub.db_url``: the address that the JupyterHub database can be
 93 |   reached at. JupyterHub supports many databases - for now we'll use sqlite.
 94 |   For information on using a different database see `the JupyterHub docs
 95 |   <https://jupyterhub.readthedocs.io/en/stable/reference/database.html>`__.
 96 | 
 97 | - ``JupyterHub.cookie_secret_file``: the location to store the cookie secret.
 98 |   There are other options for setting the cookie secret, but saving a secret
 99 |   token to a file in a secure location is a good default option. See `the
100 |   JupyterHub docs
101 |   <https://jupyterhub.readthedocs.io/en/stable/getting-started/security-basics.html#cookie-secret>`__
102 |   for more information.
103 | 
104 | 
105 | Create the cookie secret file and change permissions so only the ``jupyterhub``
106 | user has access.
107 | 
108 | .. code-block:: bash
109 | 
110 |     $ openssl rand -hex 32 > /etc/jupyterhub/jupyterhub_cookie_secret
111 |     $ chmod 600 /etc/jupyterhub/jupyterhub_cookie_secret
112 |     $ chown jupyterhub /etc/jupyterhub/jupyterhub_cookie_secret
113 | 
114 | 
115 | Save the following configuration to ``/etc/jupyterhub/jupyterhub_config.py``:
116 | 
117 | .. code-block:: python
118 | 
119 |     c.JupyterHub.bind_url = 'http://:<PORT-TO-USE>'
120 |     c.JupyterHub.cookie_secret_file = '/etc/jupyterhub/jupyterhub_cookie_secret'
121 |     c.JupyterHub.db_url = 'sqlite:////var/jupyterhub/jupyterhub.sqlite'
122 | 
123 | 
124 | Open JupyterHub Port
125 | --------------------
126 | 
127 | For users to access JupyterHub, they'll need access to the port set in
128 | ``JupyterHub.bind_url`` above. How to open this port is system specific and is
129 | left as an exercise for the reader.
130 | 
131 | 
132 | Start JupyterHub and check that it's accessible
133 | -----------------------------------------------
134 | 
135 | At this point you should be able to start JupyterHub as the ``jupyterhub`` user
136 | using your created configuration file. The JupyterHub process will be a long
137 | running process - how you intend to manage it (``supervisord``, etc...) is
138 | system specific. The requirements are:
139 | 
140 | - Start with ``jupyterhub`` as the user
141 | - Start with ``/var/jupyterhub`` as the working directory
142 | - Add ``/opt/jupyterhub/miniconda/bin`` to path
143 | - Specify the configuration file location with ``-f /etc/jupyterhub/jupyterhub_config.py``
144 | 
145 | For ease, we recommend creating a small bash script stored at
146 | ``/opt/jupyterhub/start-jupyterhub`` to set this up:
147 | 
148 | .. code-block:: bash
149 | 
150 |     #!/usr/bin/env bash
151 | 
152 |     export PATH="/opt/jupyterhub/miniconda/bin:$PATH"
153 |     cd /var/jupyterhub
154 |     jupyterhub -f /etc/jupyterhub/jupyterhub_config.py
155 | 
156 | 
157 | For *testing* here's how you might start JupyterHub manually:
158 | 
159 | 
160 | .. code-block:: bash
161 | 
162 |     $ cd /var/jupyterhub
163 |     $ sudo -iu jupyterhub /opt/jupyterhub/start-jupyterhub
164 | 
165 | Things won't work yet (more configuration is still needed), but you should at
166 | least have access to a web page.
167 | 
168 | .. image:: /_images/login-page-hello-world.png
169 |     :width: 90 %
170 |     :align: center
171 |     :alt: JupyterHub "Hello World" login page
172 | 
173 | In the next section we'll configure spawners and authentication to get things
174 | fully working.
175 | 
176 | .. _miniconda: https://docs.conda.io/en/latest/miniconda.html
177 | 


--------------------------------------------------------------------------------
/docs/source/manual-spawner.rst:
--------------------------------------------------------------------------------
  1 | Configure YarnSpawner
  2 | =====================
  3 | 
  4 | We now have JupyterHub installed, but it can't really do anything yet. For
  5 | JupyterHub to manage user's notebook servers, we need to configure a Spawner_
  6 | for it to use. YarnSpawner_ is a Spawner implementation that launches
  7 | notebook servers on Apache Hadoop/YARN clusters. Here we'll discuss
  8 | installation and configuration of this spawner, for more information see
  9 | the `YarnSpawner documentation`_
 10 | 
 11 | .. contents:: :local:
 12 |     
 13 | 
 14 | Install jupyterhub-yarnspawner
 15 | ------------------------------
 16 | 
 17 | Yarnpawner should be installed in the same environment that JupyterHub is
 18 | running in.
 19 | 
 20 | .. code-block:: bash
 21 | 
 22 |     $ conda install -c conda-forge jupyterhub-yarnspawner -y
 23 | 
 24 | 
 25 | Set the JupyterHub Spawner Class
 26 | --------------------------------
 27 | 
 28 | Tell JupyterHub to use ``YarnSpawner`` by adding the following line to your
 29 | ``jupyterhub_config.py``:
 30 | 
 31 | .. code-block:: python
 32 | 
 33 |     c.JupyterHub.spawner_class = 'yarnspawner.YarnSpawner'
 34 | 
 35 | 
 36 | Configure the Hub Connect IP
 37 | ----------------------------
 38 | 
 39 | By default JupyterHub runs its internal communications server on ``127.0.0.1``,
 40 | meaning its only accesible from the machine running JupyterHub.
 41 | 
 42 | Since the servers started by ``YarnSpawner`` are running on other machines in
 43 | the cluster, we'll need to update this address to something more accessible:
 44 | 
 45 | .. code-block:: python
 46 | 
 47 |     # Set to '' for all interfaces. Can also set to the hostname of the
 48 |     # JupyterHub machine.
 49 |     c.JupyterHub.hub_ip = ''
 50 | 
 51 | 
 52 | Enable Proxy User Permissions
 53 | -----------------------------
 54 | 
 55 | YarnSpawner makes full use of Hadoop's security model, and will start Jupyter
 56 | notebook server's in containers with the requesting user's permissions (e.g. if
 57 | ``alice`` logs in to JupyterHub, their notebook server will be running as user
 58 | ``alice``). To accomplish this, JupyterHub needs `proxy user`_ permissions.
 59 | This allows the JupyterHub server to perform actions impersonating another user.
 60 | 
 61 | For JupyterHub to work properly, you'll need to enable `proxy user`_
 62 | permissions for the ``jupyterhub`` user account. The users ``jupyterhub`` has
 63 | permission to impersonate can be restricted to certain groups, and requests to
 64 | impersonate may be restricted to certain hosts. At a minimum, ``jupyterhub``
 65 | will require permission to impersonate any JupyterHub user, with requests
 66 | allowed from at least the host running JupyterHub.
 67 | 
 68 | .. code-block:: xml
 69 | 
 70 |     <property>
 71 |       <name>hadoop.proxyuser.jupyterhub.hosts</name>
 72 |       <value>host-where-jupyterhub-is-running</value>
 73 |     </property>
 74 |     <property>
 75 |       <name>hadoop.proxyuser.jupyterhub.groups</name>
 76 |       <value>group1,group2</value>
 77 |     </property>
 78 | 
 79 | If looser restrictions are acceptable, you may also use the wildcard ``*``
 80 | to allow impersonation of any user or from any host.
 81 | 
 82 | .. code-block:: xml
 83 | 
 84 |     <property>
 85 |       <name>hadoop.proxyuser.jupyterhub.hosts</name>
 86 |       <value>*</value>
 87 |     </property>
 88 |     <property>
 89 |       <name>hadoop.proxyuser.jupyterhub.groups</name>
 90 |       <value>*</value>
 91 |     </property>
 92 | 
 93 | See the `proxy user`_ documentation for more information.
 94 | 
 95 | 
 96 | .. _create-spawner-keytab:
 97 | 
 98 | Enable Kerberos Security (Optional)
 99 | -----------------------------------
100 | 
101 | If your cluster has Kerberos enabled, you'll also need to create a principal
102 | and keytab for the ``jupyterhub`` user.
103 | 
104 | .. code-block:: shell
105 | 
106 |     # Create the jupyterhub principal
107 |     $ kadmin -q "addprinc -randkey jupyterhub@YOUR_REALM.COM"
108 | 
109 |     # Create a keytab
110 |     $ kadmin -q "xst -norandkey -k /etc/jupyterhub/jupyterhub.keytab
111 | 
112 | Store the keytab file wherever you see fit (we recommend storing it along with
113 | the JupyterHub configuration, as above). You'll also want to make sure that
114 | ``jupyterhub.keytab`` is only readable by the ``jupyterhub`` user.
115 | 
116 | .. code-block:: shell
117 | 
118 |     $ chown jupyterhub /etc/jupyterhub/jupyterhub.keytab
119 |     $ chmod 400 /etc/jupyterhub/jupyterhub.keytab
120 | 
121 | To configure JupyterHub to use this keytab file, you'll need to add the
122 | following line to your ``jupyterhub_config.py``:
123 | 
124 | .. code-block:: python
125 | 
126 |     # The principal JupyterHub is running as
127 |     c.YarnSpawner.principal = 'jupyterhub'
128 | 
129 |     # Path to the keytab you created
130 |     c.YarnSpawner.keytab = '/etc/jupyterhub/jupyterhub.keytab'
131 | 
132 | 
133 | .. _notebook-environments:
134 | 
135 | Specifying Python Environments
136 | ------------------------------
137 | 
138 | Since the user's notebook servers will be each running in their own YARN
139 | container, you'll need to provide a way for Python environments to be available
140 | to these containers. You have a few options here:
141 | 
142 | - Install identical Python environments on every node
143 | - Archive environments to be distributed to the container at runtime (recommended)
144 | 
145 | In either case, the Python environment requires at minimum:
146 | 
147 | - ``jupyterhub-yarnspawner``
148 | - ``jupyterhub``
149 | - ``notebook``
150 | 
151 | 
152 | Using a Local Environment
153 | ^^^^^^^^^^^^^^^^^^^^^^^^^
154 | 
155 | If you've installed identical Python environments on every node, you only need
156 | to configure ``YarnSpawner`` to use the provided Python. This could be done a
157 | few different ways:
158 | 
159 | 
160 | .. code-block:: python
161 | 
162 |     # Use the path to python in the startup command
163 |     c.YarnSpawner.cmd = '/path/to/python -m yarnspawner.singleuser'
164 | 
165 |     # OR
166 |     # Activate a local conda environment before startup
167 |     c.YarnSpawner.prologue = 'conda activate /path/to/your/environment'
168 | 
169 |     # OR
170 |     # Activate a virtual environment before startup
171 |     c.YarnSpawner.prologue = 'source /path/to/your/environment/bin/activate'
172 | 
173 | 
174 | .. _archived-environments:
175 | 
176 | Using an Archived Environment
177 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
178 | 
179 | YARN also provides mechanisms to "localize" files/archives to a container
180 | before starting the application. This can be used to distribute Python
181 | environments at runtime. This approach is appealing in that it doesn't require
182 | installing anything throughout the cluster, and allows for centrally managing
183 | your user's Python environments.
184 | 
185 | Packaging environments for distribution is usually accomplished using
186 | 
187 | - conda-pack_ for conda_ environments
188 | - venv-pack_  for virtual environments (both venv_ and virtualenv_ supported)
189 | 
190 | Both are tools for taking an environment and creating an archive of it in a way
191 | that (most) absolute paths in any libraries or scripts are altered to be
192 | relocatable. This archive then can be distributed with your application, and
193 | will be automatically extracted during `YARN resource localization`_
194 | 
195 | Below we demonstrate creating and packaging a Conda environment containing all
196 | the required Jupyter packages, as well as ``pandas`` and ``scikit-learn``.
197 | Additional packages could be added as needed.
198 | 
199 | 
200 | **Packaging a Conda Environment with Conda-Pack**
201 | 
202 | .. code-block:: bash
203 | 
204 |     # Make a folder for storing the conda environments locally
205 |     $ mkdir /opt/jupyterhub/envs
206 | 
207 |     # Create a new conda environment
208 |     $ conda create -c conda-forge -y -p /opt/jupyterhub/envs/example
209 |     ...
210 | 
211 |     # Activate the environment
212 |     $ conda activate /opt/jupyterhub/envs/example
213 | 
214 |     # Install the needed packages
215 |     $ conda install -c conda-forge -y \
216 |     conda-pack \
217 |     jupyterhub-yarnspawner \
218 |     pandas \
219 |     scikit-learn
220 |     ...
221 | 
222 |     # Pip required to avoid hardcoded path in kernelspec (for now)
223 |     $ pip install notebook
224 | 
225 |     # Package the environment into example.tar.gz
226 |     $ conda pack -o example.tar.gz
227 |     Collecting packages...
228 |     Packing environment at '/opt/jupyterhub/envs/example' to 'example.tar.gz'
229 |     [########################################] | 100% Completed | 17.9s
230 | 
231 | 
232 | **Using the Packaged Environment**
233 | 
234 | It is recommended to upload the environments to some directory on HDFS
235 | beforehand, to avoid repeating the upload cost for every user. This directory
236 | should be readable by all users, but writable only by the admin user managing
237 | Python environments (here we'll use the ``jupyterhub`` user, and create a
238 | ``/jupyterhub`` directory).
239 | 
240 | .. code-block:: shell
241 | 
242 |     $ hdfs dfs -mkdir -p /jupyterhub
243 |     $ hdfs dfs -chown jupyterhub /jupyterhub
244 |     $ hdfs dfs -chmod 755 /jupyterhub
245 | 
246 | Uploading our already packaged environment to hdfs:
247 | 
248 | .. code-block:: shell
249 | 
250 |     $ hdfs dfs -put /opt/jupyterhub/envs/example.tar.gz /jupyterhub/example.tar.gz
251 | 
252 | To use the packaged environment with ``YarnSpawner``, you need to include
253 | the archive in ``YarnSpawner.localize_files``, and activate the environment in
254 | ``YarnSpawner.prologue``.
255 | 
256 | .. code-block:: python
257 | 
258 |     c.YarnSpawner.localize_files = {
259 |         'environment': {
260 |             'source': 'hdfs:///jupyterhub/example.tar.gz',
261 |             'visibility': 'public'
262 |         }
263 |     }
264 |     c.YarnSpawner.prologue = 'source environment/bin/activate'
265 | 
266 | 
267 | Note that we set ``visibility`` to ``public`` for the environment, so that
268 | multiple users can all share the same localized environment (reducing the cost
269 | of moving the environments around).
270 | 
271 | For more information, see the `Skein documentation on distributing files`_.
272 | 
273 | 
274 | Additional Configuration Options
275 | --------------------------------
276 | 
277 | ``YarnSpawner`` has several additional configuration fields. See the
278 | `YarnSpawner documentation`_ for more information on all available options. At
279 | a minimum you'll probably want to configure the memory and cpu limits, as well
280 | as which YARN queue to use.
281 | 
282 | .. code-block:: python
283 | 
284 |     # The memory limit for a notebook instance.
285 |     c.YarnSpawner.mem_limit = '2 G'
286 | 
287 |     # The cpu limit for a notebook instance
288 |     c.YarnSpawner.cpu_limit = 1
289 | 
290 |     # The YARN queue to use
291 |     c.YarnSpawner.queue = '...'
292 | 
293 | 
294 | Example
295 | -------
296 | 
297 | In summary, an example ``jupyterhub_config.py`` configuration enabling
298 | ``yarnspawner`` might look like:
299 | 
300 | .. code-block:: python
301 | 
302 |     # Make the JupyterHub internal communication accessible from other machines
303 |     # in the cluster
304 |     c.JupyterHub.hub_ip = ''
305 | 
306 |     # Enable yarnspawner
307 |     c.JupyterHub.spawner_class = 'yarnspawner.YarnSpawner'
308 | 
309 |     # Configuration for kerberos security
310 |     c.YarnSpawner.principal = 'jupyterhub'
311 |     c.YarnSpawner.keytab = '/etc/jupyterhub/jupyterhub.keytab'
312 | 
313 |     # Resource limits per-user
314 |     c.YarnSpawner.mem_limit = '2 G'
315 |     c.YarnSpawner.cpu_limit = 1
316 | 
317 |     # The YARN queue to use
318 |     c.YarnSpawner.queue = 'jupyterhub'
319 | 
320 |     # Specify location of the archived Python environment
321 |     c.YarnSpawner.localize_files = {
322 |         'environment': {
323 |             'source': 'hdfs:///jupyterhub/example.tar.gz',
324 |             'visibility': 'public'
325 |         }
326 |     }
327 |     c.YarnSpawner.prologue = 'source environment/bin/activate'
328 | 
329 | 
330 | .. _spawner: https://jupyterhub.readthedocs.io/en/stable/reference/spawners.html
331 | .. _YarnSpawner documentation:
332 | .. _yarnspawner: https://jupyterhub-yarnspawner.readthedocs.io/
333 | .. _proxy user: https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html
334 | .. _conda-pack: https://conda.github.io/conda-pack/
335 | .. _conda: http://conda.io/
336 | .. _venv:
337 | .. _virtualenv: https://virtualenv.pypa.io/en/stable/
338 | .. _venv-pack documentation:
339 | .. _venv-pack: https://jcrist.github.io/venv-pack/
340 | .. _YARN resource localization: https://hortonworks.com/blog/resource-localization-in-yarn-deep-dive/
341 | .. _Skein documentation on distributing files: https://jcrist.github.io/skein/distributing-files.html
342 | 


--------------------------------------------------------------------------------
/docs/source/spark.rst:
--------------------------------------------------------------------------------
  1 | Integration with Spark
  2 | ======================
  3 | 
  4 | By using JupyterHub, users get secure access to a container running inside the
  5 | Hadoop cluster, which means they can interact with Spark *directly* (instead of
  6 | by proxy with Livy). This is both simpler and faster, as results don't need to
  7 | be serialized through Livy.
  8 | 
  9 | 
 10 | Installation
 11 | ------------
 12 | 
 13 | Spark must be installed on your cluster before use. Follow the installation
 14 | guidelines from your distribution, or refer to the `Spark-on-Yarn
 15 | documentation`_.
 16 | 
 17 | 
 18 | Configuration
 19 | -------------
 20 | 
 21 | PySpark_ isn't installed like a normal Python library, rather it's packaged
 22 | separately and needs to be added to the ``PYTHONPATH`` to be importable. This
 23 | can be done by configuring ``jupyterhub_config.py`` to find the required
 24 | libraries and set ``PYTHONPATH`` in the user's notebook environment. You'll
 25 | also want to set ``PYSPARK_PYTHON`` to the same Python path that the notebook
 26 | environment is running in.
 27 | 
 28 | .. code-block:: python
 29 | 
 30 |     import os
 31 |     import glob
 32 |     # Find pyspark modules to add to PYTHONPATH, so they can be used as regular
 33 |     # libraries
 34 |     pyspark = '/usr/lib/spark/python/'
 35 |     py4j = glob.glob(os.path.join(pyspark, 'lib', 'py4j-*.zip'))[0]
 36 |     pythonpath = ':'.join([pyspark, py4j])
 37 | 
 38 |     # Set PYTHONPATH and PYSPARK_PYTHON in the user's notebook environment
 39 |     c.YarnSpawner.environment = {
 40 |         'PYTHONPATH': pythonpath,
 41 |         'PYSPARK_PYTHON': '/opt/jupyterhub/miniconda/bin/python',
 42 |     }
 43 | 
 44 | If you're using an `archived notebook environment <archived-environments>`, you
 45 | may instead want to bundle a ``spark`` config directory in the archive, and set
 46 | the ``SPARK_CONF_DIR`` to the extracted path. This allows you to specify the
 47 | path to the same archive in the config, so your users don't have to themselves.
 48 | This might look like:
 49 | 
 50 | .. code-block:: text
 51 | 
 52 |     # A custom spark-defaults.conf
 53 |     # Stored at `<ENV>/etc/spark/spark-defaults.conf`, where `<ENV>` is the top
 54 |     # directory of the unarchived Conda/virtual environment.
 55 | 
 56 |     # Common configuration
 57 |     spark.master yarn
 58 |     spark.submit.deployMode client
 59 |     spark.yarn.queue myqueue
 60 | 
 61 |     # If the spark jars are already on every node, avoid serializing them
 62 |     spark.yarn.jars local:/usr/lib/spark/jars/*
 63 | 
 64 |     # Path to the archived Python environment
 65 |     spark.yarn.dist.archives hdfs:///jupyterhub/example.tar.gz#environment
 66 | 
 67 |     # Pyspark configuration
 68 |     spark.pyspark.python ./environment/bin/python
 69 |     spark.pyspark.driver.python ./environment/bin/python
 70 | 
 71 | 
 72 | And the ``jupyterhub_config.py`` file:
 73 | 
 74 | .. code-block:: python
 75 | 
 76 |     # Add PySpark to PYTHONPATH, same as above
 77 |     # ...
 78 | 
 79 |     # Set PYTHONPATH and SPARK_CONF_DIR in the user's notebook environment
 80 |     c.YarnSpawner.environment = {
 81 |         'PYTHONPATH': pythonpath,
 82 |         'SPARK_CONF_DIR': './environment/etc/spark'
 83 |     }
 84 | 
 85 | 
 86 | Usage
 87 | -----
 88 | 
 89 | Given configuration like above, users may not need to enter any parameters when
 90 | creating a ``SparkContext`` - the default values may already be sufficiently
 91 | set:
 92 | 
 93 | .. code-block:: python
 94 | 
 95 |     import pyspark
 96 | 
 97 |     # Create a spark context from the defaults set in configuration
 98 |     sc = pyspark.SparkContext()
 99 | 
100 | Of course, overrides can always be provided at runtime if needed:
101 | 
102 | .. code-block:: python
103 | 
104 |     import pyspark
105 | 
106 |     conf = pyspark.SparkConf()
107 | 
108 |     # Override a few default parameters
109 |     conf.set('spark.executor.memory', '512m')
110 |     conf.set('spark.executor.instances', 1)
111 | 
112 |     # Create a spark context with the overrides
113 |     sc = pyspark.SparkContext(conf=conf)
114 | 
115 | If all nodes are configured to use the same Python path/archive, then all
116 | dependencies should be available on all workers:
117 | 
118 | .. code-block:: python
119 | 
120 |     def some_function(x):
121 |         # Libraries are imported and available from the same environment as the
122 |         # notebook
123 |         import sklearn
124 |         import pandas as pd
125 |         import numpy as np
126 | 
127 |         # Use the libraries to do work
128 |         return ...
129 | 
130 | 
131 |     rdd = sc.parallelize(range(1000)).map(some_function).take(10)
132 | 
133 | 
134 | When you're done, the Spark clusters can be shutdown manually, or will be
135 | automatically shutdown when the notebook exits.
136 | 
137 | 
138 | Further Reading
139 | ---------------
140 | 
141 | There are additional Jupyter and Spark integrations that may be useful for your
142 | installation. Please refer to their documentation for more information:
143 | 
144 | - sparkmonitor_: Realtime monitoring of Spark applications from inside the notebook
145 | - jupyter-spark_: Simpler progress indicators for running Spark jobs
146 | 
147 | Additionally, you may find the following resources useful:
148 | 
149 | - `Using conda environments with Spark <https://conda.github.io/conda-pack/spark.html>`__
150 | - `Using virtual environments with Spark <https://jcrist.github.io/venv-pack/spark.html>`__
151 | 
152 | 
153 | .. _Spark-on-Yarn documentation: https://spark.apache.org/docs/latest/running-on-yarn.html#preparations
154 | .. _sparkmagic: https://github.com/jupyter-incubator/sparkmagic
155 | .. _PySpark: https://spark.apache.org/docs/2.3.1/api/python/index.html
156 | .. _sparkmonitor: https://krishnan-r.github.io/sparkmonitor/
157 | .. _jupyter-spark: https://github.com/mozilla/jupyter-spark
158 | 


--------------------------------------------------------------------------------