├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── README.rst
├── docker-demo
├── README.rst
├── docker-compose.yaml
└── image
│ ├── Dockerfile
│ ├── cloudera-cdh6.repo
│ └── files
│ ├── etc
│ ├── dask
│ │ └── config.yaml
│ ├── edge.supervisord.conf
│ ├── edge.supervisord.d
│ │ └── jupyterhub.conf
│ ├── hadoop
│ │ ├── conf.kerberos
│ │ │ ├── capacity-scheduler.xml
│ │ │ ├── container-executor.cfg
│ │ │ ├── core-site.xml
│ │ │ ├── hdfs-site.xml
│ │ │ ├── mapred-site.xml
│ │ │ ├── resource-types.xml
│ │ │ └── yarn-site.xml
│ │ └── conf.temp
│ │ │ ├── core-site.xml
│ │ │ └── hdfs-site.xml
│ ├── jupyter
│ │ └── jupyter_config.py
│ ├── jupyterhub
│ │ └── jupyterhub_config.py
│ ├── krb5.conf
│ ├── master.supervisord.conf
│ ├── master.supervisord.d
│ │ ├── hdfs-namenode.conf
│ │ ├── kerberos.conf
│ │ └── yarn-resourcemanager.conf
│ ├── spark
│ │ └── conf
│ │ │ └── spark-defaults.conf
│ ├── worker.supervisord.conf
│ └── worker.supervisord.d
│ │ ├── hdfs-datanode.conf
│ │ └── yarn-nodemanager.conf
│ ├── opt
│ └── jupyterhub
│ │ └── start.sh
│ ├── root
│ ├── init-hdfs.sh
│ ├── setup-jupyterhub.sh
│ └── setup.sh
│ └── var
│ └── kerberos
│ └── krb5kdc
│ ├── kadm5.acl
│ └── kdc.conf
└── docs
├── Makefile
└── source
├── _images
├── architecture.svg
├── dask-usage.gif
├── jupyterlab_interface.png
└── login-page-hello-world.png
├── _static
└── custom.css
├── _templates
└── help.html
├── conf.py
├── contents-managers.rst
├── customization.rst
├── dask.rst
├── demo.rst
├── enable-https.rst
├── index.rst
├── installation.rst
├── jupyterlab.rst
├── manual-authentication.rst
├── manual-installation.rst
├── manual-setup.rst
├── manual-spawner.rst
└── spark.rst
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | docs/build/
3 | .DS_Store
4 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sphinx:
4 | configuration: docs/source/conf.py
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2019, Jim Crist
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 | may be used to endorse or promote products derived from this software
16 | without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | jupyterhub-on-hadoop
2 | ====================
3 |
4 | |Doc Status|
5 |
6 | Documentation and resources for deploying and managing JupyterHub_ on a
7 | `Hadoop Cluster`_.
8 |
9 | See `the documentation`_.
10 |
11 | LICENSE
12 | -------
13 |
14 | New BSD. See the `License File`_.
15 |
16 | .. _JupyterHub: https://jupyterhub.readthedocs.io/
17 | .. _Hadoop Cluster: https://hadoop.apache.org/
18 | .. _the documentation: https://jupyterhub-on-hadoop.readthedocs.io
19 | .. _License File: https://github.com/jupyterhub/jupyterhub-on-hadoop/blob/master/LICENSE
20 |
21 | .. |Doc Status| image:: https://readthedocs.org/projects/jupyterhub-on-hadoop/badge/?version=latest
22 | :target: https://jupyterhub-on-hadoop.readthedocs.io
23 | :alt: Documentation Status
24 |
--------------------------------------------------------------------------------
/docker-demo/README.rst:
--------------------------------------------------------------------------------
1 | JupyterHub on Hadoop Docker Demo
2 | ================================
3 |
4 | This is a demo setup of JupyterHub on Hadoop deployed via Docker Compose.
5 |
6 | Startup
7 | -------
8 |
9 | From this directory:
10 |
11 | .. code-block:: shell
12 |
13 | $ docker-compose up -d
14 |
15 | Usage
16 | -----
17 |
18 | Three user accounts have been created:
19 |
20 | - ``alice``
21 | - ``bob``
22 | - ``carl``
23 |
24 | All have the same password ``testpass``.
25 |
26 | After logging in you should be dropped into a Jupyter Notebook with common
27 | Python libraries like Pandas, NumPy, and Dask installed.
28 |
29 | Shutdown
30 | --------
31 |
32 | From this directory
33 |
34 | .. code-block:: shell
35 |
36 | $ docker-compose down
37 |
--------------------------------------------------------------------------------
/docker-demo/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | networks:
4 | default:
5 | name: example.com
6 |
7 | services:
8 | master:
9 | image: jcrist/jupyterhub-hadoop-demo
10 | user: root
11 | command: supervisord -c /etc/master.supervisord.conf
12 | container_name: master
13 | hostname: master
14 | domainname: example.com
15 | ports:
16 | - 8020:8020 # NN
17 | - 9000:9000 # NN
18 | - 50070:50070 # NN webui
19 | - 8088:8088 # RM webui
20 | - 88:88/udp # Kerberos
21 | - 749:749 # Kerberos
22 | tmpfs:
23 | - /tmp:noexec
24 |
25 | worker:
26 | image: jcrist/jupyterhub-hadoop-demo
27 | user: root
28 | command: supervisord -c /etc/worker.supervisord.conf
29 | container_name: worker
30 | hostname: worker
31 | domainname: example.com
32 | ports:
33 | - 50075:50075 # DN webui
34 | - 8042:8042 # NM webui
35 | tmpfs:
36 | - /tmp:noexec
37 |
38 | edge:
39 | image: jcrist/jupyterhub-hadoop-demo
40 | user: root
41 | environment:
42 | - JHUB_AUTHENTICATOR=${JHUB_AUTHENTICATOR:-dummy}
43 | command: supervisord -c /etc/edge.supervisord.conf
44 | container_name: edge
45 | hostname: edge
46 | domainname: example.com
47 | ports:
48 | - 8888:8888 # jupyterhub
49 |
--------------------------------------------------------------------------------
/docker-demo/image/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM centos:centos7
2 | MAINTAINER jcrist
3 |
4 | # Install common utilities and kerberos
5 | RUN yum install -y \
6 | sudo \
7 | bzip2 \
8 | java-1.8.0-openjdk \
9 | krb5-libs \
10 | krb5-server \
11 | krb5-workstation \
12 | && yum clean all \
13 | && rm -rf /var/cache/yum
14 |
15 | # Install supervisord
16 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
17 | && python get-pip.py \
18 | && pip install supervisor \
19 | && rm get-pip.py
20 |
21 | # Install CDH
22 | # Note: We force remove a couple unnecessary packages to shrink the docker
23 | # image slightly
24 | ADD cloudera-cdh6.repo /etc/yum.repos.d/
25 | RUN rpm --import https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/RPM-GPG-KEY-cloudera \
26 | && yum install -y \
27 | hadoop-yarn-resourcemanager \
28 | hadoop-hdfs-namenode \
29 | hadoop-yarn-nodemanager \
30 | hadoop-hdfs-datanode \
31 | hadoop-client \
32 | hadoop-libhdfs \
33 | spark-core \
34 | spark-python \
35 | && yum clean all \
36 | && rm -rf /var/cache/yum \
37 | && rm -rf /usr/lib/kite \
38 | && rm -rf /usr/lib/hive \
39 | && rm -rf /usr/lib/solr \
40 | && rm -rf /usr/lib/sentry \
41 | && rm -rf /usr/lib/flume-ng
42 |
43 | # Copy over files
44 | COPY ./files /
45 |
46 | # Setup hadoop and kerberos
47 | RUN /root/setup.sh
48 |
49 | # Setup jupyterhub
50 | RUN /root/setup-jupyterhub.sh
51 |
52 | ENV HADOOP_TESTING_VERSION=$HADOOP_TESTING_VERSION
53 | ENV LIBHDFS3_CONF /etc/hadoop/conf/hdfs-site.xml
54 | ENV HADOOP_CONF_DIR /etc/hadoop/conf
55 | ENV HADOOP_HOME /usr/lib/hadoop
56 | ENV HADOOP_COMMON_HOME /usr/lib/hadoop
57 | ENV HADOOP_YARN_HOME /usr/lib/hadoop-yarn
58 | ENV HADOOP_HDFS_HOME /usr/lib/hadoop-hdfs
59 | ENV SPARK_HOME /usr/lib/spark
60 | ENV JAVA_HOME /usr/lib/jvm/jre
61 |
--------------------------------------------------------------------------------
/docker-demo/image/cloudera-cdh6.repo:
--------------------------------------------------------------------------------
1 | [cloudera-cdh6]
2 | # Packages for Cloudera CDH, Version 6, on RedHat or CentOS 7 x86_64
3 | name=Cloudera CDH
4 | baseurl=https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/
5 | gpgkey=https://archive.cloudera.com/cdh6/6.2.0/redhat7/yum/RPM-GPG-KEY-cloudera
6 | gpgcheck = 1
7 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/dask/config.yaml:
--------------------------------------------------------------------------------
1 | yarn:
2 | environment: venv:///opt/jupyterhub/miniconda
3 |
4 | deploy-mode: local
5 |
6 | scheduler:
7 | vcores: 1
8 | memory: 512 MiB
9 |
10 | worker:
11 | vcores: 1
12 | memory: 512 MiB
13 |
14 | distributed:
15 | dashboard:
16 | link: /user/{JUPYTERHUB_USER}/proxy/{port}/status
17 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/edge.supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | strip_ansi = true
3 | nodaemon = true
4 | logfile = /var/log/supervisord.log
5 | pidfile = /var/run/supervisord.pid
6 |
7 | [unix_http_server]
8 | file = /tmp/supervisor.sock
9 |
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 |
13 | [supervisorctl]
14 | serverurl = unix:///tmp/supervisor.sock
15 | prompt = edge
16 |
17 | [include]
18 | files = /etc/edge.supervisord.d/*.conf
19 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/edge.supervisord.d/jupyterhub.conf:
--------------------------------------------------------------------------------
1 | [program:jupyterhub]
2 | command=/opt/jupyterhub/start.sh
3 | startsecs=2
4 | stopwaitsecs=10
5 | user=jupyterhub
6 | redirect_stderr=true
7 | stdout_logfile=/var/log/jupyterhub/jupyterhub.log
8 | autostart=true
9 | autorestart=false
10 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/capacity-scheduler.xml:
--------------------------------------------------------------------------------
1 |
2 |
7 |
8 |
9 | yarn.scheduler.capacity.root.queues
10 | jupyterhub,default
11 |
12 |
13 |
14 | yarn.scheduler.capacity.maximum-am-resource-percent
15 | 0.75
16 |
17 |
18 |
19 |
20 | yarn.scheduler.capacity.root.jupyterhub.capacity
21 | 50.0
22 |
23 |
24 |
25 | yarn.scheduler.capacity.root.jupyterhub.maximum-capacity
26 | 100.0
27 |
28 |
29 |
30 |
31 | yarn.scheduler.capacity.root.default.capacity
32 | 50.0
33 |
34 |
35 |
36 | yarn.scheduler.capacity.root.default.maximum-capacity
37 | 100.0
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/container-executor.cfg:
--------------------------------------------------------------------------------
1 | yarn.nodemanager.local-dirs=/var/lib/hadoop-yarn/cache/yarn/nm-local-dir
2 | yarn.nodemanager.linux-container-executor.group=yarn
3 | yarn.nodemanager.log-dirs=/var/log/hadoop-yarn/containers
4 | banned.users=hdfs,yarn,mapred,bin
5 |
6 | min.user.id=500
7 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | hadoop.tmp.dir
5 | /var/tmp/
6 |
7 |
8 |
9 | fs.defaultFS
10 | hdfs://master.example.com:9000
11 |
12 |
13 |
14 | hadoop.proxyuser.mapred.hosts
15 | *
16 |
17 |
18 |
19 | hadoop.proxyuser.mapred.groups
20 | *
21 |
22 |
23 |
24 | hadoop.proxyuser.jupyterhub.hosts
25 | *
26 |
27 |
28 |
29 | hadoop.proxyuser.jupyterhub.users
30 | *
31 |
32 |
33 |
34 | hadoop.security.authentication
35 | kerberos
36 |
37 |
38 |
39 | hadoop.security.authorization
40 | true
41 |
42 |
43 |
44 | hadoop.http.filter.initializers
45 | org.apache.hadoop.security.AuthenticationFilterInitializer
46 |
47 |
48 |
49 | hadoop.http.authentication.type
50 | kerberos
51 |
52 |
53 |
54 | hadoop.http.authentication.signature.secret.file
55 | /etc/hadoop/conf/http-secret-file
56 |
57 |
58 |
59 | hadoop.http.authentication.cookie.domain
60 | .example.com
61 |
62 |
63 |
64 | hadoop.http.authentication.simple.anonymous.allowed
65 | false
66 |
67 |
68 |
69 | hadoop.http.authentication.kerberos.principal
70 | HTTP/master.example.com@EXAMPLE.COM
71 |
72 |
73 |
74 | hadoop.http.authentication.kerberos.keytab
75 | /etc/hadoop/conf/master-keytabs/HTTP.keytab
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dfs.replication
5 | 1
6 |
7 |
8 |
9 | dfs.permissions.enabled
10 | true
11 |
12 |
13 |
14 | dfs.webhdfs.enabled
15 | true
16 |
17 |
18 |
19 | dfs.block.access.token.enable
20 | true
21 |
22 |
23 |
24 | dfs.namenode.keytab.file
25 | /etc/hadoop/conf/master-keytabs/hdfs.keytab
26 |
27 |
28 |
29 | dfs.namenode.kerberos.principal
30 | hdfs/master.example.com@EXAMPLE.COM
31 |
32 |
33 |
34 | dfs.namenode.kerberos.internal.spnego.principal
35 | HTTP/master.example.com@EXAMPLE.COM
36 |
37 |
38 |
39 | dfs.datanode.keytab.file
40 | /etc/hadoop/conf/worker-keytabs/hdfs.keytab
41 |
42 |
43 |
44 | dfs.datanode.kerberos.principal
45 | hdfs/worker.example.com@EXAMPLE.COM
46 |
47 |
48 |
49 | dfs.web.authentication.kerberos.principal
50 | HTTP/master.example.com@EXAMPLE.COM
51 |
52 |
53 |
54 | dfs.web.authentication.kerberos.keytab
55 | /etc/hadoop/conf/master-keytabs/HTTP.keytab
56 |
57 |
58 |
59 | ignore.secure.ports.for.testing
60 | true
61 |
62 |
63 |
64 | dfs.http.policy
65 | HTTP_ONLY
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | mapreduce.framework.name
5 | yarn
6 |
7 |
8 |
9 | mapreduce.jobhistory.address
10 | master.example.com:10020
11 |
12 |
13 |
14 | mapreduce.jobhistory.webapp.address
15 | master.example.com:19888
16 |
17 |
18 |
19 | mapreduce.jobhistory.keytab
20 | /etc/hadoop/conf/master-keytabs/mapred.keytab
21 |
22 |
23 |
24 | mapreduce.jobhistory.principal
25 | mapred/master.example.com@EXAMPLE.COM
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/resource-types.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | yarn.resource-types
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.kerberos/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | yarn.resourcemanager.hostname
5 | master.example.com
6 |
7 |
8 |
9 | yarn.nodemanager.aux-services
10 | mapreduce_shuffle
11 |
12 |
13 |
14 | yarn.application.classpath
15 |
16 | $HADOOP_CONF_DIR,
17 | $HADOOP_COMMON_HOME/*,
18 | $HADOOP_COMMON_HOME/lib/*,
19 | $HADOOP_HDFS_HOME/*,
20 | $HADOOP_HDFS_HOME/lib/*,
21 | $HADOOP_MAPRED_HOME/*,
22 | $HADOOP_MAPRED_HOME/lib/*,
23 | $HADOOP_YARN_HOME/*,
24 | $HADOOP_YARN_HOME/lib/*
25 |
26 |
27 |
28 |
29 | yarn.nodemanager.local-dirs
30 | file:///var/tmp/hadoop-yarn/local
31 |
32 |
33 |
34 | yarn.nodemanager.log-dirs
35 | file:///var/tmp/hadoop-yarn/logs
36 |
37 |
38 |
39 | yarn.log-aggregation-enable
40 | true
41 |
42 |
43 |
44 | yarn.nodemanager.remote-app-log-dir
45 | hdfs://master.example.com:9000/var/log/hadoop-yarn/apps
46 |
47 |
48 |
49 | yarn.scheduler.minimum-allocation-mb
50 | 32
51 |
52 |
53 |
54 | yarn.resource-types.memory-mb.increment-allocation
55 | ${yarn.scheduler.minimum-allocation-mb}
56 |
57 |
58 |
59 | yarn.scheduler.increment-allocation-mb
60 | ${yarn.scheduler.minimum-allocation-mb}
61 |
62 |
63 |
64 | yarn.nodemanager.resource.cpu-vcores
65 | 16
66 |
67 |
68 |
69 | yarn.resourcemanager.scheduler.class
70 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
71 |
72 |
73 |
74 | yarn.resourcemanager.keytab
75 | /etc/hadoop/conf/master-keytabs/yarn.keytab
76 |
77 |
78 |
79 | yarn.resourcemanager.principal
80 | yarn/master.example.com@EXAMPLE.COM
81 |
82 |
83 |
84 | yarn.nodemanager.keytab
85 | /etc/hadoop/conf/worker-keytabs/yarn.keytab
86 |
87 |
88 |
89 | yarn.nodemanager.principal
90 | yarn/worker.example.com@EXAMPLE.COM
91 |
92 |
93 |
94 | yarn.nodemanager.container-executor.class
95 | org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor
96 |
97 |
98 |
99 | yarn.nodemanager.linux-container-executor.path
100 | /usr/lib/hadoop-yarn/bin/container-executor
101 |
102 |
103 |
104 | yarn.nodemanager.linux-container-executor.group
105 | yarn
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.temp/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | hadoop.tmp.dir
5 | /var/tmp/
6 |
7 |
8 |
9 | fs.defaultFS
10 | hdfs://master.example.com:9000
11 |
12 |
13 |
14 | hadoop.security.authentication
15 | simple
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/hadoop/conf.temp/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dfs.replication
5 | 1
6 |
7 |
8 |
9 | dfs.permissions.enabled
10 | true
11 |
12 |
13 |
14 | dfs.webhdfs.enabled
15 | true
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/jupyter/jupyter_config.py:
--------------------------------------------------------------------------------
1 | c.NotebookApp.contents_manager_class = "hdfscm.HDFSContentsManager"
2 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/jupyterhub/jupyterhub_config.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | # Basic setup
5 | c.JupyterHub.bind_url = 'http://:8888'
6 | c.JupyterHub.cookie_secret_file = '/etc/jupyterhub/jupyterhub_cookie_secret'
7 | c.JupyterHub.db_url = 'sqlite:////var/jupyterhub/jupyterhub.sqlite'
8 |
9 | # Make the JupyterHub internal communication accessible from other machines
10 | # in the cluster
11 | c.JupyterHub.hub_ip = ''
12 |
13 | # Enable yarnspawner
14 | c.JupyterHub.spawner_class = 'yarnspawner.YarnSpawner'
15 |
16 | # Configuration for kerberos security
17 | c.YarnSpawner.principal = 'jupyterhub'
18 | c.YarnSpawner.keytab = '/etc/jupyterhub/jupyterhub.keytab'
19 |
20 | # Resource limits per-user
21 | c.YarnSpawner.mem_limit = '2 G'
22 | c.YarnSpawner.cpu_limit = 1
23 |
24 | ## Configure environment variables in user notebook sessions
25 | # Find pyspark modules to add to python path, so they can be used as regular
26 | # libraries
27 | pyspark = '/usr/lib/spark/python/'
28 | py4j = glob.glob(os.path.join(pyspark, 'lib', 'py4j-*.zip'))[0]
29 | pythonpath = ':'.join([pyspark, py4j])
30 | c.YarnSpawner.environment = {
31 | 'PYTHONPATH': pythonpath,
32 | 'PYSPARK_PYTHON': '/opt/jupyterhub/miniconda/bin/python',
33 | 'PYSPARK_DRIVER_PYTHON': '/opt/jupyterhub/miniconda/bin/python',
34 | }
35 |
36 | # The YARN queue to use
37 | c.YarnSpawner.queue = 'jupyterhub'
38 |
39 | # Activate the JupyterHub conda environment
40 | c.YarnSpawner.prologue = 'source /opt/jupyterhub/miniconda/bin/activate'
41 |
42 | authenticator = os.environ.get('JHUB_AUTHENTICATOR', 'dummy').lower()
43 | if authenticator == 'kerberos':
44 | c.JupyterHub.authenticator_class = 'kerberosauthenticator.KerberosAuthenticator'
45 | c.KerberosAuthenticator.keytab = '/etc/jupyterhub/jupyterhub.keytab'
46 | else:
47 | c.JupyterHub.authenticator_class = 'dummyauthenticator.DummyAuthenticator'
48 | c.DummyAuthenticator.password = "testpass"
49 | # A whitelist of valid usernames. The kerberosauthenticator will enforce
50 | # only valid users are logged in, but the dummyauthenticator has no such
51 | # mechanism so we need to rely on a whitelist
52 | c.DummyAuthenticator.whitelist = [
53 | 'alice',
54 | 'bob',
55 | 'carl'
56 | ]
57 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/krb5.conf:
--------------------------------------------------------------------------------
1 | [logging]
2 | default = FILE:/var/log/krb5libs.log
3 | kdc = FILE:/var/log/krb5kdc.log
4 | admin_server = FILE:/var/log/kadmind.log
5 |
6 | [libdefaults]
7 | default_realm = EXAMPLE.COM
8 | dns_lookup_realm = false
9 | dns_lookup_kdc = false
10 | ticket_lifetime = 24h
11 | renew_lifetime = 7d
12 | forwardable = true
13 |
14 | [realms]
15 | EXAMPLE.COM = {
16 | kdc = master.example.com
17 | admin_server = master.example.com
18 | }
19 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | strip_ansi = true
3 | nodaemon = true
4 | logfile = /var/log/supervisord.log
5 | pidfile = /var/run/supervisord.pid
6 |
7 | [unix_http_server]
8 | file = /tmp/supervisor.sock
9 |
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 |
13 | [supervisorctl]
14 | serverurl = unix:///tmp/supervisor.sock
15 | prompt = master
16 |
17 | [include]
18 | files = /etc/master.supervisord.d/*.conf
19 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.d/hdfs-namenode.conf:
--------------------------------------------------------------------------------
1 | [program:hdfs-namenode]
2 | command=hdfs namenode
3 | startsecs=2
4 | stopwaitsecs=10
5 | user=hdfs
6 | redirect_stderr=true
7 | stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-namenode.log
8 | autostart=true
9 | autorestart=false
10 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.d/kerberos.conf:
--------------------------------------------------------------------------------
1 | [program:krb5kdc]
2 | command=/bin/bash -c "exec /usr/sbin/krb5kdc -r EXAMPLE.COM -P /var/run/krb5kdc.pid -n"
3 | redirect_stderr=true
4 | stdout_logfile=/dev/stdout
5 | stdout_logfile_maxbytes=0
6 | autostart=true
7 | autorestart=true
8 |
9 | [program:kadmind]
10 | command=/bin/bash -c "exec /usr/sbin/kadmind -r EXAMPLE.COM -P /var/run/kadmind.pid -nofork"
11 | redirect_stderr=true
12 | stdout_logfile=/dev/stdout
13 | stdout_logfile_maxbytes=0
14 | autostart=true
15 | autorestart=true
16 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/master.supervisord.d/yarn-resourcemanager.conf:
--------------------------------------------------------------------------------
1 | [program:yarn-resourcemanager]
2 | command=yarn resourcemanager
3 | startsecs=2
4 | stopwaitsecs=10
5 | user=yarn
6 | redirect_stderr=true
7 | stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-resourcemanager.log
8 | autostart=true
9 | autorestart=false
10 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/spark/conf/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | # Spark on yarn configuration
2 | spark.master yarn
3 | spark.submit.deployMode client
4 | spark.yarn.queue default
5 | spark.yarn.jars local:/usr/lib/spark/jars/*
6 |
7 | # Minimize memory requirements in demo cluster
8 | spark.driver.memory 512m
9 | spark.executor.memory 512m
10 | spark.executor.instances 1
11 |
12 | # Pyspark configuration
13 | spark.pyspark.python /opt/jupyterhub/miniconda/bin/python
14 | spark.pyspark.driver.python /opt/jupyterhub/miniconda/bin/python
15 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/worker.supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | strip_ansi = true
3 | nodaemon = true
4 | logfile = /var/log/supervisord.log
5 | pidfile = /var/run/supervisord.pid
6 |
7 | [unix_http_server]
8 | file = /tmp/supervisor.sock
9 |
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 |
13 | [supervisorctl]
14 | serverurl = unix:///tmp/supervisor.sock
15 | prompt = worker
16 |
17 | [include]
18 | files = /etc/worker.supervisord.d/*.conf
19 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/worker.supervisord.d/hdfs-datanode.conf:
--------------------------------------------------------------------------------
1 | [program:hdfs-datanode]
2 | command=hdfs datanode
3 | startsecs=2
4 | stopwaitsecs=10
5 | user=hdfs
6 | redirect_stderr=true
7 | stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-datanode.log
8 | autostart=true
9 | autorestart=false
10 |
--------------------------------------------------------------------------------
/docker-demo/image/files/etc/worker.supervisord.d/yarn-nodemanager.conf:
--------------------------------------------------------------------------------
1 | [program:yarn-nodemanager]
2 | command=yarn nodemanager
3 | startsecs=2
4 | stopwaitsecs=10
5 | user=yarn
6 | redirect_stderr=true
7 | stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-nodemanager.log
8 | autostart=true
9 | autorestart=false
10 |
--------------------------------------------------------------------------------
/docker-demo/image/files/opt/jupyterhub/start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export PATH="/opt/jupyterhub/miniconda/bin:$PATH"
4 | cd /var/jupyterhub
5 | exec jupyterhub -f /etc/jupyterhub/jupyterhub_config.py
6 |
--------------------------------------------------------------------------------
/docker-demo/image/files/root/init-hdfs.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | # Exponential backoff on testing hdfs status, then run init script
4 | echo "Waiting to connect to HDFS"
5 | timeout=2
6 | exit_code=0
7 | for attempt in {1..5}; do
8 | hdfs dfs -ls /
9 | exit_code=$?
10 |
11 | if [[ $exit_code == 0 ]]; then
12 | break
13 | fi
14 |
15 | echo "Retrying in $timeout.." 1>&2
16 | sleep $timeout
17 | timeout=$[$timeout * 2]
18 | done
19 |
20 | if [[ $exit_code != 0 ]]; then
21 | echo "Failed to connect to HDFS"
22 | exit $exit_code
23 | fi
24 | echo "HDFS connected, initializing directory structure"
25 |
26 | hdfs dfs -mkdir -p /tmp \
27 | && hdfs dfs -chmod -R 1777 /tmp \
28 | && hdfs dfs -mkdir -p /var/log \
29 | && hdfs dfs -chmod -R 1775 /var/log \
30 | && hdfs dfs -chown yarn:hadoop /var/log \
31 | && hdfs dfs -mkdir -p /tmp/hadoop-yarn \
32 | && hdfs dfs -chown -R mapred:hadoop /tmp/hadoop-yarn \
33 | && hdfs dfs -mkdir -p /tmp/hadoop-yarn/staging/history/done_intermediate \
34 | && hdfs dfs -chown -R mapred:hadoop /tmp/hadoop-yarn/staging \
35 | && hdfs dfs -chmod -R 1777 /tmp \
36 | && hdfs dfs -mkdir -p /var/log/hadoop-yarn/apps \
37 | && hdfs dfs -chmod -R 1777 /var/log/hadoop-yarn/apps \
38 | && hdfs dfs -chown yarn:hadoop /var/log/hadoop-yarn/apps \
39 | && hdfs dfs -mkdir -p /user \
40 | && hdfs dfs -mkdir -p /user/root \
41 | && hdfs dfs -chmod -R 777 /user/root \
42 | && hdfs dfs -chown root /user/root \
43 | && hdfs dfs -mkdir -p /user/history \
44 | && hdfs dfs -chmod -R 1777 /user/history \
45 | && hdfs dfs -chown mapred:hadoop /user/history \
46 | && hdfs dfs -mkdir -p /user/alice \
47 | && hdfs dfs -chown alice /user/alice \
48 | && hdfs dfs -mkdir -p /user/bob \
49 | && hdfs dfs -chown bob /user/bob \
50 | && hdfs dfs -mkdir -p /user/carl \
51 | && hdfs dfs -chown bob /user/carl
52 |
53 | exit_code=$?
54 | if [[ $exit_code != 0 ]]; then
55 | echo "Failed to initialize HDFS"
56 | exit $exit_code
57 | fi
58 | echo "Initialized HDFS"
59 |
--------------------------------------------------------------------------------
/docker-demo/image/files/root/setup-jupyterhub.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | # Make jupyterhub directories
6 | mkdir -p /etc/jupyterhub
7 | mkdir -p /opt/jupyterhub
8 | mkdir -p /var/jupyterhub
9 | mkdir -p /var/log/jupyterhub
10 | chown jupyterhub /var/jupyterhub
11 | chown jupyterhub /var/log/jupyterhub
12 |
13 | # Create jupyterhub cookie secret
14 | openssl rand -hex 32 > /etc/jupyterhub/jupyterhub_cookie_secret
15 | chmod 400 /etc/jupyterhub/jupyterhub_cookie_secret
16 | chown jupyterhub /etc/jupyterhub/jupyterhub_cookie_secret
17 |
18 | # Install miniconda
19 | curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \
20 | && /bin/bash /tmp/miniconda.sh -b -p /opt/jupyterhub/miniconda \
21 | && rm /tmp/miniconda.sh \
22 | && echo 'export PATH="/opt/jupyterhub/miniconda/bin:$PATH"' >> /root/.bashrc \
23 | && source /root/.bashrc \
24 | && conda config --set always_yes yes --set changeps1 no
25 |
26 | # Install JupyterHub, dependencies, and user packages. Normally you'd create a
27 | # separate Python environment here (and optionally package it and put it on
28 | # HDFS). However, to save memory usage in the docker images we'll use the same
29 | # environment for everything. Adding dependencies for use by users:
30 | conda install -c conda-forge \
31 | jupyterhub \
32 | jupyterhub-yarnspawner \
33 | jupyter-hdfscm \
34 | jupyter-server-proxy \
35 | tornado==5.1.1 \
36 | notebook \
37 | jupyterlab \
38 | ipywidgets \
39 | pykerberos \
40 | dask-yarn \
41 | dask==1.2.0 \
42 | pyarrow \
43 | pandas==0.24.2 \
44 | numpy==1.16.2 \
45 | nomkl
46 |
47 | # Patch out no HTTPS warning in login script to give prettier demos.
48 | sed -i '/^