├── batchspawner
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_spawners.py
    ├── __init__.py
    ├── singleuser.py
    ├── api.py
    └── batchspawner.py
├── requirements.txt
├── MANIFEST.in
├── .gitignore
├── CONTRIBUTING.md
├── version.py
├── .flake8
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── test.yml
├── LICENSE
├── .pre-commit-config.yaml
├── SPAWNERS.md
├── setup.py
├── CHANGELOG.md
└── README.md


/batchspawner/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | async_generator>=1.8
2 | jinja2
3 | jupyterhub>=0.9
4 | 


--------------------------------------------------------------------------------
/batchspawner/__init__.py:
--------------------------------------------------------------------------------
1 | from .batchspawner import *
2 | from . import api
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include LICENSE
3 | include version.py
4 | include requirements.txt
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | *.log
3 | *.pyc
4 | __pycache__/
5 | .cache/
6 | .coverage
7 | .pytest_cache
8 | *~
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 | 
3 | Welcome! As a [Jupyter](https://jupyter.org) project, we follow the [Jupyter contributor guide](https://jupyter.readthedocs.io/en/latest/contributing/content-contributor.html).
4 | 


--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Jupyter Development Team.
 2 | # Distributed under the terms of the Modified BSD License.
 3 | 
 4 | version_info = (
 5 |     1,
 6 |     2,
 7 |     0,
 8 |     #    "dev",  # comment-out this line for a release
 9 | )
10 | __version__ = ".".join(map(str, version_info))
11 | 


--------------------------------------------------------------------------------
/batchspawner/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Relevant pytest fixtures are re-used from JupyterHub's test suite"""
 2 | 
 3 | # We only use "db" and "io_loop", but we also need event_loop which is used by
 4 | # io_loop to be available with jupyterhub 1+.
 5 | from jupyterhub.tests.conftest import db, io_loop
 6 | 
 7 | try:
 8 |     from jupyterhub.tests.conftest import event_loop
 9 | except:
10 |     pass
11 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # Ignore style and complexity
 3 | # E: style errors
 4 | # W: style warnings
 5 | # C: complexity
 6 | # F401: module imported but unused
 7 | # F403: import *
 8 | # F811: redefinition of unused `name` from line `N`
 9 | # F841: local variable assigned but never used
10 | # E402: module level import not at top of file
11 | # I100: Import statements are in the wrong order
12 | # I101: Imported names are in the wrong order. Should be
13 | ignore = E, W, C, F401, F403, F811, F841, E402, I100, I101, D400
14 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | #
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [released]
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: actions/setup-python@v3
17 |         with:
18 |           python-version: "3.x"
19 | 
20 |       - name: install build package
21 |         run: |
22 |           pip install --upgrade pip
23 |           pip install build
24 |           pip freeze
25 | 
26 |       - name: build release
27 |         run: |
28 |           python -m build --sdist --wheel .
29 |           ls -l dist
30 |           sha256sum dist/* | tee SHA256SUMS
31 | 
32 |       - name: Publish to PyPI
33 |         env:
34 |           TWINE_USERNAME: __token__
35 |           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
36 |         run: |
37 |           pip install twine
38 |           twine upload --skip-existing dist/*
39 | 


--------------------------------------------------------------------------------
/batchspawner/singleuser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from runpy import run_path
 5 | from shutil import which
 6 | 
 7 | from jupyterhub.utils import random_port, url_path_join
 8 | from jupyterhub.services.auth import HubAuth
 9 | 
10 | import requests
11 | 
12 | 
13 | def main(argv=None):
14 |     port = random_port()
15 |     hub_auth = HubAuth()
16 |     hub_auth.client_ca = os.environ.get("JUPYTERHUB_SSL_CLIENT_CA", "")
17 |     hub_auth.certfile = os.environ.get("JUPYTERHUB_SSL_CERTFILE", "")
18 |     hub_auth.keyfile = os.environ.get("JUPYTERHUB_SSL_KEYFILE", "")
19 | 
20 |     url = url_path_join(hub_auth.api_url, "batchspawner")
21 |     headers = {"Authorization": f"token {hub_auth.api_token}"}
22 | 
23 |     # internal_ssl kwargs
24 |     kwargs = {}
25 |     if hub_auth.certfile and hub_auth.keyfile:
26 |         kwargs["cert"] = (hub_auth.certfile, hub_auth.keyfile)
27 |     if hub_auth.client_ca:
28 |         kwargs["verify"] = hub_auth.client_ca
29 | 
30 |     r = requests.post(
31 |         url,
32 |         headers={"Authorization": f"token {hub_auth.api_token}"},
33 |         json={"port": port},
34 |         **kwargs,
35 |     )
36 | 
37 |     cmd_path = which(sys.argv[1])
38 |     sys.argv = sys.argv[1:] + ["--port={}".format(port)]
39 |     run_path(cmd_path, run_name="__main__")
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/batchspawner/api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tornado import web
 3 | from jupyterhub.apihandlers import APIHandler, default_handlers
 4 | from batchspawner import BatchSpawnerBase
 5 | 
 6 | 
 7 | class BatchSpawnerAPIHandler(APIHandler):
 8 |     @web.authenticated
 9 |     def post(self):
10 |         """POST set user spawner data"""
11 |         if hasattr(self, "current_user"):
12 |             # Jupyterhub compatability, (september 2018, d79a99323ef1d)
13 |             user = self.current_user
14 |         else:
15 |             # Previous jupyterhub, 0.9.4 and before.
16 |             user = self.get_current_user()
17 |         token = self.get_auth_token()
18 |         spawner = None
19 |         for s in user.spawners.values():
20 |             if s.api_token == token:
21 |                 spawner = s
22 |                 # fix for when spawner is not batchspawner.
23 |                 # unsure if you can link properties between two classes
24 |                 while not issubclass(spawner.__class__,BatchSpawnerBase):
25 |                   if hasattr(s,"child_spawner"):
26 |                     spawner = spawner.child_spawner
27 |                 break
28 |         data = self.get_json_body()
29 |         for key, value in data.items():
30 |             if hasattr(spawner, key):
31 |                 setattr(spawner, key, value)
32 |         self.finish(json.dumps({"message": "BatchSpawner data configured"}))
33 |         self.set_status(201)
34 | 
35 | default_handlers.append((r"/api/batchspawner", BatchSpawnerAPIHandler))
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Project Jupyter Contributors
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # pre-commit is a tool to perform a predefined set of tasks manually and/or
 2 | # automatically before git commits are made.
 3 | #
 4 | # Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level
 5 | #
 6 | # Common tasks
 7 | #
 8 | # - Run on all files:   pre-commit run --all-files
 9 | # - Register git hooks: pre-commit install --install-hooks
10 | #
11 | repos:
12 |   # Autoformat: Python code
13 |   - repo: https://github.com/psf/black
14 |     rev: "23.9.1"
15 |     hooks:
16 |       - id: black
17 |         args:
18 |           - --target-version=py36
19 |           - --target-version=py37
20 |           - --target-version=py38
21 |           - --target-version=py39
22 |           - --target-version=py310
23 |           - --target-version=py311
24 | 
25 |   # Autoformat: markdown, yaml
26 |   - repo: https://github.com/pre-commit/mirrors-prettier
27 |     rev: v3.0.3
28 |     hooks:
29 |       - id: prettier
30 | 
31 |   # Lint: Python code
32 |   - repo: https://github.com/PyCQA/flake8
33 |     rev: "6.1.0"
34 |     hooks:
35 |       - id: flake8
36 | 
37 |   # Misc...
38 |   - repo: https://github.com/pre-commit/pre-commit-hooks
39 |     rev: v4.4.0
40 |     # ref: https://github.com/pre-commit/pre-commit-hooks#hooks-available
41 |     hooks:
42 |       # Autoformat: Makes sure files end in a newline and only a newline.
43 |       - id: end-of-file-fixer
44 | 
45 |       # Autoformat: Sorts entries in requirements.txt.
46 |       - id: requirements-txt-fixer
47 | 
48 |       # Prevent giant (500kB) files from being committed.
49 |       - id: check-added-large-files
50 | 
51 |       # Lint: Check for files with names that would conflict on a
52 |       # case-insensitive filesystem like MacOS HFS+ or Windows FAT.
53 |       - id: check-case-conflict
54 | 
55 |       # Lint: Checks that non-binary executables have a proper shebang.
56 |       - id: check-executables-have-shebangs
57 | 
58 | # pre-commit.ci config reference: https://pre-commit.ci/#configuration
59 | ci:
60 |   autoupdate_schedule: monthly
61 | 


--------------------------------------------------------------------------------
/SPAWNERS.md:
--------------------------------------------------------------------------------
 1 | # Notes on specific spawners
 2 | 
 3 | **Spawner maintainers**: Included below are "spawner maintainers",
 4 | when available. There aren't official obligations, but the general
 5 | idea is that you should watch the repository and feel especially
 6 | empowered to comment on issues when you think it might be relevant to
 7 | you (obviously everyone should be, but this is our attempt at even
 8 | more outreach). You should let us know when we break something and
 9 | provide a diversity of opinions in general. Submitting PRs and
10 | testing is nice but not required.
11 | 
12 | To be listed as a maintainer, just submit an issue or PR adding you,
13 | and please watch the repository on Github.
14 | 
15 | ## `TorqueSpawner`
16 | 
17 | Maintainers:
18 | 
19 | ## `MoabSpawner`
20 | 
21 | Subclass of TorqueSpawner
22 | 
23 | Maintainers:
24 | 
25 | ## `SlurmSpawner`
26 | 
27 | Maintainers: @rkdarst
28 | 
29 | This spawner enforces the environment if `srun` is used to wrap the
30 | spawner command, which is the default. If you _do_ want user
31 | environment to be used, set `req_srun=''`. However, this is not
32 | perfect: there is still a bash shell begun as the user which could run
33 | arbitrary startup, define shell aliases for `srun`, etc.
34 | 
35 | Use of `srun` is required to gracefully terminate.
36 | 
37 | ## `GridengineSpawner`
38 | 
39 | Maintainers:
40 | 
41 | ## `CondorSpawner`
42 | 
43 | Maintainers:
44 | 
45 | ## `LsfSpawner`
46 | 
47 | Maintainers:
48 | 
49 | # Checklist for making spawners
50 | 
51 | Please document each of these things under the spawner list above, -
52 | even if it is "OK", we need to track status of all spawners. If it is
53 | a bug, users really need to know.
54 | 
55 | - Does your spawner read shell environment before starting? (See
56 |   [Jupyterhub
57 |   Security](https://jupyterhub.readthedocs.io/en/stable/reference/websecurity.html).
58 | 
59 | - Does your spawner send SIGTERM to the jupyterhub-singleuser process
60 |   before SIGKILL? It should, so that the process can terminate
61 |   gracefully. Add `echo "terminated gracefully"` to the end of the
62 |   batch script - if you see this in your singleuser server output, you
63 |   know that you DO receive SIGTERM and terminate gracefully. If your
64 |   batch system can not automatically send SIGTERM before SIGKILL, PR
65 |   #75 might help here, ask for it to be finished.
66 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # Copyright (c) Jupyter Development Team.
 5 | # Distributed under the terms of the Modified BSD License.
 6 | 
 7 | # -----------------------------------------------------------------------------
 8 | # Minimal Python version sanity check (from IPython/Jupyterhub)
 9 | # -----------------------------------------------------------------------------
10 | 
11 | from __future__ import print_function
12 | 
13 | import os
14 | import sys
15 | 
16 | from setuptools import setup
17 | from glob import glob
18 | 
19 | pjoin = os.path.join
20 | here = os.path.abspath(os.path.dirname(__file__))
21 | 
22 | # Get the current package version.
23 | version_ns = {}
24 | with open(pjoin(here, "version.py")) as f:
25 |     exec(f.read(), {}, version_ns)
26 | 
27 | with open(pjoin(here, "README.md"), encoding="utf-8") as f:
28 |     long_desc = f.read()
29 | 
30 | setup_args = dict(
31 |     name="batchspawner",
32 |     entry_points={
33 |         "console_scripts": ["batchspawner-singleuser=batchspawner.singleuser:main"],
34 |     },
35 |     packages=["batchspawner"],
36 |     version=version_ns["__version__"],
37 |     description="""Batchspawner: A spawner for Jupyterhub to spawn notebooks using batch resource managers.""",
38 |     long_description=long_desc,
39 |     long_description_content_type="text/markdown",
40 |     author="Michael Milligan, Andrea Zonca, Mike Gilbert",
41 |     author_email="milligan@umn.edu",
42 |     url="http://jupyter.org",
43 |     license="BSD",
44 |     platforms="Linux, Mac OS X",
45 |     python_requires="~=3.5",
46 |     keywords=["Interactive", "Interpreter", "Shell", "Web", "Jupyter"],
47 |     classifiers=[
48 |         "Intended Audience :: Developers",
49 |         "Intended Audience :: System Administrators",
50 |         "Intended Audience :: Science/Research",
51 |         "License :: OSI Approved :: BSD License",
52 |         "Programming Language :: Python",
53 |         "Programming Language :: Python :: 3",
54 |     ],
55 |     project_urls={
56 |         "Bug Reports": "https://github.com/jupyterhub/batchspawner/issues",
57 |         "Source": "https://github.com/jupyterhub/batchspawner/",
58 |         "About Jupyterhub": "http://jupyterhub.readthedocs.io/en/latest/",
59 |         "Jupyter Project": "http://jupyter.org",
60 |     },
61 | )
62 | 
63 | # setuptools requirements
64 | if "setuptools" in sys.modules:
65 |     setup_args["install_requires"] = install_requires = []
66 |     with open("requirements.txt") as f:
67 |         for line in f.readlines():
68 |             req = line.strip()
69 |             if not req or req.startswith(("-e", "#")):
70 |                 continue
71 |             install_requires.append(req)
72 | 
73 | 
74 | def main():
75 |     setup(**setup_args)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This is a GitHub workflow defining a set of jobs with a set of steps.
 2 | # ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions
 3 | #
 4 | name: Test
 5 | 
 6 | on:
 7 |   pull_request:
 8 |     paths-ignore:
 9 |       - "**.md"
10 |       - "**.yml"
11 |       - "**.yaml"
12 |       - "!.github/workflows/test.yml"
13 |   push:
14 |     paths-ignore:
15 |       - "**.md"
16 |       - "**.yml"
17 |       - "**.yaml"
18 |       - "!.github/workflows/test.yml"
19 |     branches-ignore:
20 |       - "dependabot/**"
21 |       - "pre-commit-ci-update-config"
22 |     tags: ["**"]
23 |   workflow_dispatch:
24 | 
25 | jobs:
26 |   pytest:
27 |     name: "Run pytest"
28 |     runs-on: ubuntu-20.04
29 |     continue-on-error: ${{ matrix.allow_failure }}
30 |     strategy:
31 |       # Keep running even if one variation of the job fail
32 |       fail-fast: false
33 |       matrix:
34 |         python-version:
35 |           - "3.6"
36 |           - "3.10"
37 |         JHUB_VER:
38 |           - "1.0.0"
39 |           - "1.5.1"
40 |           - "2.3.1"
41 |         allow_failure: [false]
42 | 
43 |         exclude:
44 |           # JupyterHub 1.3.0 requires python 3.6+
45 |           - JHUB_VER: "1.3.0"
46 |             python-version: "3.5"
47 |           # JupyterHub 0.9.6 used a deprecated sqlalchemy feature removed in py3.9 environment
48 |           - JHUB_VER: "0.9.6"
49 |             python-version: "3.9"
50 |         include:
51 |           - JHUB_VER: "main"
52 |             python-version: "3.9"
53 |             allow_failure: true
54 |           - JHUB_VER: "3.0.0"
55 |             python-version: "3.9"
56 |             allow_failure: true
57 | 
58 |     steps:
59 |       - uses: actions/checkout@v3
60 |       - name: Set up Python ${{ matrix.python-version }}
61 |         uses: actions/setup-python@v3
62 |         with:
63 |           python-version: "${{ matrix.python-version }}"
64 | 
65 |       - name: Install dependencies
66 |         run: |
67 |           python -m pip install --upgrade pip
68 |           python -m pip install pytest
69 |           pip install -r requirements.txt
70 |           pip list
71 | 
72 |       - name: Install nodejs dependencies
73 |         run: |
74 |           sudo npm install -g configurable-http-proxy
75 | 
76 |       # We need to check compatibility with different versions of the JH API,
77 |       # including latest development.  For that, we also need to pull in the
78 |       # development dependencies of that old JH version (but we don't need
79 |       # conda/npm for our tests).
80 |       - name: install JupyterHub
81 |         run: |
82 |           git clone --quiet --branch ${{ matrix.JHUB_VER }} https://github.com/jupyterhub/jupyterhub.git ./jupyterhub
83 |           pip install -r ./jupyterhub/dev-requirements.txt
84 |           pip install ./jupyterhub
85 | 
86 |       - name: pytest
87 |         run: |
88 |           pytest --verbose --color=yes --last-failed --cov batchspawner batchspawner/tests
89 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## unreleased changes
  4 | 
  5 | Added (user)
  6 | 
  7 | Added (developer)
  8 | 
  9 | Changed
 10 | 
 11 | Fixed
 12 | 
 13 | ## v1.2
 14 | 
 15 | Changed
 16 | 
 17 | - PR #237: Replace use of scripts with entry_points
 18 | - PR #208 #238 #239 #240 #241: updates to CI - bumping versions and aligning with Jupyterhub standards
 19 | - PR #220: remove code supporting Jupyterhub earlier than 0.9
 20 | 
 21 | Fixed
 22 | 
 23 | - PR #229: LSF jobs with multiple slots display each hostname ':' separated
 24 | 
 25 | ## v1.1
 26 | 
 27 | Added (user)
 28 | 
 29 | - PR #170: SlurmSpawner: add `req_gres` to specify `-go-res`.
 30 | - PR #137: GridEngineSpawner: spawner will now add the following system environment values to the spawner environment, in accordance with the Univa Admin Guide: `SGE_CELL`, `SGE_EXECD`, `SGE_ROOT`, `SGE_CLUSTER_NAME`, `SGE_QMASTER_PORT`, `SGE_EXECD_PORT`, `PATH`
 31 | 
 32 | Added (developer)
 33 | 
 34 | - PR #187: support for unknown job state
 35 | 
 36 | Changed
 37 | 
 38 | - PR #177: Fail on first error in batch script by setting `set -e` to script templates.
 39 | - PR #165: SlurmSpawner: Update template to use `--chdir` instead of `--workdir`. Users of Slurm older than 17.11 may need to revert this locally.
 40 | - PR #189: remove bashism from default script template
 41 | - PR #195: fix exception handling in run_command
 42 | - PR #198: change from Travis to gh-actions for testing
 43 | - PR #196: documentation
 44 | - PR #199: update setup.py
 45 | 
 46 | ## v1.0 (requires minimum JupyterHub 0.9 and Python 3.5)
 47 | 
 48 | Added (user)
 49 | 
 50 | - Add support for JupyterHub named servers. #167
 51 | - Add Jinja2 templating as an option for all scripts and commands. If '{{' or `{%` is used anywhere in the string, it is used as a jinja2 template.
 52 | - Add new option exec_prefix, which defaults to `sudo -E -u {username}`. This replaces explicit `sudo` in every batch command - changes in local commands may be needed.
 53 | - New option: `req_keepvars_extra`, which allows keeping extra variables in addition to what is defined by JupyterHub itself (addition of variables to keep instead of replacement). #99
 54 | - Add `req_prologue` and `req_epilogue` options to scripts which are inserted before/after the main jupyterhub-singleuser command, which allow for generic setup/cleanup without overriding the entire script. #96
 55 | - SlurmSpawner: add the `req_reservation` option. #91
 56 | - Add basic support for JupyterHub progress updates, but this is not used much yet. #86
 57 | 
 58 | Added (developer)
 59 | 
 60 | - Add many more tests.
 61 | - Add a new page `SPAWNERS.md` which information on specific spawners. Begin trying to collect a list of spawner-specific contacts. #97
 62 | - Rename `current_ip` and `current_port` commands to `ip` and `port`. No user impact. #139
 63 | - Update to Python 3.5 `async` / `await` syntax to support JupyterHub progress updates. #90
 64 | 
 65 | Changed
 66 | 
 67 | - PR #58 and #141 changes logic of port selection, so that it is selected _after_ the singleuser server starts. This means that the port number has to be conveyed back to JupyterHub. This requires the following changes:
 68 |   - `jupyterhub_config.py` _must_ explicitely import `batchspawner`
 69 |   - Add a new option `batchspawner_singleuser_cmd` which is used as a wrapper in the single-user servers, which conveys the remote port back to JupyterHub. This is now an integral part of the spawn process.
 70 |   - If you have installed with `pip install -e`, you will have to re-install so that the new script `batchspawner-singleuser` is added to `$PATH`.
 71 | - Update minimum requirements to JupyterHub 0.9 and Python 3.5. #143
 72 | - Update Slurm batch script. Now, the single-user notebook is run in a job step, with a wrapper of `srun`. This may need to be removed using `req_srun=''` if you don't want environment variables limited.
 73 | - Pass the environment dictionary to the queue and cancel commands as well. This is mostly user environment, but may be useful to these commands as well in some cases. #108, #111 If these environment variables were used for authentication as an admin, be aware that there are pre-existing security issues because they may be passed to the user via the batch submit command, see #82.
 74 | 
 75 | Fixed
 76 | 
 77 | - Improve debugging on failed submission by raising errors including error messages from the commands. #106
 78 | - Many other non-user or developer visible changes. #107 #106 #100
 79 | - In Travis CI, blacklist jsonschema=3.0.0a1 because it breaks tests
 80 | 
 81 | Removed
 82 | 
 83 | ## v0.8.1 (bugfix release)
 84 | 
 85 | - Fix regression: single-user server binding address is overwritten by previous session server address, resulting in failure to start. Issue #76
 86 | 
 87 | ## v0.8.0 (compatible with JupyterHub 0.5.0 through 0.8.1/0.9dev)
 88 | 
 89 | - SlurmSpawner: Remove `--uid` for (at least) Slurm 17.11 compatibility. If you use `sudo`, this should not be necessary, but because this is security related you should check that user management is as you expect. If your configuration does not use `sudo` then you may need to add the `--uid` option in a custom `batch_script`.
 90 | - add base options `req_ngpus` `req_partition` `req_account` and `req_options`
 91 | - Fix up logging
 92 | - Merge `user_options` with the template substitution vars instead of having it as a separate key
 93 | - Update ip/port handling for JupyterHub 0.8
 94 | - Add `LICENSE` (BSD3) and `CONTRIBUTING.md`
 95 | - Add `LsfSpawner` for IBM LFS
 96 | - Add `MultiSlurmSpawner`
 97 | - Add `MoabSpawner`
 98 | - Add `condorSpawner`
 99 | - Add `GridEngineSpawner`
100 | - SlurmSpawner: add `req_qos` option
101 | - WrapSpawner and ProfilesSpawner, which provide mechanisms for runtime configuration of spawners, have been split out and moved to the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) package
102 | - Enable CI testing via Travis-CI
103 | 
104 | ## v0.3 (tag: jhub-0.3, compatible with JupyterHub 0.3.0)
105 | 
106 | - initial release containing `TorqueSpawner` and `SlurmSpawner`
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # batchspawner for Jupyterhub
  2 | 
  3 | [![GitHub Workflow Status - Test](https://img.shields.io/github/workflow/status/jupyterhub/batchspawner/Test?logo=github&label=tests)](https://github.com/jupyterhub/batchspawner/actions)
  4 | [![Latest PyPI version](https://img.shields.io/pypi/v/batchspawner?logo=pypi&logoColor=white)](https://pypi.python.org/pypi/batchspawner)
  5 | [![GitHub](https://img.shields.io/badge/issue_tracking-github-blue?logo=github)](https://github.com/jupyterhub/batchspawner/issues)
  6 | [![Discourse](https://img.shields.io/badge/help_forum-discourse-blue?logo=discourse)](https://discourse.jupyter.org/c/jupyterhub)
  7 | [![Gitter](https://img.shields.io/badge/social_chat-gitter-blue?logo=gitter)](https://gitter.im/jupyterhub/jupyterhub)
  8 | [![Contribute](https://img.shields.io/badge/I_want_to_contribute!-grey?logo=jupyter)](https://github.com/jupyterhub/batchspawner/blob/master/CONTRIBUTING.md)
  9 | 
 10 | This is a custom spawner for [Jupyterhub](https://jupyterhub.readthedocs.io/) that is designed for installations on clusters using batch scheduling software.
 11 | 
 12 | This began as a generalization of [mkgilbert's batchspawner](https://github.com/mkgilbert/slurmspawner) which in turn was inspired by [Andrea Zonca's blog post](http://zonca.github.io/2015/04/jupyterhub-hpc.html "Run jupyterhub on a Supercomputer") where he explains his implementation for a spawner that uses SSH and Torque. His github repo is found [here](http://www.github.com/zonca/remotespawner "RemoteSpawner").
 13 | 
 14 | This package formerly included WrapSpawner and ProfilesSpawner, which provide mechanisms for runtime configuration of spawners. These have been split out and moved to the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) package.
 15 | 
 16 | ## Installation
 17 | 
 18 | 1. from root directory of this repo (where setup.py is), run `pip install -e .`
 19 | 
 20 |    If you don't actually need an editable version, you can simply run
 21 |    `pip install batchspawner`
 22 | 
 23 | 2. add lines in jupyterhub_config.py for the spawner you intend to use, e.g.
 24 | 
 25 |    ```python
 26 |       c = get_config()
 27 |       c.JupyterHub.spawner_class = 'batchspawner.TorqueSpawner'
 28 |       import batchspawner    # Even though not used, needed to register batchspawner interface
 29 |    ```
 30 | 
 31 | 3. Depending on the spawner, additional configuration will likely be needed.
 32 | 
 33 | ## Batch Spawners
 34 | 
 35 | For information on the specific spawners, see [SPAWNERS.md](SPAWNERS.md).
 36 | 
 37 | ### Overview
 38 | 
 39 | This file contains an abstraction layer for batch job queueing systems (`BatchSpawnerBase`), and implements
 40 | Jupyterhub spawners for Torque, Moab, SLURM, SGE, HTCondor, LSF, and eventually others.
 41 | Common attributes of batch submission / resource manager environments will include notions of:
 42 | 
 43 | - queue names, resource manager addresses
 44 | - resource limits including runtime, number of processes, memory
 45 | - singleuser child process running on (usually remote) host not known until runtime
 46 | - job submission and monitoring via resource manager utilities
 47 | - remote execution via submission of templated scripts
 48 | - job names instead of PIDs
 49 | 
 50 | `BatchSpawnerBase` provides several general mechanisms:
 51 | 
 52 | - configurable traits `req_foo` that are exposed as `{foo}` in job template scripts. Templates (submit scripts in particular) may also use the full power of [jinja2](http://jinja.pocoo.org/). Templates are automatically detected if a `{{` or `{%` is present, otherwise str.format() used.
 53 | - configurable command templates for submitting/querying/cancelling jobs
 54 | - a generic concept of job-ID and ID-based job state tracking
 55 | - overrideable hooks for subclasses to plug in logic at numerous points
 56 | 
 57 | ### Example
 58 | 
 59 | Every effort has been made to accommodate highly diverse systems through configuration
 60 | only. This example consists of the (lightly edited) configuration used by the author
 61 | to run Jupyter notebooks on an academic supercomputer cluster.
 62 | 
 63 | ```python
 64 | # Select the Torque backend and increase the timeout since batch jobs may take time to start
 65 | import batchspawner
 66 | c.JupyterHub.spawner_class = 'batchspawner.TorqueSpawner'
 67 | c.Spawner.http_timeout = 120
 68 | 
 69 | #------------------------------------------------------------------------------
 70 | # BatchSpawnerBase configuration
 71 | #    These are simply setting parameters used in the job script template below
 72 | #------------------------------------------------------------------------------
 73 | c.BatchSpawnerBase.req_nprocs = '2'
 74 | c.BatchSpawnerBase.req_queue = 'mesabi'
 75 | c.BatchSpawnerBase.req_host = 'mesabi.xyz.edu'
 76 | c.BatchSpawnerBase.req_runtime = '12:00:00'
 77 | c.BatchSpawnerBase.req_memory = '4gb'
 78 | #------------------------------------------------------------------------------
 79 | # TorqueSpawner configuration
 80 | #    The script below is nearly identical to the default template, but we needed
 81 | #    to add a line for our local environment. For most sites the default templates
 82 | #    should be a good starting point.
 83 | #------------------------------------------------------------------------------
 84 | c.TorqueSpawner.batch_script = '''#!/bin/sh
 85 | #PBS -q {queue}@{host}
 86 | #PBS -l walltime={runtime}
 87 | #PBS -l nodes=1:ppn={nprocs}
 88 | #PBS -l mem={memory}
 89 | #PBS -N jupyterhub-singleuser
 90 | #PBS -v {keepvars}
 91 | module load python3
 92 | {cmd}
 93 | '''
 94 | # For our site we need to munge the execution hostname returned by qstat
 95 | c.TorqueSpawner.state_exechost_exp = r'int-\1.mesabi.xyz.edu'
 96 | ```
 97 | 
 98 | ### Security
 99 | 
100 | Unless otherwise stated for a specific spawner, assume that spawners
101 | _do_ evaluate shell environment for users and thus the [security
102 | requirements of JupyterHub security for untrusted
103 | users](https://jupyterhub.readthedocs.io/en/stable/reference/websecurity.html)
104 | are not fulfilled because some (most?) spawners _do_ start a user
105 | shell which will execute arbitrary user environment configuration
106 | (`.profile`, `.bashrc` and the like) unless users do not have
107 | access to their own cluster user account. This is something which we
108 | are working on.
109 | 
110 | ## Provide different configurations of BatchSpawner
111 | 
112 | ### Overview
113 | 
114 | `ProfilesSpawner`, available as part of the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner)
115 | package, allows the Jupyterhub administrator to define a set of different spawning configurations,
116 | both different spawners and different configurations of the same spawner.
117 | The user is then presented a dropdown menu for choosing the most suitable configuration for their needs.
118 | 
119 | This method provides an easy and safe way to provide different configurations of `BatchSpawner` to the
120 | users, see an example below.
121 | 
122 | ### Example
123 | 
124 | The following is based on the author's configuration (at the same site as the example above)
125 | showing how to give users access to multiple job configurations on the batch scheduled
126 | clusters, as well as an option to run a local notebook directly on the jupyterhub server.
127 | 
128 | ```python
129 | # Same initial setup as the previous example
130 | import batchspawner
131 | c.JupyterHub.spawner_class = 'wrapspawner.ProfilesSpawner'
132 | c.Spawner.http_timeout = 120
133 | #------------------------------------------------------------------------------
134 | # BatchSpawnerBase configuration
135 | #   Providing default values that we may omit in the profiles
136 | #------------------------------------------------------------------------------
137 | c.BatchSpawnerBase.req_host = 'mesabi.xyz.edu'
138 | c.BatchSpawnerBase.req_runtime = '12:00:00'
139 | c.TorqueSpawner.state_exechost_exp = r'in-\1.mesabi.xyz.edu'
140 | #------------------------------------------------------------------------------
141 | # ProfilesSpawner configuration
142 | #------------------------------------------------------------------------------
143 | # List of profiles to offer for selection. Signature is:
144 | #   List(Tuple( Unicode, Unicode, Type(Spawner), Dict ))
145 | # corresponding to profile display name, unique key, Spawner class,
146 | # dictionary of spawner config options.
147 | #
148 | # The first three values will be exposed in the input_template as {display},
149 | # {key}, and {type}
150 | #
151 | c.ProfilesSpawner.profiles = [
152 |    ( "Local server", 'local', 'jupyterhub.spawner.LocalProcessSpawner', {'ip':'0.0.0.0'} ),
153 |    ('Mesabi - 2 cores, 4 GB, 8 hours', 'mesabi2c4g12h', 'batchspawner.TorqueSpawner',
154 |       dict(req_nprocs='2', req_queue='mesabi', req_runtime='8:00:00', req_memory='4gb')),
155 |    ('Mesabi - 12 cores, 128 GB, 4 hours', 'mesabi128gb', 'batchspawner.TorqueSpawner',
156 |       dict(req_nprocs='12', req_queue='ram256g', req_runtime='4:00:00', req_memory='125gb')),
157 |    ('Mesabi - 2 cores, 4 GB, 24 hours', 'mesabi2c4gb24h', 'batchspawner.TorqueSpawner',
158 |       dict(req_nprocs='2', req_queue='mesabi', req_runtime='24:00:00', req_memory='4gb')),
159 |    ('Interactive Cluster - 2 cores, 4 GB, 8 hours', 'lab', 'batchspawner.TorqueSpawner',
160 |       dict(req_nprocs='2', req_host='labhost.xyz.edu', req_queue='lab',
161 |           req_runtime='8:00:00', req_memory='4gb', state_exechost_exp='')),
162 |    ]
163 | c.ProfilesSpawner.ip = '0.0.0.0'
164 | ```
165 | 
166 | ## Debugging batchspawner
167 | 
168 | Sometimes it can be hard to debug batchspawner, but it's not really
169 | once you know how the pieces interact. Check the following places for
170 | error messages:
171 | 
172 | - Check the JupyterHub logs for errors.
173 | 
174 | - Check the JupyterHub logs for the batch script that got submitted
175 |   and the command used to submit it. Are these correct? (Note that
176 |   there are submission environment variables too, which aren't
177 |   displayed.)
178 | 
179 | - At this point, it's a matter of checking the batch system. Is the
180 |   job ever scheduled? Does it run? Does it succeed? Check the batch
181 |   system status and output of the job. The most comon failure
182 |   patterns are a) job never starting due to bad scheduler options, b)
183 |   job waiting in the queue beyond the `start_timeout`, causing
184 |   JupyterHub to kill the job.
185 | 
186 | - At this point the job starts. Does it fail immediately, or before
187 |   Jupyter starts? Check the scheduler output files (stdout/stderr of
188 |   the job), wherever it is stored. To debug the job script, you can
189 |   add debugging into the batch script, such as an `env` or `set -x`.
190 | 
191 | - At this point Jupyter itself starts - check its error messages. Is
192 |   it starting with the right options? Can it communicate with the
193 |   hub? At this point there usually isn't anything
194 |   batchspawner-specific, with the one exception below. The error log
195 |   would be in the batch script output (same file as above). There may
196 |   also be clues in the JupyterHub logfile.
197 | - Are you running on an NFS filesystem? It's possible for Jupyter to
198 |   experience issues due to varying implementations of the fcntl() system
199 |   call. (See also [Jupyterhub-Notes and Tips: SQLite](https://jupyterhub.readthedocs.io/en/latest/reference/database.html?highlight=NFS#sqlite))
200 | 
201 | Common problems:
202 | 
203 | - Did you `import batchspawner` in the `jupyterhub_config.py` file?
204 |   This is needed in order to activate the batchspawer API in
205 |   JupyterHub.
206 | 
207 | ## Changelog
208 | 
209 | See [CHANGELOG.md](CHANGELOG.md).
210 | 


--------------------------------------------------------------------------------
/batchspawner/tests/test_spawners.py:
--------------------------------------------------------------------------------
  1 | """Test BatchSpawner and subclasses"""
  2 | 
  3 | import re
  4 | from unittest import mock
  5 | from .. import BatchSpawnerRegexStates, JobStatus
  6 | from traitlets import Unicode
  7 | import time
  8 | import pytest
  9 | from jupyterhub import orm
 10 | from tornado import gen
 11 | 
 12 | try:
 13 |     from jupyterhub.objects import Hub, Server
 14 |     from jupyterhub.user import User
 15 | except:
 16 |     pass
 17 | 
 18 | testhost = "userhost123"
 19 | testjob = "12345"
 20 | testport = 54321
 21 | 
 22 | 
 23 | class BatchDummy(BatchSpawnerRegexStates):
 24 |     exec_prefix = ""
 25 |     batch_submit_cmd = Unicode("cat > /dev/null; echo " + testjob)
 26 |     batch_query_cmd = Unicode("echo RUN " + testhost)
 27 |     batch_cancel_cmd = Unicode("echo STOP")
 28 |     batch_script = Unicode("{cmd}")
 29 |     state_pending_re = Unicode("PEND")
 30 |     state_running_re = Unicode("RUN")
 31 |     state_exechost_re = Unicode("RUN (.*)$")
 32 |     state_unknown_re = Unicode("UNKNOWN")
 33 | 
 34 |     cmd_expectlist = None
 35 |     out_expectlist = None
 36 | 
 37 |     def run_command(self, *args, **kwargs):
 38 |         """Overwriten run command to test templating and outputs"""
 39 |         cmd = args[0]
 40 |         # Test that the command matches the expectations
 41 |         if self.cmd_expectlist:
 42 |             run_re = self.cmd_expectlist.pop(0)
 43 |             if run_re:
 44 |                 print("run:", run_re)
 45 |                 assert (
 46 |                     run_re.search(cmd) is not None
 47 |                 ), "Failed test: re={0} cmd={1}".format(run_re, cmd)
 48 |         # Run command normally
 49 |         out = super().run_command(*args, **kwargs)
 50 |         # Test that the command matches the expectations
 51 |         if self.out_expectlist:
 52 |             out_re = self.out_expectlist.pop(0)
 53 |             if out_re:
 54 |                 print("out:", out_re)
 55 |                 assert (
 56 |                     out_re.search(cmd) is not None
 57 |                 ), "Failed output: re={0} cmd={1} out={2}".format(out_re, cmd, out)
 58 |         return out
 59 | 
 60 | 
 61 | def new_spawner(db, spawner_class=BatchDummy, **kwargs):
 62 |     kwargs.setdefault("cmd", ["singleuser_command"])
 63 |     user = db.query(orm.User).first()
 64 |     hub = Hub()
 65 |     user = User(user, {})
 66 |     server = Server()
 67 |     # Set it after constructions because it isn't a traitlet.
 68 |     kwargs.setdefault("hub", hub)
 69 |     kwargs.setdefault("user", user)
 70 |     kwargs.setdefault("poll_interval", 1)
 71 | 
 72 |     # These are not traitlets so we have to set them here
 73 |     spawner = user._new_spawner("", spawner_class=spawner_class, **kwargs)
 74 |     spawner.server = server
 75 |     spawner.mock_port = testport
 76 |     return spawner
 77 | 
 78 | 
 79 | @pytest.mark.slow
 80 | def test_stress_submit(db, io_loop):
 81 |     for i in range(200):
 82 |         time.sleep(0.01)
 83 |         test_spawner_start_stop_poll(db, io_loop)
 84 | 
 85 | 
 86 | def check_ip(spawner, value):
 87 |     assert spawner.ip == value
 88 | 
 89 | 
 90 | def test_spawner_start_stop_poll(db, io_loop):
 91 |     spawner = new_spawner(db=db)
 92 | 
 93 |     status = io_loop.run_sync(spawner.poll, timeout=5)
 94 |     assert status == 1
 95 |     assert spawner.job_id == ""
 96 |     assert spawner.get_state() == {}
 97 | 
 98 |     io_loop.run_sync(spawner.start, timeout=5)
 99 |     check_ip(spawner, testhost)
100 |     assert spawner.job_id == testjob
101 | 
102 |     status = io_loop.run_sync(spawner.poll, timeout=5)
103 |     assert status is None
104 |     spawner.batch_query_cmd = "echo NOPE"
105 |     io_loop.run_sync(spawner.stop, timeout=5)
106 |     status = io_loop.run_sync(spawner.poll, timeout=5)
107 |     assert status == 1
108 |     assert spawner.get_state() == {}
109 | 
110 | 
111 | def test_spawner_state_reload(db, io_loop):
112 |     spawner = new_spawner(db=db)
113 |     assert spawner.get_state() == {}
114 | 
115 |     io_loop.run_sync(spawner.start, timeout=30)
116 |     check_ip(spawner, testhost)
117 |     assert spawner.job_id == testjob
118 | 
119 |     state = spawner.get_state()
120 |     assert state == dict(job_id=testjob, job_status="RUN " + testhost)
121 |     spawner = new_spawner(db=db)
122 |     spawner.clear_state()
123 |     assert spawner.get_state() == {}
124 |     spawner.load_state(state)
125 |     # We used to check IP here, but that is actually only computed on start(),
126 |     # and is not part of the spawner's persistent state
127 |     assert spawner.job_id == testjob
128 | 
129 | 
130 | def test_submit_failure(db, io_loop):
131 |     spawner = new_spawner(db=db)
132 |     assert spawner.get_state() == {}
133 |     spawner.batch_submit_cmd = "cat > /dev/null; true"
134 |     with pytest.raises(RuntimeError) as e_info:
135 |         io_loop.run_sync(spawner.start, timeout=30)
136 |     assert spawner.job_id == ""
137 |     assert spawner.job_status == ""
138 | 
139 | 
140 | def test_submit_pending_fails(db, io_loop):
141 |     """Submission works, but the batch query command immediately fails"""
142 |     spawner = new_spawner(db=db)
143 |     assert spawner.get_state() == {}
144 |     spawner.batch_query_cmd = "echo xyz"
145 |     with pytest.raises(RuntimeError) as e_info:
146 |         io_loop.run_sync(spawner.start, timeout=30)
147 |     status = io_loop.run_sync(spawner.query_job_status, timeout=30)
148 |     assert status == JobStatus.NOTFOUND
149 |     assert spawner.job_id == ""
150 |     assert spawner.job_status == ""
151 | 
152 | 
153 | def test_poll_fails(db, io_loop):
154 |     """Submission works, but a later .poll() fails"""
155 |     spawner = new_spawner(db=db)
156 |     assert spawner.get_state() == {}
157 |     # The start is successful:
158 |     io_loop.run_sync(spawner.start, timeout=30)
159 |     spawner.batch_query_cmd = "echo xyz"
160 |     # Now, the poll fails:
161 |     io_loop.run_sync(spawner.poll, timeout=30)
162 |     # .poll() will run self.clear_state() if it's not found:
163 |     assert spawner.job_id == ""
164 |     assert spawner.job_status == ""
165 | 
166 | 
167 | def test_unknown_status(db, io_loop):
168 |     """Polling returns an unknown status"""
169 |     spawner = new_spawner(db=db)
170 |     assert spawner.get_state() == {}
171 |     # The start is successful:
172 |     io_loop.run_sync(spawner.start, timeout=30)
173 |     spawner.batch_query_cmd = "echo UNKNOWN"
174 |     # This poll should not fail:
175 |     io_loop.run_sync(spawner.poll, timeout=30)
176 |     status = io_loop.run_sync(spawner.query_job_status, timeout=30)
177 |     assert status == JobStatus.UNKNOWN
178 |     assert spawner.job_id == "12345"
179 |     assert spawner.job_status != ""
180 | 
181 | 
182 | def test_templates(db, io_loop):
183 |     """Test templates in the run_command commands"""
184 |     spawner = new_spawner(db=db)
185 | 
186 |     # Test when not running
187 |     spawner.cmd_expectlist = [
188 |         re.compile(".*RUN"),
189 |     ]
190 |     status = io_loop.run_sync(spawner.poll, timeout=5)
191 |     assert status == 1
192 |     assert spawner.job_id == ""
193 |     assert spawner.get_state() == {}
194 | 
195 |     # Test starting
196 |     spawner.cmd_expectlist = [
197 |         re.compile(".*echo"),
198 |         re.compile(".*RUN"),
199 |     ]
200 |     io_loop.run_sync(spawner.start, timeout=5)
201 |     check_ip(spawner, testhost)
202 |     assert spawner.job_id == testjob
203 | 
204 |     # Test poll - running
205 |     spawner.cmd_expectlist = [
206 |         re.compile(".*RUN"),
207 |     ]
208 |     status = io_loop.run_sync(spawner.poll, timeout=5)
209 |     assert status is None
210 | 
211 |     # Test stopping
212 |     spawner.batch_query_cmd = "echo NOPE"
213 |     spawner.cmd_expectlist = [
214 |         re.compile(".*STOP"),
215 |         re.compile(".*NOPE"),
216 |     ]
217 |     io_loop.run_sync(spawner.stop, timeout=5)
218 |     status = io_loop.run_sync(spawner.poll, timeout=5)
219 |     assert status == 1
220 |     assert spawner.get_state() == {}
221 | 
222 | 
223 | def test_batch_script(db, io_loop):
224 |     """Test that the batch script substitutes {cmd}"""
225 | 
226 |     class BatchDummyTestScript(BatchDummy):
227 |         @gen.coroutine
228 |         def _get_batch_script(self, **subvars):
229 |             script = yield super()._get_batch_script(**subvars)
230 |             assert "singleuser_command" in script
231 |             return script
232 | 
233 |     spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript)
234 |     # status = io_loop.run_sync(spawner.poll, timeout=5)
235 |     io_loop.run_sync(spawner.start, timeout=5)
236 |     # status = io_loop.run_sync(spawner.poll, timeout=5)
237 |     # io_loop.run_sync(spawner.stop, timeout=5)
238 | 
239 | 
240 | def test_exec_prefix(db, io_loop):
241 |     """Test that all run_commands have exec_prefix"""
242 | 
243 |     class BatchDummyTestScript(BatchDummy):
244 |         exec_prefix = "PREFIX"
245 | 
246 |         @gen.coroutine
247 |         def run_command(self, cmd, *args, **kwargs):
248 |             assert cmd.startswith("PREFIX ")
249 |             cmd = cmd[7:]
250 |             print(cmd)
251 |             out = yield super().run_command(cmd, *args, **kwargs)
252 |             return out
253 | 
254 |     spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript)
255 |     # Not running
256 |     status = io_loop.run_sync(spawner.poll, timeout=5)
257 |     assert status == 1
258 |     # Start
259 |     io_loop.run_sync(spawner.start, timeout=5)
260 |     assert spawner.job_id == testjob
261 |     # Poll
262 |     status = io_loop.run_sync(spawner.poll, timeout=5)
263 |     assert status is None
264 |     # Stop
265 |     spawner.batch_query_cmd = "echo NOPE"
266 |     io_loop.run_sync(spawner.stop, timeout=5)
267 |     status = io_loop.run_sync(spawner.poll, timeout=5)
268 |     assert status == 1
269 | 
270 | 
271 | def run_spawner_script(
272 |     db, io_loop, spawner, script, batch_script_re_list=None, spawner_kwargs={}
273 | ):
274 |     """Run a spawner script and test that the output and behavior is as expected.
275 | 
276 |     db: same as in this module
277 |     io_loop: same as in this module
278 |     spawner: the BatchSpawnerBase subclass to test
279 |     script: list of (input_re_to_match, output)
280 |     batch_script_re_list: if given, assert batch script matches all of these
281 |     """
282 |     # Create the expected scripts
283 |     cmd_expectlist, out_list = zip(*script)
284 |     cmd_expectlist = list(cmd_expectlist)
285 |     out_list = list(out_list)
286 | 
287 |     class BatchDummyTestScript(spawner):
288 |         @gen.coroutine
289 |         def run_command(self, cmd, input=None, env=None):
290 |             # Test the input
291 |             run_re = cmd_expectlist.pop(0)
292 |             if run_re:
293 |                 print('run: "{}"   [{}]'.format(cmd, run_re))
294 |                 assert (
295 |                     run_re.search(cmd) is not None
296 |                 ), "Failed test: re={0} cmd={1}".format(run_re, cmd)
297 |             # Test the stdin - will only be the batch script.  For
298 |             # each regular expression in batch_script_re_list, assert that
299 |             # each re in that list matches the batch script.
300 |             if batch_script_re_list and input:
301 |                 batch_script = input
302 |                 for match_re in batch_script_re_list:
303 |                     assert (
304 |                         match_re.search(batch_script) is not None
305 |                     ), "Batch script does not match {}".format(match_re)
306 |             # Return expected output.
307 |             out = out_list.pop(0)
308 |             print("  --> " + out)
309 |             return out
310 | 
311 |     spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript, **spawner_kwargs)
312 |     # Not running at beginning (no command run)
313 |     status = io_loop.run_sync(spawner.poll, timeout=5)
314 |     assert status == 1
315 |     # batch_submit_cmd
316 |     # batch_query_cmd    (result=pending)
317 |     # batch_query_cmd    (result=running)
318 |     io_loop.run_sync(spawner.start, timeout=5)
319 |     assert spawner.job_id == testjob
320 |     check_ip(spawner, testhost)
321 |     # batch_query_cmd
322 |     status = io_loop.run_sync(spawner.poll, timeout=5)
323 |     assert status is None
324 |     # batch_cancel_cmd
325 |     io_loop.run_sync(spawner.stop, timeout=5)
326 |     # batch_poll_cmd
327 |     status = io_loop.run_sync(spawner.poll, timeout=5)
328 |     assert status == 1
329 | 
330 | 
331 | def test_torque(db, io_loop):
332 |     spawner_kwargs = {
333 |         "req_nprocs": "5",
334 |         "req_memory": "5678",
335 |         "req_options": "some_option_asdf",
336 |         "req_prologue": "PROLOGUE",
337 |         "req_epilogue": "EPILOGUE",
338 |     }
339 |     batch_script_re_list = [
340 |         re.compile(
341 |             r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE",
342 |             re.S | re.M,
343 |         ),
344 |         re.compile(r"mem=5678"),
345 |         re.compile(r"ppn=5"),
346 |         re.compile(r"^#PBS some_option_asdf", re.M),
347 |     ]
348 |     script = [
349 |         (re.compile(r"sudo.*qsub"), str(testjob)),
350 |         (
351 |             re.compile(r"sudo.*qstat"),
352 |             "<job_state>Q</job_state><exec_host></exec_host>",
353 |         ),  # pending
354 |         (
355 |             re.compile(r"sudo.*qstat"),
356 |             "<job_state>R</job_state><exec_host>{}/1</exec_host>".format(testhost),
357 |         ),  # running
358 |         (
359 |             re.compile(r"sudo.*qstat"),
360 |             "<job_state>R</job_state><exec_host>{}/1</exec_host>".format(testhost),
361 |         ),  # running
362 |         (re.compile(r"sudo.*qdel"), "STOP"),
363 |         (re.compile(r"sudo.*qstat"), ""),
364 |     ]
365 |     from .. import TorqueSpawner
366 | 
367 |     run_spawner_script(
368 |         db,
369 |         io_loop,
370 |         TorqueSpawner,
371 |         script,
372 |         batch_script_re_list=batch_script_re_list,
373 |         spawner_kwargs=spawner_kwargs,
374 |     )
375 | 
376 | 
377 | def test_moab(db, io_loop):
378 |     spawner_kwargs = {
379 |         "req_nprocs": "5",
380 |         "req_memory": "5678",
381 |         "req_options": "some_option_asdf",
382 |         "req_prologue": "PROLOGUE",
383 |         "req_epilogue": "EPILOGUE",
384 |     }
385 |     batch_script_re_list = [
386 |         re.compile(
387 |             r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE",
388 |             re.S | re.M,
389 |         ),
390 |         re.compile(r"mem=5678"),
391 |         re.compile(r"ppn=5"),
392 |         re.compile(r"^#PBS some_option_asdf", re.M),
393 |     ]
394 |     script = [
395 |         (re.compile(r"sudo.*msub"), str(testjob)),
396 |         (re.compile(r"sudo.*mdiag"), 'State="Idle"'),  # pending
397 |         (
398 |             re.compile(r"sudo.*mdiag"),
399 |             'State="Running" AllocNodeList="{}"'.format(testhost),
400 |         ),  # running
401 |         (
402 |             re.compile(r"sudo.*mdiag"),
403 |             'State="Running" AllocNodeList="{}"'.format(testhost),
404 |         ),  # running
405 |         (re.compile(r"sudo.*mjobctl.*-c"), "STOP"),
406 |         (re.compile(r"sudo.*mdiag"), ""),
407 |     ]
408 |     from .. import MoabSpawner
409 | 
410 |     run_spawner_script(
411 |         db,
412 |         io_loop,
413 |         MoabSpawner,
414 |         script,
415 |         batch_script_re_list=batch_script_re_list,
416 |         spawner_kwargs=spawner_kwargs,
417 |     )
418 | 
419 | 
420 | def test_pbs(db, io_loop):
421 |     spawner_kwargs = {
422 |         "req_nprocs": "4",
423 |         "req_memory": "10256",
424 |         "req_options": "some_option_asdf",
425 |         "req_host": "some_pbs_admin_node",
426 |         "req_runtime": "08:00:00",
427 |     }
428 |     batch_script_re_list = [
429 |         re.compile(r"singleuser_command"),
430 |         re.compile(r"select=1"),
431 |         re.compile(r"ncpus=4"),
432 |         re.compile(r"mem=10256"),
433 |         re.compile(r"walltime=08:00:00"),
434 |         re.compile(r"@some_pbs_admin_node"),
435 |         re.compile(r"^#PBS some_option_asdf", re.M),
436 |     ]
437 |     script = [
438 |         (re.compile(r"sudo.*qsub"), str(testjob)),
439 |         (re.compile(r"sudo.*qstat"), "job_state = Q"),  # pending
440 |         (
441 |             re.compile(r"sudo.*qstat"),
442 |             "job_state = R\nexec_host = {}/2*1".format(testhost),
443 |         ),  # running
444 |         (
445 |             re.compile(r"sudo.*qstat"),
446 |             "job_state = R\nexec_host = {}/2*1".format(testhost),
447 |         ),  # running
448 |         (re.compile(r"sudo.*qdel"), "STOP"),
449 |         (re.compile(r"sudo.*qstat"), ""),
450 |     ]
451 |     from .. import PBSSpawner
452 | 
453 |     run_spawner_script(
454 |         db,
455 |         io_loop,
456 |         PBSSpawner,
457 |         script,
458 |         batch_script_re_list=batch_script_re_list,
459 |         spawner_kwargs=spawner_kwargs,
460 |     )
461 | 
462 | 
463 | def test_slurm(db, io_loop):
464 |     spawner_kwargs = {
465 |         "req_runtime": "3-05:10:10",
466 |         "req_nprocs": "5",
467 |         "req_memory": "5678",
468 |         "req_options": "some_option_asdf",
469 |         "req_prologue": "PROLOGUE",
470 |         "req_epilogue": "EPILOGUE",
471 |         "req_reservation": "RES123",
472 |         "req_gres": "GRES123",
473 |     }
474 |     batch_script_re_list = [
475 |         re.compile(
476 |             r"PROLOGUE.*srun batchspawner-singleuser singleuser_command.*EPILOGUE", re.S
477 |         ),
478 |         re.compile(r"^#SBATCH \s+ --cpus-per-task=5", re.X | re.M),
479 |         re.compile(r"^#SBATCH \s+ --time=3-05:10:10", re.X | re.M),
480 |         re.compile(r"^#SBATCH \s+ some_option_asdf", re.X | re.M),
481 |         re.compile(r"^#SBATCH \s+ --reservation=RES123", re.X | re.M),
482 |         re.compile(r"^#SBATCH \s+ --gres=GRES123", re.X | re.M),
483 |     ]
484 |     from .. import SlurmSpawner
485 | 
486 |     run_spawner_script(
487 |         db,
488 |         io_loop,
489 |         SlurmSpawner,
490 |         normal_slurm_script,
491 |         batch_script_re_list=batch_script_re_list,
492 |         spawner_kwargs=spawner_kwargs,
493 |     )
494 | 
495 | 
496 | # We tend to use slurm as our typical example job.  These allow quick
497 | # Slurm examples.
498 | normal_slurm_script = [
499 |     (re.compile(r"sudo.*sbatch"), str(testjob)),
500 |     (re.compile(r"sudo.*squeue"), "PENDING "),  # pending
501 |     (
502 |         re.compile(r"sudo.*squeue"),
503 |         "slurm_load_jobs error: Unable to contact slurm controller",
504 |     ),  # unknown
505 |     (re.compile(r"sudo.*squeue"), "RUNNING " + testhost),  # running
506 |     (re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
507 |     (re.compile(r"sudo.*scancel"), "STOP"),
508 |     (re.compile(r"sudo.*squeue"), ""),
509 | ]
510 | from .. import SlurmSpawner
511 | 
512 | 
513 | def run_typical_slurm_spawner(
514 |     db,
515 |     io_loop,
516 |     spawner=SlurmSpawner,
517 |     script=normal_slurm_script,
518 |     batch_script_re_list=None,
519 |     spawner_kwargs={},
520 | ):
521 |     """Run a full slurm job with default (overrideable) parameters.
522 | 
523 |     This is useful, for example, for changing options and testing effect
524 |     of batch scripts.
525 |     """
526 |     return run_spawner_script(
527 |         db,
528 |         io_loop,
529 |         spawner,
530 |         script,
531 |         batch_script_re_list=batch_script_re_list,
532 |         spawner_kwargs=spawner_kwargs,
533 |     )
534 | 
535 | 
536 | # def test_gridengine(db, io_loop):
537 | #    spawner_kwargs = {
538 | #        'req_options': 'some_option_asdf',
539 | #        }
540 | #    batch_script_re_list = [
541 | #        re.compile(r'singleuser_command'),
542 | #        re.compile(r'#$\s+some_option_asdf'),
543 | #        ]
544 | #    script = [
545 | #        (re.compile(r'sudo.*qsub'),   'x x '+str(testjob)),
546 | #        (re.compile(r'sudo.*qstat'),   'PENDING '),
547 | #        (re.compile(r'sudo.*qstat'),   'RUNNING '+testhost),
548 | #        (re.compile(r'sudo.*qstat'),   'RUNNING '+testhost),
549 | #        (re.compile(r'sudo.*qdel'),  'STOP'),
550 | #        (re.compile(r'sudo.*qstat'),   ''),
551 | #        ]
552 | #    from .. import GridengineSpawner
553 | #    run_spawner_script(db, io_loop, GridengineSpawner, script,
554 | #                       batch_script_re_list=batch_script_re_list,
555 | #                       spawner_kwargs=spawner_kwargs)
556 | 
557 | 
558 | def test_condor(db, io_loop):
559 |     spawner_kwargs = {
560 |         "req_nprocs": "5",
561 |         "req_memory": "5678",
562 |         "req_options": "some_option_asdf",
563 |     }
564 |     batch_script_re_list = [
565 |         re.compile(r"exec batchspawner-singleuser singleuser_command"),
566 |         re.compile(r"RequestCpus = 5"),
567 |         re.compile(r"RequestMemory = 5678"),
568 |         re.compile(r"^some_option_asdf", re.M),
569 |     ]
570 |     script = [
571 |         (
572 |             re.compile(r"sudo.*condor_submit"),
573 |             "submitted to cluster {}".format(str(testjob)),
574 |         ),
575 |         (re.compile(r"sudo.*condor_q"), "1,"),  # pending
576 |         (re.compile(r"sudo.*condor_q"), "2, @{}".format(testhost)),  # runing
577 |         (re.compile(r"sudo.*condor_q"), "2, @{}".format(testhost)),
578 |         (re.compile(r"sudo.*condor_rm"), "STOP"),
579 |         (re.compile(r"sudo.*condor_q"), ""),
580 |     ]
581 |     from .. import CondorSpawner
582 | 
583 |     run_spawner_script(
584 |         db,
585 |         io_loop,
586 |         CondorSpawner,
587 |         script,
588 |         batch_script_re_list=batch_script_re_list,
589 |         spawner_kwargs=spawner_kwargs,
590 |     )
591 | 
592 | 
593 | def test_lfs(db, io_loop):
594 |     spawner_kwargs = {
595 |         "req_nprocs": "5",
596 |         "req_memory": "5678",
597 |         "req_options": "some_option_asdf",
598 |         "req_queue": "some_queue",
599 |         "req_prologue": "PROLOGUE",
600 |         "req_epilogue": "EPILOGUE",
601 |     }
602 |     batch_script_re_list = [
603 |         re.compile(
604 |             r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE",
605 |             re.S | re.M,
606 |         ),
607 |         re.compile(r"#BSUB\s+-q\s+some_queue", re.M),
608 |     ]
609 |     script = [
610 |         (
611 |             re.compile(r"sudo.*bsub"),
612 |             "Job <{}> is submitted to default queue <normal>".format(str(testjob)),
613 |         ),
614 |         (re.compile(r"sudo.*bjobs"), "PEND "),  # pending
615 |         (re.compile(r"sudo.*bjobs"), "RUN {}".format(testhost)),  # running
616 |         (re.compile(r"sudo.*bjobs"), "RUN {}".format(testhost)),
617 |         (re.compile(r"sudo.*bkill"), "STOP"),
618 |         (re.compile(r"sudo.*bjobs"), ""),
619 |     ]
620 |     from .. import LsfSpawner
621 | 
622 |     run_spawner_script(
623 |         db,
624 |         io_loop,
625 |         LsfSpawner,
626 |         script,
627 |         batch_script_re_list=batch_script_re_list,
628 |         spawner_kwargs=spawner_kwargs,
629 |     )
630 | 
631 | 
632 | def test_keepvars(db, io_loop):
633 |     # req_keepvars
634 |     spawner_kwargs = {
635 |         "req_keepvars": "ABCDE",
636 |     }
637 |     batch_script_re_list = [
638 |         re.compile(r"--export=ABCDE", re.X | re.M),
639 |     ]
640 |     run_typical_slurm_spawner(
641 |         db,
642 |         io_loop,
643 |         spawner_kwargs=spawner_kwargs,
644 |         batch_script_re_list=batch_script_re_list,
645 |     )
646 | 
647 |     # req_keepvars AND req_keepvars together
648 |     spawner_kwargs = {
649 |         "req_keepvars": "ABCDE",
650 |         "req_keepvars_extra": "XYZ",
651 |     }
652 |     batch_script_re_list = [
653 |         re.compile(r"--export=ABCDE,XYZ", re.X | re.M),
654 |     ]
655 |     run_typical_slurm_spawner(
656 |         db,
657 |         io_loop,
658 |         spawner_kwargs=spawner_kwargs,
659 |         batch_script_re_list=batch_script_re_list,
660 |     )
661 | 


--------------------------------------------------------------------------------
/batchspawner/batchspawner.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Regents of the University of Minnesota
  2 | # Copyright (c) Michael Gilbert
  3 | # Distributed under the terms of the Modified BSD License.
  4 | 
  5 | """Batch spawners
  6 | 
  7 | This file contains an abstraction layer for batch job queueing systems, and implements
  8 | Jupyterhub spawners for Torque, SLURM, and eventually others.
  9 | 
 10 | Common attributes of batch submission / resource manager environments will include notions of:
 11 |   * queue names, resource manager addresses
 12 |   * resource limits including runtime, number of processes, memory
 13 |   * singleuser child process running on (usually remote) host not known until runtime
 14 |   * job submission and monitoring via resource manager utilities
 15 |   * remote execution via submission of templated scripts
 16 |   * job names instead of PIDs
 17 | """
 18 | import asyncio
 19 | from async_generator import async_generator, yield_
 20 | import pwd
 21 | import os
 22 | import re
 23 | 
 24 | import xml.etree.ElementTree as ET
 25 | 
 26 | from enum import Enum
 27 | 
 28 | from jinja2 import Template
 29 | 
 30 | from tornado import gen
 31 | 
 32 | from jupyterhub.spawner import Spawner
 33 | from traitlets import Integer, Unicode, Float, Dict, default
 34 | 
 35 | from jupyterhub.spawner import set_user_setuid
 36 | 
 37 | 
 38 | def format_template(template, *args, **kwargs):
 39 |     """Format a template, either using jinja2 or str.format().
 40 | 
 41 |     Use jinja2 if the template is a jinja2.Template, or contains '{{' or
 42 |     '{%'.  Otherwise, use str.format() for backwards compatability with
 43 |     old scripts (but you can't mix them).
 44 |     """
 45 |     if isinstance(template, Template):
 46 |         return template.render(*args, **kwargs)
 47 |     elif "{{" in template or "{%" in template:
 48 |         return Template(template).render(*args, **kwargs)
 49 |     return template.format(*args, **kwargs)
 50 | 
 51 | 
 52 | class JobStatus(Enum):
 53 |     NOTFOUND = 0
 54 |     RUNNING = 1
 55 |     PENDING = 2
 56 |     UNKNOWN = 3
 57 | 
 58 | 
 59 | class BatchSpawnerBase(Spawner):
 60 |     """Base class for spawners using resource manager batch job submission mechanisms
 61 | 
 62 |     This base class is developed targetting the TorqueSpawner and SlurmSpawner, so by default
 63 |     assumes a qsub-like command that reads a script from its stdin for starting jobs,
 64 |     a qstat-like command that outputs some data that can be parsed to check if the job is running
 65 |     and on what remote node, and a qdel-like command to cancel a job. The goal is to be
 66 |     sufficiently general that a broad range of systems can be supported with minimal overrides.
 67 | 
 68 |     At minimum, subclasses should provide reasonable defaults for the traits:
 69 |         batch_script
 70 |         batch_submit_cmd
 71 |         batch_query_cmd
 72 |         batch_cancel_cmd
 73 | 
 74 |     and must provide implementations for the methods:
 75 |         state_ispending
 76 |         state_isrunning
 77 |         state_gethost
 78 |     """
 79 | 
 80 |     # override default since batch systems typically need longer
 81 |     start_timeout = Integer(300).tag(config=True)
 82 | 
 83 |     # override default server ip since batch jobs normally running remotely
 84 |     ip = Unicode(
 85 |         "0.0.0.0",
 86 |         help="Address for singleuser server to listen at",
 87 |     ).tag(config=True)
 88 | 
 89 |     exec_prefix = Unicode(
 90 |         "sudo -E -u {username}",
 91 |         help="Standard executon prefix (e.g. the default sudo -E -u {username})",
 92 |     ).tag(config=True)
 93 | 
 94 |     # all these req_foo traits will be available as substvars for templated strings
 95 |     req_queue = Unicode(
 96 |         "",
 97 |         help="Queue name to submit job to resource manager",
 98 |     ).tag(config=True)
 99 | 
100 |     req_host = Unicode(
101 |         "",
102 |         help="Host name of batch server to submit job to resource manager",
103 |     ).tag(config=True)
104 | 
105 |     req_memory = Unicode(
106 |         "",
107 |         help="Memory to request from resource manager",
108 |     ).tag(config=True)
109 | 
110 |     req_nprocs = Unicode(
111 |         "",
112 |         help="Number of processors to request from resource manager",
113 |     ).tag(config=True)
114 | 
115 |     req_ngpus = Unicode(
116 |         "",
117 |         help="Number of GPUs to request from resource manager",
118 |     ).tag(config=True)
119 | 
120 |     req_runtime = Unicode(
121 |         "",
122 |         help="Length of time for submitted job to run",
123 |     ).tag(config=True)
124 | 
125 |     req_partition = Unicode(
126 |         "",
127 |         help="Partition name to submit job to resource manager",
128 |     ).tag(config=True)
129 | 
130 |     req_account = Unicode(
131 |         "",
132 |         help="Account name string to pass to the resource manager",
133 |     ).tag(config=True)
134 | 
135 |     req_options = Unicode(
136 |         "",
137 |         help="Other options to include into job submission script",
138 |     ).tag(config=True)
139 | 
140 |     req_prologue = Unicode(
141 |         "",
142 |         help="Script to run before single user server starts.",
143 |     ).tag(config=True)
144 | 
145 |     req_epilogue = Unicode(
146 |         "",
147 |         help="Script to run after single user server ends.",
148 |     ).tag(config=True)
149 | 
150 |     req_username = Unicode()
151 | 
152 |     @default("req_username")
153 |     def _req_username_default(self):
154 |         return self.user.name
155 | 
156 |     # Useful IF getpwnam on submit host returns correct info for exec host
157 |     req_homedir = Unicode()
158 | 
159 |     @default("req_homedir")
160 |     def _req_homedir_default(self):
161 |         return pwd.getpwnam(self.user.name).pw_dir
162 | 
163 |     req_keepvars = Unicode()
164 | 
165 |     @default("req_keepvars")
166 |     def _req_keepvars_default(self):
167 |         return ",".join(self.get_env().keys())
168 | 
169 |     req_keepvars_extra = Unicode(
170 |         help="Extra environment variables which should be configured, "
171 |         "added to the defaults in keepvars, "
172 |         "comma separated list.",
173 |     )
174 | 
175 |     batch_script = Unicode(
176 |         "",
177 |         help="Template for job submission script. Traits on this class named like req_xyz "
178 |         "will be substituted in the template for {xyz} using string.Formatter. "
179 |         "Must include {cmd} which will be replaced with the jupyterhub-singleuser command line.",
180 |     ).tag(config=True)
181 | 
182 |     batchspawner_singleuser_cmd = Unicode(
183 |         "batchspawner-singleuser",
184 |         help="A wrapper which is capable of special batchspawner setup: currently sets the port on "
185 |         "the remote host.  Not needed to be set under normal circumstances, unless path needs "
186 |         "specification.",
187 |     ).tag(config=True)
188 | 
189 |     # Raw output of job submission command unless overridden
190 |     job_id = Unicode()
191 | 
192 |     # Will get the raw output of the job status command unless overridden
193 |     job_status = Unicode()
194 | 
195 |     # Prepare substitution variables for templates using req_xyz traits
196 |     def get_req_subvars(self):
197 |         reqlist = [t for t in self.trait_names() if t.startswith("req_")]
198 |         subvars = {}
199 |         for t in reqlist:
200 |             subvars[t[4:]] = getattr(self, t)
201 |         if subvars.get("keepvars_extra"):
202 |             subvars["keepvars"] += "," + subvars["keepvars_extra"]
203 |         return subvars
204 | 
205 |     batch_submit_cmd = Unicode(
206 |         "",
207 |         help="Command to run to submit batch scripts. Formatted using req_xyz traits as {xyz}.",
208 |     ).tag(config=True)
209 | 
210 |     def parse_job_id(self, output):
211 |         "Parse output of submit command to get job id."
212 |         return output
213 | 
214 |     def cmd_formatted_for_batch(self):
215 |         """The command which is substituted inside of the batch script"""
216 |         return " ".join([self.batchspawner_singleuser_cmd] + self.cmd + self.get_args())
217 | 
218 |     async def run_command(self, cmd, input=None, env=None):
219 |         proc = await asyncio.create_subprocess_shell(
220 |             cmd,
221 |             env=env,
222 |             stdin=asyncio.subprocess.PIPE,
223 |             stdout=asyncio.subprocess.PIPE,
224 |             stderr=asyncio.subprocess.PIPE,
225 |         )
226 |         inbytes = None
227 | 
228 |         if input:
229 |             inbytes = input.encode()
230 | 
231 |         try:
232 |             out, eout = await proc.communicate(input=inbytes)
233 |         except:
234 |             self.log.debug("Exception raised when trying to run command: %s" % cmd)
235 |             proc.kill()
236 |             self.log.debug("Running command failed, killed process.")
237 |             try:
238 |                 out, eout = await asyncio.wait_for(proc.communicate(), timeout=2)
239 |                 out = out.decode().strip()
240 |                 eout = eout.decode().strip()
241 |                 self.log.error("Subprocess returned exitcode %s" % proc.returncode)
242 |                 self.log.error("Stdout:")
243 |                 self.log.error(out)
244 |                 self.log.error("Stderr:")
245 |                 self.log.error(eout)
246 |                 raise RuntimeError(
247 |                     "{} exit status {}: {}".format(cmd, proc.returncode, eout)
248 |                 )
249 |             except asyncio.TimeoutError:
250 |                 self.log.error(
251 |                     "Encountered timeout trying to clean up command, process probably killed already: %s"
252 |                     % cmd
253 |                 )
254 |                 return ""
255 |             except:
256 |                 self.log.error(
257 |                     "Encountered exception trying to clean up command: %s" % cmd
258 |                 )
259 |                 raise
260 |         else:
261 |             eout = eout.decode().strip()
262 |             err = proc.returncode
263 |             if err != 0:
264 |                 self.log.error("Subprocess returned exitcode %s" % err)
265 |                 self.log.error(eout)
266 |                 raise RuntimeError(eout)
267 | 
268 |         out = out.decode().strip()
269 |         return out
270 | 
271 |     async def _get_batch_script(self, **subvars):
272 |         """Format batch script from vars"""
273 |         # Could be overridden by subclasses, but mainly useful for testing
274 |         return format_template(self.batch_script, **subvars)
275 | 
276 |     async def submit_batch_script(self):
277 |         subvars = self.get_req_subvars()
278 |         # `cmd` is submitted to the batch system
279 |         cmd = " ".join(
280 |             (
281 |                 format_template(self.exec_prefix, **subvars),
282 |                 format_template(self.batch_submit_cmd, **subvars),
283 |             )
284 |         )
285 |         # `subvars['cmd']` is what is run _inside_ the batch script,
286 |         # put into the template.
287 |         subvars["cmd"] = self.cmd_formatted_for_batch()
288 |         if hasattr(self, "user_options"):
289 |             subvars.update(self.user_options)
290 |         script = await self._get_batch_script(**subvars)
291 |         self.log.info("Spawner submitting job using " + cmd)
292 |         self.log.info("Spawner submitted script:\n" + script)
293 |         out = await self.run_command(cmd, input=script, env=self.get_env())
294 |         try:
295 |             self.log.info("Job submitted. cmd: " + cmd + " output: " + out)
296 |             self.job_id = self.parse_job_id(out)
297 |         except:
298 |             self.log.error("Job submission failed with exit code " + out)
299 |             self.job_id = ""
300 |         return self.job_id
301 | 
302 |     # Override if your batch system needs something more elaborate to query the job status
303 |     batch_query_cmd = Unicode(
304 |         "",
305 |         help="Command to run to query job status. Formatted using req_xyz traits as {xyz} "
306 |         "and self.job_id as {job_id}.",
307 |     ).tag(config=True)
308 | 
309 |     async def query_job_status(self):
310 |         """Check job status, return JobStatus object."""
311 |         if self.job_id is None or len(self.job_id) == 0:
312 |             self.job_status = ""
313 |             return JobStatus.NOTFOUND
314 |         subvars = self.get_req_subvars()
315 |         subvars["job_id"] = self.job_id
316 |         cmd = " ".join(
317 |             (
318 |                 format_template(self.exec_prefix, **subvars),
319 |                 format_template(self.batch_query_cmd, **subvars),
320 |             )
321 |         )
322 |         self.log.debug("Spawner querying job: " + cmd)
323 |         try:
324 |             self.job_status = await self.run_command(cmd)
325 |         except RuntimeError as e:
326 |             # e.args[0] is stderr from the process
327 |             self.job_status = e.args[0]
328 |         except Exception as e:
329 |             self.log.error("Error querying job " + self.job_id)
330 |             self.job_status = ""
331 | 
332 |         if self.state_isrunning():
333 |             return JobStatus.RUNNING
334 |         elif self.state_ispending():
335 |             return JobStatus.PENDING
336 |         elif self.state_isunknown():
337 |             return JobStatus.UNKNOWN
338 |         else:
339 |             return JobStatus.NOTFOUND
340 | 
341 |     batch_cancel_cmd = Unicode(
342 |         "",
343 |         help="Command to stop/cancel a previously submitted job. Formatted like batch_query_cmd.",
344 |     ).tag(config=True)
345 | 
346 |     async def cancel_batch_job(self):
347 |         subvars = self.get_req_subvars()
348 |         subvars["job_id"] = self.job_id
349 |         cmd = " ".join(
350 |             (
351 |                 format_template(self.exec_prefix, **subvars),
352 |                 format_template(self.batch_cancel_cmd, **subvars),
353 |             )
354 |         )
355 |         self.log.info("Cancelling job " + self.job_id + ": " + cmd)
356 |         await self.run_command(cmd)
357 | 
358 |     def load_state(self, state):
359 |         """load job_id from state"""
360 |         super(BatchSpawnerBase, self).load_state(state)
361 |         self.job_id = state.get("job_id", "")
362 |         self.job_status = state.get("job_status", "")
363 | 
364 |     def get_state(self):
365 |         """add job_id to state"""
366 |         state = super(BatchSpawnerBase, self).get_state()
367 |         if self.job_id:
368 |             state["job_id"] = self.job_id
369 |         if self.job_status:
370 |             state["job_status"] = self.job_status
371 |         return state
372 | 
373 |     def clear_state(self):
374 |         """clear job_id state"""
375 |         super(BatchSpawnerBase, self).clear_state()
376 |         self.job_id = ""
377 |         self.job_status = ""
378 | 
379 |     def make_preexec_fn(self, name):
380 |         """make preexec fn to change uid (if running as root) before job submission"""
381 |         return set_user_setuid(name)
382 | 
383 |     def state_ispending(self):
384 |         "Return boolean indicating if job is still waiting to run, likely by parsing self.job_status"
385 |         raise NotImplementedError("Subclass must provide implementation")
386 | 
387 |     def state_isrunning(self):
388 |         "Return boolean indicating if job is running, likely by parsing self.job_status"
389 |         raise NotImplementedError("Subclass must provide implementation")
390 | 
391 |     def state_isunknown(self):
392 |         "Return boolean indicating if job state retrieval failed because of the resource manager"
393 |         return None
394 | 
395 |     def state_gethost(self):
396 |         "Return string, hostname or addr of running job, likely by parsing self.job_status"
397 |         raise NotImplementedError("Subclass must provide implementation")
398 | 
399 |     async def poll(self):
400 |         """Poll the process"""
401 |         status = await self.query_job_status()
402 |         if status in (JobStatus.PENDING, JobStatus.RUNNING, JobStatus.UNKNOWN):
403 |             return None
404 |         else:
405 |             self.clear_state()
406 |             return 1
407 | 
408 |     startup_poll_interval = Float(
409 |         0.5,
410 |         help="Polling interval (seconds) to check job state during startup",
411 |     ).tag(config=True)
412 | 
413 |     async def start(self):
414 |         """Start the process"""
415 |         self.ip = self.traits()["ip"].default_value
416 |         self.port = self.traits()["port"].default_value
417 | 
418 |         if self.server:
419 |             self.server.port = self.port
420 | 
421 |         job = await self.submit_batch_script()
422 | 
423 |         # We are called with a timeout, and if the timeout expires this function will
424 |         # be interrupted at the next yield, and self.stop() will be called.
425 |         # So this function should not return unless successful, and if unsuccessful
426 |         # should either raise and Exception or loop forever.
427 |         if len(self.job_id) == 0:
428 |             raise RuntimeError(
429 |                 "Jupyter batch job submission failure (no jobid in output)"
430 |             )
431 |         while True:
432 |             status = await self.query_job_status()
433 |             if status == JobStatus.RUNNING:
434 |                 break
435 |             elif status == JobStatus.PENDING:
436 |                 self.log.debug("Job " + self.job_id + " still pending")
437 |             elif status == JobStatus.UNKNOWN:
438 |                 self.log.debug("Job " + self.job_id + " still unknown")
439 |             else:
440 |                 self.log.warning(
441 |                     "Job "
442 |                     + self.job_id
443 |                     + " neither pending nor running.\n"
444 |                     + self.job_status
445 |                 )
446 |                 self.clear_state()
447 |                 raise RuntimeError(
448 |                     "The Jupyter batch job has disappeared"
449 |                     " while pending in the queue or died immediately"
450 |                     " after starting."
451 |                 )
452 |             await gen.sleep(self.startup_poll_interval)
453 | 
454 |         self.ip = self.state_gethost()
455 |         while self.port == 0:
456 |             await gen.sleep(self.startup_poll_interval)
457 |             # Test framework: For testing, mock_port is set because we
458 |             # don't actually run the single-user server yet.
459 |             if hasattr(self, "mock_port"):
460 |                 self.port = self.mock_port
461 | 
462 |         self.db.commit()
463 |         self.log.info(
464 |             "Notebook server job {0} started at {1}:{2}".format(
465 |                 self.job_id, self.ip, self.port
466 |             )
467 |         )
468 | 
469 |         return self.ip, self.port
470 | 
471 |     async def stop(self, now=False):
472 |         """Stop the singleuser server job.
473 | 
474 |         Returns immediately after sending job cancellation command if now=True, otherwise
475 |         tries to confirm that job is no longer running."""
476 | 
477 |         self.log.info("Stopping server job " + self.job_id)
478 |         await self.cancel_batch_job()
479 |         if now:
480 |             return
481 |         for i in range(10):
482 |             status = await self.query_job_status()
483 |             if status not in (JobStatus.RUNNING, JobStatus.UNKNOWN):
484 |                 return
485 |             await gen.sleep(1.0)
486 |         if self.job_id:
487 |             self.log.warning(
488 |                 "Notebook server job {0} at {1}:{2} possibly failed to terminate".format(
489 |                     self.job_id, self.ip, self.port
490 |                 )
491 |             )
492 | 
493 |     @async_generator
494 |     async def progress(self):
495 |         while True:
496 |             if self.state_ispending():
497 |                 await yield_(
498 |                     {
499 |                         "message": "Pending in queue...",
500 |                     }
501 |                 )
502 |             elif self.state_isrunning():
503 |                 await yield_(
504 |                     {
505 |                         "message": "Cluster job running... waiting to connect",
506 |                     }
507 |                 )
508 |                 return
509 |             else:
510 |                 await yield_(
511 |                     {
512 |                         "message": "Unknown status...",
513 |                     }
514 |                 )
515 |             await gen.sleep(1)
516 | 
517 | 
518 | class BatchSpawnerRegexStates(BatchSpawnerBase):
519 |     """Subclass of BatchSpawnerBase that uses config-supplied regular expressions
520 |     to interact with batch submission system state. Provides implementations of
521 |         state_ispending
522 |         state_isrunning
523 |         state_gethost
524 | 
525 |     In their place, the user should supply the following configuration:
526 |         state_pending_re - regex that matches job_status if job is waiting to run
527 |         state_running_re - regex that matches job_status if job is running
528 |         state_exechost_re - regex with at least one capture group that extracts
529 |                             execution host from job_status
530 |         state_exechost_exp - if empty, notebook IP will be set to the contents of the
531 |             first capture group. If this variable is set, the match object
532 |             will be expanded using this string to obtain the notebook IP.
533 |             See Python docs: re.match.expand
534 |     """
535 | 
536 |     state_pending_re = Unicode(
537 |         "",
538 |         help="Regex that matches job_status if job is waiting to run",
539 |     ).tag(config=True)
540 |     state_running_re = Unicode(
541 |         "",
542 |         help="Regex that matches job_status if job is running",
543 |     ).tag(config=True)
544 |     state_exechost_re = Unicode(
545 |         "",
546 |         help="Regex with at least one capture group that extracts "
547 |         "the execution host from job_status output",
548 |     ).tag(config=True)
549 |     state_exechost_exp = Unicode(
550 |         "",
551 |         help="""If empty, notebook IP will be set to the contents of the first capture group.
552 | 
553 |         If this variable is set, the match object will be expanded using this string
554 |         to obtain the notebook IP.
555 |         See Python docs: re.match.expand""",
556 |     ).tag(config=True)
557 |     state_unknown_re = Unicode(
558 |         "",
559 |         help="Regex that matches job_status if the resource manager is not answering."
560 |         "Blank indicates not used.",
561 |     ).tag(config=True)
562 | 
563 |     def state_ispending(self):
564 |         assert self.state_pending_re, "Misconfigured: define state_running_re"
565 |         return self.job_status and re.search(self.state_pending_re, self.job_status)
566 | 
567 |     def state_isrunning(self):
568 |         assert self.state_running_re, "Misconfigured: define state_running_re"
569 |         return self.job_status and re.search(self.state_running_re, self.job_status)
570 | 
571 |     def state_isunknown(self):
572 |         # Blank means "not set" and this function always returns None.
573 |         if self.state_unknown_re:
574 |             return self.job_status and re.search(self.state_unknown_re, self.job_status)
575 | 
576 |     def state_gethost(self):
577 |         assert self.state_exechost_re, "Misconfigured: define state_exechost_re"
578 |         match = re.search(self.state_exechost_re, self.job_status)
579 |         if not match:
580 |             self.log.error(
581 |                 "Spawner unable to match host addr in job status: " + self.job_status
582 |             )
583 |             return
584 |         if not self.state_exechost_exp:
585 |             return match.groups()[0]
586 |         else:
587 |             return match.expand(self.state_exechost_exp)
588 | 
589 | 
590 | class TorqueSpawner(BatchSpawnerRegexStates):
591 |     batch_script = Unicode(
592 |         """#!/bin/sh
593 | #PBS -q {queue}@{host}
594 | #PBS -l walltime={runtime}
595 | #PBS -l nodes=1:ppn={nprocs}
596 | #PBS -l mem={memory}
597 | #PBS -N jupyterhub-singleuser
598 | #PBS -v {keepvars}
599 | #PBS {options}
600 | 
601 | set -eu
602 | 
603 | {prologue}
604 | {cmd}
605 | {epilogue}
606 | """
607 |     ).tag(config=True)
608 | 
609 |     # outputs job id string
610 |     batch_submit_cmd = Unicode("qsub").tag(config=True)
611 |     # outputs job data XML string
612 |     batch_query_cmd = Unicode("qstat -x {job_id}").tag(config=True)
613 |     batch_cancel_cmd = Unicode("qdel {job_id}").tag(config=True)
614 |     # search XML string for job_state - [QH] = pending, R = running, [CE] = done
615 |     state_pending_re = Unicode(r"<job_state>[QH]</job_state>").tag(config=True)
616 |     state_running_re = Unicode(r"<job_state>R</job_state>").tag(config=True)
617 |     state_exechost_re = Unicode(r"<exec_host>((?:[\w_-]+\.?)+)/\d+").tag(config=True)
618 | 
619 | 
620 | class MoabSpawner(TorqueSpawner):
621 |     # outputs job id string
622 |     batch_submit_cmd = Unicode("msub").tag(config=True)
623 |     # outputs job data XML string
624 |     batch_query_cmd = Unicode("mdiag -j {job_id} --xml").tag(config=True)
625 |     batch_cancel_cmd = Unicode("mjobctl -c {job_id}").tag(config=True)
626 |     state_pending_re = Unicode(r'State="Idle"').tag(config=True)
627 |     state_running_re = Unicode(r'State="Running"').tag(config=True)
628 |     state_exechost_re = Unicode(r'AllocNodeList="([^\r\n\t\f :"]*)').tag(config=True)
629 | 
630 | 
631 | class PBSSpawner(TorqueSpawner):
632 |     batch_script = Unicode(
633 |         """#!/bin/sh
634 | {% if queue or host %}#PBS -q {% if queue  %}{{queue}}{% endif %}\
635 | {% if host %}@{{host}}{% endif %}{% endif %}
636 | #PBS -l walltime={{runtime}}
637 | #PBS -l select=1:ncpus={{nprocs}}:mem={{memory}}
638 | #PBS -N jupyterhub-singleuser
639 | #PBS -o {{homedir}}/.jupyterhub.pbs.out
640 | #PBS -e {{homedir}}/.jupyterhub.pbs.err
641 | #PBS -v {{keepvars}}
642 | {% if options %}#PBS {{options}}{% endif %}
643 | 
644 | set -eu
645 | 
646 | {{prologue}}
647 | {{cmd}}
648 | {{epilogue}}
649 | """
650 |     ).tag(config=True)
651 | 
652 |     # outputs job data XML string
653 |     batch_query_cmd = Unicode("qstat -fx {job_id}").tag(config=True)
654 | 
655 |     state_pending_re = Unicode(r"job_state = [QH]").tag(config=True)
656 |     state_running_re = Unicode(r"job_state = R").tag(config=True)
657 |     #state_exechost_re = Unicode(r"exec_host = ([\w_-]+)/").tag(config=True)
658 |     state_exechost_re = Unicode(r"exec_host = ([\w_-]+)").tag(config=True)
659 | 
660 | 
661 | class UserEnvMixin:
662 |     """Mixin class that computes values for USER, SHELL and HOME in the environment passed to
663 |     the job submission subprocess in case the batch system needs these for the batch script.
664 |     """
665 | 
666 |     def user_env(self, env):
667 |         """get user environment"""
668 |         env["USER"] = self.user.name
669 |         home = pwd.getpwnam(self.user.name).pw_dir
670 |         shell = pwd.getpwnam(self.user.name).pw_shell
671 |         if home:
672 |             env["HOME"] = home
673 |         if shell:
674 |             env["SHELL"] = shell
675 |         return env
676 | 
677 |     def get_env(self):
678 |         """Get user environment variables to be passed to the user's job
679 | 
680 |         Everything here should be passed to the user's job as
681 |         environment.  Caution: If these variables are used for
682 |         authentication to the batch system commands as an admin, be
683 |         aware that the user will receive access to these as well.
684 |         """
685 |         env = super().get_env()
686 |         env = self.user_env(env)
687 |         return env
688 | 
689 | 
690 | class SlurmSpawner(UserEnvMixin, BatchSpawnerRegexStates):
691 |     batch_script = Unicode(
692 |         """#!/bin/bash
693 | #SBATCH --output={{homedir}}/jupyterhub_slurmspawner_%j.log
694 | #SBATCH --job-name=spawner-jupyterhub
695 | #SBATCH --chdir={{homedir}}
696 | #SBATCH --export={{keepvars}}
697 | #SBATCH --get-user-env=L
698 | {% if partition  %}#SBATCH --partition={{partition}}
699 | {% endif %}{% if runtime    %}#SBATCH --time={{runtime}}
700 | {% endif %}{% if memory     %}#SBATCH --mem={{memory}}
701 | {% endif %}{% if gres       %}#SBATCH --gres={{gres}}
702 | {% endif %}{% if nprocs     %}#SBATCH --cpus-per-task={{nprocs}}
703 | {% endif %}{% if reservation%}#SBATCH --reservation={{reservation}}
704 | {% endif %}{% if options    %}#SBATCH {{options}}{% endif %}
705 | 
706 | set -euo pipefail
707 | 
708 | trap 'echo SIGTERM received' TERM
709 | {{prologue}}
710 | which jupyterhub-singleuser
711 | {% if srun %}{{srun}} {% endif %}{{cmd}}
712 | echo "jupyterhub-singleuser ended gracefully"
713 | {{epilogue}}
714 | """
715 |     ).tag(config=True)
716 | 
717 |     # all these req_foo traits will be available as substvars for templated strings
718 |     req_cluster = Unicode(
719 |         "",
720 |         help="Cluster name to submit job to resource manager",
721 |     ).tag(config=True)
722 | 
723 |     req_qos = Unicode(
724 |         "",
725 |         help="QoS name to submit job to resource manager",
726 |     ).tag(config=True)
727 | 
728 |     req_srun = Unicode(
729 |         "srun",
730 |         help="Set req_srun='' to disable running in job step, and note that "
731 |         "this affects environment handling.  This is effectively a "
732 |         "prefix for the singleuser command.",
733 |     ).tag(config=True)
734 | 
735 |     req_reservation = Unicode(
736 |         "",
737 |         help="Reservation name to submit to resource manager",
738 |     ).tag(config=True)
739 | 
740 |     req_gres = Unicode(
741 |         "",
742 |         help="Additional resources (e.g. GPUs) requested",
743 |     ).tag(config=True)
744 | 
745 |     # outputs line like "Submitted batch job 209"
746 |     batch_submit_cmd = Unicode("sbatch --parsable").tag(config=True)
747 |     # outputs status and exec node like "RUNNING hostname"
748 |     batch_query_cmd = Unicode("squeue -h -j {job_id} -o '%T %B'").tag(config=True)
749 |     batch_cancel_cmd = Unicode("scancel {job_id}").tag(config=True)
750 |     # use long-form states: PENDING,  CONFIGURING = pending
751 |     #  RUNNING,  COMPLETING = running
752 |     state_pending_re = Unicode(r"^(?:PENDING|CONFIGURING)").tag(config=True)
753 |     state_running_re = Unicode(r"^(?:RUNNING|COMPLETING)").tag(config=True)
754 |     state_unknown_re = Unicode(
755 |         r"^slurm_load_jobs error: (?:Socket timed out on send/recv|Unable to contact slurm controller)"
756 |     ).tag(config=True)
757 |     state_exechost_re = Unicode(r"\s+((?:[\w_-]+\.?)+)$").tag(config=True)
758 | 
759 |     def parse_job_id(self, output):
760 |         # make sure jobid is really a number
761 |         try:
762 |             # use only last line to circumvent slurm bug
763 |             output = output.splitlines()[-1]
764 |             id = output.split(";")[0]
765 |             int(id)
766 |         except Exception as e:
767 |             self.log.error("SlurmSpawner unable to parse job ID from text: " + output)
768 |             raise e
769 |         return id
770 | 
771 | 
772 | class MultiSlurmSpawner(SlurmSpawner):
773 |     """When slurm has been compiled with --enable-multiple-slurmd, the
774 |     administrator sets the name of the slurmd instance via the slurmd -N
775 |     option. This node name is usually different from the hostname and may
776 |     not be resolvable by JupyterHub. Here we enable the administrator to
777 |     map the node names onto the real hostnames via a traitlet."""
778 | 
779 |     daemon_resolver = Dict(
780 |         {},
781 |         help="Map node names to hostnames",
782 |     ).tag(config=True)
783 | 
784 |     def state_gethost(self):
785 |         host = SlurmSpawner.state_gethost(self)
786 |         return self.daemon_resolver.get(host, host)
787 | 
788 | 
789 | class GridengineSpawner(BatchSpawnerBase):
790 |     batch_script = Unicode(
791 |         """#!/bin/bash
792 | #$ -j yes
793 | #$ -N spawner-jupyterhub
794 | #$ -o {homedir}/.jupyterhub.sge.out
795 | #$ -e {homedir}/.jupyterhub.sge.err
796 | #$ -v {keepvars}
797 | #$ {options}
798 | 
799 | set -euo pipefail
800 | 
801 | {prologue}
802 | {cmd}
803 | {epilogue}
804 | """
805 |     ).tag(config=True)
806 | 
807 |     # outputs job id string
808 |     batch_submit_cmd = Unicode("qsub").tag(config=True)
809 |     # outputs job data XML string
810 |     batch_query_cmd = Unicode("qstat -xml").tag(config=True)
811 |     batch_cancel_cmd = Unicode("qdel {job_id}").tag(config=True)
812 | 
813 |     def parse_job_id(self, output):
814 |         return output.split(" ")[2]
815 | 
816 |     def state_ispending(self):
817 |         if self.job_status:
818 |             job_info = ET.fromstring(self.job_status).find(
819 |                 ".//job_list[JB_job_number='{0}']".format(self.job_id)
820 |             )
821 |             if job_info is not None:
822 |                 return job_info.attrib.get("state") == "pending"
823 |         return False
824 | 
825 |     def state_isrunning(self):
826 |         if self.job_status:
827 |             job_info = ET.fromstring(self.job_status).find(
828 |                 ".//job_list[JB_job_number='{0}']".format(self.job_id)
829 |             )
830 |             if job_info is not None:
831 |                 return job_info.attrib.get("state") == "running"
832 |         return False
833 | 
834 |     def state_gethost(self):
835 |         if self.job_status:
836 |             queue_name = ET.fromstring(self.job_status).find(
837 |                 ".//job_list[JB_job_number='{0}']/queue_name".format(self.job_id)
838 |             )
839 |             if queue_name is not None and queue_name.text:
840 |                 return queue_name.text.split("@")[1]
841 | 
842 |         self.log.error(
843 |             "Spawner unable to match host addr in job {0} with status {1}".format(
844 |                 self.job_id, self.job_status
845 |             )
846 |         )
847 |         return
848 | 
849 |     def get_env(self):
850 |         env = super().get_env()
851 | 
852 |         # SGE relies on environment variables to launch local jobs.  Ensure that these values are included
853 |         # in the environment used to run the spawner.
854 |         for key in [
855 |             "SGE_CELL",
856 |             "SGE_EXECD",
857 |             "SGE_ROOT",
858 |             "SGE_CLUSTER_NAME",
859 |             "SGE_QMASTER_PORT",
860 |             "SGE_EXECD_PORT",
861 |             "PATH",
862 |         ]:
863 |             if key in os.environ and key not in env:
864 |                 env[key] = os.environ[key]
865 |         return env
866 | 
867 | 
868 | class CondorSpawner(UserEnvMixin, BatchSpawnerRegexStates):
869 |     batch_script = Unicode(
870 |         """
871 | Executable = /bin/sh
872 | RequestMemory = {memory}
873 | RequestCpus = {nprocs}
874 | Arguments = \"-c 'exec {cmd}'\"
875 | Remote_Initialdir = {homedir}
876 | Output = {homedir}/.jupyterhub.condor.out
877 | Error = {homedir}/.jupyterhub.condor.err
878 | ShouldTransferFiles = False
879 | GetEnv = True
880 | {options}
881 | Queue
882 | """
883 |     ).tag(config=True)
884 | 
885 |     # outputs job id string
886 |     batch_submit_cmd = Unicode("condor_submit").tag(config=True)
887 |     # outputs job data XML string
888 |     batch_query_cmd = Unicode(
889 |         'condor_q {job_id} -format "%s, " JobStatus -format "%s" RemoteHost -format "\n" True'
890 |     ).tag(config=True)
891 |     batch_cancel_cmd = Unicode("condor_rm {job_id}").tag(config=True)
892 |     # job status: 1 = pending, 2 = running
893 |     state_pending_re = Unicode(r"^1,").tag(config=True)
894 |     state_running_re = Unicode(r"^2,").tag(config=True)
895 |     state_exechost_re = Unicode(r"^\w*, .*@([^ ]*)").tag(config=True)
896 | 
897 |     def parse_job_id(self, output):
898 |         match = re.search(r".*submitted to cluster ([0-9]+)", output)
899 |         if match:
900 |             return match.groups()[0]
901 | 
902 |         error_msg = "CondorSpawner unable to parse jobID from text: " + output
903 |         self.log.error(error_msg)
904 |         raise Exception(error_msg)
905 | 
906 |     def cmd_formatted_for_batch(self):
907 |         return (
908 |             super(CondorSpawner, self)
909 |             .cmd_formatted_for_batch()
910 |             .replace('"', '""')
911 |             .replace("'", "''")
912 |         )
913 | 
914 | 
915 | class LsfSpawner(BatchSpawnerBase):
916 |     """A Spawner that uses IBM's Platform Load Sharing Facility (LSF) to launch notebooks."""
917 | 
918 |     batch_script = Unicode(
919 |         """#!/bin/sh
920 | #BSUB -R "select[type==any]"    # Allow spawning on non-uniform hardware
921 | #BSUB -R "span[hosts=1]"        # Only spawn job on one server
922 | #BSUB -q {queue}
923 | #BSUB -J spawner-jupyterhub
924 | #BSUB -o {homedir}/.jupyterhub.lsf.out
925 | #BSUB -e {homedir}/.jupyterhub.lsf.err
926 | 
927 | set -eu
928 | 
929 | {prologue}
930 | {cmd}
931 | {epilogue}
932 | """
933 |     ).tag(config=True)
934 | 
935 |     batch_submit_cmd = Unicode("bsub").tag(config=True)
936 |     batch_query_cmd = Unicode('bjobs -a -noheader -o "STAT EXEC_HOST" {job_id}').tag(
937 |         config=True
938 |     )
939 |     batch_cancel_cmd = Unicode("bkill {job_id}").tag(config=True)
940 | 
941 |     def get_env(self):
942 |         env = super().get_env()
943 | 
944 |         # LSF relies on environment variables to launch local jobs.  Ensure that these values are included
945 |         # in the environment used to run the spawner.
946 |         for key in [
947 |             "LSF_ENVDIR",
948 |             "LSF_SERVERDIR",
949 |             "LSF_FULL_VERSION",
950 |             "LSF_LIBDIR",
951 |             "LSF_BINDIR",
952 |         ]:
953 |             if key in os.environ and key not in env:
954 |                 env[key] = os.environ[key]
955 |         return env
956 | 
957 |     def parse_job_id(self, output):
958 |         # Assumes output in the following form:
959 |         # "Job <1815> is submitted to default queue <normal>."
960 |         return output.split(" ")[1].strip("<>")
961 | 
962 |     def state_ispending(self):
963 |         # Parse results of batch_query_cmd
964 |         # Output determined by results of self.batch_query_cmd
965 |         if self.job_status:
966 |             return self.job_status.split(" ")[0].upper() in {"PEND", "PUSP"}
967 | 
968 |     def state_isrunning(self):
969 |         if self.job_status:
970 |             return self.job_status.split(" ")[0].upper() == "RUN"
971 | 
972 |     def state_gethost(self):
973 |         if self.job_status:
974 |             return self.job_status.split(" ")[1].strip().split(":")[0]
975 | 
976 |         self.log.error(
977 |             "Spawner unable to match host addr in job {0} with status {1}".format(
978 |                 self.job_id, self.job_status
979 |             )
980 |         )
981 |         return
982 | 
983 | 
984 | # vim: set ai expandtab softtabstop=4:
985 | 


--------------------------------------------------------------------------------