├── batchspawner ├── tests │ ├── __init__.py │ ├── conftest.py │ └── test_spawners.py ├── __init__.py ├── singleuser.py ├── api.py └── batchspawner.py ├── requirements.txt ├── MANIFEST.in ├── .gitignore ├── CONTRIBUTING.md ├── version.py ├── .flake8 ├── .github └── workflows │ ├── python-publish.yml │ └── test.yml ├── LICENSE ├── .pre-commit-config.yaml ├── SPAWNERS.md ├── setup.py ├── CHANGELOG.md └── README.md /batchspawner/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | async_generator>=1.8 2 | jinja2 3 | jupyterhub>=0.9 4 | -------------------------------------------------------------------------------- /batchspawner/__init__.py: -------------------------------------------------------------------------------- 1 | from .batchspawner import * 2 | from . import api 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include LICENSE 3 | include version.py 4 | include requirements.txt 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | *.log 3 | *.pyc 4 | __pycache__/ 5 | .cache/ 6 | .coverage 7 | .pytest_cache 8 | *~ 9 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Welcome! As a [Jupyter](https://jupyter.org) project, we follow the [Jupyter contributor guide](https://jupyter.readthedocs.io/en/latest/contributing/content-contributor.html). 4 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Jupyter Development Team. 2 | # Distributed under the terms of the Modified BSD License. 3 | 4 | version_info = ( 5 | 1, 6 | 2, 7 | 0, 8 | # "dev", # comment-out this line for a release 9 | ) 10 | __version__ = ".".join(map(str, version_info)) 11 | -------------------------------------------------------------------------------- /batchspawner/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Relevant pytest fixtures are re-used from JupyterHub's test suite""" 2 | 3 | # We only use "db" and "io_loop", but we also need event_loop which is used by 4 | # io_loop to be available with jupyterhub 1+. 5 | from jupyterhub.tests.conftest import db, io_loop 6 | 7 | try: 8 | from jupyterhub.tests.conftest import event_loop 9 | except: 10 | pass 11 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Ignore style and complexity 3 | # E: style errors 4 | # W: style warnings 5 | # C: complexity 6 | # F401: module imported but unused 7 | # F403: import * 8 | # F811: redefinition of unused `name` from line `N` 9 | # F841: local variable assigned but never used 10 | # E402: module level import not at top of file 11 | # I100: Import statements are in the wrong order 12 | # I101: Imported names are in the wrong order. Should be 13 | ignore = E, W, C, F401, F403, F811, F841, E402, I100, I101, D400 14 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | # 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [released] 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: actions/setup-python@v3 17 | with: 18 | python-version: "3.x" 19 | 20 | - name: install build package 21 | run: | 22 | pip install --upgrade pip 23 | pip install build 24 | pip freeze 25 | 26 | - name: build release 27 | run: | 28 | python -m build --sdist --wheel . 29 | ls -l dist 30 | sha256sum dist/* | tee SHA256SUMS 31 | 32 | - name: Publish to PyPI 33 | env: 34 | TWINE_USERNAME: __token__ 35 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 36 | run: | 37 | pip install twine 38 | twine upload --skip-existing dist/* 39 | -------------------------------------------------------------------------------- /batchspawner/singleuser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from runpy import run_path 5 | from shutil import which 6 | 7 | from jupyterhub.utils import random_port, url_path_join 8 | from jupyterhub.services.auth import HubAuth 9 | 10 | import requests 11 | 12 | 13 | def main(argv=None): 14 | port = random_port() 15 | hub_auth = HubAuth() 16 | hub_auth.client_ca = os.environ.get("JUPYTERHUB_SSL_CLIENT_CA", "") 17 | hub_auth.certfile = os.environ.get("JUPYTERHUB_SSL_CERTFILE", "") 18 | hub_auth.keyfile = os.environ.get("JUPYTERHUB_SSL_KEYFILE", "") 19 | 20 | url = url_path_join(hub_auth.api_url, "batchspawner") 21 | headers = {"Authorization": f"token {hub_auth.api_token}"} 22 | 23 | # internal_ssl kwargs 24 | kwargs = {} 25 | if hub_auth.certfile and hub_auth.keyfile: 26 | kwargs["cert"] = (hub_auth.certfile, hub_auth.keyfile) 27 | if hub_auth.client_ca: 28 | kwargs["verify"] = hub_auth.client_ca 29 | 30 | r = requests.post( 31 | url, 32 | headers={"Authorization": f"token {hub_auth.api_token}"}, 33 | json={"port": port}, 34 | **kwargs, 35 | ) 36 | 37 | cmd_path = which(sys.argv[1]) 38 | sys.argv = sys.argv[1:] + ["--port={}".format(port)] 39 | run_path(cmd_path, run_name="__main__") 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /batchspawner/api.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tornado import web 3 | from jupyterhub.apihandlers import APIHandler, default_handlers 4 | from batchspawner import BatchSpawnerBase 5 | 6 | 7 | class BatchSpawnerAPIHandler(APIHandler): 8 | @web.authenticated 9 | def post(self): 10 | """POST set user spawner data""" 11 | if hasattr(self, "current_user"): 12 | # Jupyterhub compatability, (september 2018, d79a99323ef1d) 13 | user = self.current_user 14 | else: 15 | # Previous jupyterhub, 0.9.4 and before. 16 | user = self.get_current_user() 17 | token = self.get_auth_token() 18 | spawner = None 19 | for s in user.spawners.values(): 20 | if s.api_token == token: 21 | spawner = s 22 | # fix for when spawner is not batchspawner. 23 | # unsure if you can link properties between two classes 24 | while not issubclass(spawner.__class__,BatchSpawnerBase): 25 | if hasattr(s,"child_spawner"): 26 | spawner = spawner.child_spawner 27 | break 28 | data = self.get_json_body() 29 | for key, value in data.items(): 30 | if hasattr(spawner, key): 31 | setattr(spawner, key, value) 32 | self.finish(json.dumps({"message": "BatchSpawner data configured"})) 33 | self.set_status(201) 34 | 35 | default_handlers.append((r"/api/batchspawner", BatchSpawnerAPIHandler)) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Project Jupyter Contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit is a tool to perform a predefined set of tasks manually and/or 2 | # automatically before git commits are made. 3 | # 4 | # Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level 5 | # 6 | # Common tasks 7 | # 8 | # - Run on all files: pre-commit run --all-files 9 | # - Register git hooks: pre-commit install --install-hooks 10 | # 11 | repos: 12 | # Autoformat: Python code 13 | - repo: https://github.com/psf/black 14 | rev: "23.9.1" 15 | hooks: 16 | - id: black 17 | args: 18 | - --target-version=py36 19 | - --target-version=py37 20 | - --target-version=py38 21 | - --target-version=py39 22 | - --target-version=py310 23 | - --target-version=py311 24 | 25 | # Autoformat: markdown, yaml 26 | - repo: https://github.com/pre-commit/mirrors-prettier 27 | rev: v3.0.3 28 | hooks: 29 | - id: prettier 30 | 31 | # Lint: Python code 32 | - repo: https://github.com/PyCQA/flake8 33 | rev: "6.1.0" 34 | hooks: 35 | - id: flake8 36 | 37 | # Misc... 38 | - repo: https://github.com/pre-commit/pre-commit-hooks 39 | rev: v4.4.0 40 | # ref: https://github.com/pre-commit/pre-commit-hooks#hooks-available 41 | hooks: 42 | # Autoformat: Makes sure files end in a newline and only a newline. 43 | - id: end-of-file-fixer 44 | 45 | # Autoformat: Sorts entries in requirements.txt. 46 | - id: requirements-txt-fixer 47 | 48 | # Prevent giant (500kB) files from being committed. 49 | - id: check-added-large-files 50 | 51 | # Lint: Check for files with names that would conflict on a 52 | # case-insensitive filesystem like MacOS HFS+ or Windows FAT. 53 | - id: check-case-conflict 54 | 55 | # Lint: Checks that non-binary executables have a proper shebang. 56 | - id: check-executables-have-shebangs 57 | 58 | # pre-commit.ci config reference: https://pre-commit.ci/#configuration 59 | ci: 60 | autoupdate_schedule: monthly 61 | -------------------------------------------------------------------------------- /SPAWNERS.md: -------------------------------------------------------------------------------- 1 | # Notes on specific spawners 2 | 3 | **Spawner maintainers**: Included below are "spawner maintainers", 4 | when available. There aren't official obligations, but the general 5 | idea is that you should watch the repository and feel especially 6 | empowered to comment on issues when you think it might be relevant to 7 | you (obviously everyone should be, but this is our attempt at even 8 | more outreach). You should let us know when we break something and 9 | provide a diversity of opinions in general. Submitting PRs and 10 | testing is nice but not required. 11 | 12 | To be listed as a maintainer, just submit an issue or PR adding you, 13 | and please watch the repository on Github. 14 | 15 | ## `TorqueSpawner` 16 | 17 | Maintainers: 18 | 19 | ## `MoabSpawner` 20 | 21 | Subclass of TorqueSpawner 22 | 23 | Maintainers: 24 | 25 | ## `SlurmSpawner` 26 | 27 | Maintainers: @rkdarst 28 | 29 | This spawner enforces the environment if `srun` is used to wrap the 30 | spawner command, which is the default. If you _do_ want user 31 | environment to be used, set `req_srun=''`. However, this is not 32 | perfect: there is still a bash shell begun as the user which could run 33 | arbitrary startup, define shell aliases for `srun`, etc. 34 | 35 | Use of `srun` is required to gracefully terminate. 36 | 37 | ## `GridengineSpawner` 38 | 39 | Maintainers: 40 | 41 | ## `CondorSpawner` 42 | 43 | Maintainers: 44 | 45 | ## `LsfSpawner` 46 | 47 | Maintainers: 48 | 49 | # Checklist for making spawners 50 | 51 | Please document each of these things under the spawner list above, - 52 | even if it is "OK", we need to track status of all spawners. If it is 53 | a bug, users really need to know. 54 | 55 | - Does your spawner read shell environment before starting? (See 56 | [Jupyterhub 57 | Security](https://jupyterhub.readthedocs.io/en/stable/reference/websecurity.html). 58 | 59 | - Does your spawner send SIGTERM to the jupyterhub-singleuser process 60 | before SIGKILL? It should, so that the process can terminate 61 | gracefully. Add `echo "terminated gracefully"` to the end of the 62 | batch script - if you see this in your singleuser server output, you 63 | know that you DO receive SIGTERM and terminate gracefully. If your 64 | batch system can not automatically send SIGTERM before SIGKILL, PR 65 | #75 might help here, ask for it to be finished. 66 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Copyright (c) Jupyter Development Team. 5 | # Distributed under the terms of the Modified BSD License. 6 | 7 | # ----------------------------------------------------------------------------- 8 | # Minimal Python version sanity check (from IPython/Jupyterhub) 9 | # ----------------------------------------------------------------------------- 10 | 11 | from __future__ import print_function 12 | 13 | import os 14 | import sys 15 | 16 | from setuptools import setup 17 | from glob import glob 18 | 19 | pjoin = os.path.join 20 | here = os.path.abspath(os.path.dirname(__file__)) 21 | 22 | # Get the current package version. 23 | version_ns = {} 24 | with open(pjoin(here, "version.py")) as f: 25 | exec(f.read(), {}, version_ns) 26 | 27 | with open(pjoin(here, "README.md"), encoding="utf-8") as f: 28 | long_desc = f.read() 29 | 30 | setup_args = dict( 31 | name="batchspawner", 32 | entry_points={ 33 | "console_scripts": ["batchspawner-singleuser=batchspawner.singleuser:main"], 34 | }, 35 | packages=["batchspawner"], 36 | version=version_ns["__version__"], 37 | description="""Batchspawner: A spawner for Jupyterhub to spawn notebooks using batch resource managers.""", 38 | long_description=long_desc, 39 | long_description_content_type="text/markdown", 40 | author="Michael Milligan, Andrea Zonca, Mike Gilbert", 41 | author_email="milligan@umn.edu", 42 | url="http://jupyter.org", 43 | license="BSD", 44 | platforms="Linux, Mac OS X", 45 | python_requires="~=3.5", 46 | keywords=["Interactive", "Interpreter", "Shell", "Web", "Jupyter"], 47 | classifiers=[ 48 | "Intended Audience :: Developers", 49 | "Intended Audience :: System Administrators", 50 | "Intended Audience :: Science/Research", 51 | "License :: OSI Approved :: BSD License", 52 | "Programming Language :: Python", 53 | "Programming Language :: Python :: 3", 54 | ], 55 | project_urls={ 56 | "Bug Reports": "https://github.com/jupyterhub/batchspawner/issues", 57 | "Source": "https://github.com/jupyterhub/batchspawner/", 58 | "About Jupyterhub": "http://jupyterhub.readthedocs.io/en/latest/", 59 | "Jupyter Project": "http://jupyter.org", 60 | }, 61 | ) 62 | 63 | # setuptools requirements 64 | if "setuptools" in sys.modules: 65 | setup_args["install_requires"] = install_requires = [] 66 | with open("requirements.txt") as f: 67 | for line in f.readlines(): 68 | req = line.strip() 69 | if not req or req.startswith(("-e", "#")): 70 | continue 71 | install_requires.append(req) 72 | 73 | 74 | def main(): 75 | setup(**setup_args) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This is a GitHub workflow defining a set of jobs with a set of steps. 2 | # ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions 3 | # 4 | name: Test 5 | 6 | on: 7 | pull_request: 8 | paths-ignore: 9 | - "**.md" 10 | - "**.yml" 11 | - "**.yaml" 12 | - "!.github/workflows/test.yml" 13 | push: 14 | paths-ignore: 15 | - "**.md" 16 | - "**.yml" 17 | - "**.yaml" 18 | - "!.github/workflows/test.yml" 19 | branches-ignore: 20 | - "dependabot/**" 21 | - "pre-commit-ci-update-config" 22 | tags: ["**"] 23 | workflow_dispatch: 24 | 25 | jobs: 26 | pytest: 27 | name: "Run pytest" 28 | runs-on: ubuntu-20.04 29 | continue-on-error: ${{ matrix.allow_failure }} 30 | strategy: 31 | # Keep running even if one variation of the job fail 32 | fail-fast: false 33 | matrix: 34 | python-version: 35 | - "3.6" 36 | - "3.10" 37 | JHUB_VER: 38 | - "1.0.0" 39 | - "1.5.1" 40 | - "2.3.1" 41 | allow_failure: [false] 42 | 43 | exclude: 44 | # JupyterHub 1.3.0 requires python 3.6+ 45 | - JHUB_VER: "1.3.0" 46 | python-version: "3.5" 47 | # JupyterHub 0.9.6 used a deprecated sqlalchemy feature removed in py3.9 environment 48 | - JHUB_VER: "0.9.6" 49 | python-version: "3.9" 50 | include: 51 | - JHUB_VER: "main" 52 | python-version: "3.9" 53 | allow_failure: true 54 | - JHUB_VER: "3.0.0" 55 | python-version: "3.9" 56 | allow_failure: true 57 | 58 | steps: 59 | - uses: actions/checkout@v3 60 | - name: Set up Python ${{ matrix.python-version }} 61 | uses: actions/setup-python@v3 62 | with: 63 | python-version: "${{ matrix.python-version }}" 64 | 65 | - name: Install dependencies 66 | run: | 67 | python -m pip install --upgrade pip 68 | python -m pip install pytest 69 | pip install -r requirements.txt 70 | pip list 71 | 72 | - name: Install nodejs dependencies 73 | run: | 74 | sudo npm install -g configurable-http-proxy 75 | 76 | # We need to check compatibility with different versions of the JH API, 77 | # including latest development. For that, we also need to pull in the 78 | # development dependencies of that old JH version (but we don't need 79 | # conda/npm for our tests). 80 | - name: install JupyterHub 81 | run: | 82 | git clone --quiet --branch ${{ matrix.JHUB_VER }} https://github.com/jupyterhub/jupyterhub.git ./jupyterhub 83 | pip install -r ./jupyterhub/dev-requirements.txt 84 | pip install ./jupyterhub 85 | 86 | - name: pytest 87 | run: | 88 | pytest --verbose --color=yes --last-failed --cov batchspawner batchspawner/tests 89 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## unreleased changes 4 | 5 | Added (user) 6 | 7 | Added (developer) 8 | 9 | Changed 10 | 11 | Fixed 12 | 13 | ## v1.2 14 | 15 | Changed 16 | 17 | - PR #237: Replace use of scripts with entry_points 18 | - PR #208 #238 #239 #240 #241: updates to CI - bumping versions and aligning with Jupyterhub standards 19 | - PR #220: remove code supporting Jupyterhub earlier than 0.9 20 | 21 | Fixed 22 | 23 | - PR #229: LSF jobs with multiple slots display each hostname ':' separated 24 | 25 | ## v1.1 26 | 27 | Added (user) 28 | 29 | - PR #170: SlurmSpawner: add `req_gres` to specify `-go-res`. 30 | - PR #137: GridEngineSpawner: spawner will now add the following system environment values to the spawner environment, in accordance with the Univa Admin Guide: `SGE_CELL`, `SGE_EXECD`, `SGE_ROOT`, `SGE_CLUSTER_NAME`, `SGE_QMASTER_PORT`, `SGE_EXECD_PORT`, `PATH` 31 | 32 | Added (developer) 33 | 34 | - PR #187: support for unknown job state 35 | 36 | Changed 37 | 38 | - PR #177: Fail on first error in batch script by setting `set -e` to script templates. 39 | - PR #165: SlurmSpawner: Update template to use `--chdir` instead of `--workdir`. Users of Slurm older than 17.11 may need to revert this locally. 40 | - PR #189: remove bashism from default script template 41 | - PR #195: fix exception handling in run_command 42 | - PR #198: change from Travis to gh-actions for testing 43 | - PR #196: documentation 44 | - PR #199: update setup.py 45 | 46 | ## v1.0 (requires minimum JupyterHub 0.9 and Python 3.5) 47 | 48 | Added (user) 49 | 50 | - Add support for JupyterHub named servers. #167 51 | - Add Jinja2 templating as an option for all scripts and commands. If '{{' or `{%` is used anywhere in the string, it is used as a jinja2 template. 52 | - Add new option exec_prefix, which defaults to `sudo -E -u {username}`. This replaces explicit `sudo` in every batch command - changes in local commands may be needed. 53 | - New option: `req_keepvars_extra`, which allows keeping extra variables in addition to what is defined by JupyterHub itself (addition of variables to keep instead of replacement). #99 54 | - Add `req_prologue` and `req_epilogue` options to scripts which are inserted before/after the main jupyterhub-singleuser command, which allow for generic setup/cleanup without overriding the entire script. #96 55 | - SlurmSpawner: add the `req_reservation` option. #91 56 | - Add basic support for JupyterHub progress updates, but this is not used much yet. #86 57 | 58 | Added (developer) 59 | 60 | - Add many more tests. 61 | - Add a new page `SPAWNERS.md` which information on specific spawners. Begin trying to collect a list of spawner-specific contacts. #97 62 | - Rename `current_ip` and `current_port` commands to `ip` and `port`. No user impact. #139 63 | - Update to Python 3.5 `async` / `await` syntax to support JupyterHub progress updates. #90 64 | 65 | Changed 66 | 67 | - PR #58 and #141 changes logic of port selection, so that it is selected _after_ the singleuser server starts. This means that the port number has to be conveyed back to JupyterHub. This requires the following changes: 68 | - `jupyterhub_config.py` _must_ explicitely import `batchspawner` 69 | - Add a new option `batchspawner_singleuser_cmd` which is used as a wrapper in the single-user servers, which conveys the remote port back to JupyterHub. This is now an integral part of the spawn process. 70 | - If you have installed with `pip install -e`, you will have to re-install so that the new script `batchspawner-singleuser` is added to `$PATH`. 71 | - Update minimum requirements to JupyterHub 0.9 and Python 3.5. #143 72 | - Update Slurm batch script. Now, the single-user notebook is run in a job step, with a wrapper of `srun`. This may need to be removed using `req_srun=''` if you don't want environment variables limited. 73 | - Pass the environment dictionary to the queue and cancel commands as well. This is mostly user environment, but may be useful to these commands as well in some cases. #108, #111 If these environment variables were used for authentication as an admin, be aware that there are pre-existing security issues because they may be passed to the user via the batch submit command, see #82. 74 | 75 | Fixed 76 | 77 | - Improve debugging on failed submission by raising errors including error messages from the commands. #106 78 | - Many other non-user or developer visible changes. #107 #106 #100 79 | - In Travis CI, blacklist jsonschema=3.0.0a1 because it breaks tests 80 | 81 | Removed 82 | 83 | ## v0.8.1 (bugfix release) 84 | 85 | - Fix regression: single-user server binding address is overwritten by previous session server address, resulting in failure to start. Issue #76 86 | 87 | ## v0.8.0 (compatible with JupyterHub 0.5.0 through 0.8.1/0.9dev) 88 | 89 | - SlurmSpawner: Remove `--uid` for (at least) Slurm 17.11 compatibility. If you use `sudo`, this should not be necessary, but because this is security related you should check that user management is as you expect. If your configuration does not use `sudo` then you may need to add the `--uid` option in a custom `batch_script`. 90 | - add base options `req_ngpus` `req_partition` `req_account` and `req_options` 91 | - Fix up logging 92 | - Merge `user_options` with the template substitution vars instead of having it as a separate key 93 | - Update ip/port handling for JupyterHub 0.8 94 | - Add `LICENSE` (BSD3) and `CONTRIBUTING.md` 95 | - Add `LsfSpawner` for IBM LFS 96 | - Add `MultiSlurmSpawner` 97 | - Add `MoabSpawner` 98 | - Add `condorSpawner` 99 | - Add `GridEngineSpawner` 100 | - SlurmSpawner: add `req_qos` option 101 | - WrapSpawner and ProfilesSpawner, which provide mechanisms for runtime configuration of spawners, have been split out and moved to the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) package 102 | - Enable CI testing via Travis-CI 103 | 104 | ## v0.3 (tag: jhub-0.3, compatible with JupyterHub 0.3.0) 105 | 106 | - initial release containing `TorqueSpawner` and `SlurmSpawner` 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # batchspawner for Jupyterhub 2 | 3 | [![GitHub Workflow Status - Test](https://img.shields.io/github/workflow/status/jupyterhub/batchspawner/Test?logo=github&label=tests)](https://github.com/jupyterhub/batchspawner/actions) 4 | [![Latest PyPI version](https://img.shields.io/pypi/v/batchspawner?logo=pypi&logoColor=white)](https://pypi.python.org/pypi/batchspawner) 5 | [![GitHub](https://img.shields.io/badge/issue_tracking-github-blue?logo=github)](https://github.com/jupyterhub/batchspawner/issues) 6 | [![Discourse](https://img.shields.io/badge/help_forum-discourse-blue?logo=discourse)](https://discourse.jupyter.org/c/jupyterhub) 7 | [![Gitter](https://img.shields.io/badge/social_chat-gitter-blue?logo=gitter)](https://gitter.im/jupyterhub/jupyterhub) 8 | [![Contribute](https://img.shields.io/badge/I_want_to_contribute!-grey?logo=jupyter)](https://github.com/jupyterhub/batchspawner/blob/master/CONTRIBUTING.md) 9 | 10 | This is a custom spawner for [Jupyterhub](https://jupyterhub.readthedocs.io/) that is designed for installations on clusters using batch scheduling software. 11 | 12 | This began as a generalization of [mkgilbert's batchspawner](https://github.com/mkgilbert/slurmspawner) which in turn was inspired by [Andrea Zonca's blog post](http://zonca.github.io/2015/04/jupyterhub-hpc.html "Run jupyterhub on a Supercomputer") where he explains his implementation for a spawner that uses SSH and Torque. His github repo is found [here](http://www.github.com/zonca/remotespawner "RemoteSpawner"). 13 | 14 | This package formerly included WrapSpawner and ProfilesSpawner, which provide mechanisms for runtime configuration of spawners. These have been split out and moved to the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) package. 15 | 16 | ## Installation 17 | 18 | 1. from root directory of this repo (where setup.py is), run `pip install -e .` 19 | 20 | If you don't actually need an editable version, you can simply run 21 | `pip install batchspawner` 22 | 23 | 2. add lines in jupyterhub_config.py for the spawner you intend to use, e.g. 24 | 25 | ```python 26 | c = get_config() 27 | c.JupyterHub.spawner_class = 'batchspawner.TorqueSpawner' 28 | import batchspawner # Even though not used, needed to register batchspawner interface 29 | ``` 30 | 31 | 3. Depending on the spawner, additional configuration will likely be needed. 32 | 33 | ## Batch Spawners 34 | 35 | For information on the specific spawners, see [SPAWNERS.md](SPAWNERS.md). 36 | 37 | ### Overview 38 | 39 | This file contains an abstraction layer for batch job queueing systems (`BatchSpawnerBase`), and implements 40 | Jupyterhub spawners for Torque, Moab, SLURM, SGE, HTCondor, LSF, and eventually others. 41 | Common attributes of batch submission / resource manager environments will include notions of: 42 | 43 | - queue names, resource manager addresses 44 | - resource limits including runtime, number of processes, memory 45 | - singleuser child process running on (usually remote) host not known until runtime 46 | - job submission and monitoring via resource manager utilities 47 | - remote execution via submission of templated scripts 48 | - job names instead of PIDs 49 | 50 | `BatchSpawnerBase` provides several general mechanisms: 51 | 52 | - configurable traits `req_foo` that are exposed as `{foo}` in job template scripts. Templates (submit scripts in particular) may also use the full power of [jinja2](http://jinja.pocoo.org/). Templates are automatically detected if a `{{` or `{%` is present, otherwise str.format() used. 53 | - configurable command templates for submitting/querying/cancelling jobs 54 | - a generic concept of job-ID and ID-based job state tracking 55 | - overrideable hooks for subclasses to plug in logic at numerous points 56 | 57 | ### Example 58 | 59 | Every effort has been made to accommodate highly diverse systems through configuration 60 | only. This example consists of the (lightly edited) configuration used by the author 61 | to run Jupyter notebooks on an academic supercomputer cluster. 62 | 63 | ```python 64 | # Select the Torque backend and increase the timeout since batch jobs may take time to start 65 | import batchspawner 66 | c.JupyterHub.spawner_class = 'batchspawner.TorqueSpawner' 67 | c.Spawner.http_timeout = 120 68 | 69 | #------------------------------------------------------------------------------ 70 | # BatchSpawnerBase configuration 71 | # These are simply setting parameters used in the job script template below 72 | #------------------------------------------------------------------------------ 73 | c.BatchSpawnerBase.req_nprocs = '2' 74 | c.BatchSpawnerBase.req_queue = 'mesabi' 75 | c.BatchSpawnerBase.req_host = 'mesabi.xyz.edu' 76 | c.BatchSpawnerBase.req_runtime = '12:00:00' 77 | c.BatchSpawnerBase.req_memory = '4gb' 78 | #------------------------------------------------------------------------------ 79 | # TorqueSpawner configuration 80 | # The script below is nearly identical to the default template, but we needed 81 | # to add a line for our local environment. For most sites the default templates 82 | # should be a good starting point. 83 | #------------------------------------------------------------------------------ 84 | c.TorqueSpawner.batch_script = '''#!/bin/sh 85 | #PBS -q {queue}@{host} 86 | #PBS -l walltime={runtime} 87 | #PBS -l nodes=1:ppn={nprocs} 88 | #PBS -l mem={memory} 89 | #PBS -N jupyterhub-singleuser 90 | #PBS -v {keepvars} 91 | module load python3 92 | {cmd} 93 | ''' 94 | # For our site we need to munge the execution hostname returned by qstat 95 | c.TorqueSpawner.state_exechost_exp = r'int-\1.mesabi.xyz.edu' 96 | ``` 97 | 98 | ### Security 99 | 100 | Unless otherwise stated for a specific spawner, assume that spawners 101 | _do_ evaluate shell environment for users and thus the [security 102 | requirements of JupyterHub security for untrusted 103 | users](https://jupyterhub.readthedocs.io/en/stable/reference/websecurity.html) 104 | are not fulfilled because some (most?) spawners _do_ start a user 105 | shell which will execute arbitrary user environment configuration 106 | (`.profile`, `.bashrc` and the like) unless users do not have 107 | access to their own cluster user account. This is something which we 108 | are working on. 109 | 110 | ## Provide different configurations of BatchSpawner 111 | 112 | ### Overview 113 | 114 | `ProfilesSpawner`, available as part of the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) 115 | package, allows the Jupyterhub administrator to define a set of different spawning configurations, 116 | both different spawners and different configurations of the same spawner. 117 | The user is then presented a dropdown menu for choosing the most suitable configuration for their needs. 118 | 119 | This method provides an easy and safe way to provide different configurations of `BatchSpawner` to the 120 | users, see an example below. 121 | 122 | ### Example 123 | 124 | The following is based on the author's configuration (at the same site as the example above) 125 | showing how to give users access to multiple job configurations on the batch scheduled 126 | clusters, as well as an option to run a local notebook directly on the jupyterhub server. 127 | 128 | ```python 129 | # Same initial setup as the previous example 130 | import batchspawner 131 | c.JupyterHub.spawner_class = 'wrapspawner.ProfilesSpawner' 132 | c.Spawner.http_timeout = 120 133 | #------------------------------------------------------------------------------ 134 | # BatchSpawnerBase configuration 135 | # Providing default values that we may omit in the profiles 136 | #------------------------------------------------------------------------------ 137 | c.BatchSpawnerBase.req_host = 'mesabi.xyz.edu' 138 | c.BatchSpawnerBase.req_runtime = '12:00:00' 139 | c.TorqueSpawner.state_exechost_exp = r'in-\1.mesabi.xyz.edu' 140 | #------------------------------------------------------------------------------ 141 | # ProfilesSpawner configuration 142 | #------------------------------------------------------------------------------ 143 | # List of profiles to offer for selection. Signature is: 144 | # List(Tuple( Unicode, Unicode, Type(Spawner), Dict )) 145 | # corresponding to profile display name, unique key, Spawner class, 146 | # dictionary of spawner config options. 147 | # 148 | # The first three values will be exposed in the input_template as {display}, 149 | # {key}, and {type} 150 | # 151 | c.ProfilesSpawner.profiles = [ 152 | ( "Local server", 'local', 'jupyterhub.spawner.LocalProcessSpawner', {'ip':'0.0.0.0'} ), 153 | ('Mesabi - 2 cores, 4 GB, 8 hours', 'mesabi2c4g12h', 'batchspawner.TorqueSpawner', 154 | dict(req_nprocs='2', req_queue='mesabi', req_runtime='8:00:00', req_memory='4gb')), 155 | ('Mesabi - 12 cores, 128 GB, 4 hours', 'mesabi128gb', 'batchspawner.TorqueSpawner', 156 | dict(req_nprocs='12', req_queue='ram256g', req_runtime='4:00:00', req_memory='125gb')), 157 | ('Mesabi - 2 cores, 4 GB, 24 hours', 'mesabi2c4gb24h', 'batchspawner.TorqueSpawner', 158 | dict(req_nprocs='2', req_queue='mesabi', req_runtime='24:00:00', req_memory='4gb')), 159 | ('Interactive Cluster - 2 cores, 4 GB, 8 hours', 'lab', 'batchspawner.TorqueSpawner', 160 | dict(req_nprocs='2', req_host='labhost.xyz.edu', req_queue='lab', 161 | req_runtime='8:00:00', req_memory='4gb', state_exechost_exp='')), 162 | ] 163 | c.ProfilesSpawner.ip = '0.0.0.0' 164 | ``` 165 | 166 | ## Debugging batchspawner 167 | 168 | Sometimes it can be hard to debug batchspawner, but it's not really 169 | once you know how the pieces interact. Check the following places for 170 | error messages: 171 | 172 | - Check the JupyterHub logs for errors. 173 | 174 | - Check the JupyterHub logs for the batch script that got submitted 175 | and the command used to submit it. Are these correct? (Note that 176 | there are submission environment variables too, which aren't 177 | displayed.) 178 | 179 | - At this point, it's a matter of checking the batch system. Is the 180 | job ever scheduled? Does it run? Does it succeed? Check the batch 181 | system status and output of the job. The most comon failure 182 | patterns are a) job never starting due to bad scheduler options, b) 183 | job waiting in the queue beyond the `start_timeout`, causing 184 | JupyterHub to kill the job. 185 | 186 | - At this point the job starts. Does it fail immediately, or before 187 | Jupyter starts? Check the scheduler output files (stdout/stderr of 188 | the job), wherever it is stored. To debug the job script, you can 189 | add debugging into the batch script, such as an `env` or `set -x`. 190 | 191 | - At this point Jupyter itself starts - check its error messages. Is 192 | it starting with the right options? Can it communicate with the 193 | hub? At this point there usually isn't anything 194 | batchspawner-specific, with the one exception below. The error log 195 | would be in the batch script output (same file as above). There may 196 | also be clues in the JupyterHub logfile. 197 | - Are you running on an NFS filesystem? It's possible for Jupyter to 198 | experience issues due to varying implementations of the fcntl() system 199 | call. (See also [Jupyterhub-Notes and Tips: SQLite](https://jupyterhub.readthedocs.io/en/latest/reference/database.html?highlight=NFS#sqlite)) 200 | 201 | Common problems: 202 | 203 | - Did you `import batchspawner` in the `jupyterhub_config.py` file? 204 | This is needed in order to activate the batchspawer API in 205 | JupyterHub. 206 | 207 | ## Changelog 208 | 209 | See [CHANGELOG.md](CHANGELOG.md). 210 | -------------------------------------------------------------------------------- /batchspawner/tests/test_spawners.py: -------------------------------------------------------------------------------- 1 | """Test BatchSpawner and subclasses""" 2 | 3 | import re 4 | from unittest import mock 5 | from .. import BatchSpawnerRegexStates, JobStatus 6 | from traitlets import Unicode 7 | import time 8 | import pytest 9 | from jupyterhub import orm 10 | from tornado import gen 11 | 12 | try: 13 | from jupyterhub.objects import Hub, Server 14 | from jupyterhub.user import User 15 | except: 16 | pass 17 | 18 | testhost = "userhost123" 19 | testjob = "12345" 20 | testport = 54321 21 | 22 | 23 | class BatchDummy(BatchSpawnerRegexStates): 24 | exec_prefix = "" 25 | batch_submit_cmd = Unicode("cat > /dev/null; echo " + testjob) 26 | batch_query_cmd = Unicode("echo RUN " + testhost) 27 | batch_cancel_cmd = Unicode("echo STOP") 28 | batch_script = Unicode("{cmd}") 29 | state_pending_re = Unicode("PEND") 30 | state_running_re = Unicode("RUN") 31 | state_exechost_re = Unicode("RUN (.*)$") 32 | state_unknown_re = Unicode("UNKNOWN") 33 | 34 | cmd_expectlist = None 35 | out_expectlist = None 36 | 37 | def run_command(self, *args, **kwargs): 38 | """Overwriten run command to test templating and outputs""" 39 | cmd = args[0] 40 | # Test that the command matches the expectations 41 | if self.cmd_expectlist: 42 | run_re = self.cmd_expectlist.pop(0) 43 | if run_re: 44 | print("run:", run_re) 45 | assert ( 46 | run_re.search(cmd) is not None 47 | ), "Failed test: re={0} cmd={1}".format(run_re, cmd) 48 | # Run command normally 49 | out = super().run_command(*args, **kwargs) 50 | # Test that the command matches the expectations 51 | if self.out_expectlist: 52 | out_re = self.out_expectlist.pop(0) 53 | if out_re: 54 | print("out:", out_re) 55 | assert ( 56 | out_re.search(cmd) is not None 57 | ), "Failed output: re={0} cmd={1} out={2}".format(out_re, cmd, out) 58 | return out 59 | 60 | 61 | def new_spawner(db, spawner_class=BatchDummy, **kwargs): 62 | kwargs.setdefault("cmd", ["singleuser_command"]) 63 | user = db.query(orm.User).first() 64 | hub = Hub() 65 | user = User(user, {}) 66 | server = Server() 67 | # Set it after constructions because it isn't a traitlet. 68 | kwargs.setdefault("hub", hub) 69 | kwargs.setdefault("user", user) 70 | kwargs.setdefault("poll_interval", 1) 71 | 72 | # These are not traitlets so we have to set them here 73 | spawner = user._new_spawner("", spawner_class=spawner_class, **kwargs) 74 | spawner.server = server 75 | spawner.mock_port = testport 76 | return spawner 77 | 78 | 79 | @pytest.mark.slow 80 | def test_stress_submit(db, io_loop): 81 | for i in range(200): 82 | time.sleep(0.01) 83 | test_spawner_start_stop_poll(db, io_loop) 84 | 85 | 86 | def check_ip(spawner, value): 87 | assert spawner.ip == value 88 | 89 | 90 | def test_spawner_start_stop_poll(db, io_loop): 91 | spawner = new_spawner(db=db) 92 | 93 | status = io_loop.run_sync(spawner.poll, timeout=5) 94 | assert status == 1 95 | assert spawner.job_id == "" 96 | assert spawner.get_state() == {} 97 | 98 | io_loop.run_sync(spawner.start, timeout=5) 99 | check_ip(spawner, testhost) 100 | assert spawner.job_id == testjob 101 | 102 | status = io_loop.run_sync(spawner.poll, timeout=5) 103 | assert status is None 104 | spawner.batch_query_cmd = "echo NOPE" 105 | io_loop.run_sync(spawner.stop, timeout=5) 106 | status = io_loop.run_sync(spawner.poll, timeout=5) 107 | assert status == 1 108 | assert spawner.get_state() == {} 109 | 110 | 111 | def test_spawner_state_reload(db, io_loop): 112 | spawner = new_spawner(db=db) 113 | assert spawner.get_state() == {} 114 | 115 | io_loop.run_sync(spawner.start, timeout=30) 116 | check_ip(spawner, testhost) 117 | assert spawner.job_id == testjob 118 | 119 | state = spawner.get_state() 120 | assert state == dict(job_id=testjob, job_status="RUN " + testhost) 121 | spawner = new_spawner(db=db) 122 | spawner.clear_state() 123 | assert spawner.get_state() == {} 124 | spawner.load_state(state) 125 | # We used to check IP here, but that is actually only computed on start(), 126 | # and is not part of the spawner's persistent state 127 | assert spawner.job_id == testjob 128 | 129 | 130 | def test_submit_failure(db, io_loop): 131 | spawner = new_spawner(db=db) 132 | assert spawner.get_state() == {} 133 | spawner.batch_submit_cmd = "cat > /dev/null; true" 134 | with pytest.raises(RuntimeError) as e_info: 135 | io_loop.run_sync(spawner.start, timeout=30) 136 | assert spawner.job_id == "" 137 | assert spawner.job_status == "" 138 | 139 | 140 | def test_submit_pending_fails(db, io_loop): 141 | """Submission works, but the batch query command immediately fails""" 142 | spawner = new_spawner(db=db) 143 | assert spawner.get_state() == {} 144 | spawner.batch_query_cmd = "echo xyz" 145 | with pytest.raises(RuntimeError) as e_info: 146 | io_loop.run_sync(spawner.start, timeout=30) 147 | status = io_loop.run_sync(spawner.query_job_status, timeout=30) 148 | assert status == JobStatus.NOTFOUND 149 | assert spawner.job_id == "" 150 | assert spawner.job_status == "" 151 | 152 | 153 | def test_poll_fails(db, io_loop): 154 | """Submission works, but a later .poll() fails""" 155 | spawner = new_spawner(db=db) 156 | assert spawner.get_state() == {} 157 | # The start is successful: 158 | io_loop.run_sync(spawner.start, timeout=30) 159 | spawner.batch_query_cmd = "echo xyz" 160 | # Now, the poll fails: 161 | io_loop.run_sync(spawner.poll, timeout=30) 162 | # .poll() will run self.clear_state() if it's not found: 163 | assert spawner.job_id == "" 164 | assert spawner.job_status == "" 165 | 166 | 167 | def test_unknown_status(db, io_loop): 168 | """Polling returns an unknown status""" 169 | spawner = new_spawner(db=db) 170 | assert spawner.get_state() == {} 171 | # The start is successful: 172 | io_loop.run_sync(spawner.start, timeout=30) 173 | spawner.batch_query_cmd = "echo UNKNOWN" 174 | # This poll should not fail: 175 | io_loop.run_sync(spawner.poll, timeout=30) 176 | status = io_loop.run_sync(spawner.query_job_status, timeout=30) 177 | assert status == JobStatus.UNKNOWN 178 | assert spawner.job_id == "12345" 179 | assert spawner.job_status != "" 180 | 181 | 182 | def test_templates(db, io_loop): 183 | """Test templates in the run_command commands""" 184 | spawner = new_spawner(db=db) 185 | 186 | # Test when not running 187 | spawner.cmd_expectlist = [ 188 | re.compile(".*RUN"), 189 | ] 190 | status = io_loop.run_sync(spawner.poll, timeout=5) 191 | assert status == 1 192 | assert spawner.job_id == "" 193 | assert spawner.get_state() == {} 194 | 195 | # Test starting 196 | spawner.cmd_expectlist = [ 197 | re.compile(".*echo"), 198 | re.compile(".*RUN"), 199 | ] 200 | io_loop.run_sync(spawner.start, timeout=5) 201 | check_ip(spawner, testhost) 202 | assert spawner.job_id == testjob 203 | 204 | # Test poll - running 205 | spawner.cmd_expectlist = [ 206 | re.compile(".*RUN"), 207 | ] 208 | status = io_loop.run_sync(spawner.poll, timeout=5) 209 | assert status is None 210 | 211 | # Test stopping 212 | spawner.batch_query_cmd = "echo NOPE" 213 | spawner.cmd_expectlist = [ 214 | re.compile(".*STOP"), 215 | re.compile(".*NOPE"), 216 | ] 217 | io_loop.run_sync(spawner.stop, timeout=5) 218 | status = io_loop.run_sync(spawner.poll, timeout=5) 219 | assert status == 1 220 | assert spawner.get_state() == {} 221 | 222 | 223 | def test_batch_script(db, io_loop): 224 | """Test that the batch script substitutes {cmd}""" 225 | 226 | class BatchDummyTestScript(BatchDummy): 227 | @gen.coroutine 228 | def _get_batch_script(self, **subvars): 229 | script = yield super()._get_batch_script(**subvars) 230 | assert "singleuser_command" in script 231 | return script 232 | 233 | spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript) 234 | # status = io_loop.run_sync(spawner.poll, timeout=5) 235 | io_loop.run_sync(spawner.start, timeout=5) 236 | # status = io_loop.run_sync(spawner.poll, timeout=5) 237 | # io_loop.run_sync(spawner.stop, timeout=5) 238 | 239 | 240 | def test_exec_prefix(db, io_loop): 241 | """Test that all run_commands have exec_prefix""" 242 | 243 | class BatchDummyTestScript(BatchDummy): 244 | exec_prefix = "PREFIX" 245 | 246 | @gen.coroutine 247 | def run_command(self, cmd, *args, **kwargs): 248 | assert cmd.startswith("PREFIX ") 249 | cmd = cmd[7:] 250 | print(cmd) 251 | out = yield super().run_command(cmd, *args, **kwargs) 252 | return out 253 | 254 | spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript) 255 | # Not running 256 | status = io_loop.run_sync(spawner.poll, timeout=5) 257 | assert status == 1 258 | # Start 259 | io_loop.run_sync(spawner.start, timeout=5) 260 | assert spawner.job_id == testjob 261 | # Poll 262 | status = io_loop.run_sync(spawner.poll, timeout=5) 263 | assert status is None 264 | # Stop 265 | spawner.batch_query_cmd = "echo NOPE" 266 | io_loop.run_sync(spawner.stop, timeout=5) 267 | status = io_loop.run_sync(spawner.poll, timeout=5) 268 | assert status == 1 269 | 270 | 271 | def run_spawner_script( 272 | db, io_loop, spawner, script, batch_script_re_list=None, spawner_kwargs={} 273 | ): 274 | """Run a spawner script and test that the output and behavior is as expected. 275 | 276 | db: same as in this module 277 | io_loop: same as in this module 278 | spawner: the BatchSpawnerBase subclass to test 279 | script: list of (input_re_to_match, output) 280 | batch_script_re_list: if given, assert batch script matches all of these 281 | """ 282 | # Create the expected scripts 283 | cmd_expectlist, out_list = zip(*script) 284 | cmd_expectlist = list(cmd_expectlist) 285 | out_list = list(out_list) 286 | 287 | class BatchDummyTestScript(spawner): 288 | @gen.coroutine 289 | def run_command(self, cmd, input=None, env=None): 290 | # Test the input 291 | run_re = cmd_expectlist.pop(0) 292 | if run_re: 293 | print('run: "{}" [{}]'.format(cmd, run_re)) 294 | assert ( 295 | run_re.search(cmd) is not None 296 | ), "Failed test: re={0} cmd={1}".format(run_re, cmd) 297 | # Test the stdin - will only be the batch script. For 298 | # each regular expression in batch_script_re_list, assert that 299 | # each re in that list matches the batch script. 300 | if batch_script_re_list and input: 301 | batch_script = input 302 | for match_re in batch_script_re_list: 303 | assert ( 304 | match_re.search(batch_script) is not None 305 | ), "Batch script does not match {}".format(match_re) 306 | # Return expected output. 307 | out = out_list.pop(0) 308 | print(" --> " + out) 309 | return out 310 | 311 | spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript, **spawner_kwargs) 312 | # Not running at beginning (no command run) 313 | status = io_loop.run_sync(spawner.poll, timeout=5) 314 | assert status == 1 315 | # batch_submit_cmd 316 | # batch_query_cmd (result=pending) 317 | # batch_query_cmd (result=running) 318 | io_loop.run_sync(spawner.start, timeout=5) 319 | assert spawner.job_id == testjob 320 | check_ip(spawner, testhost) 321 | # batch_query_cmd 322 | status = io_loop.run_sync(spawner.poll, timeout=5) 323 | assert status is None 324 | # batch_cancel_cmd 325 | io_loop.run_sync(spawner.stop, timeout=5) 326 | # batch_poll_cmd 327 | status = io_loop.run_sync(spawner.poll, timeout=5) 328 | assert status == 1 329 | 330 | 331 | def test_torque(db, io_loop): 332 | spawner_kwargs = { 333 | "req_nprocs": "5", 334 | "req_memory": "5678", 335 | "req_options": "some_option_asdf", 336 | "req_prologue": "PROLOGUE", 337 | "req_epilogue": "EPILOGUE", 338 | } 339 | batch_script_re_list = [ 340 | re.compile( 341 | r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE", 342 | re.S | re.M, 343 | ), 344 | re.compile(r"mem=5678"), 345 | re.compile(r"ppn=5"), 346 | re.compile(r"^#PBS some_option_asdf", re.M), 347 | ] 348 | script = [ 349 | (re.compile(r"sudo.*qsub"), str(testjob)), 350 | ( 351 | re.compile(r"sudo.*qstat"), 352 | "Q", 353 | ), # pending 354 | ( 355 | re.compile(r"sudo.*qstat"), 356 | "R{}/1".format(testhost), 357 | ), # running 358 | ( 359 | re.compile(r"sudo.*qstat"), 360 | "R{}/1".format(testhost), 361 | ), # running 362 | (re.compile(r"sudo.*qdel"), "STOP"), 363 | (re.compile(r"sudo.*qstat"), ""), 364 | ] 365 | from .. import TorqueSpawner 366 | 367 | run_spawner_script( 368 | db, 369 | io_loop, 370 | TorqueSpawner, 371 | script, 372 | batch_script_re_list=batch_script_re_list, 373 | spawner_kwargs=spawner_kwargs, 374 | ) 375 | 376 | 377 | def test_moab(db, io_loop): 378 | spawner_kwargs = { 379 | "req_nprocs": "5", 380 | "req_memory": "5678", 381 | "req_options": "some_option_asdf", 382 | "req_prologue": "PROLOGUE", 383 | "req_epilogue": "EPILOGUE", 384 | } 385 | batch_script_re_list = [ 386 | re.compile( 387 | r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE", 388 | re.S | re.M, 389 | ), 390 | re.compile(r"mem=5678"), 391 | re.compile(r"ppn=5"), 392 | re.compile(r"^#PBS some_option_asdf", re.M), 393 | ] 394 | script = [ 395 | (re.compile(r"sudo.*msub"), str(testjob)), 396 | (re.compile(r"sudo.*mdiag"), 'State="Idle"'), # pending 397 | ( 398 | re.compile(r"sudo.*mdiag"), 399 | 'State="Running" AllocNodeList="{}"'.format(testhost), 400 | ), # running 401 | ( 402 | re.compile(r"sudo.*mdiag"), 403 | 'State="Running" AllocNodeList="{}"'.format(testhost), 404 | ), # running 405 | (re.compile(r"sudo.*mjobctl.*-c"), "STOP"), 406 | (re.compile(r"sudo.*mdiag"), ""), 407 | ] 408 | from .. import MoabSpawner 409 | 410 | run_spawner_script( 411 | db, 412 | io_loop, 413 | MoabSpawner, 414 | script, 415 | batch_script_re_list=batch_script_re_list, 416 | spawner_kwargs=spawner_kwargs, 417 | ) 418 | 419 | 420 | def test_pbs(db, io_loop): 421 | spawner_kwargs = { 422 | "req_nprocs": "4", 423 | "req_memory": "10256", 424 | "req_options": "some_option_asdf", 425 | "req_host": "some_pbs_admin_node", 426 | "req_runtime": "08:00:00", 427 | } 428 | batch_script_re_list = [ 429 | re.compile(r"singleuser_command"), 430 | re.compile(r"select=1"), 431 | re.compile(r"ncpus=4"), 432 | re.compile(r"mem=10256"), 433 | re.compile(r"walltime=08:00:00"), 434 | re.compile(r"@some_pbs_admin_node"), 435 | re.compile(r"^#PBS some_option_asdf", re.M), 436 | ] 437 | script = [ 438 | (re.compile(r"sudo.*qsub"), str(testjob)), 439 | (re.compile(r"sudo.*qstat"), "job_state = Q"), # pending 440 | ( 441 | re.compile(r"sudo.*qstat"), 442 | "job_state = R\nexec_host = {}/2*1".format(testhost), 443 | ), # running 444 | ( 445 | re.compile(r"sudo.*qstat"), 446 | "job_state = R\nexec_host = {}/2*1".format(testhost), 447 | ), # running 448 | (re.compile(r"sudo.*qdel"), "STOP"), 449 | (re.compile(r"sudo.*qstat"), ""), 450 | ] 451 | from .. import PBSSpawner 452 | 453 | run_spawner_script( 454 | db, 455 | io_loop, 456 | PBSSpawner, 457 | script, 458 | batch_script_re_list=batch_script_re_list, 459 | spawner_kwargs=spawner_kwargs, 460 | ) 461 | 462 | 463 | def test_slurm(db, io_loop): 464 | spawner_kwargs = { 465 | "req_runtime": "3-05:10:10", 466 | "req_nprocs": "5", 467 | "req_memory": "5678", 468 | "req_options": "some_option_asdf", 469 | "req_prologue": "PROLOGUE", 470 | "req_epilogue": "EPILOGUE", 471 | "req_reservation": "RES123", 472 | "req_gres": "GRES123", 473 | } 474 | batch_script_re_list = [ 475 | re.compile( 476 | r"PROLOGUE.*srun batchspawner-singleuser singleuser_command.*EPILOGUE", re.S 477 | ), 478 | re.compile(r"^#SBATCH \s+ --cpus-per-task=5", re.X | re.M), 479 | re.compile(r"^#SBATCH \s+ --time=3-05:10:10", re.X | re.M), 480 | re.compile(r"^#SBATCH \s+ some_option_asdf", re.X | re.M), 481 | re.compile(r"^#SBATCH \s+ --reservation=RES123", re.X | re.M), 482 | re.compile(r"^#SBATCH \s+ --gres=GRES123", re.X | re.M), 483 | ] 484 | from .. import SlurmSpawner 485 | 486 | run_spawner_script( 487 | db, 488 | io_loop, 489 | SlurmSpawner, 490 | normal_slurm_script, 491 | batch_script_re_list=batch_script_re_list, 492 | spawner_kwargs=spawner_kwargs, 493 | ) 494 | 495 | 496 | # We tend to use slurm as our typical example job. These allow quick 497 | # Slurm examples. 498 | normal_slurm_script = [ 499 | (re.compile(r"sudo.*sbatch"), str(testjob)), 500 | (re.compile(r"sudo.*squeue"), "PENDING "), # pending 501 | ( 502 | re.compile(r"sudo.*squeue"), 503 | "slurm_load_jobs error: Unable to contact slurm controller", 504 | ), # unknown 505 | (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), # running 506 | (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), 507 | (re.compile(r"sudo.*scancel"), "STOP"), 508 | (re.compile(r"sudo.*squeue"), ""), 509 | ] 510 | from .. import SlurmSpawner 511 | 512 | 513 | def run_typical_slurm_spawner( 514 | db, 515 | io_loop, 516 | spawner=SlurmSpawner, 517 | script=normal_slurm_script, 518 | batch_script_re_list=None, 519 | spawner_kwargs={}, 520 | ): 521 | """Run a full slurm job with default (overrideable) parameters. 522 | 523 | This is useful, for example, for changing options and testing effect 524 | of batch scripts. 525 | """ 526 | return run_spawner_script( 527 | db, 528 | io_loop, 529 | spawner, 530 | script, 531 | batch_script_re_list=batch_script_re_list, 532 | spawner_kwargs=spawner_kwargs, 533 | ) 534 | 535 | 536 | # def test_gridengine(db, io_loop): 537 | # spawner_kwargs = { 538 | # 'req_options': 'some_option_asdf', 539 | # } 540 | # batch_script_re_list = [ 541 | # re.compile(r'singleuser_command'), 542 | # re.compile(r'#$\s+some_option_asdf'), 543 | # ] 544 | # script = [ 545 | # (re.compile(r'sudo.*qsub'), 'x x '+str(testjob)), 546 | # (re.compile(r'sudo.*qstat'), 'PENDING '), 547 | # (re.compile(r'sudo.*qstat'), 'RUNNING '+testhost), 548 | # (re.compile(r'sudo.*qstat'), 'RUNNING '+testhost), 549 | # (re.compile(r'sudo.*qdel'), 'STOP'), 550 | # (re.compile(r'sudo.*qstat'), ''), 551 | # ] 552 | # from .. import GridengineSpawner 553 | # run_spawner_script(db, io_loop, GridengineSpawner, script, 554 | # batch_script_re_list=batch_script_re_list, 555 | # spawner_kwargs=spawner_kwargs) 556 | 557 | 558 | def test_condor(db, io_loop): 559 | spawner_kwargs = { 560 | "req_nprocs": "5", 561 | "req_memory": "5678", 562 | "req_options": "some_option_asdf", 563 | } 564 | batch_script_re_list = [ 565 | re.compile(r"exec batchspawner-singleuser singleuser_command"), 566 | re.compile(r"RequestCpus = 5"), 567 | re.compile(r"RequestMemory = 5678"), 568 | re.compile(r"^some_option_asdf", re.M), 569 | ] 570 | script = [ 571 | ( 572 | re.compile(r"sudo.*condor_submit"), 573 | "submitted to cluster {}".format(str(testjob)), 574 | ), 575 | (re.compile(r"sudo.*condor_q"), "1,"), # pending 576 | (re.compile(r"sudo.*condor_q"), "2, @{}".format(testhost)), # runing 577 | (re.compile(r"sudo.*condor_q"), "2, @{}".format(testhost)), 578 | (re.compile(r"sudo.*condor_rm"), "STOP"), 579 | (re.compile(r"sudo.*condor_q"), ""), 580 | ] 581 | from .. import CondorSpawner 582 | 583 | run_spawner_script( 584 | db, 585 | io_loop, 586 | CondorSpawner, 587 | script, 588 | batch_script_re_list=batch_script_re_list, 589 | spawner_kwargs=spawner_kwargs, 590 | ) 591 | 592 | 593 | def test_lfs(db, io_loop): 594 | spawner_kwargs = { 595 | "req_nprocs": "5", 596 | "req_memory": "5678", 597 | "req_options": "some_option_asdf", 598 | "req_queue": "some_queue", 599 | "req_prologue": "PROLOGUE", 600 | "req_epilogue": "EPILOGUE", 601 | } 602 | batch_script_re_list = [ 603 | re.compile( 604 | r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE", 605 | re.S | re.M, 606 | ), 607 | re.compile(r"#BSUB\s+-q\s+some_queue", re.M), 608 | ] 609 | script = [ 610 | ( 611 | re.compile(r"sudo.*bsub"), 612 | "Job <{}> is submitted to default queue ".format(str(testjob)), 613 | ), 614 | (re.compile(r"sudo.*bjobs"), "PEND "), # pending 615 | (re.compile(r"sudo.*bjobs"), "RUN {}".format(testhost)), # running 616 | (re.compile(r"sudo.*bjobs"), "RUN {}".format(testhost)), 617 | (re.compile(r"sudo.*bkill"), "STOP"), 618 | (re.compile(r"sudo.*bjobs"), ""), 619 | ] 620 | from .. import LsfSpawner 621 | 622 | run_spawner_script( 623 | db, 624 | io_loop, 625 | LsfSpawner, 626 | script, 627 | batch_script_re_list=batch_script_re_list, 628 | spawner_kwargs=spawner_kwargs, 629 | ) 630 | 631 | 632 | def test_keepvars(db, io_loop): 633 | # req_keepvars 634 | spawner_kwargs = { 635 | "req_keepvars": "ABCDE", 636 | } 637 | batch_script_re_list = [ 638 | re.compile(r"--export=ABCDE", re.X | re.M), 639 | ] 640 | run_typical_slurm_spawner( 641 | db, 642 | io_loop, 643 | spawner_kwargs=spawner_kwargs, 644 | batch_script_re_list=batch_script_re_list, 645 | ) 646 | 647 | # req_keepvars AND req_keepvars together 648 | spawner_kwargs = { 649 | "req_keepvars": "ABCDE", 650 | "req_keepvars_extra": "XYZ", 651 | } 652 | batch_script_re_list = [ 653 | re.compile(r"--export=ABCDE,XYZ", re.X | re.M), 654 | ] 655 | run_typical_slurm_spawner( 656 | db, 657 | io_loop, 658 | spawner_kwargs=spawner_kwargs, 659 | batch_script_re_list=batch_script_re_list, 660 | ) 661 | -------------------------------------------------------------------------------- /batchspawner/batchspawner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Regents of the University of Minnesota 2 | # Copyright (c) Michael Gilbert 3 | # Distributed under the terms of the Modified BSD License. 4 | 5 | """Batch spawners 6 | 7 | This file contains an abstraction layer for batch job queueing systems, and implements 8 | Jupyterhub spawners for Torque, SLURM, and eventually others. 9 | 10 | Common attributes of batch submission / resource manager environments will include notions of: 11 | * queue names, resource manager addresses 12 | * resource limits including runtime, number of processes, memory 13 | * singleuser child process running on (usually remote) host not known until runtime 14 | * job submission and monitoring via resource manager utilities 15 | * remote execution via submission of templated scripts 16 | * job names instead of PIDs 17 | """ 18 | import asyncio 19 | from async_generator import async_generator, yield_ 20 | import pwd 21 | import os 22 | import re 23 | 24 | import xml.etree.ElementTree as ET 25 | 26 | from enum import Enum 27 | 28 | from jinja2 import Template 29 | 30 | from tornado import gen 31 | 32 | from jupyterhub.spawner import Spawner 33 | from traitlets import Integer, Unicode, Float, Dict, default 34 | 35 | from jupyterhub.spawner import set_user_setuid 36 | 37 | 38 | def format_template(template, *args, **kwargs): 39 | """Format a template, either using jinja2 or str.format(). 40 | 41 | Use jinja2 if the template is a jinja2.Template, or contains '{{' or 42 | '{%'. Otherwise, use str.format() for backwards compatability with 43 | old scripts (but you can't mix them). 44 | """ 45 | if isinstance(template, Template): 46 | return template.render(*args, **kwargs) 47 | elif "{{" in template or "{%" in template: 48 | return Template(template).render(*args, **kwargs) 49 | return template.format(*args, **kwargs) 50 | 51 | 52 | class JobStatus(Enum): 53 | NOTFOUND = 0 54 | RUNNING = 1 55 | PENDING = 2 56 | UNKNOWN = 3 57 | 58 | 59 | class BatchSpawnerBase(Spawner): 60 | """Base class for spawners using resource manager batch job submission mechanisms 61 | 62 | This base class is developed targetting the TorqueSpawner and SlurmSpawner, so by default 63 | assumes a qsub-like command that reads a script from its stdin for starting jobs, 64 | a qstat-like command that outputs some data that can be parsed to check if the job is running 65 | and on what remote node, and a qdel-like command to cancel a job. The goal is to be 66 | sufficiently general that a broad range of systems can be supported with minimal overrides. 67 | 68 | At minimum, subclasses should provide reasonable defaults for the traits: 69 | batch_script 70 | batch_submit_cmd 71 | batch_query_cmd 72 | batch_cancel_cmd 73 | 74 | and must provide implementations for the methods: 75 | state_ispending 76 | state_isrunning 77 | state_gethost 78 | """ 79 | 80 | # override default since batch systems typically need longer 81 | start_timeout = Integer(300).tag(config=True) 82 | 83 | # override default server ip since batch jobs normally running remotely 84 | ip = Unicode( 85 | "0.0.0.0", 86 | help="Address for singleuser server to listen at", 87 | ).tag(config=True) 88 | 89 | exec_prefix = Unicode( 90 | "sudo -E -u {username}", 91 | help="Standard executon prefix (e.g. the default sudo -E -u {username})", 92 | ).tag(config=True) 93 | 94 | # all these req_foo traits will be available as substvars for templated strings 95 | req_queue = Unicode( 96 | "", 97 | help="Queue name to submit job to resource manager", 98 | ).tag(config=True) 99 | 100 | req_host = Unicode( 101 | "", 102 | help="Host name of batch server to submit job to resource manager", 103 | ).tag(config=True) 104 | 105 | req_memory = Unicode( 106 | "", 107 | help="Memory to request from resource manager", 108 | ).tag(config=True) 109 | 110 | req_nprocs = Unicode( 111 | "", 112 | help="Number of processors to request from resource manager", 113 | ).tag(config=True) 114 | 115 | req_ngpus = Unicode( 116 | "", 117 | help="Number of GPUs to request from resource manager", 118 | ).tag(config=True) 119 | 120 | req_runtime = Unicode( 121 | "", 122 | help="Length of time for submitted job to run", 123 | ).tag(config=True) 124 | 125 | req_partition = Unicode( 126 | "", 127 | help="Partition name to submit job to resource manager", 128 | ).tag(config=True) 129 | 130 | req_account = Unicode( 131 | "", 132 | help="Account name string to pass to the resource manager", 133 | ).tag(config=True) 134 | 135 | req_options = Unicode( 136 | "", 137 | help="Other options to include into job submission script", 138 | ).tag(config=True) 139 | 140 | req_prologue = Unicode( 141 | "", 142 | help="Script to run before single user server starts.", 143 | ).tag(config=True) 144 | 145 | req_epilogue = Unicode( 146 | "", 147 | help="Script to run after single user server ends.", 148 | ).tag(config=True) 149 | 150 | req_username = Unicode() 151 | 152 | @default("req_username") 153 | def _req_username_default(self): 154 | return self.user.name 155 | 156 | # Useful IF getpwnam on submit host returns correct info for exec host 157 | req_homedir = Unicode() 158 | 159 | @default("req_homedir") 160 | def _req_homedir_default(self): 161 | return pwd.getpwnam(self.user.name).pw_dir 162 | 163 | req_keepvars = Unicode() 164 | 165 | @default("req_keepvars") 166 | def _req_keepvars_default(self): 167 | return ",".join(self.get_env().keys()) 168 | 169 | req_keepvars_extra = Unicode( 170 | help="Extra environment variables which should be configured, " 171 | "added to the defaults in keepvars, " 172 | "comma separated list.", 173 | ) 174 | 175 | batch_script = Unicode( 176 | "", 177 | help="Template for job submission script. Traits on this class named like req_xyz " 178 | "will be substituted in the template for {xyz} using string.Formatter. " 179 | "Must include {cmd} which will be replaced with the jupyterhub-singleuser command line.", 180 | ).tag(config=True) 181 | 182 | batchspawner_singleuser_cmd = Unicode( 183 | "batchspawner-singleuser", 184 | help="A wrapper which is capable of special batchspawner setup: currently sets the port on " 185 | "the remote host. Not needed to be set under normal circumstances, unless path needs " 186 | "specification.", 187 | ).tag(config=True) 188 | 189 | # Raw output of job submission command unless overridden 190 | job_id = Unicode() 191 | 192 | # Will get the raw output of the job status command unless overridden 193 | job_status = Unicode() 194 | 195 | # Prepare substitution variables for templates using req_xyz traits 196 | def get_req_subvars(self): 197 | reqlist = [t for t in self.trait_names() if t.startswith("req_")] 198 | subvars = {} 199 | for t in reqlist: 200 | subvars[t[4:]] = getattr(self, t) 201 | if subvars.get("keepvars_extra"): 202 | subvars["keepvars"] += "," + subvars["keepvars_extra"] 203 | return subvars 204 | 205 | batch_submit_cmd = Unicode( 206 | "", 207 | help="Command to run to submit batch scripts. Formatted using req_xyz traits as {xyz}.", 208 | ).tag(config=True) 209 | 210 | def parse_job_id(self, output): 211 | "Parse output of submit command to get job id." 212 | return output 213 | 214 | def cmd_formatted_for_batch(self): 215 | """The command which is substituted inside of the batch script""" 216 | return " ".join([self.batchspawner_singleuser_cmd] + self.cmd + self.get_args()) 217 | 218 | async def run_command(self, cmd, input=None, env=None): 219 | proc = await asyncio.create_subprocess_shell( 220 | cmd, 221 | env=env, 222 | stdin=asyncio.subprocess.PIPE, 223 | stdout=asyncio.subprocess.PIPE, 224 | stderr=asyncio.subprocess.PIPE, 225 | ) 226 | inbytes = None 227 | 228 | if input: 229 | inbytes = input.encode() 230 | 231 | try: 232 | out, eout = await proc.communicate(input=inbytes) 233 | except: 234 | self.log.debug("Exception raised when trying to run command: %s" % cmd) 235 | proc.kill() 236 | self.log.debug("Running command failed, killed process.") 237 | try: 238 | out, eout = await asyncio.wait_for(proc.communicate(), timeout=2) 239 | out = out.decode().strip() 240 | eout = eout.decode().strip() 241 | self.log.error("Subprocess returned exitcode %s" % proc.returncode) 242 | self.log.error("Stdout:") 243 | self.log.error(out) 244 | self.log.error("Stderr:") 245 | self.log.error(eout) 246 | raise RuntimeError( 247 | "{} exit status {}: {}".format(cmd, proc.returncode, eout) 248 | ) 249 | except asyncio.TimeoutError: 250 | self.log.error( 251 | "Encountered timeout trying to clean up command, process probably killed already: %s" 252 | % cmd 253 | ) 254 | return "" 255 | except: 256 | self.log.error( 257 | "Encountered exception trying to clean up command: %s" % cmd 258 | ) 259 | raise 260 | else: 261 | eout = eout.decode().strip() 262 | err = proc.returncode 263 | if err != 0: 264 | self.log.error("Subprocess returned exitcode %s" % err) 265 | self.log.error(eout) 266 | raise RuntimeError(eout) 267 | 268 | out = out.decode().strip() 269 | return out 270 | 271 | async def _get_batch_script(self, **subvars): 272 | """Format batch script from vars""" 273 | # Could be overridden by subclasses, but mainly useful for testing 274 | return format_template(self.batch_script, **subvars) 275 | 276 | async def submit_batch_script(self): 277 | subvars = self.get_req_subvars() 278 | # `cmd` is submitted to the batch system 279 | cmd = " ".join( 280 | ( 281 | format_template(self.exec_prefix, **subvars), 282 | format_template(self.batch_submit_cmd, **subvars), 283 | ) 284 | ) 285 | # `subvars['cmd']` is what is run _inside_ the batch script, 286 | # put into the template. 287 | subvars["cmd"] = self.cmd_formatted_for_batch() 288 | if hasattr(self, "user_options"): 289 | subvars.update(self.user_options) 290 | script = await self._get_batch_script(**subvars) 291 | self.log.info("Spawner submitting job using " + cmd) 292 | self.log.info("Spawner submitted script:\n" + script) 293 | out = await self.run_command(cmd, input=script, env=self.get_env()) 294 | try: 295 | self.log.info("Job submitted. cmd: " + cmd + " output: " + out) 296 | self.job_id = self.parse_job_id(out) 297 | except: 298 | self.log.error("Job submission failed with exit code " + out) 299 | self.job_id = "" 300 | return self.job_id 301 | 302 | # Override if your batch system needs something more elaborate to query the job status 303 | batch_query_cmd = Unicode( 304 | "", 305 | help="Command to run to query job status. Formatted using req_xyz traits as {xyz} " 306 | "and self.job_id as {job_id}.", 307 | ).tag(config=True) 308 | 309 | async def query_job_status(self): 310 | """Check job status, return JobStatus object.""" 311 | if self.job_id is None or len(self.job_id) == 0: 312 | self.job_status = "" 313 | return JobStatus.NOTFOUND 314 | subvars = self.get_req_subvars() 315 | subvars["job_id"] = self.job_id 316 | cmd = " ".join( 317 | ( 318 | format_template(self.exec_prefix, **subvars), 319 | format_template(self.batch_query_cmd, **subvars), 320 | ) 321 | ) 322 | self.log.debug("Spawner querying job: " + cmd) 323 | try: 324 | self.job_status = await self.run_command(cmd) 325 | except RuntimeError as e: 326 | # e.args[0] is stderr from the process 327 | self.job_status = e.args[0] 328 | except Exception as e: 329 | self.log.error("Error querying job " + self.job_id) 330 | self.job_status = "" 331 | 332 | if self.state_isrunning(): 333 | return JobStatus.RUNNING 334 | elif self.state_ispending(): 335 | return JobStatus.PENDING 336 | elif self.state_isunknown(): 337 | return JobStatus.UNKNOWN 338 | else: 339 | return JobStatus.NOTFOUND 340 | 341 | batch_cancel_cmd = Unicode( 342 | "", 343 | help="Command to stop/cancel a previously submitted job. Formatted like batch_query_cmd.", 344 | ).tag(config=True) 345 | 346 | async def cancel_batch_job(self): 347 | subvars = self.get_req_subvars() 348 | subvars["job_id"] = self.job_id 349 | cmd = " ".join( 350 | ( 351 | format_template(self.exec_prefix, **subvars), 352 | format_template(self.batch_cancel_cmd, **subvars), 353 | ) 354 | ) 355 | self.log.info("Cancelling job " + self.job_id + ": " + cmd) 356 | await self.run_command(cmd) 357 | 358 | def load_state(self, state): 359 | """load job_id from state""" 360 | super(BatchSpawnerBase, self).load_state(state) 361 | self.job_id = state.get("job_id", "") 362 | self.job_status = state.get("job_status", "") 363 | 364 | def get_state(self): 365 | """add job_id to state""" 366 | state = super(BatchSpawnerBase, self).get_state() 367 | if self.job_id: 368 | state["job_id"] = self.job_id 369 | if self.job_status: 370 | state["job_status"] = self.job_status 371 | return state 372 | 373 | def clear_state(self): 374 | """clear job_id state""" 375 | super(BatchSpawnerBase, self).clear_state() 376 | self.job_id = "" 377 | self.job_status = "" 378 | 379 | def make_preexec_fn(self, name): 380 | """make preexec fn to change uid (if running as root) before job submission""" 381 | return set_user_setuid(name) 382 | 383 | def state_ispending(self): 384 | "Return boolean indicating if job is still waiting to run, likely by parsing self.job_status" 385 | raise NotImplementedError("Subclass must provide implementation") 386 | 387 | def state_isrunning(self): 388 | "Return boolean indicating if job is running, likely by parsing self.job_status" 389 | raise NotImplementedError("Subclass must provide implementation") 390 | 391 | def state_isunknown(self): 392 | "Return boolean indicating if job state retrieval failed because of the resource manager" 393 | return None 394 | 395 | def state_gethost(self): 396 | "Return string, hostname or addr of running job, likely by parsing self.job_status" 397 | raise NotImplementedError("Subclass must provide implementation") 398 | 399 | async def poll(self): 400 | """Poll the process""" 401 | status = await self.query_job_status() 402 | if status in (JobStatus.PENDING, JobStatus.RUNNING, JobStatus.UNKNOWN): 403 | return None 404 | else: 405 | self.clear_state() 406 | return 1 407 | 408 | startup_poll_interval = Float( 409 | 0.5, 410 | help="Polling interval (seconds) to check job state during startup", 411 | ).tag(config=True) 412 | 413 | async def start(self): 414 | """Start the process""" 415 | self.ip = self.traits()["ip"].default_value 416 | self.port = self.traits()["port"].default_value 417 | 418 | if self.server: 419 | self.server.port = self.port 420 | 421 | job = await self.submit_batch_script() 422 | 423 | # We are called with a timeout, and if the timeout expires this function will 424 | # be interrupted at the next yield, and self.stop() will be called. 425 | # So this function should not return unless successful, and if unsuccessful 426 | # should either raise and Exception or loop forever. 427 | if len(self.job_id) == 0: 428 | raise RuntimeError( 429 | "Jupyter batch job submission failure (no jobid in output)" 430 | ) 431 | while True: 432 | status = await self.query_job_status() 433 | if status == JobStatus.RUNNING: 434 | break 435 | elif status == JobStatus.PENDING: 436 | self.log.debug("Job " + self.job_id + " still pending") 437 | elif status == JobStatus.UNKNOWN: 438 | self.log.debug("Job " + self.job_id + " still unknown") 439 | else: 440 | self.log.warning( 441 | "Job " 442 | + self.job_id 443 | + " neither pending nor running.\n" 444 | + self.job_status 445 | ) 446 | self.clear_state() 447 | raise RuntimeError( 448 | "The Jupyter batch job has disappeared" 449 | " while pending in the queue or died immediately" 450 | " after starting." 451 | ) 452 | await gen.sleep(self.startup_poll_interval) 453 | 454 | self.ip = self.state_gethost() 455 | while self.port == 0: 456 | await gen.sleep(self.startup_poll_interval) 457 | # Test framework: For testing, mock_port is set because we 458 | # don't actually run the single-user server yet. 459 | if hasattr(self, "mock_port"): 460 | self.port = self.mock_port 461 | 462 | self.db.commit() 463 | self.log.info( 464 | "Notebook server job {0} started at {1}:{2}".format( 465 | self.job_id, self.ip, self.port 466 | ) 467 | ) 468 | 469 | return self.ip, self.port 470 | 471 | async def stop(self, now=False): 472 | """Stop the singleuser server job. 473 | 474 | Returns immediately after sending job cancellation command if now=True, otherwise 475 | tries to confirm that job is no longer running.""" 476 | 477 | self.log.info("Stopping server job " + self.job_id) 478 | await self.cancel_batch_job() 479 | if now: 480 | return 481 | for i in range(10): 482 | status = await self.query_job_status() 483 | if status not in (JobStatus.RUNNING, JobStatus.UNKNOWN): 484 | return 485 | await gen.sleep(1.0) 486 | if self.job_id: 487 | self.log.warning( 488 | "Notebook server job {0} at {1}:{2} possibly failed to terminate".format( 489 | self.job_id, self.ip, self.port 490 | ) 491 | ) 492 | 493 | @async_generator 494 | async def progress(self): 495 | while True: 496 | if self.state_ispending(): 497 | await yield_( 498 | { 499 | "message": "Pending in queue...", 500 | } 501 | ) 502 | elif self.state_isrunning(): 503 | await yield_( 504 | { 505 | "message": "Cluster job running... waiting to connect", 506 | } 507 | ) 508 | return 509 | else: 510 | await yield_( 511 | { 512 | "message": "Unknown status...", 513 | } 514 | ) 515 | await gen.sleep(1) 516 | 517 | 518 | class BatchSpawnerRegexStates(BatchSpawnerBase): 519 | """Subclass of BatchSpawnerBase that uses config-supplied regular expressions 520 | to interact with batch submission system state. Provides implementations of 521 | state_ispending 522 | state_isrunning 523 | state_gethost 524 | 525 | In their place, the user should supply the following configuration: 526 | state_pending_re - regex that matches job_status if job is waiting to run 527 | state_running_re - regex that matches job_status if job is running 528 | state_exechost_re - regex with at least one capture group that extracts 529 | execution host from job_status 530 | state_exechost_exp - if empty, notebook IP will be set to the contents of the 531 | first capture group. If this variable is set, the match object 532 | will be expanded using this string to obtain the notebook IP. 533 | See Python docs: re.match.expand 534 | """ 535 | 536 | state_pending_re = Unicode( 537 | "", 538 | help="Regex that matches job_status if job is waiting to run", 539 | ).tag(config=True) 540 | state_running_re = Unicode( 541 | "", 542 | help="Regex that matches job_status if job is running", 543 | ).tag(config=True) 544 | state_exechost_re = Unicode( 545 | "", 546 | help="Regex with at least one capture group that extracts " 547 | "the execution host from job_status output", 548 | ).tag(config=True) 549 | state_exechost_exp = Unicode( 550 | "", 551 | help="""If empty, notebook IP will be set to the contents of the first capture group. 552 | 553 | If this variable is set, the match object will be expanded using this string 554 | to obtain the notebook IP. 555 | See Python docs: re.match.expand""", 556 | ).tag(config=True) 557 | state_unknown_re = Unicode( 558 | "", 559 | help="Regex that matches job_status if the resource manager is not answering." 560 | "Blank indicates not used.", 561 | ).tag(config=True) 562 | 563 | def state_ispending(self): 564 | assert self.state_pending_re, "Misconfigured: define state_running_re" 565 | return self.job_status and re.search(self.state_pending_re, self.job_status) 566 | 567 | def state_isrunning(self): 568 | assert self.state_running_re, "Misconfigured: define state_running_re" 569 | return self.job_status and re.search(self.state_running_re, self.job_status) 570 | 571 | def state_isunknown(self): 572 | # Blank means "not set" and this function always returns None. 573 | if self.state_unknown_re: 574 | return self.job_status and re.search(self.state_unknown_re, self.job_status) 575 | 576 | def state_gethost(self): 577 | assert self.state_exechost_re, "Misconfigured: define state_exechost_re" 578 | match = re.search(self.state_exechost_re, self.job_status) 579 | if not match: 580 | self.log.error( 581 | "Spawner unable to match host addr in job status: " + self.job_status 582 | ) 583 | return 584 | if not self.state_exechost_exp: 585 | return match.groups()[0] 586 | else: 587 | return match.expand(self.state_exechost_exp) 588 | 589 | 590 | class TorqueSpawner(BatchSpawnerRegexStates): 591 | batch_script = Unicode( 592 | """#!/bin/sh 593 | #PBS -q {queue}@{host} 594 | #PBS -l walltime={runtime} 595 | #PBS -l nodes=1:ppn={nprocs} 596 | #PBS -l mem={memory} 597 | #PBS -N jupyterhub-singleuser 598 | #PBS -v {keepvars} 599 | #PBS {options} 600 | 601 | set -eu 602 | 603 | {prologue} 604 | {cmd} 605 | {epilogue} 606 | """ 607 | ).tag(config=True) 608 | 609 | # outputs job id string 610 | batch_submit_cmd = Unicode("qsub").tag(config=True) 611 | # outputs job data XML string 612 | batch_query_cmd = Unicode("qstat -x {job_id}").tag(config=True) 613 | batch_cancel_cmd = Unicode("qdel {job_id}").tag(config=True) 614 | # search XML string for job_state - [QH] = pending, R = running, [CE] = done 615 | state_pending_re = Unicode(r"[QH]").tag(config=True) 616 | state_running_re = Unicode(r"R").tag(config=True) 617 | state_exechost_re = Unicode(r"((?:[\w_-]+\.?)+)/\d+").tag(config=True) 618 | 619 | 620 | class MoabSpawner(TorqueSpawner): 621 | # outputs job id string 622 | batch_submit_cmd = Unicode("msub").tag(config=True) 623 | # outputs job data XML string 624 | batch_query_cmd = Unicode("mdiag -j {job_id} --xml").tag(config=True) 625 | batch_cancel_cmd = Unicode("mjobctl -c {job_id}").tag(config=True) 626 | state_pending_re = Unicode(r'State="Idle"').tag(config=True) 627 | state_running_re = Unicode(r'State="Running"').tag(config=True) 628 | state_exechost_re = Unicode(r'AllocNodeList="([^\r\n\t\f :"]*)').tag(config=True) 629 | 630 | 631 | class PBSSpawner(TorqueSpawner): 632 | batch_script = Unicode( 633 | """#!/bin/sh 634 | {% if queue or host %}#PBS -q {% if queue %}{{queue}}{% endif %}\ 635 | {% if host %}@{{host}}{% endif %}{% endif %} 636 | #PBS -l walltime={{runtime}} 637 | #PBS -l select=1:ncpus={{nprocs}}:mem={{memory}} 638 | #PBS -N jupyterhub-singleuser 639 | #PBS -o {{homedir}}/.jupyterhub.pbs.out 640 | #PBS -e {{homedir}}/.jupyterhub.pbs.err 641 | #PBS -v {{keepvars}} 642 | {% if options %}#PBS {{options}}{% endif %} 643 | 644 | set -eu 645 | 646 | {{prologue}} 647 | {{cmd}} 648 | {{epilogue}} 649 | """ 650 | ).tag(config=True) 651 | 652 | # outputs job data XML string 653 | batch_query_cmd = Unicode("qstat -fx {job_id}").tag(config=True) 654 | 655 | state_pending_re = Unicode(r"job_state = [QH]").tag(config=True) 656 | state_running_re = Unicode(r"job_state = R").tag(config=True) 657 | #state_exechost_re = Unicode(r"exec_host = ([\w_-]+)/").tag(config=True) 658 | state_exechost_re = Unicode(r"exec_host = ([\w_-]+)").tag(config=True) 659 | 660 | 661 | class UserEnvMixin: 662 | """Mixin class that computes values for USER, SHELL and HOME in the environment passed to 663 | the job submission subprocess in case the batch system needs these for the batch script. 664 | """ 665 | 666 | def user_env(self, env): 667 | """get user environment""" 668 | env["USER"] = self.user.name 669 | home = pwd.getpwnam(self.user.name).pw_dir 670 | shell = pwd.getpwnam(self.user.name).pw_shell 671 | if home: 672 | env["HOME"] = home 673 | if shell: 674 | env["SHELL"] = shell 675 | return env 676 | 677 | def get_env(self): 678 | """Get user environment variables to be passed to the user's job 679 | 680 | Everything here should be passed to the user's job as 681 | environment. Caution: If these variables are used for 682 | authentication to the batch system commands as an admin, be 683 | aware that the user will receive access to these as well. 684 | """ 685 | env = super().get_env() 686 | env = self.user_env(env) 687 | return env 688 | 689 | 690 | class SlurmSpawner(UserEnvMixin, BatchSpawnerRegexStates): 691 | batch_script = Unicode( 692 | """#!/bin/bash 693 | #SBATCH --output={{homedir}}/jupyterhub_slurmspawner_%j.log 694 | #SBATCH --job-name=spawner-jupyterhub 695 | #SBATCH --chdir={{homedir}} 696 | #SBATCH --export={{keepvars}} 697 | #SBATCH --get-user-env=L 698 | {% if partition %}#SBATCH --partition={{partition}} 699 | {% endif %}{% if runtime %}#SBATCH --time={{runtime}} 700 | {% endif %}{% if memory %}#SBATCH --mem={{memory}} 701 | {% endif %}{% if gres %}#SBATCH --gres={{gres}} 702 | {% endif %}{% if nprocs %}#SBATCH --cpus-per-task={{nprocs}} 703 | {% endif %}{% if reservation%}#SBATCH --reservation={{reservation}} 704 | {% endif %}{% if options %}#SBATCH {{options}}{% endif %} 705 | 706 | set -euo pipefail 707 | 708 | trap 'echo SIGTERM received' TERM 709 | {{prologue}} 710 | which jupyterhub-singleuser 711 | {% if srun %}{{srun}} {% endif %}{{cmd}} 712 | echo "jupyterhub-singleuser ended gracefully" 713 | {{epilogue}} 714 | """ 715 | ).tag(config=True) 716 | 717 | # all these req_foo traits will be available as substvars for templated strings 718 | req_cluster = Unicode( 719 | "", 720 | help="Cluster name to submit job to resource manager", 721 | ).tag(config=True) 722 | 723 | req_qos = Unicode( 724 | "", 725 | help="QoS name to submit job to resource manager", 726 | ).tag(config=True) 727 | 728 | req_srun = Unicode( 729 | "srun", 730 | help="Set req_srun='' to disable running in job step, and note that " 731 | "this affects environment handling. This is effectively a " 732 | "prefix for the singleuser command.", 733 | ).tag(config=True) 734 | 735 | req_reservation = Unicode( 736 | "", 737 | help="Reservation name to submit to resource manager", 738 | ).tag(config=True) 739 | 740 | req_gres = Unicode( 741 | "", 742 | help="Additional resources (e.g. GPUs) requested", 743 | ).tag(config=True) 744 | 745 | # outputs line like "Submitted batch job 209" 746 | batch_submit_cmd = Unicode("sbatch --parsable").tag(config=True) 747 | # outputs status and exec node like "RUNNING hostname" 748 | batch_query_cmd = Unicode("squeue -h -j {job_id} -o '%T %B'").tag(config=True) 749 | batch_cancel_cmd = Unicode("scancel {job_id}").tag(config=True) 750 | # use long-form states: PENDING, CONFIGURING = pending 751 | # RUNNING, COMPLETING = running 752 | state_pending_re = Unicode(r"^(?:PENDING|CONFIGURING)").tag(config=True) 753 | state_running_re = Unicode(r"^(?:RUNNING|COMPLETING)").tag(config=True) 754 | state_unknown_re = Unicode( 755 | r"^slurm_load_jobs error: (?:Socket timed out on send/recv|Unable to contact slurm controller)" 756 | ).tag(config=True) 757 | state_exechost_re = Unicode(r"\s+((?:[\w_-]+\.?)+)$").tag(config=True) 758 | 759 | def parse_job_id(self, output): 760 | # make sure jobid is really a number 761 | try: 762 | # use only last line to circumvent slurm bug 763 | output = output.splitlines()[-1] 764 | id = output.split(";")[0] 765 | int(id) 766 | except Exception as e: 767 | self.log.error("SlurmSpawner unable to parse job ID from text: " + output) 768 | raise e 769 | return id 770 | 771 | 772 | class MultiSlurmSpawner(SlurmSpawner): 773 | """When slurm has been compiled with --enable-multiple-slurmd, the 774 | administrator sets the name of the slurmd instance via the slurmd -N 775 | option. This node name is usually different from the hostname and may 776 | not be resolvable by JupyterHub. Here we enable the administrator to 777 | map the node names onto the real hostnames via a traitlet.""" 778 | 779 | daemon_resolver = Dict( 780 | {}, 781 | help="Map node names to hostnames", 782 | ).tag(config=True) 783 | 784 | def state_gethost(self): 785 | host = SlurmSpawner.state_gethost(self) 786 | return self.daemon_resolver.get(host, host) 787 | 788 | 789 | class GridengineSpawner(BatchSpawnerBase): 790 | batch_script = Unicode( 791 | """#!/bin/bash 792 | #$ -j yes 793 | #$ -N spawner-jupyterhub 794 | #$ -o {homedir}/.jupyterhub.sge.out 795 | #$ -e {homedir}/.jupyterhub.sge.err 796 | #$ -v {keepvars} 797 | #$ {options} 798 | 799 | set -euo pipefail 800 | 801 | {prologue} 802 | {cmd} 803 | {epilogue} 804 | """ 805 | ).tag(config=True) 806 | 807 | # outputs job id string 808 | batch_submit_cmd = Unicode("qsub").tag(config=True) 809 | # outputs job data XML string 810 | batch_query_cmd = Unicode("qstat -xml").tag(config=True) 811 | batch_cancel_cmd = Unicode("qdel {job_id}").tag(config=True) 812 | 813 | def parse_job_id(self, output): 814 | return output.split(" ")[2] 815 | 816 | def state_ispending(self): 817 | if self.job_status: 818 | job_info = ET.fromstring(self.job_status).find( 819 | ".//job_list[JB_job_number='{0}']".format(self.job_id) 820 | ) 821 | if job_info is not None: 822 | return job_info.attrib.get("state") == "pending" 823 | return False 824 | 825 | def state_isrunning(self): 826 | if self.job_status: 827 | job_info = ET.fromstring(self.job_status).find( 828 | ".//job_list[JB_job_number='{0}']".format(self.job_id) 829 | ) 830 | if job_info is not None: 831 | return job_info.attrib.get("state") == "running" 832 | return False 833 | 834 | def state_gethost(self): 835 | if self.job_status: 836 | queue_name = ET.fromstring(self.job_status).find( 837 | ".//job_list[JB_job_number='{0}']/queue_name".format(self.job_id) 838 | ) 839 | if queue_name is not None and queue_name.text: 840 | return queue_name.text.split("@")[1] 841 | 842 | self.log.error( 843 | "Spawner unable to match host addr in job {0} with status {1}".format( 844 | self.job_id, self.job_status 845 | ) 846 | ) 847 | return 848 | 849 | def get_env(self): 850 | env = super().get_env() 851 | 852 | # SGE relies on environment variables to launch local jobs. Ensure that these values are included 853 | # in the environment used to run the spawner. 854 | for key in [ 855 | "SGE_CELL", 856 | "SGE_EXECD", 857 | "SGE_ROOT", 858 | "SGE_CLUSTER_NAME", 859 | "SGE_QMASTER_PORT", 860 | "SGE_EXECD_PORT", 861 | "PATH", 862 | ]: 863 | if key in os.environ and key not in env: 864 | env[key] = os.environ[key] 865 | return env 866 | 867 | 868 | class CondorSpawner(UserEnvMixin, BatchSpawnerRegexStates): 869 | batch_script = Unicode( 870 | """ 871 | Executable = /bin/sh 872 | RequestMemory = {memory} 873 | RequestCpus = {nprocs} 874 | Arguments = \"-c 'exec {cmd}'\" 875 | Remote_Initialdir = {homedir} 876 | Output = {homedir}/.jupyterhub.condor.out 877 | Error = {homedir}/.jupyterhub.condor.err 878 | ShouldTransferFiles = False 879 | GetEnv = True 880 | {options} 881 | Queue 882 | """ 883 | ).tag(config=True) 884 | 885 | # outputs job id string 886 | batch_submit_cmd = Unicode("condor_submit").tag(config=True) 887 | # outputs job data XML string 888 | batch_query_cmd = Unicode( 889 | 'condor_q {job_id} -format "%s, " JobStatus -format "%s" RemoteHost -format "\n" True' 890 | ).tag(config=True) 891 | batch_cancel_cmd = Unicode("condor_rm {job_id}").tag(config=True) 892 | # job status: 1 = pending, 2 = running 893 | state_pending_re = Unicode(r"^1,").tag(config=True) 894 | state_running_re = Unicode(r"^2,").tag(config=True) 895 | state_exechost_re = Unicode(r"^\w*, .*@([^ ]*)").tag(config=True) 896 | 897 | def parse_job_id(self, output): 898 | match = re.search(r".*submitted to cluster ([0-9]+)", output) 899 | if match: 900 | return match.groups()[0] 901 | 902 | error_msg = "CondorSpawner unable to parse jobID from text: " + output 903 | self.log.error(error_msg) 904 | raise Exception(error_msg) 905 | 906 | def cmd_formatted_for_batch(self): 907 | return ( 908 | super(CondorSpawner, self) 909 | .cmd_formatted_for_batch() 910 | .replace('"', '""') 911 | .replace("'", "''") 912 | ) 913 | 914 | 915 | class LsfSpawner(BatchSpawnerBase): 916 | """A Spawner that uses IBM's Platform Load Sharing Facility (LSF) to launch notebooks.""" 917 | 918 | batch_script = Unicode( 919 | """#!/bin/sh 920 | #BSUB -R "select[type==any]" # Allow spawning on non-uniform hardware 921 | #BSUB -R "span[hosts=1]" # Only spawn job on one server 922 | #BSUB -q {queue} 923 | #BSUB -J spawner-jupyterhub 924 | #BSUB -o {homedir}/.jupyterhub.lsf.out 925 | #BSUB -e {homedir}/.jupyterhub.lsf.err 926 | 927 | set -eu 928 | 929 | {prologue} 930 | {cmd} 931 | {epilogue} 932 | """ 933 | ).tag(config=True) 934 | 935 | batch_submit_cmd = Unicode("bsub").tag(config=True) 936 | batch_query_cmd = Unicode('bjobs -a -noheader -o "STAT EXEC_HOST" {job_id}').tag( 937 | config=True 938 | ) 939 | batch_cancel_cmd = Unicode("bkill {job_id}").tag(config=True) 940 | 941 | def get_env(self): 942 | env = super().get_env() 943 | 944 | # LSF relies on environment variables to launch local jobs. Ensure that these values are included 945 | # in the environment used to run the spawner. 946 | for key in [ 947 | "LSF_ENVDIR", 948 | "LSF_SERVERDIR", 949 | "LSF_FULL_VERSION", 950 | "LSF_LIBDIR", 951 | "LSF_BINDIR", 952 | ]: 953 | if key in os.environ and key not in env: 954 | env[key] = os.environ[key] 955 | return env 956 | 957 | def parse_job_id(self, output): 958 | # Assumes output in the following form: 959 | # "Job <1815> is submitted to default queue ." 960 | return output.split(" ")[1].strip("<>") 961 | 962 | def state_ispending(self): 963 | # Parse results of batch_query_cmd 964 | # Output determined by results of self.batch_query_cmd 965 | if self.job_status: 966 | return self.job_status.split(" ")[0].upper() in {"PEND", "PUSP"} 967 | 968 | def state_isrunning(self): 969 | if self.job_status: 970 | return self.job_status.split(" ")[0].upper() == "RUN" 971 | 972 | def state_gethost(self): 973 | if self.job_status: 974 | return self.job_status.split(" ")[1].strip().split(":")[0] 975 | 976 | self.log.error( 977 | "Spawner unable to match host addr in job {0} with status {1}".format( 978 | self.job_id, self.job_status 979 | ) 980 | ) 981 | return 982 | 983 | 984 | # vim: set ai expandtab softtabstop=4: 985 | --------------------------------------------------------------------------------