├── .flake8 ├── .github ├── dependabot.yaml └── workflows │ ├── python-publish.yml │ └── test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── SPAWNERS.md ├── batchspawner ├── __init__.py ├── _version.py ├── api.py ├── batchspawner.py ├── singleuser.py └── tests │ ├── __init__.py │ ├── conftest.py │ └── test_spawners.py ├── pyproject.toml └── setup.py /.flake8: -------------------------------------------------------------------------------- 1 | # flake8 is used for linting Python code setup to automatically run with 2 | # pre-commit. 3 | # 4 | # ref: https://flake8.pycqa.org/en/latest/user/configuration.html 5 | # 6 | [flake8] 7 | # E: style errors 8 | # W: style warnings 9 | # C: complexity 10 | # D: docstring warnings (unused pydocstyle extension) 11 | ignore = E, C, W, D 12 | -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | # dependabot.yaml reference: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 2 | # 3 | # Notes: 4 | # - Status and logs from dependabot are provided at 5 | # https://github.com/jupyterhub/tmpauthenticator/network/updates. 6 | # 7 | version: 2 8 | updates: 9 | # Maintain dependencies in our GitHub Workflows 10 | - package-ecosystem: github-actions 11 | directory: / 12 | labels: [ci] 13 | schedule: 14 | interval: monthly 15 | time: "05:00" 16 | timezone: Etc/UTC 17 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | # 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [released] 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.x" 19 | 20 | - name: install build package 21 | run: | 22 | pip install --upgrade pip 23 | pip install build 24 | pip freeze 25 | 26 | - name: build release 27 | run: | 28 | python -m build --sdist --wheel . 29 | ls -l dist 30 | sha256sum dist/* | tee SHA256SUMS 31 | 32 | - name: Publish to PyPI 33 | env: 34 | TWINE_USERNAME: __token__ 35 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 36 | run: | 37 | pip install twine 38 | twine upload --skip-existing dist/* 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | # This is a GitHub workflow defining a set of jobs with a set of steps. 2 | # ref: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions 3 | # 4 | name: Test 5 | 6 | on: 7 | pull_request: 8 | paths-ignore: 9 | - "**.md" 10 | - ".github/workflows/*.yaml" 11 | - "!.github/workflows/test.yaml" 12 | push: 13 | paths-ignore: 14 | - "**.md" 15 | - ".github/workflows/*.yaml" 16 | - "!.github/workflows/test.yaml" 17 | branches-ignore: 18 | - "dependabot/**" 19 | - "pre-commit-ci-update-config" 20 | tags: ["**"] 21 | workflow_dispatch: 22 | 23 | jobs: 24 | pytest: 25 | name: Run pytest 26 | runs-on: ${{ matrix.runs-on || 'ubuntu-22.04' }} 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | include: 32 | # test oldest supported version 33 | - python-version: "3.6" 34 | pip-install-spec: "jupyterhub==1.5.1 sqlalchemy==1.*" 35 | runs-on: ubuntu-20.04 # python 3.6 is only available in 20.04 36 | 37 | - python-version: "3.7" 38 | pip-install-spec: "jupyterhub==2.* sqlalchemy==1.*" 39 | - python-version: "3.8" 40 | pip-install-spec: "jupyterhub==3.*" 41 | - python-version: "3.10" 42 | pip-install-spec: "jupyterhub==4.*" 43 | - python-version: "3.11" 44 | pip-install-spec: "jupyterhub==4.*" 45 | - python-version: "3.12" 46 | pip-install-spec: "jupyterhub==4.*" 47 | 48 | # test unreleased jupyterhub, failures tolerated 49 | - python-version: "3.X" 50 | pip-install-spec: "git+https://github.com/jupyterhub/jupyterhub" 51 | allow-failure: true 52 | 53 | steps: 54 | - uses: actions/checkout@v4 55 | - uses: actions/setup-node@v4 56 | with: 57 | node-version: "lts/*" 58 | - uses: actions/setup-python@v5 59 | with: 60 | python-version: "${{ matrix.python-version }}" 61 | 62 | - name: Install Node dependencies 63 | run: | 64 | npm install -g configurable-http-proxy 65 | 66 | - name: Install Python dependencies 67 | run: | 68 | pip install --upgrade pip 69 | pip install ${{ matrix.pip-install-spec }} 70 | pip install -e ".[test]" 71 | 72 | - name: List dependencies 73 | run: | 74 | pip freeze 75 | 76 | - name: pytest 77 | run: | 78 | pytest 79 | 80 | # GitHub action reference: https://github.com/codecov/codecov-action 81 | - uses: codecov/codecov-action@v4 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | *.log 3 | *.pyc 4 | __pycache__/ 5 | .cache/ 6 | .coverage 7 | .pytest_cache 8 | *~ 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit is a tool to perform a predefined set of tasks manually and/or 2 | # automatically before git commits are made. 3 | # 4 | # Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level 5 | # 6 | # Common tasks 7 | # 8 | # - Run on all files: pre-commit run --all-files 9 | # - Register git hooks: pre-commit install --install-hooks 10 | # 11 | repos: 12 | # Autoformat: Python code, syntax patterns are modernized 13 | - repo: https://github.com/asottile/pyupgrade 14 | rev: v3.15.2 15 | hooks: 16 | - id: pyupgrade 17 | args: 18 | - --py38-plus 19 | 20 | # Autoformat: Python code 21 | - repo: https://github.com/PyCQA/autoflake 22 | rev: v2.3.1 23 | hooks: 24 | - id: autoflake 25 | # args ref: https://github.com/PyCQA/autoflake#advanced-usage 26 | args: 27 | - --in-place 28 | 29 | # Autoformat: Python code 30 | - repo: https://github.com/pycqa/isort 31 | rev: 5.13.2 32 | hooks: 33 | - id: isort 34 | 35 | # Autoformat: Python code 36 | - repo: https://github.com/psf/black 37 | rev: "24.4.2" 38 | hooks: 39 | - id: black 40 | 41 | # Autoformat: markdown, yaml 42 | - repo: https://github.com/pre-commit/mirrors-prettier 43 | rev: v4.0.0-alpha.8 44 | hooks: 45 | - id: prettier 46 | 47 | # Lint: Python code 48 | - repo: https://github.com/PyCQA/flake8 49 | rev: "7.0.0" 50 | hooks: 51 | - id: flake8 52 | 53 | # Misc... 54 | - repo: https://github.com/pre-commit/pre-commit-hooks 55 | rev: v4.6.0 56 | # ref: https://github.com/pre-commit/pre-commit-hooks#hooks-available 57 | hooks: 58 | # Autoformat: Makes sure files end in a newline and only a newline. 59 | - id: end-of-file-fixer 60 | 61 | # Autoformat: Sorts entries in requirements.txt. 62 | - id: requirements-txt-fixer 63 | 64 | # Prevent giant (500kB) files from being committed. 65 | - id: check-added-large-files 66 | 67 | # Lint: Check for files with names that would conflict on a 68 | # case-insensitive filesystem like MacOS HFS+ or Windows FAT. 69 | - id: check-case-conflict 70 | 71 | # Lint: Checks that non-binary executables have a proper shebang. 72 | - id: check-executables-have-shebangs 73 | 74 | # pre-commit.ci config reference: https://pre-commit.ci/#configuration 75 | ci: 76 | autoupdate_schedule: monthly 77 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v1.3 4 | 5 | ### v1.3.0 - 2024-03-19 6 | 7 | This release requires Python >=3.6 and JupyterHub >=1.5.1. 8 | 9 | #### New features added 10 | 11 | - allow for req_keepvars_extra to be configured [#295](https://github.com/jupyterhub/batchspawner/pull/295) ([@mark-tomich](https://github.com/mark-tomich), [@minrk](https://github.com/minrk)) 12 | 13 | #### Bugs fixed 14 | 15 | - Remove `which jupyterhub-singleuser` command from `SlurmSpawner.batch_script` [#265](https://github.com/jupyterhub/batchspawner/pull/265) ([@t20100](https://github.com/t20100), [@consideRatio](https://github.com/consideRatio)) 16 | 17 | #### Maintenance and upkeep improvements 18 | 19 | - TST: don't assume test user is OS user [#301](https://github.com/jupyterhub/batchspawner/pull/301) ([@minrk](https://github.com/minrk)) 20 | - Add python 3.12 for tests [#299](https://github.com/jupyterhub/batchspawner/pull/299) ([@Ph0tonic](https://github.com/Ph0tonic), [@consideRatio](https://github.com/consideRatio)) 21 | - maint: req py36+ and jh 1.5.1+, fix tests, add RELEASE.md, add pre-commit hooks, add dependabot [#273](https://github.com/jupyterhub/batchspawner/pull/273) ([@consideRatio](https://github.com/consideRatio), [@mbmilligan](https://github.com/mbmilligan), [@ryanlovett](https://github.com/ryanlovett), [@yuvipanda](https://github.com/yuvipanda), [@mahendrapaipuri](https://github.com/mahendrapaipuri)) 22 | - Upgrade singleuser.py to JupyterHub 4 [#267](https://github.com/jupyterhub/batchspawner/pull/267) ([@mahendrapaipuri](https://github.com/mahendrapaipuri), [@minrk](https://github.com/minrk), [@consideRatio](https://github.com/consideRatio)) 23 | - Remove reading/setting HubAuth SSL attributes in singeuser [#259](https://github.com/jupyterhub/batchspawner/pull/259) ([@cmd-ntrf](https://github.com/cmd-ntrf), [@consideRatio](https://github.com/consideRatio)) 24 | - Fix Slurm test used regular expression [#256](https://github.com/jupyterhub/batchspawner/pull/256) ([@t20100](https://github.com/t20100), [@consideRatio](https://github.com/consideRatio)) 25 | - Quell async warning, and POST with body for jupyterhub 3.0 [#247](https://github.com/jupyterhub/batchspawner/pull/247) ([@ryanlovett](https://github.com/ryanlovett), [@mbmilligan](https://github.com/mbmilligan), [@rcthomas](https://github.com/rcthomas), [@minrk](https://github.com/minrk), [@jbeal-work](https://github.com/jbeal-work), [@mawigh](https://github.com/mawigh), [@cmd-ntrf](https://github.com/cmd-ntrf), [@jaescartin1](https://github.com/jaescartin1)) 26 | - Improve submit_batch_script logging [#219](https://github.com/jupyterhub/batchspawner/pull/219) ([@cmd-ntrf](https://github.com/cmd-ntrf), [@consideRatio](https://github.com/consideRatio), [@mbmilligan](https://github.com/mbmilligan)) 27 | 28 | #### Documentation improvements 29 | 30 | - Add temporary info about a temporary bug with JupyterHub 3+ [#290](https://github.com/jupyterhub/batchspawner/pull/290) ([@krokicki](https://github.com/krokicki), [@consideRatio](https://github.com/consideRatio)) 31 | 32 | #### Continuous integration improvements 33 | 34 | - Modernize test matrix [#252](https://github.com/jupyterhub/batchspawner/pull/252) ([@mbmilligan](https://github.com/mbmilligan)) 35 | 36 | #### Contributors to this release 37 | 38 | The following people contributed discussions, new ideas, code and documentation contributions, and review. 39 | See [our definition of contributors](https://github-activity.readthedocs.io/en/latest/#how-does-this-tool-define-contributions-in-the-reports). 40 | 41 | ([GitHub contributors page for this release](https://github.com/jupyterhub/batchspawner/graphs/contributors?from=2022-10-05&to=2024-03-19&type=c)) 42 | 43 | @basnijholt ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Abasnijholt+updated%3A2022-10-05..2024-03-19&type=Issues)) | @cmd-ntrf ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Acmd-ntrf+updated%3A2022-10-05..2024-03-19&type=Issues)) | @consideRatio ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3AconsideRatio+updated%3A2022-10-05..2024-03-19&type=Issues)) | @jaescartin1 ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Ajaescartin1+updated%3A2022-10-05..2024-03-19&type=Issues)) | @jbeal-work ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Ajbeal-work+updated%3A2022-10-05..2024-03-19&type=Issues)) | @krokicki ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Akrokicki+updated%3A2022-10-05..2024-03-19&type=Issues)) | @mahendrapaipuri ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Amahendrapaipuri+updated%3A2022-10-05..2024-03-19&type=Issues)) | @mark-tomich ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Amark-tomich+updated%3A2022-10-05..2024-03-19&type=Issues)) | @mawigh ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Amawigh+updated%3A2022-10-05..2024-03-19&type=Issues)) | @mbmilligan ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Ambmilligan+updated%3A2022-10-05..2024-03-19&type=Issues)) | @minrk ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Aminrk+updated%3A2022-10-05..2024-03-19&type=Issues)) | @opoplawski ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Aopoplawski+updated%3A2022-10-05..2024-03-19&type=Issues)) | @Ph0tonic ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3APh0tonic+updated%3A2022-10-05..2024-03-19&type=Issues)) | @rcthomas ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Arcthomas+updated%3A2022-10-05..2024-03-19&type=Issues)) | @ryanlovett ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Aryanlovett+updated%3A2022-10-05..2024-03-19&type=Issues)) | @t20100 ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3At20100+updated%3A2022-10-05..2024-03-19&type=Issues)) | @yuvipanda ([activity](https://github.com/search?q=repo%3Ajupyterhub%2Fbatchspawner+involves%3Ayuvipanda+updated%3A2022-10-05..2024-03-19&type=Issues)) 44 | 45 | ## v1.2 46 | 47 | ### v1.2.0 - 2022-10-04 48 | 49 | Changed 50 | 51 | - PR #237: Replace use of scripts with entry_points 52 | - PR #208 #238 #239 #240 #241: updates to CI - bumping versions and aligning with Jupyterhub standards 53 | - PR #220: remove code supporting Jupyterhub earlier than 0.9 54 | 55 | Fixed 56 | 57 | - PR #229: LSF jobs with multiple slots display each hostname ':' separated 58 | 59 | ## v1.1 60 | 61 | ### v1.1.0 - 2021-04-07 62 | 63 | Added (user) 64 | 65 | - PR #170: SlurmSpawner: add `req_gres` to specify `-go-res`. 66 | - PR #137: GridEngineSpawner: spawner will now add the following system environment values to the spawner environment, in accordance with the Univa Admin Guide: `SGE_CELL`, `SGE_EXECD`, `SGE_ROOT`, `SGE_CLUSTER_NAME`, `SGE_QMASTER_PORT`, `SGE_EXECD_PORT`, `PATH` 67 | 68 | Added (developer) 69 | 70 | - PR #187: support for unknown job state 71 | 72 | Changed 73 | 74 | - PR #177: Fail on first error in batch script by setting `set -e` to script templates. 75 | - PR #165: SlurmSpawner: Update template to use `--chdir` instead of `--workdir`. Users of Slurm older than 17.11 may need to revert this locally. 76 | - PR #189: remove bashism from default script template 77 | - PR #195: fix exception handling in run_command 78 | - PR #198: change from Travis to gh-actions for testing 79 | - PR #196: documentation 80 | - PR #199: update setup.py 81 | 82 | ## v1.0 83 | 84 | ### v1.0.1 - 2020-11-04 85 | 86 | - PR #189: batchspawner/batchspawner: Don't use `-o pipefail` in /bin/sh scripts 87 | - PR #180: travis: Attempt to fix CI 88 | - PR #177: Fail hard on first error in batch script 89 | - PR #170: add 'gres' option to SlurmSpawner 90 | - PR #165: Update batchspawner.py to use --chdir instead of --workdir 91 | - PR #137: Grab environment variables needed for grid engine 92 | 93 | ### v1.0.0 - 2020-07-21 94 | 95 | This release requires minimum JupyterHub 0.9 and Python 3.5. 96 | 97 | Added (user) 98 | 99 | - Add support for JupyterHub named servers. #167 100 | - Add Jinja2 templating as an option for all scripts and commands. If '{{' or `{%` is used anywhere in the string, it is used as a jinja2 template. 101 | - Add new option exec_prefix, which defaults to `sudo -E -u {username}`. This replaces explicit `sudo` in every batch command - changes in local commands may be needed. 102 | - New option: `req_keepvars_extra`, which allows keeping extra variables in addition to what is defined by JupyterHub itself (addition of variables to keep instead of replacement). #99 103 | - Add `req_prologue` and `req_epilogue` options to scripts which are inserted before/after the main jupyterhub-singleuser command, which allow for generic setup/cleanup without overriding the entire script. #96 104 | - SlurmSpawner: add the `req_reservation` option. #91 105 | - Add basic support for JupyterHub progress updates, but this is not used much yet. #86 106 | 107 | Added (developer) 108 | 109 | - Add many more tests. 110 | - Add a new page `SPAWNERS.md` which information on specific spawners. Begin trying to collect a list of spawner-specific contacts. #97 111 | - Rename `current_ip` and `current_port` commands to `ip` and `port`. No user impact. #139 112 | - Update to Python 3.5 `async` / `await` syntax to support JupyterHub progress updates. #90 113 | 114 | Changed 115 | 116 | - PR #58 and #141 changes logic of port selection, so that it is selected _after_ the singleuser server starts. This means that the port number has to be conveyed back to JupyterHub. This requires the following changes: 117 | - `jupyterhub_config.py` _must_ explicitely import `batchspawner` 118 | - Add a new option `batchspawner_singleuser_cmd` which is used as a wrapper in the single-user servers, which conveys the remote port back to JupyterHub. This is now an integral part of the spawn process. 119 | - If you have installed with `pip install -e`, you will have to re-install so that the new script `batchspawner-singleuser` is added to `$PATH`. 120 | - Update minimum requirements to JupyterHub 0.9 and Python 3.5. #143 121 | - Update Slurm batch script. Now, the single-user notebook is run in a job step, with a wrapper of `srun`. This may need to be removed using `req_srun=''` if you don't want environment variables limited. 122 | - Pass the environment dictionary to the queue and cancel commands as well. This is mostly user environment, but may be useful to these commands as well in some cases. #108, #111 If these environment variables were used for authentication as an admin, be aware that there are pre-existing security issues because they may be passed to the user via the batch submit command, see #82. 123 | 124 | Fixed 125 | 126 | - Improve debugging on failed submission by raising errors including error messages from the commands. #106 127 | - Many other non-user or developer visible changes. #107 #106 #100 128 | - In Travis CI, blacklist jsonschema=3.0.0a1 because it breaks tests 129 | 130 | Removed 131 | 132 | ## v0.8 133 | 134 | ### v0.8.1 - 2018-05-02 135 | 136 | - Fix regression: single-user server binding address is overwritten by previous session server address, resulting in failure to start. Issue #76 137 | 138 | ### v0.8.0 - 2018-04-24 139 | 140 | This release is compatible with JupyterHub 0.5.0 through 0.8.1/0.9dev. 141 | 142 | - SlurmSpawner: Remove `--uid` for (at least) Slurm 17.11 compatibility. If you use `sudo`, this should not be necessary, but because this is security related you should check that user management is as you expect. If your configuration does not use `sudo` then you may need to add the `--uid` option in a custom `batch_script`. 143 | - add base options `req_ngpus` `req_partition` `req_account` and `req_options` 144 | - Fix up logging 145 | - Merge `user_options` with the template substitution vars instead of having it as a separate key 146 | - Update ip/port handling for JupyterHub 0.8 147 | - Add `LICENSE` (BSD3) and `CONTRIBUTING.md` 148 | - Add `LsfSpawner` for IBM LFS 149 | - Add `MultiSlurmSpawner` 150 | - Add `MoabSpawner` 151 | - Add `condorSpawner` 152 | - Add `GridEngineSpawner` 153 | - SlurmSpawner: add `req_qos` option 154 | - WrapSpawner and ProfilesSpawner, which provide mechanisms for runtime configuration of spawners, have been split out and moved to the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) package 155 | - Enable CI testing via Travis-CI 156 | 157 | ## v0.3 158 | 159 | ### v0.3.0 - 2015-11-30 160 | 161 | - initial release containing `TorqueSpawner` and `SlurmSpawner` 162 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Welcome! As a [Jupyter](https://jupyter.org) project, we follow the [Jupyter contributor guide](https://jupyter.readthedocs.io/en/latest/contributing/content-contributor.html). 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Project Jupyter Contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # batchspawner for Jupyterhub 2 | 3 | [![Latest PyPI version](https://img.shields.io/pypi/v/batchspawner?logo=pypi)](https://pypi.python.org/pypi/batchspawner) 4 | [![Latest conda-forge version](https://img.shields.io/conda/vn/conda-forge/batchspawner?logo=conda-forge)](https://anaconda.org/conda-forge/batchspawner) 5 | [![GitHub Workflow Status - Test](https://img.shields.io/github/actions/workflow/status/jupyterhub/batchspawner/test.yaml?logo=github&label=tests)](https://github.com/jupyterhub/batchspawner/actions) 6 | [![Test coverage of code](https://codecov.io/gh/jupyterhub/batchspawner/branch/main/graph/badge.svg)](https://codecov.io/gh/jupyterhub/batchspawner) 7 | [![Issue tracking - GitHub](https://img.shields.io/badge/issue_tracking-github-blue?logo=github)](https://github.com/jupyterhub/batchspawner/issues) 8 | [![Help forum - Discourse](https://img.shields.io/badge/help_forum-discourse-blue?logo=discourse)](https://discourse.jupyter.org/c/jupyterhub) 9 | [![Contribute](https://img.shields.io/badge/I_want_to_contribute!-grey?logo=jupyter)](https://github.com/jupyterhub/batchspawner/blob/master/CONTRIBUTING.md) 10 | 11 | This is a custom spawner for [Jupyterhub](https://jupyterhub.readthedocs.io/) that is designed for installations on clusters using batch scheduling software. 12 | 13 | This began as a generalization of [mkgilbert's batchspawner](https://github.com/mkgilbert/slurmspawner) which in turn was inspired by [Andrea Zonca's blog post](http://zonca.github.io/2015/04/jupyterhub-hpc.html "Run jupyterhub on a Supercomputer") where he explains his implementation for a spawner that uses SSH and Torque. His github repo is found [here](http://www.github.com/zonca/remotespawner "RemoteSpawner"). 14 | 15 | This package formerly included WrapSpawner and ProfilesSpawner, which provide mechanisms for runtime configuration of spawners. These have been split out and moved to the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) package. 16 | 17 | ## Installation 18 | 19 | 1. from root directory of this repo (where setup.py is), run `pip install -e .` 20 | 21 | If you don't actually need an editable version, you can simply run 22 | `pip install batchspawner` 23 | 24 | 2. add lines in jupyterhub_config.py for the spawner you intend to use, e.g. 25 | 26 | ```python 27 | c = get_config() 28 | c.JupyterHub.spawner_class = 'batchspawner.TorqueSpawner' 29 | import batchspawner # Even though not used, needed to register batchspawner interface 30 | ``` 31 | 32 | 3. Depending on the spawner, additional configuration will likely be needed. 33 | 34 | ## Batch Spawners 35 | 36 | For information on the specific spawners, see [SPAWNERS.md](SPAWNERS.md). 37 | 38 | ### Overview 39 | 40 | This file contains an abstraction layer for batch job queueing systems (`BatchSpawnerBase`), and implements 41 | Jupyterhub spawners for Torque, Moab, SLURM, SGE, HTCondor, LSF, and eventually others. 42 | Common attributes of batch submission / resource manager environments will include notions of: 43 | 44 | - queue names, resource manager addresses 45 | - resource limits including runtime, number of processes, memory 46 | - singleuser child process running on (usually remote) host not known until runtime 47 | - job submission and monitoring via resource manager utilities 48 | - remote execution via submission of templated scripts 49 | - job names instead of PIDs 50 | 51 | `BatchSpawnerBase` provides several general mechanisms: 52 | 53 | - configurable traits `req_foo` that are exposed as `{foo}` in job template scripts. Templates (submit scripts in particular) may also use the full power of [jinja2](http://jinja.pocoo.org/). Templates are automatically detected if a `{{` or `{%` is present, otherwise str.format() used. 54 | - configurable command templates for submitting/querying/cancelling jobs 55 | - a generic concept of job-ID and ID-based job state tracking 56 | - overrideable hooks for subclasses to plug in logic at numerous points 57 | 58 | ### Example 59 | 60 | Every effort has been made to accommodate highly diverse systems through configuration 61 | only. This example consists of the (lightly edited) configuration used by the author 62 | to run Jupyter notebooks on an academic supercomputer cluster. 63 | 64 | ```python 65 | # Select the Torque backend and increase the timeout since batch jobs may take time to start 66 | import batchspawner 67 | c.JupyterHub.spawner_class = 'batchspawner.TorqueSpawner' 68 | c.Spawner.http_timeout = 120 69 | 70 | #------------------------------------------------------------------------------ 71 | # BatchSpawnerBase configuration 72 | # These are simply setting parameters used in the job script template below 73 | #------------------------------------------------------------------------------ 74 | c.BatchSpawnerBase.req_nprocs = '2' 75 | c.BatchSpawnerBase.req_queue = 'mesabi' 76 | c.BatchSpawnerBase.req_host = 'mesabi.xyz.edu' 77 | c.BatchSpawnerBase.req_runtime = '12:00:00' 78 | c.BatchSpawnerBase.req_memory = '4gb' 79 | #------------------------------------------------------------------------------ 80 | # TorqueSpawner configuration 81 | # The script below is nearly identical to the default template, but we needed 82 | # to add a line for our local environment. For most sites the default templates 83 | # should be a good starting point. 84 | #------------------------------------------------------------------------------ 85 | c.TorqueSpawner.batch_script = '''#!/bin/sh 86 | #PBS -q {queue}@{host} 87 | #PBS -l walltime={runtime} 88 | #PBS -l nodes=1:ppn={nprocs} 89 | #PBS -l mem={memory} 90 | #PBS -N jupyterhub-singleuser 91 | #PBS -v {keepvars} 92 | module load python3 93 | {cmd} 94 | ''' 95 | # For our site we need to munge the execution hostname returned by qstat 96 | c.TorqueSpawner.state_exechost_exp = r'int-\1.mesabi.xyz.edu' 97 | ``` 98 | 99 | ### Security 100 | 101 | Unless otherwise stated for a specific spawner, assume that spawners 102 | _do_ evaluate shell environment for users and thus the [security 103 | requirements of JupyterHub security for untrusted 104 | users](https://jupyterhub.readthedocs.io/en/stable/reference/websecurity.html) 105 | are not fulfilled because some (most?) spawners _do_ start a user 106 | shell which will execute arbitrary user environment configuration 107 | (`.profile`, `.bashrc` and the like) unless users do not have 108 | access to their own cluster user account. This is something which we 109 | are working on. 110 | 111 | ## Provide different configurations of BatchSpawner 112 | 113 | ### Overview 114 | 115 | `ProfilesSpawner`, available as part of the [`wrapspawner`](https://github.com/jupyterhub/wrapspawner) 116 | package, allows the Jupyterhub administrator to define a set of different spawning configurations, 117 | both different spawners and different configurations of the same spawner. 118 | The user is then presented a dropdown menu for choosing the most suitable configuration for their needs. 119 | 120 | This method provides an easy and safe way to provide different configurations of `BatchSpawner` to the 121 | users, see an example below. 122 | 123 | ### Example 124 | 125 | The following is based on the author's configuration (at the same site as the example above) 126 | showing how to give users access to multiple job configurations on the batch scheduled 127 | clusters, as well as an option to run a local notebook directly on the jupyterhub server. 128 | 129 | ```python 130 | # Same initial setup as the previous example 131 | import batchspawner 132 | c.JupyterHub.spawner_class = 'wrapspawner.ProfilesSpawner' 133 | c.Spawner.http_timeout = 120 134 | #------------------------------------------------------------------------------ 135 | # BatchSpawnerBase configuration 136 | # Providing default values that we may omit in the profiles 137 | #------------------------------------------------------------------------------ 138 | c.BatchSpawnerBase.req_host = 'mesabi.xyz.edu' 139 | c.BatchSpawnerBase.req_runtime = '12:00:00' 140 | c.TorqueSpawner.state_exechost_exp = r'in-\1.mesabi.xyz.edu' 141 | #------------------------------------------------------------------------------ 142 | # ProfilesSpawner configuration 143 | #------------------------------------------------------------------------------ 144 | # List of profiles to offer for selection. Signature is: 145 | # List(Tuple( Unicode, Unicode, Type(Spawner), Dict )) 146 | # corresponding to profile display name, unique key, Spawner class, 147 | # dictionary of spawner config options. 148 | # 149 | # The first three values will be exposed in the input_template as {display}, 150 | # {key}, and {type} 151 | # 152 | c.ProfilesSpawner.profiles = [ 153 | ( "Local server", 'local', 'jupyterhub.spawner.LocalProcessSpawner', {'ip':'0.0.0.0'} ), 154 | ('Mesabi - 2 cores, 4 GB, 8 hours', 'mesabi2c4g12h', 'batchspawner.TorqueSpawner', 155 | dict(req_nprocs='2', req_queue='mesabi', req_runtime='8:00:00', req_memory='4gb')), 156 | ('Mesabi - 12 cores, 128 GB, 4 hours', 'mesabi128gb', 'batchspawner.TorqueSpawner', 157 | dict(req_nprocs='12', req_queue='ram256g', req_runtime='4:00:00', req_memory='125gb')), 158 | ('Mesabi - 2 cores, 4 GB, 24 hours', 'mesabi2c4gb24h', 'batchspawner.TorqueSpawner', 159 | dict(req_nprocs='2', req_queue='mesabi', req_runtime='24:00:00', req_memory='4gb')), 160 | ('Interactive Cluster - 2 cores, 4 GB, 8 hours', 'lab', 'batchspawner.TorqueSpawner', 161 | dict(req_nprocs='2', req_host='labhost.xyz.edu', req_queue='lab', 162 | req_runtime='8:00:00', req_memory='4gb', state_exechost_exp='')), 163 | ] 164 | c.ProfilesSpawner.ip = '0.0.0.0' 165 | ``` 166 | 167 | ## Debugging batchspawner 168 | 169 | Sometimes it can be hard to debug batchspawner, but it's not really 170 | once you know how the pieces interact. Check the following places for 171 | error messages: 172 | 173 | - Check the JupyterHub logs for errors. 174 | 175 | - Check the JupyterHub logs for the batch script that got submitted 176 | and the command used to submit it. Are these correct? (Note that 177 | there are submission environment variables too, which aren't 178 | displayed.) 179 | 180 | - At this point, it's a matter of checking the batch system. Is the 181 | job ever scheduled? Does it run? Does it succeed? Check the batch 182 | system status and output of the job. The most comon failure 183 | patterns are a) job never starting due to bad scheduler options, b) 184 | job waiting in the queue beyond the `start_timeout`, causing 185 | JupyterHub to kill the job. 186 | 187 | - At this point the job starts. Does it fail immediately, or before 188 | Jupyter starts? Check the scheduler output files (stdout/stderr of 189 | the job), wherever it is stored. To debug the job script, you can 190 | add debugging into the batch script, such as an `env` or `set -x`. 191 | 192 | - At this point Jupyter itself starts - check its error messages. Is 193 | it starting with the right options? Can it communicate with the 194 | hub? At this point there usually isn't anything 195 | batchspawner-specific, with the one exception below. The error log 196 | would be in the batch script output (same file as above). There may 197 | also be clues in the JupyterHub logfile. 198 | - Are you running on an NFS filesystem? It's possible for Jupyter to 199 | experience issues due to varying implementations of the fcntl() system 200 | call. (See also [Jupyterhub-Notes and Tips: SQLite](https://jupyterhub.readthedocs.io/en/latest/reference/database.html?highlight=NFS#sqlite)) 201 | 202 | Common problems: 203 | 204 | - Did you `import batchspawner` in the `jupyterhub_config.py` file? 205 | This is needed in order to activate the batchspawer API in 206 | JupyterHub. 207 | 208 | ## Changelog 209 | 210 | See [CHANGELOG.md](CHANGELOG.md). 211 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # How to make a release 2 | 3 | `batchspawner` is a package available on [PyPI] and on [conda-forge]. 4 | 5 | These are the instructions on how to make a release. 6 | 7 | ## Pre-requisites 8 | 9 | - Push rights to this GitHub repository 10 | 11 | ## Steps to make a release 12 | 13 | 1. Create a PR updating `CHANGELOG.md` with [github-activity] and continue when 14 | its merged. 15 | 16 | Advice on this procedure can be found in [this team compass 17 | issue](https://github.com/jupyterhub/team-compass/issues/563). 18 | 19 | 2. Checkout main and make sure it is up to date. 20 | 21 | ```shell 22 | git checkout main 23 | git fetch origin main 24 | git reset --hard origin/main 25 | ``` 26 | 27 | 3. Update the version, make commits, and push a git tag with `tbump`. 28 | 29 | ```shell 30 | pip install tbump 31 | ``` 32 | 33 | `tbump` will ask for confirmation before doing anything. 34 | 35 | ```shell 36 | # Example versions to set: 1.0.0, 1.0.0b1 37 | VERSION= 38 | tbump ${VERSION} 39 | ``` 40 | 41 | Following this, the [CI system] will build and publish a release. 42 | 43 | 4. Reset the version back to dev, e.g. `1.0.1.dev` after releasing `1.0.0`. 44 | 45 | ```shell 46 | # Example version to set: 1.0.1.dev 47 | NEXT_VERSION= 48 | tbump --no-tag ${NEXT_VERSION}.dev 49 | ``` 50 | 51 | 5. Following the release to PyPI, an automated PR should arrive within 24 hours 52 | to [conda-forge/batchspawner-feedstock] with instructions on releasing to 53 | conda-forge. You are welcome to volunteer doing this, but aren't required as 54 | part of making this release to PyPI. 55 | 56 | [github-activity]: https://github.com/executablebooks/github-activity 57 | [pypi]: https://pypi.org/project/batchspawner/ 58 | [ci system]: https://github.com/jupyterhub/batchspawner/actions/workflows/release.yaml 59 | [conda-forge]: https://anaconda.org/conda-forge/batchspawner 60 | [conda-forge/batchspawner-feedstock]: https://github.com/conda-forge/batchspawner-feedstock 61 | -------------------------------------------------------------------------------- /SPAWNERS.md: -------------------------------------------------------------------------------- 1 | # Notes on specific spawners 2 | 3 | **Spawner maintainers**: Included below are "spawner maintainers", 4 | when available. There aren't official obligations, but the general 5 | idea is that you should watch the repository and feel especially 6 | empowered to comment on issues when you think it might be relevant to 7 | you (obviously everyone should be, but this is our attempt at even 8 | more outreach). You should let us know when we break something and 9 | provide a diversity of opinions in general. Submitting PRs and 10 | testing is nice but not required. 11 | 12 | To be listed as a maintainer, just submit an issue or PR adding you, 13 | and please watch the repository on Github. 14 | 15 | ## `TorqueSpawner` 16 | 17 | Maintainers: 18 | 19 | ## `MoabSpawner` 20 | 21 | Subclass of TorqueSpawner 22 | 23 | Maintainers: 24 | 25 | ## `SlurmSpawner` 26 | 27 | Maintainers: @rkdarst 28 | 29 | This spawner enforces the environment if `srun` is used to wrap the 30 | spawner command, which is the default. If you _do_ want user 31 | environment to be used, set `req_srun=''`. However, this is not 32 | perfect: there is still a bash shell begun as the user which could run 33 | arbitrary startup, define shell aliases for `srun`, etc. 34 | 35 | Use of `srun` is required to gracefully terminate. 36 | 37 | ## `GridengineSpawner` 38 | 39 | Maintainers: 40 | 41 | ## `CondorSpawner` 42 | 43 | Maintainers: 44 | 45 | ## `LsfSpawner` 46 | 47 | Maintainers: 48 | 49 | # Checklist for making spawners 50 | 51 | Please document each of these things under the spawner list above, - 52 | even if it is "OK", we need to track status of all spawners. If it is 53 | a bug, users really need to know. 54 | 55 | - Does your spawner read shell environment before starting? (See 56 | [Jupyterhub 57 | Security](https://jupyterhub.readthedocs.io/en/stable/reference/websecurity.html). 58 | 59 | - Does your spawner send SIGTERM to the jupyterhub-singleuser process 60 | before SIGKILL? It should, so that the process can terminate 61 | gracefully. Add `echo "terminated gracefully"` to the end of the 62 | batch script - if you see this in your singleuser server output, you 63 | know that you DO receive SIGTERM and terminate gracefully. If your 64 | batch system can not automatically send SIGTERM before SIGKILL, PR 65 | #75 might help here, ask for it to be finished. 66 | -------------------------------------------------------------------------------- /batchspawner/__init__.py: -------------------------------------------------------------------------------- 1 | from . import api # noqa 2 | from ._version import __version__, version_info # noqa 3 | from .batchspawner import * # noqa 4 | -------------------------------------------------------------------------------- /batchspawner/_version.py: -------------------------------------------------------------------------------- 1 | # __version__ should be updated using tbump, based on configuration in 2 | # pyproject.toml, according to instructions in RELEASE.md. 3 | # 4 | __version__ = "1.3.1.dev" 5 | 6 | # version_info looks like (1, 2, 3, "dev") if __version__ is 1.2.3.dev 7 | version_info = tuple(int(p) if p.isdigit() else p for p in __version__.split(".")) 8 | -------------------------------------------------------------------------------- /batchspawner/api.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from jupyterhub.apihandlers import APIHandler, default_handlers 4 | from tornado import web 5 | 6 | 7 | class BatchSpawnerAPIHandler(APIHandler): 8 | @web.authenticated 9 | def post(self): 10 | """POST set user spawner data""" 11 | if hasattr(self, "current_user"): 12 | # Jupyterhub compatability, (september 2018, d79a99323ef1d) 13 | user = self.current_user 14 | else: 15 | # Previous jupyterhub, 0.9.4 and before. 16 | user = self.get_current_user() 17 | token = self.get_auth_token() 18 | spawner = None 19 | for s in user.spawners.values(): 20 | if s.api_token == token: 21 | spawner = s 22 | break 23 | data = self.get_json_body() 24 | for key, value in data.items(): 25 | if hasattr(spawner, key): 26 | setattr(spawner, key, value) 27 | self.finish(json.dumps({"message": "BatchSpawner data configured"})) 28 | self.set_status(201) 29 | 30 | 31 | default_handlers.append((r"/api/batchspawner", BatchSpawnerAPIHandler)) 32 | -------------------------------------------------------------------------------- /batchspawner/batchspawner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Regents of the University of Minnesota 2 | # Copyright (c) Michael Gilbert 3 | # Distributed under the terms of the Modified BSD License. 4 | 5 | """Batch spawners 6 | 7 | This file contains an abstraction layer for batch job queueing systems, and implements 8 | Jupyterhub spawners for Torque, SLURM, and eventually others. 9 | 10 | Common attributes of batch submission / resource manager environments will include notions of: 11 | * queue names, resource manager addresses 12 | * resource limits including runtime, number of processes, memory 13 | * singleuser child process running on (usually remote) host not known until runtime 14 | * job submission and monitoring via resource manager utilities 15 | * remote execution via submission of templated scripts 16 | * job names instead of PIDs 17 | """ 18 | import asyncio 19 | import os 20 | import pwd 21 | import re 22 | import xml.etree.ElementTree as ET 23 | from enum import Enum 24 | 25 | from jinja2 import Template 26 | from jupyterhub.spawner import Spawner, set_user_setuid 27 | from traitlets import Dict, Float, Integer, Unicode, default 28 | 29 | 30 | def format_template(template, *args, **kwargs): 31 | """Format a template, either using jinja2 or str.format(). 32 | 33 | Use jinja2 if the template is a jinja2.Template, or contains '{{' or 34 | '{%'. Otherwise, use str.format() for backwards compatability with 35 | old scripts (but you can't mix them). 36 | """ 37 | if isinstance(template, Template): 38 | return template.render(*args, **kwargs) 39 | elif "{{" in template or "{%" in template: 40 | return Template(template).render(*args, **kwargs) 41 | return template.format(*args, **kwargs) 42 | 43 | 44 | class JobStatus(Enum): 45 | NOTFOUND = 0 46 | RUNNING = 1 47 | PENDING = 2 48 | UNKNOWN = 3 49 | 50 | 51 | class BatchSpawnerBase(Spawner): 52 | """Base class for spawners using resource manager batch job submission mechanisms 53 | 54 | This base class is developed targetting the TorqueSpawner and SlurmSpawner, so by default 55 | assumes a qsub-like command that reads a script from its stdin for starting jobs, 56 | a qstat-like command that outputs some data that can be parsed to check if the job is running 57 | and on what remote node, and a qdel-like command to cancel a job. The goal is to be 58 | sufficiently general that a broad range of systems can be supported with minimal overrides. 59 | 60 | At minimum, subclasses should provide reasonable defaults for the traits: 61 | batch_script 62 | batch_submit_cmd 63 | batch_query_cmd 64 | batch_cancel_cmd 65 | 66 | and must provide implementations for the methods: 67 | state_ispending 68 | state_isrunning 69 | state_gethost 70 | """ 71 | 72 | # override default since batch systems typically need longer 73 | start_timeout = Integer(300).tag(config=True) 74 | 75 | # override default server ip since batch jobs normally running remotely 76 | ip = Unicode( 77 | "0.0.0.0", 78 | help="Address for singleuser server to listen at", 79 | ).tag(config=True) 80 | 81 | exec_prefix = Unicode( 82 | "sudo -E -u {username}", 83 | help="Standard executon prefix (e.g. the default sudo -E -u {username})", 84 | ).tag(config=True) 85 | 86 | # all these req_foo traits will be available as substvars for templated strings 87 | req_queue = Unicode( 88 | "", 89 | help="Queue name to submit job to resource manager", 90 | ).tag(config=True) 91 | 92 | req_host = Unicode( 93 | "", 94 | help="Host name of batch server to submit job to resource manager", 95 | ).tag(config=True) 96 | 97 | req_memory = Unicode( 98 | "", 99 | help="Memory to request from resource manager", 100 | ).tag(config=True) 101 | 102 | req_nprocs = Unicode( 103 | "", 104 | help="Number of processors to request from resource manager", 105 | ).tag(config=True) 106 | 107 | req_ngpus = Unicode( 108 | "", 109 | help="Number of GPUs to request from resource manager", 110 | ).tag(config=True) 111 | 112 | req_runtime = Unicode( 113 | "", 114 | help="Length of time for submitted job to run", 115 | ).tag(config=True) 116 | 117 | req_partition = Unicode( 118 | "", 119 | help="Partition name to submit job to resource manager", 120 | ).tag(config=True) 121 | 122 | req_account = Unicode( 123 | "", 124 | help="Account name string to pass to the resource manager", 125 | ).tag(config=True) 126 | 127 | req_options = Unicode( 128 | "", 129 | help="Other options to include into job submission script", 130 | ).tag(config=True) 131 | 132 | req_prologue = Unicode( 133 | "", 134 | help="Script to run before single user server starts.", 135 | ).tag(config=True) 136 | 137 | req_epilogue = Unicode( 138 | "", 139 | help="Script to run after single user server ends.", 140 | ).tag(config=True) 141 | 142 | req_username = Unicode() 143 | 144 | @default("req_username") 145 | def _req_username_default(self): 146 | return self.user.name 147 | 148 | # Useful IF getpwnam on submit host returns correct info for exec host 149 | req_homedir = Unicode() 150 | 151 | @default("req_homedir") 152 | def _req_homedir_default(self): 153 | return pwd.getpwnam(self.user.name).pw_dir 154 | 155 | req_keepvars = Unicode() 156 | 157 | @default("req_keepvars") 158 | def _req_keepvars_default(self): 159 | return ",".join(self.get_env().keys()) 160 | 161 | req_keepvars_extra = Unicode( 162 | help="Extra environment variables which should be configured, " 163 | "added to the defaults in keepvars, " 164 | "comma separated list.", 165 | ).tag(config=True) 166 | 167 | batch_script = Unicode( 168 | "", 169 | help="Template for job submission script. Traits on this class named like req_xyz " 170 | "will be substituted in the template for {xyz} using string.Formatter. " 171 | "Must include {cmd} which will be replaced with the jupyterhub-singleuser command line.", 172 | ).tag(config=True) 173 | 174 | batchspawner_singleuser_cmd = Unicode( 175 | "batchspawner-singleuser", 176 | help="A wrapper which is capable of special batchspawner setup: currently sets the port on " 177 | "the remote host. Not needed to be set under normal circumstances, unless path needs " 178 | "specification.", 179 | ).tag(config=True) 180 | 181 | # Raw output of job submission command unless overridden 182 | job_id = Unicode() 183 | 184 | # Will get the raw output of the job status command unless overridden 185 | job_status = Unicode() 186 | 187 | # Prepare substitution variables for templates using req_xyz traits 188 | def get_req_subvars(self): 189 | reqlist = [t for t in self.trait_names() if t.startswith("req_")] 190 | subvars = {} 191 | for t in reqlist: 192 | subvars[t[4:]] = getattr(self, t) 193 | if subvars.get("keepvars_extra"): 194 | subvars["keepvars"] += "," + subvars["keepvars_extra"] 195 | return subvars 196 | 197 | batch_submit_cmd = Unicode( 198 | "", 199 | help="Command to run to submit batch scripts. Formatted using req_xyz traits as {xyz}.", 200 | ).tag(config=True) 201 | 202 | def parse_job_id(self, output): 203 | "Parse output of submit command to get job id." 204 | return output 205 | 206 | def cmd_formatted_for_batch(self): 207 | """The command which is substituted inside of the batch script""" 208 | return " ".join([self.batchspawner_singleuser_cmd] + self.cmd + self.get_args()) 209 | 210 | async def run_command(self, cmd, input=None, env=None): 211 | proc = await asyncio.create_subprocess_shell( 212 | cmd, 213 | env=env, 214 | stdin=asyncio.subprocess.PIPE, 215 | stdout=asyncio.subprocess.PIPE, 216 | stderr=asyncio.subprocess.PIPE, 217 | ) 218 | inbytes = None 219 | 220 | if input: 221 | inbytes = input.encode() 222 | 223 | try: 224 | out, eout = await proc.communicate(input=inbytes) 225 | except: 226 | self.log.debug("Exception raised when trying to run command: %s" % cmd) 227 | proc.kill() 228 | self.log.debug("Running command failed, killed process.") 229 | try: 230 | out, eout = await asyncio.wait_for(proc.communicate(), timeout=2) 231 | out = out.decode().strip() 232 | eout = eout.decode().strip() 233 | self.log.error("Subprocess returned exitcode %s" % proc.returncode) 234 | self.log.error("Stdout:") 235 | self.log.error(out) 236 | self.log.error("Stderr:") 237 | self.log.error(eout) 238 | raise RuntimeError(f"{cmd} exit status {proc.returncode}: {eout}") 239 | except asyncio.TimeoutError: 240 | self.log.error( 241 | "Encountered timeout trying to clean up command, process probably killed already: %s" 242 | % cmd 243 | ) 244 | return "" 245 | except: 246 | self.log.error( 247 | "Encountered exception trying to clean up command: %s" % cmd 248 | ) 249 | raise 250 | else: 251 | eout = eout.decode().strip() 252 | err = proc.returncode 253 | if err != 0: 254 | self.log.error("Subprocess returned exitcode %s" % err) 255 | self.log.error(eout) 256 | raise RuntimeError(eout) 257 | 258 | out = out.decode().strip() 259 | return out 260 | 261 | async def _get_batch_script(self, **subvars): 262 | """Format batch script from vars""" 263 | # Could be overridden by subclasses, but mainly useful for testing 264 | return format_template(self.batch_script, **subvars) 265 | 266 | async def submit_batch_script(self): 267 | subvars = self.get_req_subvars() 268 | # `cmd` is submitted to the batch system 269 | cmd = " ".join( 270 | ( 271 | format_template(self.exec_prefix, **subvars), 272 | format_template(self.batch_submit_cmd, **subvars), 273 | ) 274 | ) 275 | # `subvars['cmd']` is what is run _inside_ the batch script, 276 | # put into the template. 277 | subvars["cmd"] = self.cmd_formatted_for_batch() 278 | if hasattr(self, "user_options"): 279 | subvars.update(self.user_options) 280 | script = await self._get_batch_script(**subvars) 281 | self.log.info("Spawner script options: %s", subvars) 282 | self.log.info("Spawner submitting command: %s", cmd) 283 | self.log.debug("Spawner submitting script:\n%s", script) 284 | self.log.debug("Spawner submitting environment: %s", self.get_env()) 285 | out = await self.run_command(cmd, input=script, env=self.get_env()) 286 | try: 287 | self.log.info("Job submitted. output: %s", out) 288 | self.job_id = self.parse_job_id(out) 289 | except: 290 | self.log.error("Job submission failed. exit code: %s", out) 291 | self.job_id = "" 292 | return self.job_id 293 | 294 | # Override if your batch system needs something more elaborate to query the job status 295 | batch_query_cmd = Unicode( 296 | "", 297 | help="Command to run to query job status. Formatted using req_xyz traits as {xyz} " 298 | "and self.job_id as {job_id}.", 299 | ).tag(config=True) 300 | 301 | async def query_job_status(self): 302 | """Check job status, return JobStatus object.""" 303 | if self.job_id is None or len(self.job_id) == 0: 304 | self.job_status = "" 305 | return JobStatus.NOTFOUND 306 | subvars = self.get_req_subvars() 307 | subvars["job_id"] = self.job_id 308 | cmd = " ".join( 309 | ( 310 | format_template(self.exec_prefix, **subvars), 311 | format_template(self.batch_query_cmd, **subvars), 312 | ) 313 | ) 314 | self.log.debug("Spawner querying job: " + cmd) 315 | try: 316 | self.job_status = await self.run_command(cmd) 317 | except RuntimeError as e: 318 | # e.args[0] is stderr from the process 319 | self.job_status = e.args[0] 320 | except Exception: 321 | self.log.error("Error querying job " + self.job_id) 322 | self.job_status = "" 323 | 324 | if self.state_isrunning(): 325 | return JobStatus.RUNNING 326 | elif self.state_ispending(): 327 | return JobStatus.PENDING 328 | elif self.state_isunknown(): 329 | return JobStatus.UNKNOWN 330 | else: 331 | return JobStatus.NOTFOUND 332 | 333 | batch_cancel_cmd = Unicode( 334 | "", 335 | help="Command to stop/cancel a previously submitted job. Formatted like batch_query_cmd.", 336 | ).tag(config=True) 337 | 338 | async def cancel_batch_job(self): 339 | subvars = self.get_req_subvars() 340 | subvars["job_id"] = self.job_id 341 | cmd = " ".join( 342 | ( 343 | format_template(self.exec_prefix, **subvars), 344 | format_template(self.batch_cancel_cmd, **subvars), 345 | ) 346 | ) 347 | self.log.info("Cancelling job " + self.job_id + ": " + cmd) 348 | await self.run_command(cmd) 349 | 350 | def load_state(self, state): 351 | """load job_id from state""" 352 | super().load_state(state) 353 | self.job_id = state.get("job_id", "") 354 | self.job_status = state.get("job_status", "") 355 | 356 | def get_state(self): 357 | """add job_id to state""" 358 | state = super().get_state() 359 | if self.job_id: 360 | state["job_id"] = self.job_id 361 | if self.job_status: 362 | state["job_status"] = self.job_status 363 | return state 364 | 365 | def clear_state(self): 366 | """clear job_id state""" 367 | super().clear_state() 368 | self.job_id = "" 369 | self.job_status = "" 370 | 371 | def make_preexec_fn(self, name): 372 | """make preexec fn to change uid (if running as root) before job submission""" 373 | return set_user_setuid(name) 374 | 375 | def state_ispending(self): 376 | "Return boolean indicating if job is still waiting to run, likely by parsing self.job_status" 377 | raise NotImplementedError("Subclass must provide implementation") 378 | 379 | def state_isrunning(self): 380 | "Return boolean indicating if job is running, likely by parsing self.job_status" 381 | raise NotImplementedError("Subclass must provide implementation") 382 | 383 | def state_isunknown(self): 384 | "Return boolean indicating if job state retrieval failed because of the resource manager" 385 | return None 386 | 387 | def state_gethost(self): 388 | "Return string, hostname or addr of running job, likely by parsing self.job_status" 389 | raise NotImplementedError("Subclass must provide implementation") 390 | 391 | async def poll(self): 392 | """Poll the process""" 393 | status = await self.query_job_status() 394 | if status in (JobStatus.PENDING, JobStatus.RUNNING, JobStatus.UNKNOWN): 395 | return None 396 | else: 397 | self.clear_state() 398 | return 1 399 | 400 | startup_poll_interval = Float( 401 | 0.5, 402 | help="Polling interval (seconds) to check job state during startup", 403 | ).tag(config=True) 404 | 405 | async def start(self): 406 | """Start the process""" 407 | self.ip = self.traits()["ip"].default_value 408 | self.port = self.traits()["port"].default_value 409 | 410 | if self.server: 411 | self.server.port = self.port 412 | 413 | await self.submit_batch_script() 414 | 415 | # We are called with a timeout, and if the timeout expires this function will 416 | # be interrupted at the next yield, and self.stop() will be called. 417 | # So this function should not return unless successful, and if unsuccessful 418 | # should either raise and Exception or loop forever. 419 | if len(self.job_id) == 0: 420 | raise RuntimeError( 421 | "Jupyter batch job submission failure (no jobid in output)" 422 | ) 423 | while True: 424 | status = await self.query_job_status() 425 | if status == JobStatus.RUNNING: 426 | break 427 | elif status == JobStatus.PENDING: 428 | self.log.debug("Job " + self.job_id + " still pending") 429 | elif status == JobStatus.UNKNOWN: 430 | self.log.debug("Job " + self.job_id + " still unknown") 431 | else: 432 | self.log.warning( 433 | "Job " 434 | + self.job_id 435 | + " neither pending nor running.\n" 436 | + self.job_status 437 | ) 438 | self.clear_state() 439 | raise RuntimeError( 440 | "The Jupyter batch job has disappeared" 441 | " while pending in the queue or died immediately" 442 | " after starting." 443 | ) 444 | await asyncio.sleep(self.startup_poll_interval) 445 | 446 | self.ip = self.state_gethost() 447 | while self.port == 0: 448 | await asyncio.sleep(self.startup_poll_interval) 449 | # Test framework: For testing, mock_port is set because we 450 | # don't actually run the single-user server yet. 451 | if hasattr(self, "mock_port"): 452 | self.port = self.mock_port 453 | # Check if job is still running 454 | status = await self.poll() 455 | if status: 456 | raise RuntimeError( 457 | "The Jupyter batch job started" 458 | " but died before launching the single-user server." 459 | ) 460 | 461 | self.db.commit() 462 | self.log.info( 463 | "Notebook server job {} started at {}:{}".format( 464 | self.job_id, self.ip, self.port 465 | ) 466 | ) 467 | 468 | return self.ip, self.port 469 | 470 | async def stop(self, now=False): 471 | """Stop the singleuser server job. 472 | 473 | Returns immediately after sending job cancellation command if now=True, otherwise 474 | tries to confirm that job is no longer running.""" 475 | 476 | self.log.info("Stopping server job " + self.job_id) 477 | await self.cancel_batch_job() 478 | if now: 479 | return 480 | for i in range(10): 481 | status = await self.query_job_status() 482 | if status not in (JobStatus.RUNNING, JobStatus.UNKNOWN): 483 | return 484 | await asyncio.sleep(1) 485 | if self.job_id: 486 | self.log.warning( 487 | "Notebook server job {} at {}:{} possibly failed to terminate".format( 488 | self.job_id, self.ip, self.port 489 | ) 490 | ) 491 | 492 | async def progress(self): 493 | while True: 494 | if self.state_ispending(): 495 | yield {"message": "Pending in queue..."} 496 | elif self.state_isrunning(): 497 | yield {"message": "Cluster job running... waiting to connect"} 498 | return 499 | else: 500 | yield {"message": "Unknown status..."} 501 | await asyncio.sleep(1) 502 | 503 | 504 | class BatchSpawnerRegexStates(BatchSpawnerBase): 505 | """Subclass of BatchSpawnerBase that uses config-supplied regular expressions 506 | to interact with batch submission system state. Provides implementations of 507 | state_ispending 508 | state_isrunning 509 | state_gethost 510 | 511 | In their place, the user should supply the following configuration: 512 | state_pending_re - regex that matches job_status if job is waiting to run 513 | state_running_re - regex that matches job_status if job is running 514 | state_exechost_re - regex with at least one capture group that extracts 515 | execution host from job_status 516 | state_exechost_exp - if empty, notebook IP will be set to the contents of the 517 | first capture group. If this variable is set, the match object 518 | will be expanded using this string to obtain the notebook IP. 519 | See Python docs: re.match.expand 520 | """ 521 | 522 | state_pending_re = Unicode( 523 | "", 524 | help="Regex that matches job_status if job is waiting to run", 525 | ).tag(config=True) 526 | state_running_re = Unicode( 527 | "", 528 | help="Regex that matches job_status if job is running", 529 | ).tag(config=True) 530 | state_exechost_re = Unicode( 531 | "", 532 | help="Regex with at least one capture group that extracts " 533 | "the execution host from job_status output", 534 | ).tag(config=True) 535 | state_exechost_exp = Unicode( 536 | "", 537 | help="""If empty, notebook IP will be set to the contents of the first capture group. 538 | 539 | If this variable is set, the match object will be expanded using this string 540 | to obtain the notebook IP. 541 | See Python docs: re.match.expand""", 542 | ).tag(config=True) 543 | state_unknown_re = Unicode( 544 | "", 545 | help="Regex that matches job_status if the resource manager is not answering." 546 | "Blank indicates not used.", 547 | ).tag(config=True) 548 | 549 | def state_ispending(self): 550 | assert self.state_pending_re, "Misconfigured: define state_running_re" 551 | return self.job_status and re.search(self.state_pending_re, self.job_status) 552 | 553 | def state_isrunning(self): 554 | assert self.state_running_re, "Misconfigured: define state_running_re" 555 | return self.job_status and re.search(self.state_running_re, self.job_status) 556 | 557 | def state_isunknown(self): 558 | # Blank means "not set" and this function always returns None. 559 | if self.state_unknown_re: 560 | return self.job_status and re.search(self.state_unknown_re, self.job_status) 561 | 562 | def state_gethost(self): 563 | assert self.state_exechost_re, "Misconfigured: define state_exechost_re" 564 | match = re.search(self.state_exechost_re, self.job_status) 565 | if not match: 566 | self.log.error( 567 | "Spawner unable to match host addr in job status: " + self.job_status 568 | ) 569 | return 570 | if not self.state_exechost_exp: 571 | return match.groups()[0] 572 | else: 573 | return match.expand(self.state_exechost_exp) 574 | 575 | 576 | class TorqueSpawner(BatchSpawnerRegexStates): 577 | batch_script = Unicode( 578 | """#!/bin/sh 579 | #PBS -q {queue}@{host} 580 | #PBS -l walltime={runtime} 581 | #PBS -l nodes=1:ppn={nprocs} 582 | #PBS -l mem={memory} 583 | #PBS -N jupyterhub-singleuser 584 | #PBS -v {keepvars} 585 | #PBS {options} 586 | 587 | set -eu 588 | 589 | {prologue} 590 | {cmd} 591 | {epilogue} 592 | """ 593 | ).tag(config=True) 594 | 595 | # outputs job id string 596 | batch_submit_cmd = Unicode("qsub").tag(config=True) 597 | # outputs job data XML string 598 | batch_query_cmd = Unicode("qstat -x {job_id}").tag(config=True) 599 | batch_cancel_cmd = Unicode("qdel {job_id}").tag(config=True) 600 | # search XML string for job_state - [QH] = pending, R = running, [CE] = done 601 | state_pending_re = Unicode(r"[QH]").tag(config=True) 602 | state_running_re = Unicode(r"R").tag(config=True) 603 | state_exechost_re = Unicode(r"((?:[\w_-]+\.?)+)/\d+").tag(config=True) 604 | 605 | 606 | class MoabSpawner(TorqueSpawner): 607 | # outputs job id string 608 | batch_submit_cmd = Unicode("msub").tag(config=True) 609 | # outputs job data XML string 610 | batch_query_cmd = Unicode("mdiag -j {job_id} --xml").tag(config=True) 611 | batch_cancel_cmd = Unicode("mjobctl -c {job_id}").tag(config=True) 612 | state_pending_re = Unicode(r'State="Idle"').tag(config=True) 613 | state_running_re = Unicode(r'State="Running"').tag(config=True) 614 | state_exechost_re = Unicode(r'AllocNodeList="([^\r\n\t\f :"]*)').tag(config=True) 615 | 616 | 617 | class PBSSpawner(TorqueSpawner): 618 | batch_script = Unicode( 619 | """#!/bin/sh 620 | {% if queue or host %}#PBS -q {% if queue %}{{queue}}{% endif %}\ 621 | {% if host %}@{{host}}{% endif %}{% endif %} 622 | #PBS -l walltime={{runtime}} 623 | #PBS -l select=1:ncpus={{nprocs}}:mem={{memory}} 624 | #PBS -N jupyterhub-singleuser 625 | #PBS -o {{homedir}}/.jupyterhub.pbs.out 626 | #PBS -e {{homedir}}/.jupyterhub.pbs.err 627 | #PBS -v {{keepvars}} 628 | {% if options %}#PBS {{options}}{% endif %} 629 | 630 | set -eu 631 | 632 | {{prologue}} 633 | {{cmd}} 634 | {{epilogue}} 635 | """ 636 | ).tag(config=True) 637 | 638 | # outputs job data XML string 639 | batch_query_cmd = Unicode("qstat -fx {job_id}").tag(config=True) 640 | 641 | state_pending_re = Unicode(r"job_state = [QH]").tag(config=True) 642 | state_running_re = Unicode(r"job_state = R").tag(config=True) 643 | state_exechost_re = Unicode(r"exec_host = ([\w_-]+)/").tag(config=True) 644 | 645 | 646 | class UserEnvMixin: 647 | """Mixin class that computes values for USER, SHELL and HOME in the environment passed to 648 | the job submission subprocess in case the batch system needs these for the batch script. 649 | """ 650 | 651 | def user_env(self, env): 652 | """get user environment""" 653 | env["USER"] = self.user.name 654 | home = pwd.getpwnam(self.user.name).pw_dir 655 | shell = pwd.getpwnam(self.user.name).pw_shell 656 | if home: 657 | env["HOME"] = home 658 | if shell: 659 | env["SHELL"] = shell 660 | return env 661 | 662 | def get_env(self): 663 | """Get user environment variables to be passed to the user's job 664 | 665 | Everything here should be passed to the user's job as 666 | environment. Caution: If these variables are used for 667 | authentication to the batch system commands as an admin, be 668 | aware that the user will receive access to these as well. 669 | """ 670 | env = super().get_env() 671 | env = self.user_env(env) 672 | return env 673 | 674 | 675 | class SlurmSpawner(UserEnvMixin, BatchSpawnerRegexStates): 676 | batch_script = Unicode( 677 | """#!/bin/bash 678 | #SBATCH --output={{homedir}}/jupyterhub_slurmspawner_%j.log 679 | #SBATCH --job-name=spawner-jupyterhub 680 | #SBATCH --chdir={{homedir}} 681 | #SBATCH --export={{keepvars}} 682 | #SBATCH --get-user-env=L 683 | {% if partition %}#SBATCH --partition={{partition}} 684 | {% endif %}{% if runtime %}#SBATCH --time={{runtime}} 685 | {% endif %}{% if memory %}#SBATCH --mem={{memory}} 686 | {% endif %}{% if gres %}#SBATCH --gres={{gres}} 687 | {% endif %}{% if nprocs %}#SBATCH --cpus-per-task={{nprocs}} 688 | {% endif %}{% if reservation%}#SBATCH --reservation={{reservation}} 689 | {% endif %}{% if options %}#SBATCH {{options}}{% endif %} 690 | 691 | set -euo pipefail 692 | 693 | trap 'echo SIGTERM received' TERM 694 | {{prologue}} 695 | {% if srun %}{{srun}} {% endif %}{{cmd}} 696 | echo "jupyterhub-singleuser ended gracefully" 697 | {{epilogue}} 698 | """ 699 | ).tag(config=True) 700 | 701 | # all these req_foo traits will be available as substvars for templated strings 702 | req_cluster = Unicode( 703 | "", 704 | help="Cluster name to submit job to resource manager", 705 | ).tag(config=True) 706 | 707 | req_qos = Unicode( 708 | "", 709 | help="QoS name to submit job to resource manager", 710 | ).tag(config=True) 711 | 712 | req_srun = Unicode( 713 | "srun", 714 | help="Set req_srun='' to disable running in job step, and note that " 715 | "this affects environment handling. This is effectively a " 716 | "prefix for the singleuser command.", 717 | ).tag(config=True) 718 | 719 | req_reservation = Unicode( 720 | "", 721 | help="Reservation name to submit to resource manager", 722 | ).tag(config=True) 723 | 724 | req_gres = Unicode( 725 | "", 726 | help="Additional resources (e.g. GPUs) requested", 727 | ).tag(config=True) 728 | 729 | # outputs line like "Submitted batch job 209" 730 | batch_submit_cmd = Unicode("sbatch --parsable").tag(config=True) 731 | # outputs status and exec node like "RUNNING hostname" 732 | batch_query_cmd = Unicode("squeue -h -j {job_id} -o '%T %B'").tag(config=True) 733 | batch_cancel_cmd = Unicode("scancel {job_id}").tag(config=True) 734 | # use long-form states: PENDING, CONFIGURING = pending 735 | # RUNNING, COMPLETING = running 736 | state_pending_re = Unicode(r"^(?:PENDING|CONFIGURING)").tag(config=True) 737 | state_running_re = Unicode(r"^(?:RUNNING|COMPLETING)").tag(config=True) 738 | state_unknown_re = Unicode( 739 | r"^slurm_load_jobs error: (?:Socket timed out on send/recv|Unable to contact slurm controller)" 740 | ).tag(config=True) 741 | state_exechost_re = Unicode(r"\s+((?:[\w_-]+\.?)+)$").tag(config=True) 742 | 743 | def parse_job_id(self, output): 744 | # make sure jobid is really a number 745 | try: 746 | # use only last line to circumvent slurm bug 747 | output = output.splitlines()[-1] 748 | id = output.split(";")[0] 749 | int(id) 750 | except Exception as e: 751 | self.log.error("SlurmSpawner unable to parse job ID from text: " + output) 752 | raise e 753 | return id 754 | 755 | 756 | class MultiSlurmSpawner(SlurmSpawner): 757 | """When slurm has been compiled with --enable-multiple-slurmd, the 758 | administrator sets the name of the slurmd instance via the slurmd -N 759 | option. This node name is usually different from the hostname and may 760 | not be resolvable by JupyterHub. Here we enable the administrator to 761 | map the node names onto the real hostnames via a traitlet.""" 762 | 763 | daemon_resolver = Dict( 764 | {}, 765 | help="Map node names to hostnames", 766 | ).tag(config=True) 767 | 768 | def state_gethost(self): 769 | host = SlurmSpawner.state_gethost(self) 770 | return self.daemon_resolver.get(host, host) 771 | 772 | 773 | class GridengineSpawner(BatchSpawnerBase): 774 | batch_script = Unicode( 775 | """#!/bin/bash 776 | #$ -j yes 777 | #$ -N spawner-jupyterhub 778 | #$ -o {homedir}/.jupyterhub.sge.out 779 | #$ -e {homedir}/.jupyterhub.sge.err 780 | #$ -v {keepvars} 781 | #$ {options} 782 | 783 | set -euo pipefail 784 | 785 | {prologue} 786 | {cmd} 787 | {epilogue} 788 | """ 789 | ).tag(config=True) 790 | 791 | # outputs job id string 792 | batch_submit_cmd = Unicode("qsub").tag(config=True) 793 | # outputs job data XML string 794 | batch_query_cmd = Unicode("qstat -xml").tag(config=True) 795 | batch_cancel_cmd = Unicode("qdel {job_id}").tag(config=True) 796 | 797 | def parse_job_id(self, output): 798 | return output.split(" ")[2] 799 | 800 | def state_ispending(self): 801 | if self.job_status: 802 | job_info = ET.fromstring(self.job_status).find( 803 | f".//job_list[JB_job_number='{self.job_id}']" 804 | ) 805 | if job_info is not None: 806 | return job_info.attrib.get("state") == "pending" 807 | return False 808 | 809 | def state_isrunning(self): 810 | if self.job_status: 811 | job_info = ET.fromstring(self.job_status).find( 812 | f".//job_list[JB_job_number='{self.job_id}']" 813 | ) 814 | if job_info is not None: 815 | return job_info.attrib.get("state") == "running" 816 | return False 817 | 818 | def state_gethost(self): 819 | if self.job_status: 820 | queue_name = ET.fromstring(self.job_status).find( 821 | f".//job_list[JB_job_number='{self.job_id}']/queue_name" 822 | ) 823 | if queue_name is not None and queue_name.text: 824 | return queue_name.text.split("@")[1] 825 | 826 | self.log.error( 827 | "Spawner unable to match host addr in job {} with status {}".format( 828 | self.job_id, self.job_status 829 | ) 830 | ) 831 | return 832 | 833 | def get_env(self): 834 | env = super().get_env() 835 | 836 | # SGE relies on environment variables to launch local jobs. Ensure that these values are included 837 | # in the environment used to run the spawner. 838 | for key in [ 839 | "SGE_CELL", 840 | "SGE_EXECD", 841 | "SGE_ROOT", 842 | "SGE_CLUSTER_NAME", 843 | "SGE_QMASTER_PORT", 844 | "SGE_EXECD_PORT", 845 | "PATH", 846 | ]: 847 | if key in os.environ and key not in env: 848 | env[key] = os.environ[key] 849 | return env 850 | 851 | 852 | class CondorSpawner(UserEnvMixin, BatchSpawnerRegexStates): 853 | batch_script = Unicode( 854 | """ 855 | Executable = /bin/sh 856 | RequestMemory = {memory} 857 | RequestCpus = {nprocs} 858 | Arguments = \"-c 'exec {cmd}'\" 859 | Remote_Initialdir = {homedir} 860 | Output = {homedir}/.jupyterhub.condor.out 861 | Error = {homedir}/.jupyterhub.condor.err 862 | ShouldTransferFiles = False 863 | GetEnv = True 864 | {options} 865 | Queue 866 | """ 867 | ).tag(config=True) 868 | 869 | # outputs job id string 870 | batch_submit_cmd = Unicode("condor_submit").tag(config=True) 871 | # outputs job data XML string 872 | batch_query_cmd = Unicode( 873 | 'condor_q {job_id} -format "%s, " JobStatus -format "%s" RemoteHost -format "\n" True' 874 | ).tag(config=True) 875 | batch_cancel_cmd = Unicode("condor_rm {job_id}").tag(config=True) 876 | # job status: 1 = pending, 2 = running 877 | state_pending_re = Unicode(r"^1,").tag(config=True) 878 | state_running_re = Unicode(r"^2,").tag(config=True) 879 | state_exechost_re = Unicode(r"^\w*, .*@([^ ]*)").tag(config=True) 880 | 881 | def parse_job_id(self, output): 882 | match = re.search(r".*submitted to cluster ([0-9]+)", output) 883 | if match: 884 | return match.groups()[0] 885 | 886 | error_msg = "CondorSpawner unable to parse jobID from text: " + output 887 | self.log.error(error_msg) 888 | raise Exception(error_msg) 889 | 890 | def cmd_formatted_for_batch(self): 891 | return super().cmd_formatted_for_batch().replace('"', '""').replace("'", "''") 892 | 893 | 894 | class LsfSpawner(BatchSpawnerBase): 895 | """A Spawner that uses IBM's Platform Load Sharing Facility (LSF) to launch notebooks.""" 896 | 897 | batch_script = Unicode( 898 | """#!/bin/sh 899 | #BSUB -R "select[type==any]" # Allow spawning on non-uniform hardware 900 | #BSUB -R "span[hosts=1]" # Only spawn job on one server 901 | #BSUB -q {queue} 902 | #BSUB -J spawner-jupyterhub 903 | #BSUB -o {homedir}/.jupyterhub.lsf.out 904 | #BSUB -e {homedir}/.jupyterhub.lsf.err 905 | 906 | set -eu 907 | 908 | {prologue} 909 | {cmd} 910 | {epilogue} 911 | """ 912 | ).tag(config=True) 913 | 914 | batch_submit_cmd = Unicode("bsub").tag(config=True) 915 | batch_query_cmd = Unicode('bjobs -a -noheader -o "STAT EXEC_HOST" {job_id}').tag( 916 | config=True 917 | ) 918 | batch_cancel_cmd = Unicode("bkill {job_id}").tag(config=True) 919 | 920 | def get_env(self): 921 | env = super().get_env() 922 | 923 | # LSF relies on environment variables to launch local jobs. Ensure that these values are included 924 | # in the environment used to run the spawner. 925 | for key in [ 926 | "LSF_ENVDIR", 927 | "LSF_SERVERDIR", 928 | "LSF_FULL_VERSION", 929 | "LSF_LIBDIR", 930 | "LSF_BINDIR", 931 | ]: 932 | if key in os.environ and key not in env: 933 | env[key] = os.environ[key] 934 | return env 935 | 936 | def parse_job_id(self, output): 937 | # Assumes output in the following form: 938 | # "Job <1815> is submitted to default queue ." 939 | return output.split(" ")[1].strip("<>") 940 | 941 | def state_ispending(self): 942 | # Parse results of batch_query_cmd 943 | # Output determined by results of self.batch_query_cmd 944 | if self.job_status: 945 | return self.job_status.split(" ")[0].upper() in {"PEND", "PUSP"} 946 | 947 | def state_isrunning(self): 948 | if self.job_status: 949 | return self.job_status.split(" ")[0].upper() == "RUN" 950 | 951 | def state_gethost(self): 952 | if self.job_status: 953 | return self.job_status.split(" ")[1].strip().split(":")[0] 954 | 955 | self.log.error( 956 | "Spawner unable to match host addr in job {} with status {}".format( 957 | self.job_id, self.job_status 958 | ) 959 | ) 960 | return 961 | 962 | 963 | # vim: set ai expandtab softtabstop=4: 964 | -------------------------------------------------------------------------------- /batchspawner/singleuser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from runpy import run_path 4 | from shutil import which 5 | from urllib.parse import urlparse, urlunparse 6 | 7 | import requests 8 | from jupyterhub.services.auth import HubAuth 9 | from jupyterhub.utils import random_port, url_path_join 10 | 11 | 12 | def main(argv=None): 13 | port = random_port() 14 | hub_auth = HubAuth() 15 | 16 | url = url_path_join(hub_auth.api_url, "batchspawner") 17 | headers = {"Authorization": f"token {hub_auth.api_token}"} 18 | 19 | # internal_ssl kwargs 20 | kwargs = {} 21 | if hub_auth.certfile and hub_auth.keyfile: 22 | kwargs["cert"] = (hub_auth.certfile, hub_auth.keyfile) 23 | if hub_auth.client_ca: 24 | kwargs["verify"] = hub_auth.client_ca 25 | 26 | requests.post( 27 | url, 28 | headers=headers, 29 | json={"port": port}, 30 | **kwargs, 31 | ) 32 | 33 | # Read the env var JUPYTERHUB_SERVICE_URL and replace port in the URL 34 | # with free port that we found here 35 | # JUPYTERHUB_SERVICE_URL is added in JupyterHub 2.0 36 | service_url_env = os.environ.get("JUPYTERHUB_SERVICE_URL", "") 37 | if service_url_env: 38 | url = urlparse(os.environ["JUPYTERHUB_SERVICE_URL"]) 39 | url = url._replace(netloc=f"{url.hostname}:{port}") 40 | os.environ["JUPYTERHUB_SERVICE_URL"] = urlunparse(url) 41 | else: 42 | # JupyterHub < 2.0 specifies port on the command-line 43 | sys.argv.append(f"--port={port}") 44 | 45 | cmd_path = which(sys.argv[1]) 46 | sys.argv = sys.argv[1:] 47 | run_path(cmd_path, run_name="__main__") 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /batchspawner/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jupyterhub/batchspawner/87874004b2dc761598405a564a26e5f6775bb473/batchspawner/tests/__init__.py -------------------------------------------------------------------------------- /batchspawner/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Relevant pytest fixtures are re-used from JupyterHub's test suite""" 2 | 3 | # We use "db" directly, but we also need event_loop 4 | from jupyterhub.tests.conftest import db, event_loop # noqa 5 | -------------------------------------------------------------------------------- /batchspawner/tests/test_spawners.py: -------------------------------------------------------------------------------- 1 | """Test BatchSpawner and subclasses""" 2 | 3 | import asyncio 4 | import pwd 5 | import re 6 | import time 7 | from getpass import getuser 8 | from unittest import mock 9 | 10 | import pytest 11 | from jupyterhub import orm 12 | from jupyterhub.objects import Hub, Server 13 | from jupyterhub.user import User 14 | from traitlets import Unicode 15 | 16 | from .. import BatchSpawnerRegexStates, JobStatus 17 | 18 | testhost = "userhost123" 19 | testjob = "12345" 20 | testport = 54321 21 | 22 | 23 | @pytest.fixture(autouse=True) 24 | def _always_get_my_home(): 25 | # pwd.getbwnam() is always called with the current user 26 | # ignoring the requested name, which usually doesn't exist 27 | getpwnam = pwd.getpwnam 28 | with mock.patch.object(pwd, "getpwnam", lambda name: getpwnam(getuser())): 29 | yield 30 | 31 | 32 | class BatchDummy(BatchSpawnerRegexStates): 33 | exec_prefix = "" 34 | batch_submit_cmd = Unicode("cat > /dev/null; echo " + testjob) 35 | batch_query_cmd = Unicode("echo RUN " + testhost) 36 | batch_cancel_cmd = Unicode("echo STOP") 37 | batch_script = Unicode("{cmd}") 38 | state_pending_re = Unicode("PEND") 39 | state_running_re = Unicode("RUN") 40 | state_exechost_re = Unicode("RUN (.*)$") 41 | state_unknown_re = Unicode("UNKNOWN") 42 | 43 | cmd_expectlist = None 44 | out_expectlist = None 45 | 46 | async def run_command(self, *args, **kwargs): 47 | """Overwriten run command to test templating and outputs""" 48 | cmd = args[0] 49 | # Test that the command matches the expectations 50 | if self.cmd_expectlist: 51 | run_re = self.cmd_expectlist.pop(0) 52 | if run_re: 53 | print("run:", run_re) 54 | assert ( 55 | run_re.search(cmd) is not None 56 | ), f"Failed test: re={run_re} cmd={cmd}" 57 | # Run command normally 58 | out = await super().run_command(*args, **kwargs) 59 | # Test that the command matches the expectations 60 | if self.out_expectlist: 61 | out_re = self.out_expectlist.pop(0) 62 | if out_re: 63 | print("out:", out_re) 64 | assert ( 65 | out_re.search(cmd) is not None 66 | ), f"Failed output: re={out_re} cmd={cmd} out={out}" 67 | return out 68 | 69 | 70 | def new_spawner(db, spawner_class=BatchDummy, **kwargs): 71 | kwargs.setdefault("cmd", ["singleuser_command"]) 72 | user = db.query(orm.User).first() 73 | hub = Hub() 74 | user = User(user, {}) 75 | server = Server() 76 | # Set it after constructions because it isn't a traitlet. 77 | kwargs.setdefault("hub", hub) 78 | kwargs.setdefault("user", user) 79 | kwargs.setdefault("poll_interval", 1) 80 | 81 | # These are not traitlets so we have to set them here 82 | spawner = user._new_spawner("", spawner_class=spawner_class, **kwargs) 83 | spawner.server = server 84 | spawner.mock_port = testport 85 | return spawner 86 | 87 | 88 | def check_ip(spawner, value): 89 | assert spawner.ip == value 90 | 91 | 92 | async def test_spawner_start_stop_poll(db, event_loop): 93 | spawner = new_spawner(db=db) 94 | 95 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 96 | assert status == 1 97 | assert spawner.job_id == "" 98 | assert spawner.get_state() == {} 99 | 100 | await asyncio.wait_for(spawner.start(), timeout=5) 101 | check_ip(spawner, testhost) 102 | assert spawner.job_id == testjob 103 | 104 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 105 | assert status is None 106 | spawner.batch_query_cmd = "echo NOPE" 107 | await asyncio.wait_for(spawner.stop(), timeout=5) 108 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 109 | assert status == 1 110 | assert spawner.get_state() == {} 111 | 112 | 113 | async def test_stress_submit(db, event_loop): 114 | for i in range(200): 115 | time.sleep(0.01) 116 | test_spawner_start_stop_poll(db, event_loop) 117 | 118 | 119 | async def test_spawner_state_reload(db, event_loop): 120 | spawner = new_spawner(db=db) 121 | assert spawner.get_state() == {} 122 | 123 | await asyncio.wait_for(spawner.start(), timeout=30) 124 | check_ip(spawner, testhost) 125 | assert spawner.job_id == testjob 126 | 127 | state = spawner.get_state() 128 | assert state == dict(job_id=testjob, job_status="RUN " + testhost) 129 | spawner = new_spawner(db=db) 130 | spawner.clear_state() 131 | assert spawner.get_state() == {} 132 | spawner.load_state(state) 133 | # We used to check IP here, but that is actually only computed on start(), 134 | # and is not part of the spawner's persistent state 135 | assert spawner.job_id == testjob 136 | 137 | 138 | async def test_submit_failure(db, event_loop): 139 | spawner = new_spawner(db=db) 140 | assert spawner.get_state() == {} 141 | spawner.batch_submit_cmd = "cat > /dev/null; true" 142 | with pytest.raises(RuntimeError): 143 | await asyncio.wait_for(spawner.start(), timeout=30) 144 | assert spawner.job_id == "" 145 | assert spawner.job_status == "" 146 | 147 | 148 | async def test_submit_pending_fails(db, event_loop): 149 | """Submission works, but the batch query command immediately fails""" 150 | spawner = new_spawner(db=db) 151 | assert spawner.get_state() == {} 152 | spawner.batch_query_cmd = "echo xyz" 153 | with pytest.raises(RuntimeError): 154 | await asyncio.wait_for(spawner.start(), timeout=30) 155 | status = await asyncio.wait_for(spawner.query_job_status(), timeout=30) 156 | assert status == JobStatus.NOTFOUND 157 | assert spawner.job_id == "" 158 | assert spawner.job_status == "" 159 | 160 | 161 | async def test_poll_fails(db, event_loop): 162 | """Submission works, but a later .poll() fails""" 163 | spawner = new_spawner(db=db) 164 | assert spawner.get_state() == {} 165 | # The start is successful: 166 | await asyncio.wait_for(spawner.start(), timeout=30) 167 | spawner.batch_query_cmd = "echo xyz" 168 | # Now, the poll fails: 169 | await asyncio.wait_for(spawner.poll(), timeout=30) 170 | # .poll() will run self.clear_state() if it's not found: 171 | assert spawner.job_id == "" 172 | assert spawner.job_status == "" 173 | 174 | 175 | async def test_unknown_status(db, event_loop): 176 | """Polling returns an unknown status""" 177 | spawner = new_spawner(db=db) 178 | assert spawner.get_state() == {} 179 | # The start is successful: 180 | await asyncio.wait_for(spawner.start(), timeout=30) 181 | spawner.batch_query_cmd = "echo UNKNOWN" 182 | # This poll should not fail: 183 | await asyncio.wait_for(spawner.poll(), timeout=30) 184 | status = await asyncio.wait_for(spawner.query_job_status(), timeout=30) 185 | assert status == JobStatus.UNKNOWN 186 | assert spawner.job_id == "12345" 187 | assert spawner.job_status != "" 188 | 189 | 190 | async def test_templates(db, event_loop): 191 | """Test templates in the run_command commands""" 192 | spawner = new_spawner(db=db) 193 | 194 | # Test when not running 195 | spawner.cmd_expectlist = [ 196 | re.compile(".*RUN"), 197 | ] 198 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 199 | assert status == 1 200 | assert spawner.job_id == "" 201 | assert spawner.get_state() == {} 202 | 203 | # Test starting 204 | spawner.cmd_expectlist = [ 205 | re.compile(".*echo"), 206 | re.compile(".*RUN"), 207 | ] 208 | await asyncio.wait_for(spawner.start(), timeout=5) 209 | check_ip(spawner, testhost) 210 | assert spawner.job_id == testjob 211 | 212 | # Test poll - running 213 | spawner.cmd_expectlist = [ 214 | re.compile(".*RUN"), 215 | ] 216 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 217 | assert status is None 218 | 219 | # Test stopping 220 | spawner.batch_query_cmd = "echo NOPE" 221 | spawner.cmd_expectlist = [ 222 | re.compile(".*STOP"), 223 | re.compile(".*NOPE"), 224 | ] 225 | await asyncio.wait_for(spawner.stop(), timeout=5) 226 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 227 | assert status == 1 228 | assert spawner.get_state() == {} 229 | 230 | 231 | async def test_batch_script(db, event_loop): 232 | """Test that the batch script substitutes {cmd}""" 233 | 234 | class BatchDummyTestScript(BatchDummy): 235 | async def _get_batch_script(self, **subvars): 236 | script = await super()._get_batch_script(**subvars) 237 | assert "singleuser_command" in script 238 | return script 239 | 240 | spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript) 241 | # status = await asyncio.wait_for(spawner.poll(), timeout=5) 242 | await asyncio.wait_for(spawner.start(), timeout=5) 243 | # status = await asyncio.wait_for(spawner.poll(), timeout=5) 244 | # await asyncio.wait_for(spawner.stop(), timeout=5) 245 | 246 | 247 | async def test_exec_prefix(db, event_loop): 248 | """Test that all run_commands have exec_prefix""" 249 | 250 | class BatchDummyTestScript(BatchDummy): 251 | exec_prefix = "PREFIX" 252 | 253 | async def run_command(self, cmd, *args, **kwargs): 254 | assert cmd.startswith("PREFIX ") 255 | cmd = cmd[7:] 256 | print(cmd) 257 | out = await super().run_command(cmd, *args, **kwargs) 258 | return out 259 | 260 | spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript) 261 | # Not running 262 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 263 | assert status == 1 264 | # Start 265 | await asyncio.wait_for(spawner.start(), timeout=5) 266 | assert spawner.job_id == testjob 267 | # Poll 268 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 269 | assert status is None 270 | # Stop 271 | spawner.batch_query_cmd = "echo NOPE" 272 | await asyncio.wait_for(spawner.stop(), timeout=5) 273 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 274 | assert status == 1 275 | 276 | 277 | async def run_spawner_script( 278 | db, spawner, script, batch_script_re_list=None, spawner_kwargs={} 279 | ): 280 | """Run a spawner script and test that the output and behavior is as expected. 281 | 282 | db: same as in this module 283 | spawner: the BatchSpawnerBase subclass to test 284 | script: list of (input_re_to_match, output) 285 | batch_script_re_list: if given, assert batch script matches all of these 286 | """ 287 | # Create the expected scripts 288 | cmd_expectlist, out_list = zip(*script) 289 | cmd_expectlist = list(cmd_expectlist) 290 | out_list = list(out_list) 291 | 292 | class BatchDummyTestScript(spawner): 293 | async def run_command(self, cmd, input=None, env=None): 294 | # Test the input 295 | run_re = cmd_expectlist.pop(0) 296 | if run_re: 297 | print(f'run: "{cmd}" [{run_re}]') 298 | assert ( 299 | run_re.search(cmd) is not None 300 | ), f"Failed test: re={run_re} cmd={cmd}" 301 | # Test the stdin - will only be the batch script. For 302 | # each regular expression in batch_script_re_list, assert that 303 | # each re in that list matches the batch script. 304 | if batch_script_re_list and input: 305 | batch_script = input 306 | for match_re in batch_script_re_list: 307 | assert ( 308 | match_re.search(batch_script) is not None 309 | ), f"Batch script does not match {match_re}" 310 | # Return expected output. 311 | out = out_list.pop(0) 312 | print(" --> " + out) 313 | return out 314 | 315 | spawner = new_spawner(db=db, spawner_class=BatchDummyTestScript, **spawner_kwargs) 316 | # Not running at beginning (no command run) 317 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 318 | assert status == 1 319 | # batch_submit_cmd 320 | # batch_query_cmd (result=pending) 321 | # batch_query_cmd (result=running) 322 | await asyncio.wait_for(spawner.start(), timeout=5) 323 | assert spawner.job_id == testjob 324 | check_ip(spawner, testhost) 325 | # batch_query_cmd 326 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 327 | assert status is None 328 | # batch_cancel_cmd 329 | await asyncio.wait_for(spawner.stop(), timeout=5) 330 | # batch_poll_cmd 331 | status = await asyncio.wait_for(spawner.poll(), timeout=5) 332 | assert status == 1 333 | 334 | 335 | async def test_torque(db, event_loop): 336 | spawner_kwargs = { 337 | "req_nprocs": "5", 338 | "req_memory": "5678", 339 | "req_options": "some_option_asdf", 340 | "req_prologue": "PROLOGUE", 341 | "req_epilogue": "EPILOGUE", 342 | } 343 | batch_script_re_list = [ 344 | re.compile( 345 | r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE", 346 | re.S | re.M, 347 | ), 348 | re.compile(r"mem=5678"), 349 | re.compile(r"ppn=5"), 350 | re.compile(r"^#PBS some_option_asdf", re.M), 351 | ] 352 | poll_running = ( 353 | re.compile(r"sudo.*qstat"), 354 | f"R{testhost}/1", 355 | ) 356 | script = [ 357 | (re.compile(r"sudo.*qsub"), str(testjob)), 358 | ( 359 | re.compile(r"sudo.*qstat"), 360 | "Q", 361 | ), # pending 362 | poll_running, 363 | poll_running, 364 | poll_running, 365 | (re.compile(r"sudo.*qdel"), "STOP"), 366 | (re.compile(r"sudo.*qstat"), ""), 367 | ] 368 | from .. import TorqueSpawner 369 | 370 | await run_spawner_script( 371 | db, 372 | TorqueSpawner, 373 | script, 374 | batch_script_re_list=batch_script_re_list, 375 | spawner_kwargs=spawner_kwargs, 376 | ) 377 | 378 | 379 | async def test_moab(db, event_loop): 380 | spawner_kwargs = { 381 | "req_nprocs": "5", 382 | "req_memory": "5678", 383 | "req_options": "some_option_asdf", 384 | "req_prologue": "PROLOGUE", 385 | "req_epilogue": "EPILOGUE", 386 | } 387 | batch_script_re_list = [ 388 | re.compile( 389 | r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE", 390 | re.S | re.M, 391 | ), 392 | re.compile(r"mem=5678"), 393 | re.compile(r"ppn=5"), 394 | re.compile(r"^#PBS some_option_asdf", re.M), 395 | ] 396 | poll_running = ( 397 | re.compile(r"sudo.*mdiag"), 398 | f'State="Running" AllocNodeList="{testhost}"', 399 | ) 400 | script = [ 401 | (re.compile(r"sudo.*msub"), str(testjob)), 402 | (re.compile(r"sudo.*mdiag"), 'State="Idle"'), # pending 403 | poll_running, 404 | poll_running, 405 | poll_running, 406 | (re.compile(r"sudo.*mjobctl.*-c"), "STOP"), 407 | (re.compile(r"sudo.*mdiag"), ""), 408 | ] 409 | from .. import MoabSpawner 410 | 411 | await run_spawner_script( 412 | db, 413 | MoabSpawner, 414 | script, 415 | batch_script_re_list=batch_script_re_list, 416 | spawner_kwargs=spawner_kwargs, 417 | ) 418 | 419 | 420 | async def test_pbs(db, event_loop): 421 | spawner_kwargs = { 422 | "req_nprocs": "4", 423 | "req_memory": "10256", 424 | "req_options": "some_option_asdf", 425 | "req_host": "some_pbs_admin_node", 426 | "req_runtime": "08:00:00", 427 | } 428 | batch_script_re_list = [ 429 | re.compile(r"singleuser_command"), 430 | re.compile(r"select=1"), 431 | re.compile(r"ncpus=4"), 432 | re.compile(r"mem=10256"), 433 | re.compile(r"walltime=08:00:00"), 434 | re.compile(r"@some_pbs_admin_node"), 435 | re.compile(r"^#PBS some_option_asdf", re.M), 436 | ] 437 | poll_running = ( 438 | re.compile(r"sudo.*qstat"), 439 | f"job_state = R\nexec_host = {testhost}/2*1", 440 | ) 441 | script = [ 442 | (re.compile(r"sudo.*qsub"), str(testjob)), 443 | (re.compile(r"sudo.*qstat"), "job_state = Q"), # pending 444 | poll_running, 445 | poll_running, 446 | poll_running, 447 | (re.compile(r"sudo.*qdel"), "STOP"), 448 | (re.compile(r"sudo.*qstat"), ""), 449 | ] 450 | from .. import PBSSpawner 451 | 452 | await run_spawner_script( 453 | db, 454 | PBSSpawner, 455 | script, 456 | batch_script_re_list=batch_script_re_list, 457 | spawner_kwargs=spawner_kwargs, 458 | ) 459 | 460 | 461 | async def test_slurm(db, event_loop): 462 | spawner_kwargs = { 463 | "req_runtime": "3-05:10:10", 464 | "req_nprocs": "5", 465 | "req_memory": "5678", 466 | "req_options": "some_option_asdf", 467 | "req_prologue": "PROLOGUE", 468 | "req_epilogue": "EPILOGUE", 469 | "req_reservation": "RES123", 470 | "req_gres": "GRES123", 471 | } 472 | batch_script_re_list = [ 473 | re.compile( 474 | r"PROLOGUE.*srun batchspawner-singleuser singleuser_command.*EPILOGUE", re.S 475 | ), 476 | re.compile(r"^\#SBATCH \s+ --cpus-per-task=5", re.X | re.M), 477 | re.compile(r"^\#SBATCH \s+ --time=3-05:10:10", re.X | re.M), 478 | re.compile(r"^\#SBATCH \s+ some_option_asdf", re.X | re.M), 479 | re.compile(r"^\#SBATCH \s+ --reservation=RES123", re.X | re.M), 480 | re.compile(r"^\#SBATCH \s+ --gres=GRES123", re.X | re.M), 481 | ] 482 | from .. import SlurmSpawner 483 | 484 | await run_spawner_script( 485 | db, 486 | SlurmSpawner, 487 | normal_slurm_script, 488 | batch_script_re_list=batch_script_re_list, 489 | spawner_kwargs=spawner_kwargs, 490 | ) 491 | 492 | 493 | # We tend to use slurm as our typical example job. These allow quick 494 | # Slurm examples. 495 | normal_slurm_script = [ 496 | (re.compile(r"sudo.*sbatch"), str(testjob)), 497 | (re.compile(r"sudo.*squeue"), "PENDING "), # pending 498 | ( 499 | re.compile(r"sudo.*squeue"), 500 | "slurm_load_jobs error: Unable to contact slurm controller", 501 | ), # unknown 502 | (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), # running 503 | (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), 504 | (re.compile(r"sudo.*squeue"), "RUNNING " + testhost), 505 | (re.compile(r"sudo.*scancel"), "STOP"), 506 | (re.compile(r"sudo.*squeue"), ""), 507 | ] 508 | from .. import SlurmSpawner 509 | 510 | 511 | async def run_typical_slurm_spawner( 512 | db, 513 | spawner=SlurmSpawner, 514 | script=normal_slurm_script, 515 | batch_script_re_list=None, 516 | spawner_kwargs={}, 517 | ): 518 | """Run a full slurm job with default (overrideable) parameters. 519 | 520 | This is useful, for example, for changing options and testing effect 521 | of batch scripts. 522 | """ 523 | return await run_spawner_script( 524 | db, 525 | spawner, 526 | script, 527 | batch_script_re_list=batch_script_re_list, 528 | spawner_kwargs=spawner_kwargs, 529 | ) 530 | 531 | 532 | # async def test_gridengine(db, event_loop): 533 | # spawner_kwargs = { 534 | # 'req_options': 'some_option_asdf', 535 | # } 536 | # batch_script_re_list = [ 537 | # re.compile(r'singleuser_command'), 538 | # re.compile(r'#$\s+some_option_asdf'), 539 | # ] 540 | # script = [ 541 | # (re.compile(r'sudo.*qsub'), 'x x '+str(testjob)), 542 | # (re.compile(r'sudo.*qstat'), 'PENDING '), 543 | # (re.compile(r'sudo.*qstat'), 'RUNNING '+testhost), 544 | # (re.compile(r'sudo.*qstat'), 'RUNNING '+testhost), 545 | # (re.compile(r'sudo.*qdel'), 'STOP'), 546 | # (re.compile(r'sudo.*qstat'), ''), 547 | # ] 548 | # from .. import GridengineSpawner 549 | # await run_spawner_script(db, GridengineSpawner, script, 550 | # batch_script_re_list=batch_script_re_list, 551 | # spawner_kwargs=spawner_kwargs) 552 | 553 | 554 | async def test_condor(db, event_loop): 555 | spawner_kwargs = { 556 | "req_nprocs": "5", 557 | "req_memory": "5678", 558 | "req_options": "some_option_asdf", 559 | } 560 | batch_script_re_list = [ 561 | re.compile(r"exec batchspawner-singleuser singleuser_command"), 562 | re.compile(r"RequestCpus = 5"), 563 | re.compile(r"RequestMemory = 5678"), 564 | re.compile(r"^some_option_asdf", re.M), 565 | ] 566 | script = [ 567 | ( 568 | re.compile(r"sudo.*condor_submit"), 569 | f"submitted to cluster {str(testjob)}", 570 | ), 571 | (re.compile(r"sudo.*condor_q"), "1,"), # pending 572 | (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), # runing 573 | (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), 574 | (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), 575 | (re.compile(r"sudo.*condor_rm"), "STOP"), 576 | (re.compile(r"sudo.*condor_q"), ""), 577 | ] 578 | from .. import CondorSpawner 579 | 580 | await run_spawner_script( 581 | db, 582 | CondorSpawner, 583 | script, 584 | batch_script_re_list=batch_script_re_list, 585 | spawner_kwargs=spawner_kwargs, 586 | ) 587 | 588 | 589 | async def test_lfs(db, event_loop): 590 | spawner_kwargs = { 591 | "req_nprocs": "5", 592 | "req_memory": "5678", 593 | "req_options": "some_option_asdf", 594 | "req_queue": "some_queue", 595 | "req_prologue": "PROLOGUE", 596 | "req_epilogue": "EPILOGUE", 597 | } 598 | batch_script_re_list = [ 599 | re.compile( 600 | r"^PROLOGUE.*^batchspawner-singleuser singleuser_command.*^EPILOGUE", 601 | re.S | re.M, 602 | ), 603 | re.compile(r"#BSUB\s+-q\s+some_queue", re.M), 604 | ] 605 | script = [ 606 | ( 607 | re.compile(r"sudo.*bsub"), 608 | f"Job <{str(testjob)}> is submitted to default queue ", 609 | ), 610 | (re.compile(r"sudo.*bjobs"), "PEND "), # pending 611 | (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), # running 612 | (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), 613 | (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), 614 | (re.compile(r"sudo.*bkill"), "STOP"), 615 | (re.compile(r"sudo.*bjobs"), ""), 616 | ] 617 | from .. import LsfSpawner 618 | 619 | await run_spawner_script( 620 | db, 621 | LsfSpawner, 622 | script, 623 | batch_script_re_list=batch_script_re_list, 624 | spawner_kwargs=spawner_kwargs, 625 | ) 626 | 627 | 628 | async def test_keepvars(db, event_loop): 629 | # req_keepvars 630 | spawner_kwargs = { 631 | "req_keepvars": "ABCDE", 632 | } 633 | batch_script_re_list = [ 634 | re.compile(r"--export=ABCDE", re.X | re.M), 635 | ] 636 | await run_typical_slurm_spawner( 637 | db, 638 | spawner_kwargs=spawner_kwargs, 639 | batch_script_re_list=batch_script_re_list, 640 | ) 641 | 642 | # req_keepvars AND req_keepvars together 643 | spawner_kwargs = { 644 | "req_keepvars": "ABCDE", 645 | "req_keepvars_extra": "XYZ", 646 | } 647 | batch_script_re_list = [ 648 | re.compile(r"--export=ABCDE,XYZ", re.X | re.M), 649 | ] 650 | await run_typical_slurm_spawner( 651 | db, 652 | spawner_kwargs=spawner_kwargs, 653 | batch_script_re_list=batch_script_re_list, 654 | ) 655 | 656 | 657 | async def test_early_stop(db, event_loop): 658 | script = [ 659 | (re.compile(r"sudo.*sbatch"), str(testjob)), 660 | (re.compile(r"sudo.*squeue"), "PENDING "), # pending 661 | ( 662 | re.compile(r"sudo.*squeue"), 663 | "slurm_load_jobs error: Unable to contact slurm controller", 664 | ), # unknown 665 | # job exits early during start 666 | (re.compile(r"sudo.*squeue"), ""), 667 | (re.compile(r"sudo.*scancel"), "STOP"), 668 | ] 669 | with pytest.raises(RuntimeError, match="job has disappeared"): 670 | await run_spawner_script(db, SlurmSpawner, script) 671 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | 6 | # autoflake is used for autoformatting Python code 7 | # 8 | # ref: https://github.com/PyCQA/autoflake#readme 9 | # 10 | [tool.autoflake] 11 | ignore-init-module-imports = true 12 | #remove-all-unused-imports = true 13 | remove-duplicate-keys = true 14 | remove-unused-variables = true 15 | 16 | 17 | # isort is used for autoformatting Python code 18 | # 19 | # ref: https://pycqa.github.io/isort/ 20 | # 21 | [tool.isort] 22 | profile = "black" 23 | 24 | 25 | # black is used for autoformatting Python code 26 | # 27 | # ref: https://black.readthedocs.io/en/stable/ 28 | # 29 | [tool.black] 30 | target_version = [ 31 | "py36", 32 | "py37", 33 | "py38", 34 | "py39", 35 | "py310", 36 | "py311", 37 | "py312", 38 | ] 39 | 40 | 41 | # pytest is used for running Python based tests 42 | # 43 | # ref: https://docs.pytest.org/en/stable/ 44 | # 45 | [tool.pytest.ini_options] 46 | addopts = "--verbose --color=yes --durations=10 --cov=batchspawner" 47 | asyncio_mode = "auto" 48 | testpaths = ["batchspawner/tests"] 49 | 50 | 51 | # pytest-cov / coverage is used to measure code coverage of tests 52 | # 53 | # ref: https://coverage.readthedocs.io/en/stable/config.html 54 | # 55 | [tool.coverage.run] 56 | omit = [ 57 | "batchspawner/tests/*", 58 | ] 59 | 60 | 61 | # tbump is used to simplify and standardize the release process when updating 62 | # the version, making a git commit and tag, and pushing changes. 63 | # 64 | # ref: https://github.com/your-tools/tbump#readme 65 | # 66 | [tool.tbump] 67 | github_url = "https://github.com/jupyterhub/batchspawner" 68 | 69 | [tool.tbump.version] 70 | current = "1.3.1.dev" 71 | regex = ''' 72 | (?P\d+) 73 | \. 74 | (?P\d+) 75 | \. 76 | (?P\d+) 77 | (?P
((a|b|rc)\d+)|)
78 |     \.?
79 |     (?P(?<=\.)dev\d*|)
80 | '''
81 | 
82 | [tool.tbump.git]
83 | message_template = "Bump to {new_version}"
84 | tag_template = "v{new_version}"
85 | 
86 | [[tool.tbump.file]]
87 | src = "setup.py"
88 | 
89 | [[tool.tbump.file]]
90 | src = "batchspawner/_version.py"
91 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(
 7 |     name="batchspawner",
 8 |     entry_points={
 9 |         "console_scripts": ["batchspawner-singleuser=batchspawner.singleuser:main"],
10 |     },
11 |     packages=["batchspawner"],
12 |     version="1.3.1.dev",
13 |     description="""Batchspawner: A spawner for Jupyterhub to spawn notebooks using batch resource managers.""",
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     author="Michael Milligan, Andrea Zonca, Mike Gilbert",
17 |     author_email="milligan@umn.edu",
18 |     url="http://jupyter.org",
19 |     license="BSD",
20 |     platforms="Linux, Mac OS X",
21 |     keywords=["Interactive", "Interpreter", "Shell", "Web", "Jupyter"],
22 |     classifiers=[
23 |         "Intended Audience :: Developers",
24 |         "Intended Audience :: System Administrators",
25 |         "Intended Audience :: Science/Research",
26 |         "License :: OSI Approved :: BSD License",
27 |         "Programming Language :: Python",
28 |         "Programming Language :: Python :: 3",
29 |     ],
30 |     project_urls={
31 |         "Bug Reports": "https://github.com/jupyterhub/batchspawner/issues",
32 |         "Source": "https://github.com/jupyterhub/batchspawner/",
33 |         "About Jupyterhub": "http://jupyterhub.readthedocs.io/en/latest/",
34 |         "Jupyter Project": "http://jupyter.org",
35 |     },
36 |     python_requires=">=3.6",
37 |     install_require={
38 |         "jinja2",
39 |         "jupyterhub>=1.5.1",
40 |     },
41 |     extras_require={
42 |         "test": [
43 |             "pytest",
44 |             "pytest-asyncio",
45 |             "pytest-cov",
46 |             "notebook",
47 |         ],
48 |     },
49 | )
50 | 


--------------------------------------------------------------------------------