├── tests ├── pytest.ini ├── mock-slurm │ └── bin │ │ ├── squeue │ │ └── sacct ├── cluster-config.yaml ├── Snakefile_issue49 ├── docker-compose.yaml ├── test_sidecar.py ├── test_issues.py ├── Snakefile ├── test_cookie.py ├── test_slurm.py ├── test_utils.py ├── deploystack.sh ├── wrapper.py └── conftest.py ├── pytest.ini ├── {{cookiecutter.profile_name}} ├── slurm-jobscript.sh ├── settings.json ├── config.yaml ├── CookieCutter.py ├── slurm-status.py ├── slurm-submit.py ├── slurm-sidecar.py └── slurm_utils.py ├── .gitignore ├── requirements.txt ├── .flake8 ├── test-environment.yml ├── .pre-commit-config.yaml ├── cookiecutter.json ├── LICENSE ├── .github └── workflows │ └── slurm.yaml ├── ChangeLog.md ├── README.md └── conda-linux-64.lock /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -rasxX 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --keep-baked-projects -v -s 3 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/slurm-jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # properties = {properties} 3 | {exec_job} 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .pytest 3 | 4 | # pycharm 5 | .idea/ 6 | 7 | # text editors 8 | *.sw? 9 | *~ 10 | -------------------------------------------------------------------------------- /tests/mock-slurm/bin/squeue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | cat <<"EOF" 4 | JOBID,STATE 5 | 1044785,RUNNING 6 | 1044875,RUNNING 7 | EOF -------------------------------------------------------------------------------- /tests/mock-slurm/bin/sacct: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | cat <"EOF" 4 | 1044785|RUNNING|0:0 5 | 1044785.extern|RUNNING|0:0 6 | 1044785.0|RUNNING|0:0 7 | EOF -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cookiecutter 2 | 3 | # Testing 4 | docker==3.* 5 | snakemake==5.10.* 6 | pytest 7 | pytest-runner 8 | pep8 9 | pytest-cookies 10 | pyflakes 11 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "SBATCH_DEFAULTS": "{{cookiecutter.sbatch_defaults}}", 3 | "CLUSTER_NAME": "{{cookiecutter.cluster_name}}", 4 | "CLUSTER_CONFIG": "{{cookiecutter.cluster_config}}" 5 | } 6 | -------------------------------------------------------------------------------- /tests/cluster-config.yaml: -------------------------------------------------------------------------------- 1 | __default__: 2 | constraint: mem500MB 3 | 4 | memory_with_constraint: 5 | constraint: mem800MB 6 | 7 | short_queue: 8 | partition: debug 9 | 10 | simemory: 11 | mem: 1G 12 | 13 | set_partition_in_cc: 14 | partition: debug 15 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Based directly on Black's recommendations: 3 | # https://black.readthedocs.io/en/stable/the_black_code_style.html#line-length 4 | max-line-length = 81 5 | select = A,C,E,F,W,B,B950 6 | #B305 doesn't like `.next()` that is a key Tree method. 7 | ignore = E203, E501, W503, B305 8 | -------------------------------------------------------------------------------- /test-environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - default 5 | dependencies: 6 | - python 7 | - pip 8 | - pytest 9 | - pytest-runner 10 | - docker-py >=3.0 11 | - cookiecutter 12 | - pep8 13 | - pytest-cookies >=0.4.0 14 | - pytest-timeout 15 | - pyflakes 16 | - snakemake 17 | - urllib3 18 | - cryptography 19 | -------------------------------------------------------------------------------- /tests/Snakefile_issue49: -------------------------------------------------------------------------------- 1 | # -*- snakemake -*- 2 | rule long_running_rule: 3 | output: 4 | "{sample}.delayOutput.txt", 5 | resources: 6 | time = 2 7 | shell: 8 | """ 9 | echo "" > {output} 10 | 11 | for i in $(seq 1 60) 12 | do 13 | echo $i >> {output} 14 | sleep 1 15 | done 16 | """ 17 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: check-merge-conflict 6 | - id: mixed-line-ending 7 | - id: check-case-conflict 8 | - id: check-added-large-files 9 | - id: check-yaml 10 | - id: end-of-file-fixer 11 | - id: trailing-whitespace 12 | - repo: https://gitlab.com/pycqa/flake8 13 | rev: 3.8.3 14 | hooks: 15 | - id: flake8 16 | args: [--config=.flake8] 17 | additional_dependencies: ["flake8-bugbear==20.1.4", "flake8-builtins==1.5.2"] 18 | -------------------------------------------------------------------------------- /cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "profile_name": "slurm", 3 | "use_singularity": [false, true], 4 | "use_conda": [false, true], 5 | "jobs": 500, 6 | "restart_times": 0, 7 | "max_status_checks_per_second": 10, 8 | "max_jobs_per_second": 10, 9 | "latency_wait": 5, 10 | "print_shell_commands": [false, true], 11 | "sbatch_defaults": "", 12 | "cluster_sidecar_help": "Use cluster sidecar. NB! Requires snakemake >= 7.0! Enter to continue...", 13 | "cluster_sidecar": ["yes", "no"], 14 | "cluster_name": "", 15 | "cluster_jobname": "%r_%w", 16 | "cluster_logpath": "logs/slurm/%r/%j", 17 | "cluster_config_help": "The use of cluster-config is discouraged. Rather, set snakemake CLI options in the profile configuration file (see snakemake documentation on best practices). Enter to continue...", 18 | "cluster_config": "" 19 | } 20 | -------------------------------------------------------------------------------- /tests/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | snakemake: 4 | image: quay.io/biocontainers/snakemake:7.30.1--hdfd78af_0 5 | hostname: slurmctl 6 | command: /bin/bash 7 | deploy: 8 | resources: 9 | limits: 10 | cpus: '0.5' 11 | memory: 1000M 12 | reservations: 13 | cpus: '0.5' 14 | memory: 1000M 15 | tty: true 16 | stdin_open: true 17 | working_dir: /tmp 18 | volumes: 19 | - usr:/usr/ 20 | slurm: 21 | image: giovtorres/docker-centos7-slurm:20.11.8 22 | hostname: slurmctl 23 | stdin_open: true 24 | tty: true 25 | working_dir: /tmp 26 | environment: 27 | PATH: "/opt/local/bin:$PATH" 28 | SNAKEMAKE_PATH: "/opt/local/bin" 29 | LC_ALL: en_US.UTF-8 30 | LANG: en_US.UTF-8 31 | volumes: 32 | # Mount snakemake image usr volume to opt 33 | - usr:/opt/ 34 | - /tmp:/tmp 35 | 36 | volumes: 37 | usr: 38 | -------------------------------------------------------------------------------- /tests/test_sidecar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import signal 5 | import subprocess 6 | import tempfile 7 | import time 8 | 9 | import pytest 10 | 11 | 12 | @pytest.mark.slow 13 | @pytest.mark.timeout(60) 14 | def test_cluster_sidecar_smoke(): 15 | env = dict(os.environ) 16 | env["PATH"] = ( 17 | os.path.realpath(os.path.dirname(__file__) + "/mock-slurm/bin") + ":" + env.get("PATH") 18 | ) 19 | path_sidecar_py = os.path.realpath( 20 | os.path.dirname(__file__) + "/../{{cookiecutter.profile_name}}/slurm-sidecar.py" 21 | ) 22 | with tempfile.TemporaryFile("w+t") as tmpf: 23 | with subprocess.Popen(["python", path_sidecar_py], env=env, text=True, stdout=tmpf) as proc: 24 | time.sleep(2) 25 | os.kill(proc.pid, signal.SIGTERM) 26 | tmpf.seek(0) 27 | stdout = tmpf.read() 28 | the_vars = json.loads(stdout.splitlines()[0]) 29 | assert "server_port" in the_vars 30 | assert "server_secret" in the_vars 31 | assert proc.returncode == 0 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Snakemake-Profiles 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/config.yaml: -------------------------------------------------------------------------------- 1 | {%- if cookiecutter.cluster_sidecar == "yes" %} 2 | cluster-sidecar: "slurm-sidecar.py" 3 | {%- endif %} 4 | cluster-cancel: "scancel" 5 | restart-times: "{{cookiecutter.restart_times}}" 6 | jobscript: "slurm-jobscript.sh" 7 | cluster: "slurm-submit.py" 8 | cluster-status: "slurm-status.py" 9 | max-jobs-per-second: "{{cookiecutter.max_jobs_per_second}}" 10 | max-status-checks-per-second: "{{cookiecutter.max_status_checks_per_second}}" 11 | local-cores: 1 12 | latency-wait: "{{cookiecutter.latency_wait}}" 13 | use-conda: "{{cookiecutter.use_conda}}" 14 | use-singularity: "{{cookiecutter.use_singularity}}" 15 | jobs: "{{cookiecutter.jobs}}" 16 | printshellcmds: "{{cookiecutter.print_shell_commands}}" 17 | 18 | # Example resource configuration 19 | # default-resources: 20 | # - runtime=100 21 | # - mem_mb=6000 22 | # - disk_mb=1000000 23 | # # set-threads: map rule names to threads 24 | # set-threads: 25 | # - single_core_rule=1 26 | # - multi_core_rule=10 27 | # # set-resources: map rule names to resources in general 28 | # set-resources: 29 | # - high_memory_rule:mem_mb=12000 30 | # - long_running_rule:runtime=1200 31 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/CookieCutter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Based on lsf CookieCutter.py 3 | # 4 | import os 5 | import json 6 | 7 | d = os.path.dirname(__file__) 8 | with open(os.path.join(d, "settings.json")) as fh: 9 | settings = json.load(fh) 10 | 11 | 12 | def from_entry_or_env(values, key): 13 | """Return value from ``values`` and override with environment variables.""" 14 | if key in os.environ: 15 | return os.environ[key] 16 | else: 17 | return values[key] 18 | 19 | 20 | class CookieCutter: 21 | 22 | SBATCH_DEFAULTS = from_entry_or_env(settings, "SBATCH_DEFAULTS") 23 | CLUSTER_NAME = from_entry_or_env(settings, "CLUSTER_NAME") 24 | CLUSTER_CONFIG = from_entry_or_env(settings, "CLUSTER_CONFIG") 25 | 26 | @staticmethod 27 | def get_cluster_option() -> str: 28 | cluster = CookieCutter.CLUSTER_NAME 29 | if cluster != "": 30 | return f"--cluster={cluster}" 31 | return "" 32 | 33 | @staticmethod 34 | def get_cluster_logpath() -> str: 35 | return "{{cookiecutter.cluster_logpath}}" 36 | 37 | @staticmethod 38 | def get_cluster_jobname() -> str: 39 | return "{{cookiecutter.cluster_jobname}}" 40 | -------------------------------------------------------------------------------- /tests/test_issues.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pytest 3 | import time 4 | import re 5 | import py 6 | 7 | @pytest.fixture 8 | def issue49(cookie_factory, datafile): 9 | p = datafile("Snakefile_issue49", "Snakefile") 10 | cookie_factory() 11 | d = py.path.local(p.dirname) 12 | config = d.join("slurm").join("config.yaml") 13 | lines = re.sub("restart-times: 3", "restart-times: 0", 14 | config.read()) 15 | config.write(lines) 16 | 17 | 18 | @pytest.mark.xfail 19 | def test_issue49(smk_runner, issue49): 20 | """https://github.com/Snakemake-Profiles/slurm/issues/49 21 | 22 | Cancelling a slurm job leaves an incomplete file that causes 23 | Snakemake to incorrectly state that there is 'Nothing to be done'. 24 | 25 | """ 26 | smk_runner.make_target("foo.delayOutput.txt", asynchronous=True) 27 | time.sleep(5) 28 | while smk_runner.check_jobstatus("RUNNING", verbose=False) is None: 29 | time.sleep(5) 30 | print("Job is running") 31 | time.sleep(20) 32 | jid = smk_runner.external_jobid[0] 33 | print(f"Cancelling job {jid}") 34 | smk_runner.exec_run(f"scancel {jid}") 35 | while smk_runner.check_jobstatus("CANCELLED", verbose=True) is None: 36 | time.sleep(5) 37 | print("Job has been cancelled; resubmitting") 38 | smk_runner.make_target("foo.delayOutput.txt") 39 | time.sleep(10) 40 | assert "Nothing to be done" in print(smk_runner.output) 41 | -------------------------------------------------------------------------------- /tests/Snakefile: -------------------------------------------------------------------------------- 1 | rule timeout: 2 | resources: 3 | runtime = lambda wildcards, attempt: attempt 4 | output: "timeout.txt" 5 | threads: lambda wildcards, attempt: attempt 6 | log: "timeout.log" 7 | shell: 8 | """ 9 | for i in $(seq 1 80) 10 | do 11 | echo $i >> {output} 12 | sleep 1 13 | done 14 | exit 0 15 | """ 16 | 17 | 18 | rule bar: 19 | resources: 20 | runtime = 1 21 | output: "bar.txt" 22 | shell: 23 | "echo bar > {output}" 24 | 25 | 26 | rule excessive_runtime: 27 | resources: 28 | runtime = 10000 29 | output: "runtime.txt" 30 | shell: 31 | "echo {resources} > {output}" 32 | 33 | 34 | rule excessive_memory: 35 | resources: 36 | mem_mb = 2000 37 | output: "memory.txt" 38 | shell: 39 | "echo {resources} > {output}" 40 | 41 | 42 | rule memory_with_constraint: 43 | resources: 44 | mem_mb = 800 45 | output: "memory_with_constraint.txt" 46 | shell: 47 | "echo {resources} > {output}" 48 | 49 | 50 | rule short_queue: 51 | output: "short_queue.txt" 52 | shell: "touch {output}" 53 | 54 | 55 | rule group_job1: 56 | output: "group_job.1.txt" 57 | group: "groupjob" 58 | shell: "echo group_job1 > {output}" 59 | 60 | 61 | rule group_job2: 62 | input: "{wc}.1.txt" 63 | output: "{wc}.2.txt" 64 | group: "groupjob" 65 | shell: "cat {input} > {output}; echo group_job2 >> {output}" 66 | 67 | 68 | rule wildcard_job: 69 | output: "{prefix}.wc.txt" 70 | params: 71 | label = "wcj_params" 72 | wildcard_constraints: 73 | prefix = "wildcard" 74 | shell: "echo {wildcards.prefix} > {output}" 75 | 76 | 77 | rule simemory: 78 | output: "siunit.txt" 79 | shell: "touch {output}" 80 | 81 | 82 | rule set_partition_in_cc: 83 | output: "partition.cc.txt" 84 | shell: 85 | "echo {resources} > {output}" 86 | 87 | rule set_partition_in_resources: 88 | output: "partition.resources.txt" 89 | resources: 90 | partition = "debug" 91 | shell: 92 | "echo {resources} > {output}" 93 | -------------------------------------------------------------------------------- /tests/test_cookie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import pytest 4 | from unittest.mock import patch 5 | 6 | 7 | @pytest.mark.parametrize("sidecar", ["yes", "no"]) 8 | def test_bake_project(cookies, sidecar): 9 | result = cookies.bake(template=str(pytest.cookie_template), 10 | extra_context={"cluster_sidecar": sidecar}) 11 | cfg = result.project_path / "config.yaml" 12 | if sidecar == "yes": 13 | assert "cluster-sidecar: \"slurm-sidecar.py\"\n" in cfg.read_text() 14 | else: 15 | assert "cluster-sidecar: \"slurm-sidecar.py\"" not in cfg.read_text() 16 | assert result.exit_code == 0 17 | assert result.exception is None 18 | assert result.project_path.name == "slurm" 19 | assert result.project_path.is_dir() 20 | 21 | 22 | def test_cookiecutter(cookies, monkeypatch): 23 | result = cookies.bake(template=str(pytest.cookie_template)) 24 | assert result.exit_code == 0 25 | assert result.exception is None 26 | 27 | assert result.project_path.name == "slurm" 28 | assert result.project_path.is_dir() 29 | with patch.dict(sys.modules): 30 | if "CookieCutter" in sys.modules: 31 | del sys.modules["CookieCutter"] 32 | monkeypatch.syspath_prepend(str(result.project_path)) 33 | from CookieCutter import CookieCutter 34 | assert CookieCutter.SBATCH_DEFAULTS == "" 35 | assert CookieCutter.CLUSTER_NAME == "" 36 | assert CookieCutter.CLUSTER_CONFIG == "" 37 | assert CookieCutter.get_cluster_option() == "" 38 | 39 | 40 | def test_cookiecutter_extra_context(cookies, monkeypatch): 41 | result = cookies.bake(template=str(pytest.cookie_template), 42 | extra_context={"sbatch_defaults": "account=foo", 43 | "cluster_name": "dusk", 44 | "cluster_config": "slurm.yaml"}) 45 | assert result.exit_code == 0 46 | assert result.exception is None 47 | 48 | assert result.project_path.name == "slurm" 49 | assert result.project_path.is_dir() 50 | with patch.dict(sys.modules): 51 | if "CookieCutter" in sys.modules: 52 | del sys.modules["CookieCutter"] 53 | monkeypatch.syspath_prepend(str(result.project_path)) 54 | from CookieCutter import CookieCutter 55 | assert CookieCutter.SBATCH_DEFAULTS == "account=foo" 56 | assert CookieCutter.CLUSTER_NAME == "dusk" 57 | assert CookieCutter.CLUSTER_CONFIG == "slurm.yaml" 58 | assert CookieCutter.get_cluster_option() == "--cluster=dusk" 59 | -------------------------------------------------------------------------------- /.github/workflows/slurm.yaml: -------------------------------------------------------------------------------- 1 | name: Test SnakemakeProfiles/slurm 2 | env: 3 | SNAKEMAKE_IMAGE: quay.io/biocontainers/snakemake:7.30.1--hdfd78af_0 4 | SLURM_IMAGE: giovtorres/docker-centos7-slurm:20.11.8 5 | DOCKER_COMPOSE: tests/docker-compose.yaml 6 | 7 | on: [push, pull_request] 8 | 9 | jobs: 10 | slurmtest: 11 | name: Test slurm profile in docker containers 12 | runs-on: ubuntu-latest 13 | timeout-minutes: 30 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v3 17 | 18 | - run: mkdir -p ~/image-cache 19 | 20 | - name: cache conda environment 21 | uses: actions/cache@v3 22 | env: 23 | CACHE_NUMBER: 0 24 | with: 25 | path: ~/conda_pkgs_dir 26 | key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda-linux-64.lock') }} 27 | 28 | - name: cache images 29 | uses: actions/cache@v3 30 | id: cache-images 31 | env: 32 | CACHE_NUMBER: 0 33 | with: 34 | path: ~/image-cache 35 | key: image-cache-${{ runner.os }}-${{ env.CACHE_NUMBER }}-${{ env.SNAKEMAKE_IMAGE }}-${{ env.SLURM_IMAGE }} 36 | 37 | - name: install miniconda 38 | uses: conda-incubator/setup-miniconda@v2 39 | with: 40 | miniconda-version: "latest" 41 | environment-file: conda-linux-64.lock 42 | use-only-tar-bz2: true 43 | 44 | - name: docker swarm init 45 | run: docker swarm init 46 | 47 | - if: steps.cache-images.outputs.cache-hit == 'true' 48 | run: docker load -i ~/image-cache/snakemake.tar 49 | 50 | - if: steps.cache-images.outputs.cache-hit == 'true' 51 | run: docker load -i ~/image-cache/slurm.tar 52 | 53 | - name: docker deploy 54 | shell: bash -l {0} 55 | env: 56 | DOCKER_COMPOSE: ${{ env.DOCKER_COMPOSE }} 57 | SNAKEMAKE_IMAGE: ${{ env.SNAKEMAKE_IMAGE }} 58 | SLURM_IMAGE: ${{ env.SLURM_IMAGE }} 59 | run: ./tests/deploystack.sh 60 | 61 | - if: steps.cache-images.outputs.cache-hit != 'true' 62 | run: docker save -o ~/image-cache/snakemake.tar ${{ env.SNAKEMAKE_IMAGE }} 63 | 64 | - if: steps.cache-images.outputs.cache-hit != 'true' 65 | run: docker save -o ~/image-cache/slurm.tar ${{ env.SLURM_IMAGE }} 66 | 67 | - name: run tests 68 | shell: bash -l {0} 69 | run: | 70 | pytest -v -s tests/test_cookie.py 71 | pytest -v -s tests/test_utils.py 72 | pytest -v -s tests/test_slurm.py --slow 73 | pytest -v -s tests/test_sidecar.py 74 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | # ChangeLog 2 | 3 | ## 2022-05-18 4 | 5 | ### Added 6 | - Human-friendly time formatting 7 | - Common snakemake options to cookiecutter JSON [[#80][80]] 8 | - `--use-singularity` 9 | - `--use-conda` 10 | - `--printshellcmds` 11 | - `--latency-wait` 12 | - `--max-jobs-per-second` 13 | - `--max-status-checks-per-second` 14 | - `--restart-times` 15 | - Support for passing extra options to `sbatch` via the `slurm` parameter in a rule's `resources` [[#86][86]] 16 | - More control over job names and log paths with patterns 17 | - Print the job log path along with the job ID 18 | 19 | ## 2022-05-12 20 | 21 | ### Changes 22 | 23 | - deprecate advanced argument conversion (#91, PR #93) 24 | - add support for sidecar (PR #85) 25 | 26 | ## 2021-03-10 27 | 28 | ### Issues 29 | 30 | - serialize cookiecutter settings in json file and add CookieCutter 31 | class to access config (fixes #63) 32 | - demote pandas import to function (addresses #64) 33 | - add preliminary support for suffixes when specifying memory (fixes #62) 34 | 35 | ## 2020-10-23 36 | 37 | This is a major rewrite of the testing framework, in an attempt to 38 | make it more accessible to users. In particular, it is now possible to 39 | run tests on an HPC. 40 | 41 | ### Changes 42 | 43 | - the new ShellContainer class emulates container output and allows 44 | the tests to be executed on a HPC running SLURM 45 | - slurm-submit.py now submits sbatch jobs with the --parsable option 46 | - add profile option 'cluster\_name' - some HPCs define multiple SLURM 47 | clusters. Simply adding --cluster=cluster\_name to SBATCH_DEFAULTS 48 | will not suffice as slurm-status.py also needs to check status in 49 | the queue corresponding to the chosen cluster. 50 | - the advanced argument conversion has been much simplified and 51 | improved 52 | 53 | ### Issues 54 | 55 | - options without arguments can now be set in SBATCH_DEFAULTS (fixes #52) 56 | 57 | 58 | ## 2020-09-11 59 | 60 | Move CI infrastructure from circleCI to github actions. 61 | 62 | ## 2020-04-15 63 | 64 | - process string patterns in snakemake style (replace keywords in braces) 65 | 66 | ## 2020-03-31 67 | 68 | - map threads to `--cpus-per-task` (#35) 69 | - rewrite some tests to address changes 70 | 71 | ## 2020-02-29 72 | 73 | - major rewrite and merge of the `slurm-submit.py` script to support any sbatch argument 74 | - parse any argument via the `sbatch_defaults` option and 75 | - enable per-profile cluster (YAML/JSON) config file 76 | - make experimental sbatch argument adjustments optional via the `advanced_argument_conversion` option 77 | 78 | ## 2019-09-03 79 | 80 | - add qos option 81 | 82 | ## 2019-08-21 83 | 84 | - replace pytest_namespace with pytest_configure 85 | - make days optional (#18) 86 | 87 | ## 2018-10-18 88 | 89 | - add cookiecutter options to set sbatch output and error defaults 90 | 91 | ## 2018-10-09 92 | 93 | - add support for mem_mb in resources 94 | - add support for cluster configuration file 95 | - add advanced slurm-submit file 96 | - adjust resource requirements if they exceed partition configuration 97 | settings (#11) 98 | 99 | 100 | [80]: https://github.com/Snakemake-Profiles/slurm/issues/80 101 | [86]: https://github.com/Snakemake-Profiles/slurm/issues/86 -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/slurm-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import re 5 | import requests 6 | import subprocess as sp 7 | import shlex 8 | import sys 9 | import time 10 | import logging 11 | from CookieCutter import CookieCutter 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | STATUS_ATTEMPTS = 20 16 | SIDECAR_VARS = os.environ.get("SNAKEMAKE_CLUSTER_SIDECAR_VARS", None) 17 | DEBUG = bool(int(os.environ.get("SNAKEMAKE_SLURM_DEBUG", "0"))) 18 | 19 | if DEBUG: 20 | logging.basicConfig(level=logging.DEBUG) 21 | logger.setLevel(logging.DEBUG) 22 | 23 | 24 | def get_status_direct(jobid): 25 | """Get status directly from sacct/scontrol""" 26 | cluster = CookieCutter.get_cluster_option() 27 | for i in range(STATUS_ATTEMPTS): 28 | try: 29 | sacct_res = sp.check_output(shlex.split(f"sacct {cluster} -P -b -j {jobid} -n")) 30 | res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} 31 | break 32 | except sp.CalledProcessError as e: 33 | logger.error("sacct process error") 34 | logger.error(e) 35 | except IndexError as e: 36 | logger.error(e) 37 | pass 38 | # Try getting job with scontrol instead in case sacct is misconfigured 39 | try: 40 | sctrl_res = sp.check_output(shlex.split(f"scontrol {cluster} -o show job {jobid}")) 41 | m = re.search(r"JobState=(\w+)", sctrl_res.decode()) 42 | res = {jobid: m.group(1)} 43 | break 44 | except sp.CalledProcessError as e: 45 | logger.error("scontrol process error") 46 | logger.error(e) 47 | if i >= STATUS_ATTEMPTS - 1: 48 | print("failed") 49 | exit(0) 50 | else: 51 | time.sleep(1) 52 | 53 | return res[jobid] or "" 54 | 55 | 56 | def get_status_sidecar(jobid): 57 | """Get status from cluster sidecar""" 58 | sidecar_vars = json.loads(SIDECAR_VARS) 59 | url = "http://localhost:%d/job/status/%s" % (sidecar_vars["server_port"], jobid) 60 | headers = {"Authorization": "Bearer %s" % sidecar_vars["server_secret"]} 61 | try: 62 | resp = requests.get(url, headers=headers) 63 | if resp.status_code == 404: 64 | return "" # not found yet 65 | logger.debug("sidecar returned: %s" % resp.json()) 66 | resp.raise_for_status() 67 | return resp.json().get("status") or "" 68 | except requests.exceptions.ConnectionError as e: 69 | logger.warning("slurm-status.py: could not query side car: %s", e) 70 | logger.info("slurm-status.py: falling back to direct query") 71 | return get_status_direct(jobid) 72 | 73 | 74 | jobid = sys.argv[1] 75 | 76 | if SIDECAR_VARS: 77 | logger.debug("slurm-status.py: querying sidecar") 78 | status = get_status_sidecar(jobid) 79 | else: 80 | logger.debug("slurm-status.py: direct query") 81 | status = get_status_direct(jobid) 82 | 83 | logger.debug("job status: %s", repr(status)) 84 | 85 | if status == "BOOT_FAIL": 86 | print("failed") 87 | elif status == "OUT_OF_MEMORY": 88 | print("failed") 89 | elif status.startswith("CANCELLED"): 90 | print("failed") 91 | elif status == "COMPLETED": 92 | print("success") 93 | elif status == "DEADLINE": 94 | print("failed") 95 | elif status == "FAILED": 96 | print("failed") 97 | elif status == "NODE_FAIL": 98 | print("failed") 99 | elif status == "PREEMPTED": 100 | print("failed") 101 | elif status == "TIMEOUT": 102 | print("failed") 103 | elif status == "SUSPENDED": 104 | print("running") 105 | else: 106 | print("running") 107 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/slurm-submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Snakemake SLURM submit script. 4 | """ 5 | import json 6 | import logging 7 | import os 8 | 9 | import requests 10 | from snakemake.utils import read_job_properties 11 | 12 | import slurm_utils 13 | from CookieCutter import CookieCutter 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | SIDECAR_VARS = os.environ.get("SNAKEMAKE_CLUSTER_SIDECAR_VARS", None) 18 | DEBUG = bool(int(os.environ.get("SNAKEMAKE_SLURM_DEBUG", "0"))) 19 | 20 | if DEBUG: 21 | logging.basicConfig(level=logging.DEBUG) 22 | logger.setLevel(logging.DEBUG) 23 | 24 | 25 | def register_with_sidecar(jobid): 26 | if SIDECAR_VARS is None: 27 | return 28 | sidecar_vars = json.loads(SIDECAR_VARS) 29 | url = "http://localhost:%d/job/register/%s" % (sidecar_vars["server_port"], jobid) 30 | logger.debug("POST to %s", url) 31 | headers = {"Authorization": "Bearer %s" % sidecar_vars["server_secret"]} 32 | requests.post(url, headers=headers) 33 | 34 | 35 | # cookiecutter arguments 36 | SBATCH_DEFAULTS = CookieCutter.SBATCH_DEFAULTS 37 | CLUSTER = CookieCutter.get_cluster_option() 38 | CLUSTER_CONFIG = CookieCutter.CLUSTER_CONFIG 39 | 40 | RESOURCE_MAPPING = { 41 | "time": ("time", "runtime", "walltime"), 42 | "mem": ("mem", "mem_mb", "ram", "memory"), 43 | "mem-per-cpu": ("mem-per-cpu", "mem_per_cpu", "mem_per_thread"), 44 | "nodes": ("nodes", "nnodes"), 45 | "partition": ("partition", "queue"), 46 | } 47 | 48 | # parse job 49 | jobscript = slurm_utils.parse_jobscript() 50 | job_properties = read_job_properties(jobscript) 51 | 52 | sbatch_options = {} 53 | cluster_config = slurm_utils.load_cluster_config(CLUSTER_CONFIG) 54 | 55 | # 1) sbatch default arguments and cluster 56 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(SBATCH_DEFAULTS)) 57 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(CLUSTER)) 58 | 59 | # 2) cluster_config defaults 60 | sbatch_options.update(cluster_config["__default__"]) 61 | 62 | # 3) Convert resources (no unit conversion!) and threads 63 | sbatch_options.update(slurm_utils.convert_job_properties(job_properties, RESOURCE_MAPPING)) 64 | 65 | # 4) cluster_config for particular rule 66 | sbatch_options.update(cluster_config.get(job_properties.get("rule"), {})) 67 | 68 | # 5) cluster_config options 69 | sbatch_options.update(job_properties.get("cluster", {})) 70 | 71 | # convert human-friendly time - leaves slurm format time as is 72 | if "time" in sbatch_options: 73 | duration = str(sbatch_options["time"]) 74 | sbatch_options["time"] = str(slurm_utils.Time(duration)) 75 | 76 | # 6) Format pattern in snakemake style 77 | sbatch_options = slurm_utils.format_values(sbatch_options, job_properties) 78 | 79 | # 7) create output and error filenames and paths 80 | joblog = slurm_utils.JobLog(job_properties) 81 | log = "" 82 | if "output" not in sbatch_options and CookieCutter.get_cluster_logpath(): 83 | outlog = joblog.outlog 84 | log = outlog 85 | sbatch_options["output"] = outlog 86 | 87 | if "error" not in sbatch_options and CookieCutter.get_cluster_logpath(): 88 | errlog = joblog.errlog 89 | log = errlog 90 | sbatch_options["error"] = errlog 91 | 92 | # ensure sbatch output dirs exist 93 | for o in ("output", "error"): 94 | slurm_utils.ensure_dirs_exist(sbatch_options[o]) if o in sbatch_options else None 95 | 96 | # 9) Set slurm job name 97 | if "job-name" not in sbatch_options and "job_name" not in sbatch_options: 98 | sbatch_options["job-name"] = joblog.jobname 99 | 100 | # submit job and echo id back to Snakemake (must be the only stdout) 101 | jobid = slurm_utils.submit_job(jobscript, **sbatch_options) 102 | logger.debug("Registering %s with sidecar...", jobid) 103 | register_with_sidecar(jobid) 104 | logger.debug("... done registering with sidecar") 105 | print(jobid) 106 | -------------------------------------------------------------------------------- /tests/test_slurm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pytest 3 | import time 4 | 5 | 6 | @pytest.fixture 7 | def profile(cookie_factory, data, request): 8 | cookie_factory() 9 | 10 | 11 | @pytest.fixture(params=["yes", "no"]) 12 | def sidecar_profile(cookie_factory, data, request): 13 | cookie_factory(cluster_sidecar=request.param) 14 | 15 | 16 | @pytest.mark.slow 17 | @pytest.mark.skipci 18 | def test_no_timeout(smk_runner, sidecar_profile): 19 | """Test that rule that updates runtime doesn't timeout""" 20 | smk_runner.make_target("timeout.txt") 21 | assert "Trying to restart" in smk_runner.output 22 | smk_runner.wait_for_status("COMPLETED") 23 | assert "Finished job" in smk_runner.output 24 | 25 | 26 | @pytest.mark.slow 27 | def test_timeout(smk_runner, sidecar_profile): 28 | """Test that rule excessive runtime resources times out""" 29 | opts = ( 30 | f'--cluster "sbatch --parsable -p {smk_runner.partition} {pytest.account} ' 31 | '-c 1 -t {resources.runtime}" --attempt 1' 32 | ) 33 | smk_runner.make_target("timeout.txt", options=opts, profile=None, asynchronous=True) 34 | # Discount queueing time 35 | smk_runner.wait_for_status("RUNNING") 36 | smk_runner.wait_while_status("RUNNING", tdelta=20, timeout=90) 37 | assert smk_runner.check_jobstatus("TIMEOUT|NODE_FAIL") 38 | 39 | 40 | def test_profile_status_running(smk_runner, sidecar_profile): 41 | """Test that slurm-status.py catches RUNNING status""" 42 | opts = ( 43 | f'--cluster "sbatch --parsable -p {smk_runner.partition}' 44 | f' {pytest.account} -c 1 -t 1"' 45 | ) 46 | smk_runner.make_target( 47 | "timeout.txt", options=opts, profile=None, asynchronous=True 48 | ) # noqa: E501 49 | smk_runner.wait_for_status("RUNNING", tdelta=5) 50 | jid = smk_runner.external_jobid[0] 51 | _, output = smk_runner.exec_run( 52 | cmd=f"{smk_runner.slurm_status} {jid}", stream=False 53 | ) 54 | assert output.decode().strip() == "running" 55 | smk_runner.cancel_slurm_job(jid) 56 | 57 | 58 | @pytest.mark.timeout(60) 59 | def test_slurm_submit(smk_runner, profile): 60 | """Test that slurm-submit.py works""" 61 | jobscript = smk_runner.script("jobscript.sh") 62 | jobscript.write( 63 | ( 64 | "#!/bin/bash\n" 65 | '# properties = {"cluster": {"job-name": "sm-job"},' 66 | '"input": [], "output": [], "wildcards": {}, "params": {},' 67 | '"rule": "slurm_submit"}\n' 68 | ) 69 | ) 70 | _, output = smk_runner.exec_run( 71 | cmd=f"{smk_runner.slurm_submit} {jobscript}", stream=False 72 | ) 73 | jobid = int(output.decode().strip()) 74 | time.sleep(5) 75 | assert smk_runner.check_jobstatus( 76 | "sm-job", options="--format=jobname", jobid=jobid) 77 | smk_runner.cancel_slurm_job(jobid) 78 | 79 | 80 | @pytest.mark.timeout(60) 81 | @pytest.mark.skipci 82 | def test_group_job(smk_runner, profile): 83 | """Test that group job properties formatted as expected""" 84 | smk_runner.make_target("group_job.2.txt", stream=False) 85 | smk_runner.wait_for_status("COMPLETED", tdelta=5) 86 | assert "Submitted group job" in smk_runner.output 87 | assert "2 of 2 steps" in smk_runner.output 88 | 89 | 90 | @pytest.mark.timeout(60) 91 | @pytest.mark.skipci 92 | def test_wildcard_job(smk_runner, profile): 93 | """Test that wildcard job properties formatted as expected""" 94 | smk_runner.make_target("wildcard.wc.txt") 95 | assert "Finished job" in smk_runner.output 96 | 97 | 98 | def test_si_units(smk_runner, profile): 99 | """Test that setting memory with si units works""" 100 | _, output = smk_runner.make_target( 101 | "siunit.txt", 102 | options=f"--cluster-config {smk_runner.cluster_config}", 103 | stream=False 104 | ) 105 | assert "Memory specification can not be satisfied" in smk_runner.output 106 | assert "--mem=1000" in smk_runner.output 107 | 108 | 109 | @pytest.mark.parametrize("cluster_config", [True, False]) 110 | def test_partition(smk_runner, profile, cluster_config): 111 | options = f"--cluster-config {smk_runner.cluster_config}" if cluster_config else "" 112 | target = "partition.cc.txt" if cluster_config else "partition.resources.txt" 113 | smk_runner.make_target(target, options=options, stream=False) 114 | assert smk_runner.check_jobstatus("debug", "-n -o Partition") 115 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import re 4 | import sys 5 | import subprocess 6 | import pytest 7 | from docker.models.containers import Container 8 | 9 | sys.path.append( 10 | os.path.join(os.path.dirname(__file__), os.pardir, "{{cookiecutter.profile_name}}") 11 | ) 12 | from CookieCutter import CookieCutter # noqa: E402 13 | import slurm_utils # noqa: E402 14 | from slurm_utils import Time, InvalidTimeUnitError # noqa: E402 15 | 16 | 17 | def test_time_to_minutes(): 18 | minutes = slurm_utils.time_to_minutes("foo") 19 | assert minutes is None 20 | minutes = slurm_utils.time_to_minutes("10-00:00:10") 21 | assert minutes == 14401 22 | minutes = slurm_utils.time_to_minutes("10:00:00") 23 | assert minutes == 600 24 | minutes = slurm_utils.time_to_minutes("100:00") 25 | assert minutes == 100 26 | minutes = slurm_utils.time_to_minutes("20") 27 | assert minutes == 20 28 | 29 | 30 | def test_si_units(): 31 | m = slurm_utils._convert_units_to_mb(1000) 32 | assert m == 1000 33 | m = slurm_utils._convert_units_to_mb("1000K") 34 | assert m == 1 35 | m = slurm_utils._convert_units_to_mb("1000M") 36 | assert m == 1000 37 | m = slurm_utils._convert_units_to_mb("1000G") 38 | assert m == 1e6 39 | m = slurm_utils._convert_units_to_mb("1000T") 40 | assert m == 1e9 41 | with pytest.raises(SystemExit): 42 | m = slurm_utils._convert_units_to_mb("1000E") 43 | 44 | 45 | class TestTime: 46 | def test_parse_time_seconds(self): 47 | s = "4s" 48 | 49 | actual = str(Time(s)) 50 | expected = "0:00:04" 51 | 52 | assert actual == expected 53 | 54 | def test_parse_time_minutes(self): 55 | s = "4m" 56 | 57 | actual = str(Time(s)) 58 | expected = "0:04:00" 59 | 60 | assert actual == expected 61 | 62 | def test_parse_time_hours_in_minutes(self): 63 | s = "400m" 64 | 65 | actual = str(Time(s)) 66 | expected = "6:40:00" 67 | 68 | assert actual == expected 69 | 70 | def test_parse_time_hours(self): 71 | s = "3H" 72 | 73 | actual = str(Time(s)) 74 | expected = "3:00:00" 75 | 76 | assert actual == expected 77 | 78 | def test_parse_time_hours_and_minutes(self): 79 | s = "3h46m" 80 | 81 | actual = str(Time(s)) 82 | expected = "3:46:00" 83 | 84 | assert actual == expected 85 | 86 | def test_parse_time_hours_and_minutes_with_space(self): 87 | s = "3h 46m" 88 | 89 | actual = str(Time(s)) 90 | expected = "3:46:00" 91 | 92 | assert actual == expected 93 | 94 | def test_parse_time_days_and_seconds(self): 95 | s = "1d4s" 96 | 97 | actual = str(Time(s)) 98 | expected = "24:00:04" 99 | 100 | assert actual == expected 101 | def test_parse_time_days_and_seconds_order_not_important(self): 102 | s = "4s1d" 103 | 104 | actual = str(Time(s)) 105 | expected = "24:00:04" 106 | 107 | assert actual == expected 108 | 109 | def test_parse_time_weeks_and_minutes(self): 110 | s = "2w4m" 111 | 112 | actual = str(Time(s)) 113 | expected = "336:04:00" 114 | 115 | assert actual == expected 116 | 117 | def test_parse_time_slurm_format_no_parsing(self): 118 | s = "3:45" 119 | 120 | actual = str(Time(s)) 121 | expected = "3:45" 122 | 123 | assert actual == expected 124 | 125 | def test_parse_time_no_units(self): 126 | s = "3" 127 | 128 | actual = str(Time(s)) 129 | expected = "3" 130 | 131 | assert actual == expected 132 | 133 | def test_parse_time_zero(self): 134 | s = "0" 135 | 136 | actual = str(Time(s)) 137 | expected = "0" 138 | 139 | assert actual == expected 140 | 141 | def test_parse_time_float_is_supported(self): 142 | s = "1.5d" 143 | 144 | actual = str(Time(s)) 145 | expected = "36:00:00" 146 | 147 | assert actual == expected 148 | 149 | def test_parse_time_missing_unit_ignores_value_with_no_unit(self): 150 | s = "5m3" 151 | 152 | actual = str(Time(s)) 153 | expected = "0:05:00" 154 | 155 | assert actual == expected 156 | 157 | def test_parse_time_unknown_unit(self): 158 | s = "5x" 159 | 160 | with pytest.raises(InvalidTimeUnitError): 161 | actual = str(Time(s)) 162 | -------------------------------------------------------------------------------- /tests/deploystack.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # 3 | # Deploy docker stack 4 | # 5 | # Compose file 6 | DOCKER_COMPOSE=${DOCKER_COMPOSE:=docker-compose.yaml} 7 | 8 | # Images 9 | SNAKEMAKE_IMAGE=${SNAKEMAKE_IMAGE:=quay.io/biocontainers/snakemake:7.30.1--hdfd78af_0} 10 | SLURM_IMAGE=${SLURM_IMAGE:=giovtorres/docker-centos7-slurm:20.11.8} 11 | 12 | docker pull $SNAKEMAKE_IMAGE 13 | docker pull $SLURM_IMAGE 14 | 15 | # Stack and service config 16 | STACK_NAME=cookiecutter-slurm 17 | SLURM_SERVICE=${STACK_NAME}_slurm 18 | SNAKEMAKE_SERVICE=${STACK_NAME}_snakemake 19 | LOCAL_USER_ID=$(id -u) 20 | 21 | ############################## 22 | ## Functions 23 | ############################## 24 | ## Add slurm user to container 25 | function add_slurm_user { 26 | user=$1 27 | container=$2 28 | # check if user exists 29 | docker exec $container /bin/bash -c "id $user" > /dev/null 30 | if [ $? -eq 1 ]; then 31 | echo "Adding user $user to docker container" 32 | docker exec $container /bin/bash -c "useradd --shell /bin/bash -u $user -o -c \"\" -m -g slurm user" 33 | if [ $? -eq 1 ]; then 34 | echo "Failed to add user $user" 35 | exit 1; 36 | fi 37 | fi 38 | } 39 | 40 | 41 | SLURM_CONF=$(cat <> $slurmconf ; " 71 | # Need to be sure slurmdb is available for sacctmgr to work 72 | database_up $container 73 | # Restart services; needed for sacct; see https://github.com/giovtorres/docker-centos7-slurm/issues/3 74 | echo " restarting slurm services..." 75 | docker exec $container /bin/bash -c 'sacctmgr --immediate add cluster name=linux' 76 | docker exec $container supervisorctl restart slurmdbd 77 | docker exec $container supervisorctl restart slurmctld 78 | docker exec $container /bin/bash -c "sacctmgr --immediate add account none,test Description=\"none\" Organization=\"none\"" 79 | docker exec $container sinfo 80 | fi 81 | } 82 | 83 | 84 | ### Check if database is up 85 | function database_up { 86 | COUNT=1 87 | MAXCOUNT=10 88 | 89 | container=$1 90 | docker exec $container mysqladmin status 2> /dev/null 91 | database_up=$? 92 | 93 | until [ $database_up -eq 0 ]; do 94 | echo "$COUNT: database unavailable" 95 | sleep 5 96 | docker exec $container mysqladmin status 2> /dev/null 97 | database_up=$? 98 | if [ $COUNT -eq $MAXCOUNT ]; then 99 | echo "database connection failed" 100 | return 101 | fi 102 | COUNT=$((COUNT+1)) 103 | done 104 | 105 | echo "database up!" 106 | } 107 | 108 | 109 | ### Check if service is up 110 | function service_up { 111 | SERVICE=$1 112 | COUNT=1 113 | MAXCOUNT=30 114 | 115 | docker service ps $SERVICE --format "{{.CurrentState}}" 2>/dev/null | grep Running 116 | service_up=$? 117 | 118 | until [ $service_up -eq 0 ]; do 119 | echo "$COUNT: service $SERVICE unavailable" 120 | sleep 5 121 | docker service ps $SERVICE --format "{{.CurrentState}}" 2>/dev/null | grep Running 122 | service_up=$? 123 | if [ $COUNT -eq $MAXCOUNT ]; then 124 | echo "service $SERVICE not found; giving up" 125 | exit 1 126 | fi 127 | COUNT=$((COUNT+1)) 128 | done 129 | 130 | echo "service $SERVICE up!" 131 | } 132 | 133 | 134 | ############################## 135 | ## Deploy stack 136 | ############################## 137 | 138 | # Check if docker stack has been deployed 139 | docker service ps $SLURM_SERVICE --format "{{.CurrentState}}" 2>/dev/null | grep Running 140 | service_up=$? 141 | 142 | if [ $service_up -eq 1 ]; then 143 | docker stack deploy --with-registry-auth -c $DOCKER_COMPOSE $STACK_NAME; 144 | fi 145 | 146 | service_up $SLURM_SERVICE 147 | service_up $SNAKEMAKE_SERVICE 148 | CONTAINER=$(docker ps | grep cookiecutter-slurm_slurm | awk '{print $1}') 149 | 150 | 151 | # Add local user id as user to container 152 | add_slurm_user $LOCAL_USER_ID $CONTAINER 153 | 154 | # Fix snakemake header to point to /opt/local/bin 155 | docker exec $CONTAINER /bin/bash -c "head -1 /opt/local/bin/snakemake" | grep -q "/usr/local/bin" 156 | if [ $? -eq 0 ]; then 157 | echo "Rewriting snakemake header to point to /opt/local/bin" 158 | docker exec $CONTAINER /bin/bash -c 'sed -i -e "s:/usr:/opt:" /opt/local/bin/snakemake' 159 | fi 160 | 161 | # Rewrite slurm config 162 | modify_slurm_conf $CONTAINER 163 | 164 | # Add pandas to snakemake 165 | CONTAINER=$(docker ps | grep cookiecutter-slurm_snakemake | awk '{print $1}') 166 | docker exec $CONTAINER pip install pandas 167 | 168 | # Make sure sacct is function properly 169 | CONTAINER=$(docker ps | grep cookiecutter-slurm_slurm | awk '{print $1}') 170 | jobid=$(docker exec $CONTAINER sbatch --parsable --wrap "sleep 1" --job-name check-sacct) 171 | sleep 5 172 | docker exec $CONTAINER sacct -o JobName -p | grep check-sacct -q 173 | if [ $? -eq 1 ]; then 174 | echo "sacct not working properly; tests will fail" 175 | exit 1 176 | fi 177 | docker exec $CONTAINER scancel $jobid 178 | -------------------------------------------------------------------------------- /tests/wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import re 5 | import sys 6 | import logging 7 | import time 8 | import subprocess as sp 9 | from docker.models.resource import Model 10 | from docker.models.containers import ExecResult 11 | from docker.errors import DockerException 12 | 13 | STDOUT = sys.stdout 14 | 15 | 16 | class ShellContainer(Model): 17 | """Class wrapper to emulate docker container but for shell calls""" 18 | 19 | _exit_code = None 20 | 21 | def __init__(self, attrs=None, client=None, collection=None): 22 | super().__init__(attrs, client, collection) 23 | 24 | @property 25 | def short_id(self): 26 | return self.id 27 | 28 | def exec_run(self, cmd, stream=False, detach=False, **kwargs): 29 | stdout = kwargs.pop("stdout", sp.PIPE) 30 | stderr = kwargs.pop("stderr", sp.STDOUT) 31 | close_fds = sys.platform != "win32" 32 | executable = os.environ.get("SHELL", None) 33 | proc = sp.Popen( 34 | cmd, 35 | bufsize=-1, 36 | shell=True, 37 | stdout=stdout, 38 | stderr=stderr, 39 | close_fds=close_fds, 40 | executable=executable, 41 | ) 42 | 43 | def iter_stdout(proc): 44 | for line in proc.stdout: 45 | yield line[:-1] 46 | 47 | if detach: 48 | return ExecResult(None, "") 49 | 50 | if stream: 51 | return ExecResult(None, iter_stdout(proc)) 52 | 53 | output = proc.communicate() 54 | return ExecResult(proc.returncode, output[0]) 55 | 56 | 57 | class SnakemakeRunner: 58 | """Class wrapper to run snakemake jobs in container""" 59 | 60 | _snakemake = "snakemake" 61 | _snakefile = "Snakefile" 62 | _directory = None 63 | _jobid_regex = "|".join( 64 | [ 65 | r"Submitted batch job (\d+)", 66 | r"Submitted job \d+ with external jobid '(\d+)'.", 67 | r"Submitted group job \S+ with external jobid '(\d+)'." 68 | # Missing resubmitted case 69 | ] 70 | ) 71 | 72 | _process_args = {} 73 | _process_prefix = "" 74 | 75 | @classmethod 76 | def executable(cls, cmd): 77 | if os.path.split(cmd)[-1] == "bash": 78 | cls._process_prefix = "set -euo pipefail" 79 | cls._process_args["executable"] = cmd 80 | 81 | @classmethod 82 | def prefix(cls, prefix): 83 | cls._process_prefix = prefix 84 | 85 | def __init__(self, container, data, jobname, partition="normal", account=None): 86 | self._container = container 87 | self._data = data 88 | self._jobname = re.sub("test_", "", jobname) 89 | self._output = [] 90 | self._pp = self._process_prefix 91 | self._cmd = "" 92 | self._num_cores = 1 93 | self._logger = logging.getLogger(str(self)) 94 | self._external_jobid = [] 95 | self._partition = partition 96 | self._account = account 97 | self._profile = self._data.join("slurm") 98 | 99 | def exec_run(self, cmd, stream=False, **kwargs): 100 | return self._container.exec_run(cmd, stream=stream, **kwargs) 101 | 102 | def make_target(self, target, stream=True, asynchronous=False, **kwargs): 103 | """Wrapper to make snakemake target""" 104 | self._snakefile = kwargs.pop("snakefile", self._snakefile) 105 | options = kwargs.pop("options", "") 106 | profile = kwargs.pop("profile", str(self.profile)) 107 | jobname = kwargs.pop("jobname", str(self.jobname)) 108 | force = "-F" if kwargs.pop("force", False) else "" 109 | verbose = kwargs.pop("verbose", True) 110 | self._directory = "-d {}".format(kwargs.pop("dir", self.snakefile.dirname)) 111 | prof = "" if profile is None else f"--profile {profile}" 112 | jn = "" if jobname is None else f"--jn {jobname}-{{jobid}}" 113 | self._external_jobid = [] 114 | 115 | cmd = ( 116 | f"{self.exe} -c '{self.pp} && " 117 | + f"{self.snakemake} -s {self.snakefile} " 118 | + f"{options} --nolock --default-resources mem_mb=100 " 119 | + f"-j {self._num_cores} {self.workdir} {force} {target} {prof} {jn}'" 120 | ) 121 | 122 | try: 123 | (exit_code, output) = self.exec_run(cmd, stream=stream, detach=asynchronous) 124 | except Exception as e: 125 | raise e 126 | if stream: 127 | for x in output: 128 | if isinstance(x, bytes): 129 | x = x.decode() 130 | if verbose: 131 | print(x) 132 | self._output.append(x) 133 | else: 134 | if isinstance(output, bytes): 135 | output = output.decode() 136 | self._output = [output] 137 | return ExecResult(exit_code, output) 138 | 139 | @property 140 | def jobname(self): 141 | return self._jobname 142 | 143 | @property 144 | def profile(self): 145 | return self._profile 146 | 147 | @property 148 | def snakefile(self): 149 | return self._data.join(self._snakefile) 150 | 151 | @property 152 | def snakemake(self): 153 | return self._snakemake 154 | 155 | @property 156 | def account(self): 157 | return self._account 158 | 159 | @property 160 | def partition(self): 161 | return self._partition 162 | 163 | @property 164 | def workdir(self): 165 | if self._directory is None: 166 | self._directory = self.snakefile.dirname 167 | return self._directory 168 | 169 | @property 170 | def cluster_config(self): 171 | return self._data.join("cluster-config.yaml") 172 | 173 | @property 174 | def slurm_submit(self): 175 | return self.profile.join("slurm-submit.py") 176 | 177 | @property 178 | def slurm_status(self): 179 | return self.profile.join("slurm-status.py") 180 | 181 | @property 182 | def exe(self): 183 | return self._process_args["executable"] 184 | 185 | @property 186 | def pp(self): 187 | return self._pp 188 | 189 | def script(self, script): 190 | return self._data.join(script) 191 | 192 | @property 193 | def output(self): 194 | if isinstance(self._output, list): 195 | return "\n".join(self._output) 196 | return self._output 197 | 198 | @property 199 | def external_jobid(self): 200 | if len(self._external_jobid) == 0: 201 | try: 202 | m = re.findall(self._jobid_regex, self.output) 203 | if m is not None: 204 | self._external_jobid = [int(x) for y in m for x in y if x] 205 | except Exception as e: 206 | print(e) 207 | finally: 208 | (_, out) = self.exec_run('squeue -h -o "%.50j,%.10i"', stream=False) 209 | try: 210 | for res in out.decode().split("\n"): 211 | if self.jobname in res: 212 | self._external_jobid.append( 213 | re.search(r" (\d+)$", res.strip()).group(1) 214 | ) 215 | except Exception as e: 216 | print(e) 217 | 218 | return self._external_jobid 219 | 220 | def wait_while_status(self, status, timeout=60, tdelta=10, verbose=False): 221 | """Wait for status to change""" 222 | t = 0 223 | while self.check_jobstatus(status, verbose=verbose): 224 | time.sleep(tdelta) 225 | t = t + tdelta 226 | if t >= timeout: 227 | self._logger.error(f"waiting while status '{status}' timed out") 228 | break 229 | 230 | def wait_for_status(self, status, timeout=60, tdelta=10, verbose=False): 231 | """Wait until status is achieved""" 232 | t = 0 233 | while not self.check_jobstatus(status, verbose=verbose): 234 | time.sleep(tdelta) 235 | t = t + tdelta 236 | if t >= timeout: 237 | self._logger.error(f"waiting for status '{status}' timed out") 238 | break 239 | 240 | def cancel_slurm_job(self, jobid): 241 | """Cancel job in slurm queue""" 242 | self.exec_run(f"scancel {jobid}") 243 | 244 | def check_jobstatus(self, regex, options="", jobid=None, which=0, verbose=True): 245 | """Use sacct to check jobstatus""" 246 | if len(self.external_jobid) == 0 and jobid is None: 247 | return False 248 | if jobid is None: 249 | jobid = str(self.external_jobid[which]).strip() 250 | cmd = f"sacct --parsable2 -b {options} -j {jobid}" 251 | (exit_code, output) = self.exec_run(cmd, stream=False) 252 | if exit_code != 0: 253 | raise DockerException(output.decode()) 254 | m = re.search(regex, output.decode()) 255 | if m is None and verbose: 256 | self._logger.warning(f"{cmd}\n{output.decode()}") 257 | return m 258 | 259 | def __str__(self): 260 | return f"{self._jobname}" 261 | 262 | 263 | if "SHELL" in os.environ: 264 | SnakemakeRunner.executable(os.environ["SHELL"]) 265 | # Try falling back on /bin/bash 266 | else: 267 | SnakemakeRunner.executable("/bin/bash") 268 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from os.path import join as pjoin 4 | import re 5 | import py 6 | import pytest 7 | import docker 8 | from docker.models.containers import Container 9 | import shutil 10 | import logging 11 | from pytest_cookies.plugin import Cookies 12 | from wrapper import SnakemakeRunner, ShellContainer 13 | 14 | 15 | def pytest_addoption(parser): 16 | group = parser.getgroup("slurm") 17 | group.addoption( 18 | "--partition", 19 | action="store", 20 | default="normal", 21 | help="partition to run tests on", 22 | ) 23 | group.addoption("--account", action="store", default=None, help="slurm account") 24 | group.addoption( 25 | "--slow", action="store_true", help="include slow tests", default=False 26 | ) 27 | group.addoption("--cluster", action="store", default=None, help="slurm cluster") 28 | 29 | 30 | def pytest_configure(config): 31 | pytest.local_user_id = os.getuid() 32 | pytest.dname = os.path.dirname(__file__) 33 | pytest.cookie_template = py.path.local(pytest.dname).join(os.pardir) 34 | config.addinivalue_line("markers", "slow: mark tests as slow") 35 | config.addinivalue_line("markers", "docker: mark tests as docker tests only") 36 | config.addinivalue_line("markers", "sbatch: mark tests as sbatch shell tests only") 37 | config.addinivalue_line("markers", "skipci: skip tests on ci") 38 | setup_logging(config.getoption("--log-level")) 39 | pytest.partition = config.getoption("--partition") 40 | pytest.account = "" 41 | if config.getoption("--account"): 42 | pytest.account = "--account={}".format(config.getoption("--account")) 43 | pytest.cluster = config.getoption("--cluster") 44 | if shutil.which("sbatch") is not None and config.getoption("--basetemp") is None: 45 | config.option.basetemp = "./.pytest" 46 | 47 | 48 | def setup_logging(level): 49 | if level is None: 50 | level = logging.WARN 51 | elif re.match(r"\d+", level): 52 | level = int(level) 53 | logging.basicConfig(level=level) 54 | logging.getLogger("urllib3").setLevel(level) 55 | logging.getLogger("docker").setLevel(level) 56 | logging.getLogger("poyo").setLevel(level) 57 | logging.getLogger("binaryornot").setLevel(level) 58 | 59 | 60 | @pytest.fixture 61 | def datadir(tmpdir_factory): 62 | """Setup base data directory for a test""" 63 | p = tmpdir_factory.mktemp("data") 64 | return p 65 | 66 | 67 | @pytest.fixture 68 | def datafile(datadir): 69 | """Add a datafile to the datadir. 70 | 71 | By default, look for a source (src) input file located in the 72 | tests directory (pytest.dname). Custom data can be added by 73 | pointing a file 'dname / src'. The contents of src are copied to 74 | the file 'dst' in the test data directory 75 | 76 | Args: 77 | src (str): source file name 78 | dst (str): destination file name. Defaults to src. 79 | dname (str): directory where src is located. 80 | 81 | """ 82 | 83 | def _datafile(src, dst=None, dname=pytest.dname): 84 | dst = src if dst is None else dst 85 | src = py.path.local(pjoin(dname, src)) 86 | dst = datadir.join(dst) 87 | src.copy(dst) 88 | return dst 89 | 90 | return _datafile 91 | 92 | 93 | @pytest.fixture 94 | def cookie_factory(tmpdir_factory, _cookiecutter_config_file, datadir): 95 | """Cookie factory fixture. 96 | 97 | Cookie factory fixture to create a slurm profile in the test data 98 | directory. 99 | 100 | Args: 101 | sbatch_defaults (str): sbatch defaults for cookie 102 | cluster_sidecar (str): use sidecar to monitor job status 103 | cluster_name (str): set cluster name 104 | cluster_config (str): cluster configuration file 105 | yamlconfig (dict): dictionary of snakemake options with values 106 | 107 | """ 108 | 109 | logging.getLogger("cookiecutter").setLevel(logging.INFO) 110 | _sbatch_defaults = ( 111 | f"--partition={pytest.partition} {pytest.account} " 112 | "--output=logs/slurm-%j.out --error=logs/slurm-%j.err" 113 | ) 114 | _yamlconfig_default = { 115 | 'restart-times': 1 116 | } 117 | 118 | def _cookie_factory( 119 | sbatch_defaults=_sbatch_defaults, 120 | cluster_sidecar="yes", 121 | cluster_name=None, 122 | cluster_config=None, 123 | yamlconfig=_yamlconfig_default, 124 | ): 125 | cookie_template = pjoin(os.path.abspath(pytest.dname), os.pardir) 126 | output_factory = tmpdir_factory.mktemp 127 | c = Cookies(cookie_template, output_factory, _cookiecutter_config_file) 128 | c._new_output_dir = lambda: str(datadir) 129 | extra_context = { 130 | "sbatch_defaults": sbatch_defaults, 131 | "cluster_sidecar": cluster_sidecar, 132 | } 133 | if cluster_name is not None: 134 | extra_context["cluster_name"] = cluster_name 135 | if cluster_config is not None: 136 | extra_context["cluster_config"] = cluster_config 137 | c.bake(extra_context=extra_context) 138 | config = datadir.join("slurm").join("config.yaml") 139 | config_d = dict( 140 | [tuple(line.split(":")) for line in config.read().split("\n") if re.search("^[a-z]", line)] 141 | ) 142 | config_d.update(**yamlconfig) 143 | config.write("\n".join(f"{k}: {v}" for k, v in config_d.items())) 144 | return _cookie_factory 145 | 146 | 147 | @pytest.fixture 148 | def data(tmpdir_factory, request, datafile): 149 | """Setup base data consisting of a Snakefile and cluster configuration file""" 150 | datafile("Snakefile") 151 | ccfile = datafile("cluster-config.yaml") 152 | return py.path.local(ccfile.dirname) 153 | 154 | 155 | @pytest.fixture(scope="session") 156 | def slurm(request): 157 | """Slurm fixture 158 | 159 | Return relevant container depending on environment. First look for 160 | sbatch command to determine whether we are on a system running the 161 | SLURM scheduler. Second, try deploying a docker stack to run slurm 162 | locally. 163 | 164 | Skip slurm tests if the above actions fail. 165 | 166 | """ 167 | if shutil.which("sbatch") is not None: 168 | return ShellContainer() 169 | else: 170 | client = docker.from_env() 171 | container_list = client.containers.list( 172 | filters={"name": "cookiecutter-slurm_slurm"} 173 | ) 174 | container = container_list[0] if len(container_list) > 0 else None 175 | if container: 176 | return container 177 | 178 | msg = ( 179 | "no sbatch or docker stack 'cookiecutter-slurm' running;" 180 | " skipping slurm-based tests." 181 | " Either run tests on a slurm HPC or deploy a docker stack with" 182 | f" {os.path.dirname(__file__)}/deploystack.sh" 183 | ) 184 | 185 | pytest.skip(msg) 186 | 187 | 188 | def teardown(request): 189 | """Shutdown snakemake processes that are waiting for slurm 190 | 191 | On nsf systems, stale snakemake log files may linger in the test 192 | directory, which prevents reruns of pytest. The teardown function 193 | calls 'lsof' to identify and terminate the processes using these 194 | files. 195 | 196 | """ 197 | 198 | logging.info(f"\n\nTearing down test '{request.node.name}'") 199 | basetemp = request.config.getoption("basetemp") 200 | from subprocess import Popen, PIPE 201 | import psutil 202 | 203 | for root, _, files in os.walk(basetemp, topdown=False): 204 | for name in files: 205 | if not root.endswith(".snakemake/log"): 206 | continue 207 | try: 208 | fn = os.path.join(root, name) 209 | proc = Popen(["lsof", "-F", "p", fn], stdout=PIPE, stderr=PIPE) 210 | pid = proc.communicate()[0].decode().strip().strip("p") 211 | if pid: 212 | p = psutil.Process(int(pid)) 213 | logging.info(f"Killing process {p.pid} related to {fn}") 214 | p.kill() 215 | except psutil.NoSuchProcess as e: 216 | logging.warning(e) 217 | except ValueError as e: 218 | logging.warning(e) 219 | 220 | 221 | @pytest.fixture 222 | def smk_runner(slurm, datadir, request): 223 | """smk_runner fixture 224 | 225 | Setup a wrapper.SnakemakeRunner instance that runs the snakemake 226 | tests. Skip tests where the partition doesn't exist on the system. 227 | Some tests also only run in docker. 228 | 229 | """ 230 | 231 | _, partitions = slurm.exec_run('sinfo -h -o "%P"', stream=False) 232 | plist = [p.strip("*") for p in partitions.decode().split("\n") if p != ""] 233 | markers = [m.name for m in request.node.iter_markers()] 234 | slow = request.config.getoption("--slow") 235 | 236 | if pytest.partition not in plist: 237 | plist = ",".join(plist) 238 | pytest.skip( 239 | ( 240 | f"partition '{pytest.partition}' not in cluster partitions '{plist}';" 241 | " use the --partition option" 242 | ) 243 | ) 244 | 245 | if isinstance(slurm, ShellContainer): 246 | if "docker" in markers: 247 | pytest.skip(f"'{request.node.name}' only runs in docker container") 248 | if pytest.account == "": 249 | pytest.skip( 250 | "HPC slurm tests require setting the account; use the --account option" 251 | ) 252 | 253 | if isinstance(slurm, Container): 254 | if "sbatch" in markers: 255 | pytest.skip(f"'{request.node.name}' only runs with sbatch in shell") 256 | 257 | if not slow and "slow" in markers: 258 | pytest.skip(f"'{request.node.name}' is a slow test; activate with --slow flag") 259 | 260 | if os.getenv("CI") is not None and "skipci" in markers: 261 | pytest.skip(f"skip '{request.node.name}' on CI; test fails on CI only") 262 | 263 | yield SnakemakeRunner(slurm, datadir, request.node.name, pytest.partition) 264 | 265 | if isinstance(slurm, ShellContainer): 266 | teardown(request) 267 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/slurm-sidecar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Run a Snakemake v7+ sidecar process for Slurm 3 | 4 | This sidecar process will poll ``squeue --user [user] --format='%i,%T'`` 5 | every 60 seconds by default (use environment variable 6 | ``SNAKEMAKE_SLURM_SQUEUE_WAIT`` for adjusting this). 7 | 8 | Note that you have to adjust the value to fit to your ``MinJobAge`` Slurm 9 | configuration. Jobs remain at least ``MinJobAge`` seconds known to the 10 | Slurm controller (default of 300 seconds). If you query ``squeue`` every 11 | 60 seconds then this is plenty and you will observe all relevant job status 12 | states as they are relevant for Snakemake. 13 | 14 | If the environment variable ``SNAKEMAKE_CLUSTER_SIDECAR_VARS`` is set then 15 | the ``slurm-status.py`` of the slurm profile will attempt to query this 16 | sidecar process via HTTP. As the sidecar process does not update its 17 | cache in real-time, setting ``SNAKEMAKE_SLURM_SQUEUE_WAIT`` too large might 18 | lead to Snakemake missing the "done" job state. The defaults of 19 | ``SNAKEMAKE_SLURM_SQUEUE_WAIT=60`` and Slurm's ``MinJobAge=600`` work well 20 | together and you will see all relevant job statuses. 21 | 22 | If the sidecar is queried for a job ID that it has not seen yet then it will 23 | perform a query to ``sacct`` such that it works well if Snakemake "resume 24 | external job" feature. The ``slurm-submit.py`` script of the Snakemake profile 25 | will register all jobs via POST with this sidecar. 26 | """ 27 | 28 | import http.server 29 | import json 30 | import logging 31 | import os 32 | import subprocess 33 | import sys 34 | import signal 35 | import time 36 | import threading 37 | import uuid 38 | 39 | from CookieCutter import CookieCutter 40 | 41 | 42 | #: Enables debug messages for slurm sidecar. 43 | DEBUG = bool(int(os.environ.get("SNAKEMAKE_SLURM_DEBUG", "0"))) 44 | #: Enables HTTP request logging in sidecar. 45 | LOG_REQUESTS = bool(int(os.environ.get("SNAKEMAKE_SLURM_LOG_REQUESTS", "0"))) 46 | #: Command to call when calling squeue 47 | SQUEUE_CMD = os.environ.get("SNAKEMAKE_SLURM_SQUEUE_CMD", "squeue") 48 | #: Number of seconds to wait between ``squeue`` calls. 49 | SQUEUE_WAIT = int(os.environ.get("SNAKEMAKE_SLURM_SQUEUE_WAIT", "60")) 50 | 51 | logger = logging.getLogger(__name__) 52 | if DEBUG: 53 | logging.basicConfig(level=logging.DEBUG) 54 | logger.setLevel(logging.DEBUG) 55 | 56 | 57 | class PollSqueueThread(threading.Thread): 58 | """Thread that polls ``squeue`` until stopped by ``stop()``""" 59 | 60 | def __init__( 61 | self, 62 | squeue_wait, 63 | squeue_cmd, 64 | squeue_timeout=2, 65 | sleep_time=0.01, 66 | max_tries=3, 67 | *args, 68 | **kwargs 69 | ): 70 | super().__init__(target=self._work, *args, **kwargs) 71 | #: Time to wait between squeue calls. 72 | self.squeue_wait = squeue_wait 73 | #: Command to call squeue with. 74 | self.squeue_cmd = squeue_cmd 75 | #: Whether or not the thread should stop. 76 | self.stopped = threading.Event() 77 | #: Previous call to ``squeue`` 78 | self.prev_call = 0.0 79 | #: Time to sleep between iterations in seconds. Thread can only be 80 | #: terminated after this interval when waiting. 81 | self.sleep_time = sleep_time 82 | #: Maximal running time to accept for call to ``squeue``. 83 | self.squeue_timeout = squeue_timeout 84 | #: Maximal number of tries if call to ``squeue`` fails. 85 | self.max_tries = max_tries 86 | #: Dict mapping the job id to the job state string. 87 | self.states = {} 88 | #: Make at least one call to squeue, must not fail. 89 | logger.debug("initializing trhead") 90 | self._call_squeue(allow_failure=False) 91 | self.prev_call = time.time() 92 | 93 | def _work(self): 94 | """Execute the thread's action""" 95 | while not self.stopped.is_set(): 96 | now = time.time() 97 | if now - self.prev_call > self.squeue_wait: 98 | self._call_squeue() 99 | self.prev_call = now 100 | time.sleep(self.sleep_time) 101 | 102 | def get_state(self, jobid): 103 | """Return the job state for the given jobid.""" 104 | jobid = str(jobid) 105 | if jobid not in self.states: 106 | try: 107 | self.states[jobid] = self._get_state_sacct(jobid) 108 | except: 109 | return "__not_seen_yet__" 110 | return self.states.get(jobid, "__not_seen_yet__") 111 | 112 | def register_job(self, jobid): 113 | """Register job with the given ID.""" 114 | self.states.setdefault(jobid, None) 115 | 116 | def _get_state_sacct(self, jobid): 117 | """Implement retrieving state via sacct for resuming jobs.""" 118 | cluster = CookieCutter.get_cluster_option() 119 | cmd = ["sacct", "-P", "-b", "-j", jobid, "-n"] 120 | if cluster: 121 | cmd.append(cluster) 122 | try_num = 0 123 | while try_num < self.max_tries: 124 | try_num += 1 125 | try: 126 | logger.debug("Calling %s (try %d)", cmd, try_num) 127 | output = subprocess.check_output(cmd, timeout=self.squeue_timeout, text=True) 128 | except subprocess.TimeoutExpired as e: 129 | logger.warning("Call to %s timed out (try %d of %d)", cmd, try_num, self.max_tries) 130 | continue 131 | except subprocess.CalledProcessError as e: 132 | logger.warning("Call to %s failed (try %d of %d)", cmd, try_num, self.max_tries) 133 | continue 134 | try: 135 | parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")} 136 | logger.debug("Returning state of %s as %s", jobid, parsed[jobid]) 137 | return parsed[jobid] 138 | except IndexError: 139 | logger.warning("Could not parse %s (try %d of %d)", repr(output), try_num, self.max_tries) 140 | secs = try_num / 2.0 141 | loger.info("Sleeping %f seconds", secs) 142 | time.sleep(secs) 143 | raise Exception("Problem with call to %s" % cmd) 144 | 145 | def stop(self): 146 | """Flag thread to stop execution""" 147 | logger.debug("stopping thread") 148 | self.stopped.set() 149 | 150 | def _call_squeue(self, allow_failure=True): 151 | """Run the call to ``squeue``""" 152 | cluster = CookieCutter.get_cluster_option() 153 | try_num = 0 154 | cmd = [SQUEUE_CMD, "--user={}".format(os.environ.get("USER")), "--format=%i,%T", "--state=all"] 155 | if cluster: 156 | cmd.append(cluster) 157 | while try_num < self.max_tries: 158 | try_num += 1 159 | try: 160 | logger.debug("Calling %s (try %d)", cmd, try_num) 161 | output = subprocess.check_output(cmd, timeout=self.squeue_timeout, text=True) 162 | logger.debug("Output is:\n---\n%s\n---", output) 163 | break 164 | except subprocess.TimeoutExpired as e: 165 | if not allow_failure: 166 | raise 167 | logger.debug("Call to %s timed out (try %d of %d)", cmd, try_num, self.max_tries) 168 | except subprocess.CalledProcessError as e: 169 | if not allow_failure: 170 | raise 171 | logger.debug("Call to %s failed (try %d of %d)", cmd, try_num, self.max_tries) 172 | if try_num >= self.max_tries: 173 | logger.debug("Giving up for this round") 174 | else: 175 | logger.debug("parsing output") 176 | self._parse_output(output) 177 | 178 | def _parse_output(self, output): 179 | """Parse output of ``squeue`` call.""" 180 | header = None 181 | for line in output.splitlines(): 182 | line = line.strip() 183 | arr = line.split(",") 184 | if not header: 185 | if not line.startswith("JOBID"): 186 | continue # skip leader 187 | header = arr 188 | else: 189 | logger.debug("Updating state of %s to %s", arr[0], arr[1]) 190 | self.states[arr[0]] = arr[1] 191 | 192 | 193 | class JobStateHttpHandler(http.server.BaseHTTPRequestHandler): 194 | """HTTP handler class that responds to ```/job/status/${jobid}/`` GET requests""" 195 | 196 | def do_GET(self): 197 | """Only to ``/job/status/${job_id}/?``""" 198 | logger.debug("--- BEGIN GET") 199 | # Remove trailing slashes from path. 200 | path = self.path 201 | while path.endswith("/"): 202 | path = path[:-1] 203 | # Ensure that /job/status was requested 204 | if not self.path.startswith("/job/status/"): 205 | self.send_response(400) 206 | self.end_headers() 207 | return 208 | # Ensure authentication bearer is correct 209 | auth_required = "Bearer %s" % self.server.http_secret 210 | auth_header = self.headers.get("Authorization") 211 | logger.debug( 212 | "Authorization header is %s, required: %s" % (repr(auth_header), repr(auth_required)) 213 | ) 214 | if auth_header != auth_required: 215 | self.send_response(403) 216 | self.end_headers() 217 | return 218 | # Otherwise, query job ID status 219 | job_id = self.path[len("/job/status/") :] 220 | try: 221 | job_id=job_id.split("%20")[3] 222 | except IndexError: 223 | pass 224 | logger.debug("Querying for job ID %s" % repr(job_id)) 225 | status = self.server.poll_thread.get_state(job_id) 226 | logger.debug("Status: %s" % status) 227 | if not status: 228 | self.send_response(404) 229 | self.end_headers() 230 | else: 231 | self.send_response(200) 232 | self.send_header("Content-type", "application/json") 233 | self.end_headers() 234 | output = json.dumps({"status": status}) 235 | logger.debug("Sending %s" % repr(output)) 236 | self.wfile.write(output.encode("utf-8")) 237 | logger.debug("--- END GET") 238 | 239 | def do_POST(self): 240 | """Handle POSTs (only to ``/job/register/${job_id}/?``)""" 241 | logger.debug("--- BEGIN POST") 242 | # Remove trailing slashes from path. 243 | path = self.path 244 | while path.endswith("/"): 245 | path = path[:-1] 246 | # Ensure that /job/register was requested 247 | if not self.path.startswith("/job/register/"): 248 | self.send_response(400) 249 | self.end_headers() 250 | return 251 | # Ensure authentication bearer is correct 252 | auth_required = "Bearer %s" % self.server.http_secret 253 | auth_header = self.headers.get("Authorization") 254 | logger.debug( 255 | "Authorization header is %s, required: %s", repr(auth_header), repr(auth_required) 256 | ) 257 | # Otherwise, register job ID 258 | job_id = self.path[len("/job/status/") :] 259 | self.server.poll_thread.register_job(job_id) 260 | self.send_response(200) 261 | self.end_headers() 262 | logger.debug("--- END POST") 263 | 264 | def log_request(self, *args, **kwargs): 265 | if LOG_REQUESTS: 266 | super().log_request(*args, **kwargs) 267 | 268 | 269 | class JobStateHttpServer(http.server.HTTPServer): 270 | """The HTTP server class""" 271 | 272 | allow_reuse_address = False 273 | 274 | def __init__(self, poll_thread): 275 | """Initialize thread and print the ``SNAKEMAKE_CLUSTER_SIDECAR_VARS`` to stdout, then flush.""" 276 | super().__init__(("0.0.0.0", 0), JobStateHttpHandler) 277 | #: The ``PollSqueueThread`` with the state dictionary. 278 | self.poll_thread = poll_thread 279 | #: The secret to use. 280 | self.http_secret = str(uuid.uuid4()) 281 | sidecar_vars = { 282 | "server_port": self.server_port, 283 | "server_secret": self.http_secret, 284 | "pid": os.getpid(), 285 | } 286 | logger.debug(json.dumps(sidecar_vars)) 287 | sys.stdout.write(json.dumps(sidecar_vars) + "\n") 288 | sys.stdout.flush() 289 | 290 | def log_message(self, *args, **kwargs): 291 | """Log messages are printed if ``DEBUG`` is ``True``.""" 292 | if DEBUG: 293 | super().log_message(*args, **kwargs) 294 | 295 | 296 | def main(): 297 | # Start thread to poll ``squeue`` in a controlled fashion. 298 | poll_thread = PollSqueueThread(SQUEUE_WAIT, SQUEUE_CMD, name="poll-squeue") 299 | poll_thread.start() 300 | 301 | # Initialize HTTP server that makes available the output of ``squeue --user [user]`` 302 | # in a controlled fashion. 303 | http_server = JobStateHttpServer(poll_thread) 304 | http_thread = threading.Thread(name="http-server", target=http_server.serve_forever) 305 | http_thread.start() 306 | 307 | # Allow for graceful shutdown of poll thread and HTTP server. 308 | def signal_handler(signum, frame): 309 | """Handler for Unix signals. Shuts down http_server and poll_thread.""" 310 | logger.info("Shutting down squeue poll thread and HTTP server...") 311 | # from remote_pdb import set_trace 312 | # set_trace() 313 | poll_thread.stop() 314 | http_server.shutdown() 315 | logger.info("... HTTP server and poll thread shutdown complete.") 316 | for thread in threading.enumerate(): 317 | logger.info("ACTIVE %s", thread.name) 318 | 319 | signal.signal(signal.SIGINT, signal_handler) 320 | signal.signal(signal.SIGTERM, signal_handler) 321 | 322 | # Actually run the server. 323 | poll_thread.join() 324 | logger.debug("poll_thread done") 325 | http_thread.join() 326 | logger.debug("http_thread done") 327 | 328 | 329 | if __name__ == "__main__": 330 | sys.exit(int(main() or 0)) 331 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/slurm_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import math 4 | import os 5 | import re 6 | import subprocess as sp 7 | import sys 8 | from datetime import timedelta 9 | from os.path import dirname 10 | from time import time as unix_time 11 | from typing import Union 12 | from uuid import uuid4 13 | import shlex 14 | from io import StringIO 15 | 16 | from CookieCutter import CookieCutter 17 | from snakemake import io 18 | from snakemake.exceptions import WorkflowError 19 | from snakemake.io import Wildcards 20 | from snakemake.logging import logger 21 | from snakemake.utils import AlwaysQuotedFormatter 22 | from snakemake.utils import QuotedFormatter 23 | from snakemake.utils import SequenceFormatter 24 | 25 | 26 | def _convert_units_to_mb(memory): 27 | """If memory is specified with SI unit, convert to MB""" 28 | if isinstance(memory, int) or isinstance(memory, float): 29 | return int(memory) 30 | siunits = {"K": 1e-3, "M": 1, "G": 1e3, "T": 1e6} 31 | regex = re.compile(r"(\d+)({})$".format("|".join(siunits.keys()))) 32 | m = regex.match(memory) 33 | if m is None: 34 | logger.error( 35 | (f"unsupported memory specification '{memory}';" " allowed suffixes: [K|M|G|T]") 36 | ) 37 | sys.exit(1) 38 | factor = siunits[m.group(2)] 39 | return int(int(m.group(1)) * factor) 40 | 41 | 42 | def parse_jobscript(): 43 | """Minimal CLI to require/only accept single positional argument.""" 44 | p = argparse.ArgumentParser(description="SLURM snakemake submit script") 45 | p.add_argument("jobscript", help="Snakemake jobscript with job properties.") 46 | return p.parse_args().jobscript 47 | 48 | 49 | def parse_sbatch_defaults(parsed): 50 | """Unpack SBATCH_DEFAULTS.""" 51 | d = shlex.split(parsed) if type(parsed) == str else parsed 52 | args = {} 53 | for keyval in [a.split("=") for a in d]: 54 | k = keyval[0].strip().strip("-") 55 | v = keyval[1].strip() if len(keyval) == 2 else None 56 | args[k] = v 57 | return args 58 | 59 | 60 | def load_cluster_config(path): 61 | """Load config to dict 62 | 63 | Load configuration to dict either from absolute path or relative 64 | to profile dir. 65 | """ 66 | if path: 67 | path = os.path.join(dirname(__file__), os.path.expandvars(path)) 68 | dcc = io.load_configfile(path) 69 | else: 70 | dcc = {} 71 | if "__default__" not in dcc: 72 | dcc["__default__"] = {} 73 | return dcc 74 | 75 | 76 | # adapted from format function in snakemake.utils 77 | def format(_pattern, _quote_all=False, **kwargs): # noqa: A001 78 | """Format a pattern in Snakemake style. 79 | This means that keywords embedded in braces are replaced by any variable 80 | values that are available in the current namespace. 81 | """ 82 | fmt = SequenceFormatter(separator=" ") 83 | if _quote_all: 84 | fmt.element_formatter = AlwaysQuotedFormatter() 85 | else: 86 | fmt.element_formatter = QuotedFormatter() 87 | try: 88 | return fmt.format(_pattern, **kwargs) 89 | except KeyError as ex: 90 | raise NameError( 91 | f"The name {ex} is unknown in this context. Please " 92 | "make sure that you defined that variable. " 93 | "Also note that braces not used for variable access " 94 | "have to be escaped by repeating them " 95 | ) 96 | 97 | 98 | # adapted from Job.format_wildcards in snakemake.jobs 99 | def format_wildcards(string, job_properties): 100 | """Format a string with variables from the job.""" 101 | 102 | class Job(object): 103 | def __init__(self, job_properties): 104 | for key in job_properties: 105 | setattr(self, key, job_properties[key]) 106 | 107 | job = Job(job_properties) 108 | if "params" in job_properties: 109 | job._format_params = Wildcards(fromdict=job_properties["params"]) 110 | else: 111 | job._format_params = None 112 | if "wildcards" in job_properties: 113 | job._format_wildcards = Wildcards(fromdict=job_properties["wildcards"]) 114 | else: 115 | job._format_wildcards = None 116 | _variables = dict() 117 | _variables.update(dict(params=job._format_params, wildcards=job._format_wildcards)) 118 | if hasattr(job, "rule"): 119 | _variables.update(dict(rule=job.rule)) 120 | try: 121 | return format(string, **_variables) 122 | except NameError as ex: 123 | raise WorkflowError("NameError with group job {}: {}".format(job.jobid, str(ex))) 124 | except IndexError as ex: 125 | raise WorkflowError("IndexError with group job {}: {}".format(job.jobid, str(ex))) 126 | 127 | 128 | # adapted from ClusterExecutor.cluster_params function in snakemake.executor 129 | def format_values(dictionary, job_properties): 130 | formatted = dictionary.copy() 131 | for key, value in list(formatted.items()): 132 | if key == "mem": 133 | value = str(_convert_units_to_mb(value)) 134 | if isinstance(value, str): 135 | try: 136 | formatted[key] = format_wildcards(value, job_properties) 137 | except NameError as e: 138 | msg = "Failed to format cluster config " "entry for job {}.".format( 139 | job_properties["rule"] 140 | ) 141 | raise WorkflowError(msg, e) 142 | return formatted 143 | 144 | 145 | def convert_job_properties(job_properties, resource_mapping=None): 146 | options = {} 147 | if resource_mapping is None: 148 | resource_mapping = {} 149 | resources = job_properties.get("resources", {}) 150 | for k, v in resource_mapping.items(): 151 | options.update({k: resources[i] for i in v if i in resources}) 152 | 153 | if "threads" in job_properties: 154 | options["cpus-per-task"] = job_properties["threads"] 155 | 156 | slurm_opts = resources.get("slurm", "") 157 | if not isinstance(slurm_opts, str): 158 | raise ValueError( 159 | "The `slurm` argument to resources must be a space-separated string" 160 | ) 161 | 162 | for opt in slurm_opts.split(): 163 | kv = opt.split("=", maxsplit=1) 164 | k = kv[0] 165 | v = None if len(kv) == 1 else kv[1] 166 | options[k.lstrip("-").replace("_", "-")] = v 167 | 168 | return options 169 | 170 | 171 | def ensure_dirs_exist(path): 172 | """Ensure output folder for Slurm log files exist.""" 173 | di = dirname(path) 174 | if di == "": 175 | return 176 | if not os.path.exists(di): 177 | os.makedirs(di, exist_ok=True) 178 | return 179 | 180 | 181 | def format_sbatch_options(**sbatch_options): 182 | """Format sbatch options""" 183 | options = [] 184 | for k, v in sbatch_options.items(): 185 | val = "" 186 | if v is not None: 187 | val = f"={v}" 188 | options.append(f"--{k}{val}") 189 | return options 190 | 191 | 192 | def submit_job(jobscript, **sbatch_options): 193 | """Submit jobscript and return jobid.""" 194 | options = format_sbatch_options(**sbatch_options) 195 | try: 196 | cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] 197 | res = sp.check_output(cmd) 198 | except sp.CalledProcessError as e: 199 | raise e 200 | # Get jobid 201 | res = res.decode() 202 | try: 203 | jobid = re.search(r"(\d+)", res).group(1) 204 | except Exception as e: 205 | raise e 206 | return jobid 207 | 208 | 209 | timeformats = [ 210 | re.compile(r"^(?P\d+)-(?P\d+):(?P\d+):(?P\d+)$"), 211 | re.compile(r"^(?P\d+)-(?P\d+):(?P\d+)$"), 212 | re.compile(r"^(?P\d+)-(?P\d+)$"), 213 | re.compile(r"^(?P\d+):(?P\d+):(?P\d+)$"), 214 | re.compile(r"^(?P\d+):(?P\d+)$"), 215 | re.compile(r"^(?P\d+)$"), 216 | ] 217 | 218 | 219 | def time_to_minutes(time): 220 | """Convert time string to minutes. 221 | 222 | According to slurm: 223 | 224 | Acceptable time formats include "minutes", "minutes:seconds", 225 | "hours:minutes:seconds", "days-hours", "days-hours:minutes" 226 | and "days-hours:minutes:seconds". 227 | 228 | """ 229 | if not isinstance(time, str): 230 | time = str(time) 231 | d = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0} 232 | regex = list(filter(lambda regex: regex.match(time) is not None, timeformats)) 233 | if len(regex) == 0: 234 | return 235 | assert len(regex) == 1, "multiple time formats match" 236 | m = regex[0].match(time) 237 | d.update(m.groupdict()) 238 | minutes = ( 239 | int(d["days"]) * 24 * 60 240 | + int(d["hours"]) * 60 241 | + int(d["minutes"]) 242 | + math.ceil(int(d["seconds"]) / 60) 243 | ) 244 | assert minutes > 0, "minutes has to be greater than 0" 245 | return minutes 246 | 247 | 248 | class InvalidTimeUnitError(Exception): 249 | pass 250 | 251 | 252 | class Time: 253 | _nanosecond_size = 1 254 | _microsecond_size = 1000 * _nanosecond_size 255 | _millisecond_size = 1000 * _microsecond_size 256 | _second_size = 1000 * _millisecond_size 257 | _minute_size = 60 * _second_size 258 | _hour_size = 60 * _minute_size 259 | _day_size = 24 * _hour_size 260 | _week_size = 7 * _day_size 261 | units = { 262 | "s": _second_size, 263 | "m": _minute_size, 264 | "h": _hour_size, 265 | "d": _day_size, 266 | "w": _week_size, 267 | } 268 | pattern = re.compile(rf"(?P\d+(\.\d*)?|\.\d+)(?P[a-zA-Z])") 269 | 270 | def __init__(self, duration: str): 271 | self.duration = Time._from_str(duration) 272 | 273 | def __str__(self) -> str: 274 | return Time._timedelta_to_slurm(self.duration) 275 | 276 | def __repr__(self): 277 | return str(self) 278 | 279 | @staticmethod 280 | def _timedelta_to_slurm(delta: Union[timedelta, str]) -> str: 281 | if isinstance(delta, timedelta): 282 | d = dict() 283 | d["hours"], rem = divmod(delta.seconds, 3600) 284 | d["minutes"], d["seconds"] = divmod(rem, 60) 285 | d["hours"] += delta.days * 24 286 | return "{hours}:{minutes:02d}:{seconds:02d}".format(**d) 287 | elif isinstance(delta, str): 288 | return delta 289 | else: 290 | raise ValueError("Time is in an unknown format '{}'".format(delta)) 291 | 292 | @staticmethod 293 | def _from_str(duration: str) -> Union[timedelta, str]: 294 | """Parse a duration string to a datetime.timedelta""" 295 | 296 | matches = Time.pattern.finditer(duration) 297 | 298 | total = 0 299 | n_matches = 0 300 | for m in matches: 301 | n_matches += 1 302 | value = m.group("val") 303 | unit = m.group("unit").lower() 304 | if unit not in Time.units: 305 | raise InvalidTimeUnitError( 306 | "Unknown unit '{}' in time {}".format(unit, duration) 307 | ) 308 | 309 | total += float(value) * Time.units[unit] 310 | 311 | if n_matches == 0: 312 | return duration 313 | 314 | microseconds = total / Time._microsecond_size 315 | return timedelta(microseconds=microseconds) 316 | 317 | 318 | class JobLog: 319 | def __init__(self, job_props: dict): 320 | self.job_properties = job_props 321 | self.uid = str(uuid4()) 322 | 323 | @property 324 | def wildcards(self) -> dict: 325 | return self.job_properties.get("wildcards", dict()) 326 | 327 | @property 328 | def wildcards_str(self) -> str: 329 | return ( 330 | ".".join("{}={}".format(k, v) for k, v in self.wildcards.items()) 331 | or "unique" 332 | ) 333 | 334 | @property 335 | def rule_name(self) -> str: 336 | if not self.is_group_jobtype: 337 | return self.job_properties.get("rule", "nameless_rule") 338 | return self.groupid 339 | 340 | @property 341 | def groupid(self) -> str: 342 | return self.job_properties.get("groupid", "group") 343 | 344 | @property 345 | def is_group_jobtype(self) -> bool: 346 | return self.job_properties.get("type", "") == "group" 347 | 348 | @property 349 | def short_uid(self) -> str: 350 | return self.uid.split("-")[0] 351 | 352 | def pattern_replace(self, s: str) -> str: 353 | """ 354 | %r - rule name. If group job, will use the group ID instead 355 | %i - snakemake job ID 356 | %w - wildcards. e.g., wildcards A and B will be concatenated as 'A=.B=' 357 | %U - a random universally unique identifier 358 | %S - shortened version od %U 359 | %T - Unix time, aka seconds since epoch (rounded to an integer) 360 | """ 361 | replacement = { 362 | "%r": self.rule_name, 363 | "%i": self.jobid, 364 | "%w": self.wildcards_str, 365 | "%U": self.uid, 366 | "%T": str(int(unix_time())), 367 | "%S": self.short_uid, 368 | } 369 | for old, new in replacement.items(): 370 | s = s.replace(old, new) 371 | 372 | return s 373 | 374 | @property 375 | def jobname(self) -> str: 376 | jobname_pattern = CookieCutter.get_cluster_jobname() 377 | if not jobname_pattern: 378 | return "" 379 | 380 | return self.pattern_replace(jobname_pattern) 381 | 382 | @property 383 | def jobid(self) -> str: 384 | """The snakemake jobid""" 385 | if self.is_group_jobtype: 386 | return self.job_properties.get("jobid", "").split("-")[0] 387 | return str(self.job_properties.get("jobid")) 388 | 389 | @property 390 | def logpath(self) -> str: 391 | logpath_pattern = CookieCutter.get_cluster_logpath() 392 | if not logpath_pattern: 393 | return "" 394 | 395 | return self.pattern_replace(logpath_pattern) 396 | 397 | @property 398 | def outlog(self) -> str: 399 | return self.logpath + ".out" 400 | 401 | @property 402 | def errlog(self) -> str: 403 | return self.logpath + ".err" 404 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Test 2 | SnakemakeProfiles/slurm](https://github.com/Snakemake-Profiles/slurm/workflows/Test%20SnakemakeProfiles/slurm/badge.svg) 3 | 4 | > [!IMPORTANT] 5 | > 6 | > ## Snakemake version 8 currently unsupported 7 | > 8 | > Snakemake version 8 introduces breaking changes with respect to the Snakemake cookiecutter profile. One new 9 | > feature is that execution of jobs on a cluster or the cloud is handled by 10 | > [executor plugins](https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html#cluster-or-cloud-execution). 11 | > For instance, there is a 12 | > [dedicated executor plugin for SLURM](https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/slurm.html) that 13 | > provides much of the functionality that was previously catered for by this snakemake profile. Profiles are still needed, but their 14 | > role changes somewhat to that of fine-tuning executor plugins for particular sites, only requiring the yaml configuration 15 | > file. 16 | > 17 | > With this in mind, support for Snakemake version 8 has currently been put on hold and I recommend that users consult 18 | > the executor plugin documentation for slurm. 19 | > 20 | 21 | 22 | # Contents 23 | 24 | - [Introduction](#introduction) 25 | - [Alternatives](#alternatives) 26 | - [Quickstart](#quickstart) 27 | - [Examples](#examples) 28 | - [Example 1: project setup to use specific slurm 29 | account](#example-1-project-setup-to-use-specific-slurm-account) 30 | - [Example 2: project setup using a specified 31 | cluster](#example-2-project-setup-using-a-specified-cluster) 32 | - [Profile details](#profile-details) 33 | - [Cookiecutter options](#cookiecutter-options) 34 | - [Default snakemake arguments](#default-snakemake-arguments) 35 | - [Parsing arguments to SLURM (sbatch) and resource 36 | configuration](#parsing-arguments-to-slurm-sbatch-and-resource-configuration) 37 | - [Rule specific resource 38 | configuration](#rule-specific-resource-configuration) 39 | - [Cluster configuration file](#cluster-configuration-file) 40 | - [Tests](#tests) 41 | - [Testing on a HPC running 42 | SLURM](#testing-on-a-hpc-running-slurm) 43 | - [Testing on machine without 44 | SLURM](#testing-on-machine-without-slurm) 45 | - [Baking cookies](#baking-cookies) 46 | - [Anatomy of the tests (WIP)](#anatomy-of-the-tests-wip) 47 | - [Adding new tests (WIP)](#adding-new-tests-wip) 48 | 49 | ## Introduction 50 | 51 | This cookiecutter provides a template Snakemake profile for configuring 52 | Snakemake to run on the [SLURM Workload 53 | Manager](https://slurm.schedmd.com/) for **Snakemake version <=7**. The profile defines the following scripts 54 | 55 | 1. `slurm-submit.py` - submits a jobscript to slurm 56 | 2. `slurm-jobscript.sh` - a template jobscript 57 | 3. `slurm-status.py` - checks the status of jobs in slurm 58 | 4. `slurm-sidecar.py` - run a Snakemake cluster sidecar for caching queries to Slurm's controller/database daemons 59 | 60 | and a configuration file `config.yaml` that defines default values for 61 | snakemake command line arguments. 62 | 63 | Given an installed profile `profile_name`, when snakemake is run with 64 | `--profile profile_name`, the configuration keys and values from 65 | `config.yaml` are passed to snakemake - plus any additional options to 66 | snakemake that the user has applied. 67 | 68 | Note that the use of option `--cluster-config` is discouraged, but the 69 | profile still provides support for backwards compatibility. The default 70 | configuration file therefore contains a commented section with examples 71 | of resource configuration (see also [snakemake best 72 | practices](https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html?highlight=set-resources#best-practices)): 73 | 74 | # Example resource configuration 75 | # default-resources: 76 | # - runtime=100 77 | # - mem_mb=6000 78 | # - disk_mb=1000000 79 | # # set-threads: map rule names to threads 80 | # set-threads: 81 | # - single_core_rule=1 82 | # - multi_core_rule=10 83 | # # set-resources: map rule names to resources in general 84 | # set-resources: 85 | # - high_memory_rule:mem_mb=12000 86 | # - long_running_rule:runtime=1200 87 | 88 | See the [snakemake documentation on 89 | profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html?highlight=profile#profiles) 90 | for more information. 91 | 92 | ## Alternatives 93 | 94 | For a more light-weight alternative, see the excellent repo 95 | [smk-simple-slurm](https://github.com/jdblischak/smk-simple-slurm) by 96 | @jdblischak. In particular, it can handle larger amounts of jobs than 97 | this profile (see [issue 98 | #79](https://github.com/Snakemake-Profiles/slurm/issues/79)). 99 | 100 | ## Quickstart 101 | 102 | To create a slurm profile from the 103 | [cookiecutter](https://github.com/cookiecutter/cookiecutter), simply run 104 | 105 | # create config directory that snakemake searches for profiles (or use something else) 106 | profile_dir="${HOME}/.config/snakemake" 107 | mkdir -p "$profile_dir" 108 | # use cookiecutter to create the profile in the config directory 109 | template="gh:Snakemake-Profiles/slurm" 110 | cookiecutter --output-dir "$profile_dir" "$template" 111 | 112 | You will be prompted to set some values for your profile (here assumed 113 | to be called `profile_name`), after which the profile scripts and 114 | configuration file will be installed in `$profile_dir` as 115 | `profile_name/`. Then you can run Snakemake with 116 | 117 | snakemake --profile profile_name ... 118 | 119 | Note that the `--profile` argument can be either a relative or absolute 120 | path. In addition, snakemake will search for a corresponding folder 121 | `profile_name` in `/etc/xdg/snakemake` and `$HOME/.config/snakemake`, 122 | where globally accessible profiles can be placed. 123 | 124 | ## Examples 125 | 126 | ### Example 1: project setup to use specific slurm account 127 | 128 | One typical use case is to setup a profile to use a specific slurm 129 | account: 130 | 131 | $ cookiecutter --output-dir "$profile_dir" "$template" 132 | profile_name [slurm]: slurm.my_account 133 | sbatch_defaults []: account=my_account no-requeue exclusive 134 | cluster_sidecar_help: [Use cluster sidecar. NB! Requires snakemake >= 7.0! Enter to continue...] 135 | Select cluster_sidecar: 136 | 1 - yes 137 | 2 - no 138 | Choose from 1, 2 [1]: 139 | cluster_name []: 140 | cluster_config_help: [The use of cluster-config is discouraged. Rather, set snakemake CLI options in the profile configuration file (see snakemake documentation on best practices). Enter to continue...] 141 | cluster_config []: 142 | 143 | The command `snakemake --profile slurm.my_account ...` will submit jobs 144 | with `sbatch --parsable --account=my_account --no-requeue --exclusive`. 145 | Note that the option `--parsable` is always added. 146 | 147 | ### Example 2: project setup using a specified cluster 148 | 149 | It is possible to install multiple profiles in a project directory. 150 | Assuming our HPC defines a [multi-cluster 151 | environment](https://slurm.schedmd.com/multi_cluster.html), we can 152 | create a profile that uses a specified cluster: 153 | 154 | $ cookiecutter slurm 155 | profile_name [slurm]: slurm.dusk 156 | sbatch_defaults []: account=my_account 157 | cluster_sidecar_help: [Use cluster sidecar. NB! Requires snakemake >= 7.0! Enter to continue...] 158 | Select cluster_sidecar: 159 | 1 - yes 160 | 2 - no 161 | Choose from 1, 2 [1]: 162 | cluster_name []: dusk 163 | cluster_config_help: [The use of cluster-config is discouraged. Rather, set snakemake CLI options in the profile configuration file (see snakemake documentation on best practices). Enter to continue...] 164 | cluster_config []: 165 | 166 | (Note that once a cookiecutter has been installed, we can reuse it 167 | without using the github URL). 168 | 169 | The command `snakemake --profile slurm.dusk ...` will now submit jobs 170 | with `sbatch --parsable --account=my_account --cluster=dusk`. In 171 | addition, the `slurm-status.py` script will check for jobs in the `dusk` 172 | cluster job queue. 173 | 174 | ## Profile details 175 | 176 | ### Cookiecutter options 177 | 178 | - `profile_name` : A name to address the profile via the `--profile` 179 | Snakemake option. 180 | - `use_singularity`: This sets the default `--use-singularity` 181 | parameter. Default is not to use (`false`). 182 | - `use_conda`: This sets the default `--use-conda` parameter. Default 183 | is not to use (`false`). 184 | - `jobs`: This sets the default `--cores`/`--jobs`/`-j` parameter. 185 | - `restart_times`: This sets the default `--restart-times`/`-T` 186 | parameter. 187 | - `max_status_checks_per_second`: This sets the default 188 | `--max-status-checks-per-second` parameter. 189 | - `max_jobs_per_second`: This sets the default `--max-jobs-per-second` 190 | parameter. 191 | - `latency_wait`: This sets the default 192 | `--latency-wait`/`--output-wait`/`-w` parameter. 193 | - `print_shell_commands`: This sets the default 194 | `--printshellcmds`/`-p` parameter. 195 | - `sbatch_defaults` : List of (space-separated) default arguments to 196 | sbatch, e.g.: `qos=short time=60`. Note, we support [human-friendly 197 | time specification](#human-friendly-time). 198 | - `cluster_sidecar`: Whether to use the [cluster sidecar](https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html?highlight=sidecar#using-cluster-sidecar) feature. (Requires Snakemake version of at least 7.0) 199 | - `cluster_name` : some HPCs define multiple SLURM clusters. Set the 200 | cluster name, leave empty to use the default. This will add the 201 | `--cluster` string to the sbatch defaults, and adjust 202 | `slurm-status.py` to check status on the relevant cluster. 203 | - `cluster_jobname`: A pattern to use for naming Slurm 204 | jobs ([`--job-name`](https://slurm.schedmd.com/sbatch.html#OPT_job-name)). 205 | See [Patterns](#patterns) below. Set to `"""` (i.e., blank) to use the slurm default. 206 | - `cluster_logpath`: A pattern to use for setting 207 | the [`--output`](https://slurm.schedmd.com/sbatch.html#OPT_output) 208 | and [`--error`](https://slurm.schedmd.com/sbatch.html#OPT_error) log files. You can 209 | use [slurm filename patterns](https://slurm.schedmd.com/sbatch.html#lbAH) 210 | and [Patterns](#patterns). Set to `"""` (i.e., blank) to use the slurm default. For 211 | example, `logs/slurm/%r_%w` creates logs named `%r_%w.out` and `%r_%w.err` in the 212 | directory `logs/slurm`. 213 | - `cluster_config` (NB: discouraged): Path to a YAML or JSON 214 | configuration file analogues to the Snakemake [`--cluster-config` 215 | option](https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration-deprecated) 216 | . 217 | Path may be relative to the profile directory or absolute including 218 | environment variables (e.g. 219 | `$PROJECT_ROOT/config/slurm_defaults.yaml`). 220 | 221 | #### Patterns 222 | 223 | For job name and log paths we provide a custom pattern syntax. 224 | 225 | - `%r`: Rule name. If it is a group job, the group ID will be used instead. 226 | - `%i`: Snakemake job ID. 227 | - `%w`: Wildcards string. e.g., wildcards A and B will be concatenated as `A=.B=` 228 | - `%U`: A [random universally unique identifier](https://docs.python.org/3/library/uuid.html#uuid.uuid4) (UUID). 229 | - `%S`: A shortened version of `%U`. For example, `16fd2706-8baf-433b-82eb-8c7fada847da` would become `16fd2706`. 230 | - `%T`: The [Unix timestamp](https://docs.python.org/3/library/time.html#time.time) (rounded to an integer). 231 | 232 | ### Default snakemake arguments 233 | 234 | Other default arguments to `snakemake` may be adjusted in the resulting 235 | `/config.yaml` file. 236 | 237 | ### Parsing arguments to SLURM (sbatch) and resource configuration 238 | 239 | NB!!! As previusly pointed out, the use of cluster-config is 240 | discouraged. Rule specific resource configuration is better handled by 241 | snakemake's CLI arguments (see [snakemake best 242 | practices](https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html?highlight=set-resources#best-practices)) 243 | which can be put in the profile configuration file. 244 | 245 | Arguments are set and overridden in the following order and must be 246 | named according to [sbatch long option 247 | names](https://slurm.schedmd.com/sbatch.html): 248 | 249 | 1) `sbatch_defaults` cookiecutter option 250 | 2) Profile `cluster_config` file `__default__` entries 251 | 3) Snakefile threads and resources (time, mem) 252 | 4) Profile `cluster_config` file ``{=html} entries 253 | 5) `--cluster-config` parsed to Snakemake (deprecated since Snakemake 254 | 5.10) 255 | 6) Snakemake CLI resource configuration in profile configuration file 256 | 257 | ### Rule specific resource configuration 258 | 259 | In addition to Snakemake CLI resource configuration, resources can be 260 | specified in Snakefile rules and must all be in the correct unit/format 261 | as expected by `sbatch` ([except time](#human-friendly-time)). The 262 | implemented resource names are given (and may be adjusted) in 263 | `slurm-submit.py`'s variable `RESOURCE_MAPPING`. This is intended for 264 | system agnostic resources such as time and memory. Currently supported 265 | resources are `time`, `mem`, `mem-per-cpu`, `nodes`, and `partition`. An 266 | example rule resources configuration follows: 267 | 268 | rule bwa_mem: 269 | resources: 270 | time = "00:10:00", 271 | mem = 12000, 272 | partition = "debug" 273 | 274 | Resources not listed in `RESOURCE_MAPPING` can also be specified with 275 | the special `slurm` parameter to resources. For example, to specify 276 | a specific QoS and 2 GPU resources for a rule `gpu_stuff` 277 | 278 | rule gpu_stuff: 279 | resources: 280 | time="12:00:00", 281 | mem_mb=8000, 282 | partition="gpu", 283 | slurm="qos=gpuqos gres=gpu:2" 284 | 285 | Note: `slurm` **must** be a space-separated string of the form `