├── .flake8 ├── .github └── workflows │ └── htcondor.yaml ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── cookiecutter.json ├── test-environment.yml ├── tests ├── Snakefile ├── conftest.py ├── deploystack.sh ├── docker-compose.yaml ├── test_cookie.py ├── test_submit.py └── wrapper.py └── {{cookiecutter.profile_name}} ├── config.v8+.yaml ├── config.yaml ├── grid-jobscript.sh ├── grid-status.py └── grid-submit.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 90 3 | 4 | -------------------------------------------------------------------------------- /.github/workflows/htcondor.yaml: -------------------------------------------------------------------------------- 1 | name: Test SnakemakeProfiles/htcondor 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | htcondortest: 6 | name: Test htcondor profile in docker containers 7 | runs-on: ubuntu-latest 8 | timeout-minutes: 30 9 | strategy: 10 | matrix: 11 | snakemake_image: 12 | - "quay.io/biocontainers/snakemake:7.32.4--hdfd78af_1" 13 | - "quay.io/biocontainers/snakemake:8.9.0--hdfd78af_0" 14 | htcondor_image: 15 | - "htcondor/mini:23.5.2-el8" 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - run: mkdir -p ~/image-cache 20 | - name: cache-conda 21 | uses: actions/cache@v4 22 | env: 23 | CACHE_NUMBER: 0 24 | with: 25 | path: ~/conda_pkgs_dir 26 | key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('test-environment.yml') }} 27 | 28 | - uses: actions/cache@v4 29 | id: cache-images 30 | env: 31 | CACHE_NUMBER: 0 32 | with: 33 | path: ~/image-cache 34 | key: image-cache-${{ runner.os }}-${{ env.CACHE_NUMBER }}-${{ matrix.snakemake_image }}-${{ matrix.htcondor_image }} 35 | 36 | - name: install miniconda 37 | uses: conda-incubator/setup-miniconda@v2 38 | with: 39 | channels: conda-forge,bioconda,defaults 40 | channel-priority: true 41 | environment-file: test-environment.yml 42 | 43 | - name: docker swarm init 44 | run: docker swarm init 45 | 46 | - if: steps.cache-images.outputs.cache-hit == 'true' 47 | run: docker load -i ~/image-cache/snakemake.tar 48 | 49 | - if: steps.cache-images.outputs.cache-hit == 'true' 50 | run: docker load -i ~/image-cache/htcondor.tar 51 | 52 | - name: docker deploy 53 | shell: bash -l {0} 54 | env: 55 | DOCKER_COMPOSE: "tests/docker-compose.yaml" 56 | SNAKEMAKE_IMAGE: ${{ matrix.snakemake_image }} 57 | HTCONDOR_IMAGE: ${{ matrix.htcondor_image }} 58 | run: ./tests/deploystack.sh 59 | 60 | - if: steps.cache-images.outputs.cache-hit != 'true' 61 | run: docker save -o ~/image-cache/snakemake.tar ${{ matrix.snakemake_image }} 62 | 63 | - if: steps.cache-images.outputs.cache-hit != 'true' 64 | run: docker save -o ~/image-cache/htcondor.tar ${{ matrix.htcondor_image }} 65 | 66 | - name: run tests 67 | shell: bash -l {0} 68 | run: | 69 | pytest -v -s tests/test_cookie.py 70 | pytest -v -s tests/test_submit.py 71 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 22.3.0 4 | hooks: 5 | - id: black 6 | - repo: https://gitlab.com/pycqa/flake8 7 | rev: 3.9.2 8 | hooks: 9 | - id: flake8 10 | 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Snakemake-Profiles 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTCondor Snakemake profile 2 | 3 | This profile configures Snakemake to submit jobs to a HTCondor cluster. 4 | 5 | ### Prerequisites 6 | The profile makes use of the HTCondor python bindings (and `snakemake-executor-plugin-cluster-generic` for snakemake > 8) which can be installed with 7 | 8 | pip install --user htcondor snakemake-executor-plugin-cluster-generic 9 | 10 | or using Anaconda with 11 | 12 | conda install -c conda-forge -c bioconda python-htcondor snakemake-executor-plugin-cluster-generic 13 | 14 | ### Deploy profile 15 | 16 | To deploy this profile run 17 | 18 | mkdir -p ~/.config/snakemake 19 | cookiecutter --output-dir ~/.config/snakemake gh:Snakemake-Profiles/htcondor 20 | 21 | You will be asked for the name of the profile and for a path where the HTCondor logs will be stored. 22 | The logs will be used to update the status of submitted jobs (as recommended in the [documentation of the HTCondor Python bindings](https://htcondor.readthedocs.io/en/latest/apis/python-bindings/tutorials/Scalable-Job-Tracking.html)). 23 | 24 | Then, you can run Snakemake with 25 | 26 | snakemake --profile htcondor ... 27 | 28 | so that jobs are submitted to the cluster. If Snakemake is killed and restarted afterwards, it will automatically resume still running jobs. 29 | 30 | 31 | ### Tests 32 | The tests are heavily inspired by the tests for the slurm snakemake profile. They can be run from the base directory by 33 | ``` 34 | pytest 35 | ``` 36 | 37 | Because the tests will try to submit jobs they need to be started from a HTCondor submit node. To run the tests from non-cluster machines or from github CI the [HTCondor/mini docker container](https://github.com/htcondor/htcondor/blob/master/build/docker/services/README.md) can be started by: 38 | ``` 39 | DOCKER_COMPOSE=tests/docker-compose.yaml ./tests/deploystack.sh 40 | ``` 41 | 42 | ## Migration to snakemake v8 43 | If using snakemake version 8 or higher, refer to [the migration guide](https://snakemake.readthedocs.io/en/stable/getting_started/migration.html). For this profile, use `config.v8+.yaml` instead of `config.yaml` 44 | -------------------------------------------------------------------------------- /cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "profile_name": "htcondor", 3 | "htcondor_log_dir": "/net/data_lhcb1b/user/jheuel/.condor_jobs", 4 | "location_cern": false 5 | } 6 | -------------------------------------------------------------------------------- /test-environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - default 5 | dependencies: 6 | - python 7 | - pip 8 | - pytest 9 | - pytest-runner 10 | - docker-py 11 | - cookiecutter 12 | - pep8 13 | - pytest-cookies 14 | - pytest-timeout 15 | - pyflakes 16 | - urllib3 17 | - cryptography 18 | - psutil 19 | - snakemake-executor-plugin-cluster-generic 20 | -------------------------------------------------------------------------------- /tests/Snakefile: -------------------------------------------------------------------------------- 1 | rule test_submit: 2 | output: "test_submit.txt" 3 | shell: "echo test_submit > {output}" 4 | 5 | 6 | rule test_resources_mem: 7 | resources: 8 | mem_mb=99 9 | output: "resources_mem.txt" 10 | shell: "echo test_resources_mem > {output}; sleep 60" 11 | 12 | 13 | rule test_resources_disk: 14 | resources: 15 | disk_mb=99 16 | output: "resources_disk.txt" 17 | shell: "echo test_resources_disk > {output}; sleep 60" 18 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from os.path import join as pjoin 4 | import re 5 | import py 6 | import pytest 7 | import docker 8 | import shutil 9 | import logging 10 | from pytest_cookies.plugin import Cookies 11 | from wrapper import SnakemakeRunner, ShellContainer 12 | 13 | 14 | def pytest_configure(config): 15 | pytest.local_user_id = os.getuid() 16 | pytest.dname = os.path.dirname(__file__) 17 | pytest.cookie_template = py.path.local(pytest.dname).join(os.pardir) 18 | setup_logging(config.getoption("--log-level")) 19 | if shutil.which("condor_q") is not None and config.getoption("--basetemp") is None: 20 | config.option.basetemp = "./.pytest" 21 | 22 | 23 | def setup_logging(level): 24 | if level is None: 25 | level = logging.WARN 26 | elif re.match(r"\d+", level): 27 | level = int(level) 28 | logging.basicConfig(level=level) 29 | logging.getLogger("urllib3").setLevel(level) 30 | logging.getLogger("docker").setLevel(level) 31 | logging.getLogger("poyo").setLevel(level) 32 | logging.getLogger("binaryornot").setLevel(level) 33 | 34 | 35 | @pytest.fixture 36 | def datadir(tmpdir_factory): 37 | """Setup base data directory for a test""" 38 | p = tmpdir_factory.mktemp("data") 39 | return p 40 | 41 | 42 | @pytest.fixture 43 | def datafile(datadir): 44 | """Add a datafile to the datadir. 45 | 46 | By default, look for a source (src) input file located in the 47 | tests directory (pytest.dname). Custom data can be added by 48 | pointing a file 'dname / src'. The contents of src are copied to 49 | the file 'dst' in the test data directory 50 | 51 | Args: 52 | src (str): source file name 53 | dst (str): destination file name. Defaults to src. 54 | dname (str): directory where src is located. 55 | 56 | """ 57 | 58 | def _datafile(src, dst=None, dname=pytest.dname): 59 | dst = src if dst is None else dst 60 | src = py.path.local(pjoin(dname, src)) 61 | dst = datadir.join(dst) 62 | src.copy(dst) 63 | return dst 64 | 65 | return _datafile 66 | 67 | 68 | @pytest.fixture 69 | def cookie_factory(tmpdir_factory, _cookiecutter_config_file, datadir): 70 | """Cookie factory fixture. 71 | 72 | Cookie factory fixture to create a slurm profile in the test data 73 | directory. 74 | 75 | """ 76 | 77 | logging.getLogger("cookiecutter").setLevel(logging.INFO) 78 | 79 | _yamlconfig_default = {"restart-times": 1} 80 | 81 | def _cookie_factory( 82 | log_dir="log", 83 | yamlconfig=_yamlconfig_default, 84 | ): 85 | cookie_template = pjoin(os.path.abspath(pytest.dname), os.pardir) 86 | output_factory = tmpdir_factory.mktemp 87 | c = Cookies(cookie_template, output_factory, _cookiecutter_config_file) 88 | c._new_output_dir = lambda: str(datadir) 89 | profile_name = "htcondor" 90 | extra_context = { 91 | "profile_name": profile_name, 92 | "htcondor_log_dir": log_dir, 93 | } 94 | c.bake(extra_context=extra_context) 95 | config = datadir.join(profile_name).join("config.yaml") 96 | config_d = dict( 97 | [ 98 | tuple(line.split(":")) 99 | for line in config.read().split("\n") 100 | if re.search("^[a-z]", line) 101 | ] 102 | ) 103 | config_d.update(**yamlconfig) 104 | config.write("\n".join(f"{k}: {v}" for k, v in config_d.items())) 105 | 106 | return _cookie_factory 107 | 108 | 109 | @pytest.fixture 110 | def data(tmpdir_factory, request, datafile): 111 | """Setup base data""" 112 | dfile = datafile("Snakefile") 113 | return py.path.local(dfile.dirname) 114 | 115 | 116 | @pytest.fixture(scope="session") 117 | def htcondor(request): 118 | """HTCondor fixture 119 | 120 | Return relevant container depending on environment. First look for 121 | condor_q command to determine whether we are on a system running the 122 | HTCondor scheduler. Second, try deploying a docker stack to run htcondor 123 | locally. 124 | 125 | Skip htcondor tests if the above actions fail. 126 | 127 | """ 128 | if shutil.which("condor_q") is not None: 129 | return ShellContainer() 130 | else: 131 | client = docker.from_env() 132 | container_list = client.containers.list( 133 | filters={"name": "cookiecutter-htcondor_htcondor"} 134 | ) 135 | container = container_list[0] if len(container_list) > 0 else None 136 | if container: 137 | return container 138 | 139 | msg = ( 140 | "no condor_q or docker stack 'cookiecutter-htcondor_htcondor' running;" 141 | " skipping HTCondor-based tests." 142 | " Either run tests on a HTCondor HPC or deploy a docker stack with" 143 | f" {os.path.dirname(__file__)}/deploystack.sh" 144 | ) 145 | 146 | pytest.skip(msg) 147 | 148 | 149 | def teardown(request): 150 | """Shutdown snakemake processes that are waiting for HTCondor 151 | 152 | On nsf systems, stale snakemake log files may linger in the test 153 | directory, which prevents reruns of pytest. The teardown function 154 | calls 'lsof' to identify and terminate the processes using these 155 | files. 156 | 157 | """ 158 | 159 | logging.info(f"\n\nTearing down test '{request.node.name}'") 160 | basetemp = request.config.getoption("basetemp") 161 | from subprocess import Popen, PIPE 162 | import psutil 163 | 164 | for root, _, files in os.walk(basetemp, topdown=False): 165 | for name in files: 166 | if not root.endswith(".snakemake/log"): 167 | continue 168 | try: 169 | fn = os.path.join(root, name) 170 | proc = Popen(["lsof", "-F", "p", fn], stdout=PIPE, stderr=PIPE) 171 | pid = proc.communicate()[0].decode().strip().strip("p") 172 | if pid: 173 | p = psutil.Process(int(pid)) 174 | logging.info(f"Killing process {p.pid} related to {fn}") 175 | p.kill() 176 | except psutil.NoSuchProcess as e: 177 | logging.warning(e) 178 | except ValueError as e: 179 | logging.warning(e) 180 | 181 | 182 | @pytest.fixture 183 | def smk_runner(htcondor, datadir, request): 184 | """smk_runner fixture 185 | 186 | Setup a wrapper.SnakemakeRunner instance that runs the snakemake 187 | tests. Skip tests where the partition doesn't exist on the system. 188 | Some tests also only run in docker. 189 | 190 | """ 191 | 192 | yield SnakemakeRunner(htcondor, datadir, request.node.name) 193 | 194 | if isinstance(htcondor, ShellContainer): 195 | teardown(request) 196 | -------------------------------------------------------------------------------- /tests/deploystack.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # 3 | # Deploy docker stack 4 | # 5 | # Compose file 6 | DOCKER_COMPOSE=${DOCKER_COMPOSE:=docker-compose.yaml} 7 | 8 | # Images 9 | SNAKEMAKE_IMAGE=${SNAKEMAKE_IMAGE:=quay.io/biocontainers/snakemake:7.32.4--hdfd78af_1} 10 | HTCONDORIMAGE=${HTCONDORIMAGE:=htcondor/mini:23.5.2-el8} 11 | 12 | docker pull $SNAKEMAKE_IMAGE 13 | docker pull $HTCONDORIMAGE 14 | 15 | # Stack and service config 16 | STACK_NAME=cookiecutter-htcondor 17 | HTCONDOR_SERVICE=${STACK_NAME}_htcondor 18 | SNAKEMAKE_SERVICE=${STACK_NAME}_snakemake 19 | LOCAL_USER_ID=$(id -u) 20 | 21 | ############################## 22 | ## Functions 23 | ############################## 24 | 25 | ### Check if service is up 26 | function service_up { 27 | SERVICE=$1 28 | COUNT=1 29 | MAXCOUNT=30 30 | 31 | docker service ps $SERVICE --format "{{.CurrentState}}" 2>/dev/null | grep Running 32 | service_up=$? 33 | 34 | until [ $service_up -eq 0 ]; do 35 | echo "$COUNT: service $SERVICE unavailable" 36 | sleep 5 37 | docker service ps $SERVICE --format "{{.CurrentState}}" 2>/dev/null | grep Running 38 | service_up=$? 39 | if [ $COUNT -eq $MAXCOUNT ]; then 40 | echo "service $SERVICE not found; giving up" 41 | exit 1 42 | fi 43 | COUNT=$((COUNT+1)) 44 | done 45 | 46 | echo "service $SERVICE up!" 47 | } 48 | 49 | 50 | ############################## 51 | ## Deploy stack 52 | ############################## 53 | 54 | # Check if docker stack has been deployed 55 | docker service ps $HTCONDOR_SERVICE --format "{{.CurrentState}}" 2>/dev/null | grep Running 56 | service_up=$? 57 | 58 | if [ $service_up -eq 1 ]; then 59 | docker stack deploy --with-registry-auth -c $DOCKER_COMPOSE $STACK_NAME; 60 | fi 61 | 62 | service_up $HTCONDOR_SERVICE 63 | service_up $SNAKEMAKE_SERVICE 64 | CONTAINER=$(docker ps | grep cookiecutter-htcondor_htcondor | awk '{print $1}') 65 | 66 | # Fix snakemake header to point to /opt/local/bin 67 | docker exec $CONTAINER /bin/bash -c "head -1 /opt/local/bin/snakemake" | grep -q "/usr/local/bin" 68 | if [ $? -eq 0 ]; then 69 | echo "Rewriting snakemake header to point to /opt/local/bin" 70 | docker exec $CONTAINER /bin/bash -c 'sed -i -e "s:/usr:/opt:" /opt/local/bin/snakemake' 71 | fi 72 | 73 | # Add htcondor to snakemake 74 | CONTAINER=$(docker ps | grep cookiecutter-htcondor_snakemake | awk '{print $1}') 75 | docker exec $CONTAINER pip install htcondor==23.5.2 snakemake-executor-plugin-cluster-generic 76 | -------------------------------------------------------------------------------- /tests/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | snakemake: 4 | image: ${SNAKEMAKE_IMAGE} 5 | hostname: htcondorctl 6 | command: /bin/bash 7 | deploy: 8 | resources: 9 | limits: 10 | cpus: '0.5' 11 | memory: 1000M 12 | reservations: 13 | cpus: '0.5' 14 | memory: 1000M 15 | tty: true 16 | stdin_open: true 17 | working_dir: /tmp 18 | volumes: 19 | - usr:/usr/ 20 | 21 | htcondor: 22 | image: ${HTCONDOR_IMAGE} 23 | hostname: htcondorctl 24 | stdin_open: true 25 | tty: true 26 | working_dir: /tmp 27 | environment: 28 | PATH: "/opt/local/bin:$PATH" 29 | SNAKEMAKE_PATH: "/opt/local/bin" 30 | LC_ALL: en_US.UTF-8 31 | LANG: en_US.UTF-8 32 | volumes: 33 | # Mount snakemake image usr volume to opt 34 | - usr:/opt/ 35 | - /tmp:/tmp 36 | 37 | volumes: 38 | usr: 39 | -------------------------------------------------------------------------------- /tests/test_cookie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | PROFILENAME = "helloworld" 4 | LOGDIR = "helloworld" 5 | 6 | 7 | def test_bake(cookies): 8 | """Test for 'cookiecutter-template'.""" 9 | result = cookies.bake( 10 | extra_context={"profile_name": PROFILENAME, "htcondor_log_dir": LOGDIR} 11 | ) 12 | 13 | assert result.exit_code == 0 14 | assert result.exception is None 15 | 16 | assert result.project_path.name == PROFILENAME 17 | assert result.project_path.is_dir() 18 | -------------------------------------------------------------------------------- /tests/test_submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pytest 4 | import os 5 | 6 | 7 | @pytest.fixture 8 | def profile(cookie_factory, data, request): 9 | cookie_factory() 10 | 11 | 12 | def test_htcondor_submit(smk_runner, profile): 13 | fn = "test_submit.txt" 14 | smk_runner.make_target(fn) 15 | assert "Finished job" in smk_runner.output 16 | path = os.path.join(smk_runner._data, fn) 17 | assert os.path.isfile(path) 18 | 19 | 20 | @pytest.mark.timeout(30) 21 | def test_resources_mem(smk_runner, profile): 22 | fn = "resources_mem.txt" 23 | smk_runner.make_target(fn, asynchronous=True) 24 | smk_runner.wait_until_job_exists() 25 | for ji in smk_runner.external_jobinfo: 26 | assert ji["RequestMemory"] == 99 27 | smk_runner.kill_job() 28 | 29 | 30 | @pytest.mark.timeout(30) 31 | def test_resources_disk(smk_runner, profile): 32 | fn = "resources_disk.txt" 33 | smk_runner.make_target(fn, asynchronous=True) 34 | smk_runner.wait_until_job_exists() 35 | for ji in smk_runner.external_jobinfo: 36 | assert ji["RequestDisk"] == 99 37 | smk_runner.kill_job() 38 | -------------------------------------------------------------------------------- /tests/wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import re 5 | import sys 6 | import json 7 | import logging 8 | import time 9 | import subprocess as sp 10 | from time import sleep 11 | from docker.models.resource import Model 12 | from docker.models.containers import ExecResult 13 | from docker.errors import DockerException 14 | 15 | STDOUT = sys.stdout 16 | 17 | 18 | class ShellContainer(Model): 19 | """Class wrapper to emulate docker container but for shell calls""" 20 | 21 | _exit_code = None 22 | 23 | def __init__(self, attrs=None, client=None, collection=None): 24 | super().__init__(attrs, client, collection) 25 | 26 | @property 27 | def short_id(self): 28 | return self.id 29 | 30 | def exec_run(self, cmd, stream=False, detach=False, **kwargs): 31 | stdout = kwargs.pop("stdout", sp.PIPE) 32 | stderr = kwargs.pop("stderr", sp.STDOUT) 33 | close_fds = sys.platform != "win32" 34 | executable = os.environ.get("SHELL", None) 35 | proc = sp.Popen( 36 | cmd, 37 | bufsize=-1, 38 | shell=True, 39 | stdout=stdout, 40 | stderr=stderr, 41 | close_fds=close_fds, 42 | executable=executable, 43 | ) 44 | 45 | def iter_stdout(proc): 46 | for line in proc.stdout: 47 | yield line[:-1] 48 | 49 | if detach: 50 | return ExecResult(None, "") 51 | 52 | if stream: 53 | return ExecResult(None, iter_stdout(proc)) 54 | 55 | output = proc.communicate() 56 | return ExecResult(proc.returncode, output[0]) 57 | 58 | 59 | class SnakemakeRunner: 60 | """Class wrapper to run snakemake jobs in container""" 61 | 62 | _snakemake = "snakemake" 63 | _snakefile = "Snakefile" 64 | _directory = None 65 | _jobid_regex = re.compile( 66 | "|".join( 67 | [ 68 | r"Submitted batch job (.+_.+_\d+)", 69 | r"Submitted job \d+ with external jobid '(.+_.+_\d+)'.", 70 | r"Submitted group job \S+ with external jobid '(.+_.+_\d+)'." 71 | # Missing resubmitted case 72 | ] 73 | ) 74 | ) 75 | 76 | _process_args = {} 77 | _process_prefix = "" 78 | 79 | @classmethod 80 | def executable(cls, cmd): 81 | if os.path.split(cmd)[-1] == "bash": 82 | cls._process_prefix = "set -euo pipefail" 83 | cls._process_args["executable"] = cmd 84 | 85 | @classmethod 86 | def prefix(cls, prefix): 87 | cls._process_prefix = prefix 88 | 89 | def __init__(self, container, data, jobname, partition="normal", account=None): 90 | self._container = container 91 | self._data = data 92 | self._jobname = re.sub("test_", "", jobname) 93 | self._output = [] 94 | self._pp = self._process_prefix 95 | self._cmd = "" 96 | self._num_cores = 1 97 | self._logger = logging.getLogger(str(self)) 98 | self._external_jobid = [] 99 | self._external_jobinfo = [] 100 | self._partition = partition 101 | self._account = account 102 | self._profile = self._data.join("htcondor") 103 | 104 | def exec_run(self, cmd, stream=False, **kwargs): 105 | return self._container.exec_run(cmd, stream=stream, **kwargs) 106 | 107 | def make_target(self, target, stream=True, asynchronous=False, **kwargs): 108 | """Wrapper to make snakemake target""" 109 | self._snakefile = kwargs.pop("snakefile", self._snakefile) 110 | options = kwargs.pop("options", "") 111 | profile = kwargs.pop("profile", str(self.profile)) 112 | jobname = kwargs.pop("jobname", str(self.jobname)) 113 | force = "-F" if kwargs.pop("force", False) else "" 114 | verbose = kwargs.pop("verbose", True) 115 | self._directory = "-d {}".format(kwargs.pop("dir", self.snakefile.dirname)) 116 | prof = "" if profile is None else f"--profile {profile}" 117 | jn = "" if jobname is None else f"--jn {jobname}-{{jobid}}" 118 | self._external_jobid = [] 119 | self._external_jobinfo = [] 120 | 121 | cmd = ( 122 | f"{self.exe} -c '{self.pp} && " 123 | + f"{self.snakemake} -s {self.snakefile} " 124 | + f"{options} --nolock --default-resources mem_mb=100 " 125 | + f"-j {self._num_cores} {self.workdir} {force} {target} {prof} {jn}'" 126 | ) 127 | 128 | try: 129 | sp.run( 130 | f"chmod 777 -fR {os.path.dirname(os.path.dirname(self._data))}", 131 | shell=True, 132 | ) 133 | except Exception as e: 134 | raise e 135 | 136 | try: 137 | (exit_code, output) = self.exec_run( 138 | cmd, stream=stream, detach=asynchronous, user="submituser" 139 | ) 140 | except Exception as e: 141 | raise e 142 | if stream: 143 | for x in output: 144 | if isinstance(x, bytes): 145 | x = x.decode() 146 | if verbose: 147 | print(x) 148 | self._output.append(x) 149 | else: 150 | if isinstance(output, bytes): 151 | output = output.decode() 152 | self._output = [output] 153 | return ExecResult(exit_code, output) 154 | 155 | @property 156 | def jobname(self): 157 | return self._jobname 158 | 159 | @property 160 | def profile(self): 161 | return self._profile 162 | 163 | @property 164 | def snakefile(self): 165 | return self._data.join(self._snakefile) 166 | 167 | @property 168 | def snakemake(self): 169 | return self._snakemake 170 | 171 | @property 172 | def account(self): 173 | return self._account 174 | 175 | @property 176 | def partition(self): 177 | return self._partition 178 | 179 | @property 180 | def workdir(self): 181 | if self._directory is None: 182 | self._directory = self.snakefile.dirname 183 | return self._directory 184 | 185 | @property 186 | def cluster_config(self): 187 | return self._data.join("config.yaml") 188 | 189 | @property 190 | def slurm_submit(self): 191 | return self.profile.join("grid-submit.py") 192 | 193 | @property 194 | def slurm_status(self): 195 | return self.profile.join("grid-status.py") 196 | 197 | @property 198 | def exe(self): 199 | return self._process_args["executable"] 200 | 201 | @property 202 | def pp(self): 203 | return self._pp 204 | 205 | def script(self, script): 206 | return self._data.join(script) 207 | 208 | @property 209 | def output(self): 210 | if isinstance(self._output, list): 211 | return "\n".join(self._output) 212 | return self._output 213 | 214 | def wait_while_status(self, status, timeout=60, tdelta=10, verbose=False): 215 | """Wait for status to change""" 216 | t = 0 217 | while self.check_jobstatus(status, verbose=verbose): 218 | time.sleep(tdelta) 219 | t = t + tdelta 220 | if t >= timeout: 221 | self._logger.error(f"waiting while status '{status}' timed out") 222 | break 223 | 224 | def wait_for_status(self, status, timeout=60, tdelta=10, verbose=False): 225 | """Wait until status is achieved""" 226 | t = 0 227 | while not self.check_jobstatus(status, verbose=verbose): 228 | time.sleep(tdelta) 229 | t = t + tdelta 230 | if t >= timeout: 231 | self._logger.error(f"waiting for status '{status}' timed out") 232 | break 233 | 234 | def cancel_slurm_job(self, jobid): 235 | """Cancel job in slurm queue""" 236 | self.exec_run(f"scancel {jobid}") 237 | 238 | def check_jobstatus( 239 | self, 240 | regex, 241 | options="", 242 | jobid=None, 243 | which=0, 244 | verbose=True, 245 | ): 246 | """Use sacct to check jobstatus""" 247 | if len(self.external_jobid) == 0 and jobid is None: 248 | return False 249 | if jobid is None: 250 | jobid = str(self.external_jobid[which]).strip() 251 | cmd = f"sacct --parsable2 -b {options} -j {jobid}" 252 | (exit_code, output) = self.exec_run(cmd, stream=False) 253 | if exit_code != 0: 254 | raise DockerException(output.decode()) 255 | m = re.search(regex, output.decode()) 256 | if m is None and verbose: 257 | self._logger.warning(f"{cmd}\n{output.decode()}") 258 | return m 259 | 260 | def __str__(self): 261 | return f"{self._jobname}" 262 | 263 | @property 264 | def external_jobid(self): 265 | if len(self._external_jobid) == 0: 266 | try: 267 | m = self._jobid_regex.findall(self.output) 268 | if m is not None: 269 | self._external_jobid = [x for y in m for x in y if x] 270 | except Exception as e: 271 | print(e) 272 | finally: 273 | (_, out) = self.exec_run("condor_q --allusers --json", stream=False) 274 | try: 275 | jobinfos = json.loads(out) 276 | print(jobinfos) 277 | except json.decoder.JSONDecodeError: 278 | return [] 279 | for job in jobinfos: 280 | if job["Iwd"] != self._data: 281 | continue 282 | self._external_jobid.append(job["ClusterId"]) 283 | self._external_jobinfo.append(job) 284 | 285 | return self._external_jobid 286 | 287 | def wait_until_job_exists(self): 288 | while not self.external_jobid: 289 | sleep(1) 290 | 291 | def wait_until_file_exists(self, fn): 292 | while not os.path.isfile(fn): 293 | sleep(1) 294 | 295 | def kill_job(self): 296 | for ext_jobid in self.external_jobid: 297 | cmd = f"condor_rm {ext_jobid}" 298 | try: 299 | (exit_code, output) = self.exec_run( 300 | cmd, stream=False, detach=False, user="submituser" 301 | ) 302 | except Exception as e: 303 | raise e 304 | 305 | @property 306 | def external_jobinfo(self): 307 | if len(self._external_jobinfo) == 0: 308 | for ext_jobid in self.external_jobid: 309 | cmd = f"condor_q --json {self.condor_jobid}" 310 | try: 311 | (exit_code, output) = self.exec_run( 312 | cmd, stream=False, detach=False, user="submituser" 313 | ) 314 | except Exception as e: 315 | raise e 316 | self._external_jobinfo.append(output) 317 | 318 | return self._external_jobinfo 319 | 320 | 321 | if "SHELL" in os.environ: 322 | SnakemakeRunner.executable(os.environ["SHELL"]) 323 | # Try falling back on /bin/bash 324 | else: 325 | SnakemakeRunner.executable("/bin/bash") 326 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/config.v8+.yaml: -------------------------------------------------------------------------------- 1 | jobscript: "grid-jobscript.sh" 2 | executor: "cluster-generic" 3 | cluster-generic-status-cmd: "grid-status.py" 4 | cluster-generic-submit-cmd: "grid-submit.py" 5 | max-jobs-per-second: 100 6 | max-status-checks-per-second: 100 7 | restart-times: 5 8 | local-cores: 10 9 | jobs: 5000 10 | verbose: false 11 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/config.yaml: -------------------------------------------------------------------------------- 1 | jobscript: "grid-jobscript.sh" 2 | cluster: "grid-submit.py" 3 | cluster-status: "grid-status.py" 4 | max-jobs-per-second: 100 5 | max-status-checks-per-second: 100 6 | restart-times: 5 7 | local-cores: 10 8 | jobs: 5000 9 | verbose: false 10 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/grid-jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # properties = {properties} 3 | 4 | set -e 5 | 6 | echo "hostname:" 7 | hostname -f 8 | 9 | {exec_job} 10 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/grid-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import htcondor 5 | from htcondor import JobEventType 6 | from os.path import join 7 | 8 | 9 | def print_and_exit(s): 10 | print(s) 11 | exit() 12 | 13 | 14 | jobID, UUID, clusterID = sys.argv[1].split("_") 15 | 16 | jobDir = "{{cookiecutter.htcondor_log_dir}}/{}_{}".format(jobID, UUID) 17 | jobLog = join(jobDir, "condor.log") 18 | 19 | failed_states = [ 20 | JobEventType.JOB_HELD, 21 | JobEventType.JOB_ABORTED, 22 | JobEventType.EXECUTABLE_ERROR, 23 | ] 24 | 25 | try: 26 | jel = htcondor.JobEventLog(join(jobLog)) 27 | for event in jel.events(stop_after=5): 28 | if event.type in failed_states: 29 | print_and_exit("failed") 30 | if event.type is JobEventType.JOB_TERMINATED: 31 | if event["ReturnValue"] == 0: 32 | print_and_exit("success") 33 | print_and_exit("failed") 34 | except OSError as e: 35 | print_and_exit("failed: {}".format(e)) 36 | 37 | print_and_exit("running") 38 | -------------------------------------------------------------------------------- /{{cookiecutter.profile_name}}/grid-submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import htcondor 5 | from os import makedirs 6 | from os.path import join 7 | from uuid import uuid4 8 | 9 | from snakemake.utils import read_job_properties 10 | 11 | 12 | jobscript = sys.argv[1] 13 | job_properties = read_job_properties(jobscript) 14 | 15 | UUID = uuid4() # random UUID 16 | jobDir = "{{cookiecutter.htcondor_log_dir}}/{}_{}".format(job_properties["jobid"], UUID) 17 | makedirs(jobDir, exist_ok=True) 18 | 19 | sub = htcondor.Submit( 20 | { 21 | "executable": "/bin/bash", 22 | "arguments": jobscript, 23 | "max_retries": "5", 24 | "log": join(jobDir, "condor.log"), 25 | "output": join(jobDir, "condor.out"), 26 | "error": join(jobDir, "condor.err"), 27 | "getenv": "True", 28 | "request_cpus": str(job_properties["threads"]), 29 | } 30 | ) 31 | 32 | request_memory = job_properties["resources"].get("mem_mb", None) 33 | if request_memory is not None: 34 | sub["request_memory"] = str(request_memory) 35 | 36 | request_disk = job_properties["resources"].get("disk_mb", None) 37 | if request_disk is not None: 38 | sub["request_disk"] = str(request_disk) 39 | 40 | {%- if cookiecutter.location_cern %} 41 | 42 | # Add kerberos credentials 43 | # c.f. https://batchdocs.web.cern.ch/local/pythonapi.html 44 | col = htcondor.Collector() 45 | credd = htcondor.Credd() 46 | credd.add_user_cred(htcondor.CredTypes.Kerberos, None) 47 | sub["MY.SendCredential"] = "True" 48 | {%- endif %} 49 | 50 | schedd = htcondor.Schedd() 51 | clusterID = schedd.submit(sub) 52 | 53 | # print jobid for use in Snakemake 54 | print("{}_{}_{}".format(job_properties["jobid"], UUID, clusterID)) 55 | --------------------------------------------------------------------------------