├── requirements.txt ├── yaspi ├── misc │ ├── mnist.png │ ├── cpu-proc.png │ └── gpu-proc.png ├── templates │ ├── cpu-proc │ │ ├── master.sh │ │ └── template.sh │ ├── gpu-proc │ │ ├── master.sh │ │ └── template.sh │ └── ray │ │ ├── start-ray-head-node.sh │ │ ├── start-ray-worker-node.sh │ │ ├── ray-master.sh │ │ └── ray-sbatch.sh └── yaspi.py ├── .gitignore ├── yaspi_test ├── misc │ ├── hello_world.py │ └── dummy_yaspi_config.json └── test_yaspi.py ├── examples ├── mnist_hyperparams.json ├── yaspi_settings.json ├── minimal_ray_example.py └── train_mnist.py ├── .github └── workflows │ └── ci-workflow.yml ├── setup.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | watchlogs 2 | beartype -------------------------------------------------------------------------------- /yaspi/misc/mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albanie/yaspi/HEAD/yaspi/misc/mnist.png -------------------------------------------------------------------------------- /yaspi/misc/cpu-proc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albanie/yaspi/HEAD/yaspi/misc/cpu-proc.png -------------------------------------------------------------------------------- /yaspi/misc/gpu-proc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albanie/yaspi/HEAD/yaspi/misc/gpu-proc.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | data 3 | .vscode 4 | *__pycache__ 5 | build 6 | dist 7 | *.egg-info 8 | .coverage 9 | -------------------------------------------------------------------------------- /yaspi_test/misc/hello_world.py: -------------------------------------------------------------------------------- 1 | """Test module for yaspi. 2 | """ 3 | 4 | if __name__ == "__main__": 5 | print(f"Hello world") -------------------------------------------------------------------------------- /yaspi/templates/cpu-proc/master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Launch the sbatch job 4 | job_id=$(sbatch --parsable {{sbatch_path}}) 5 | echo $job_id -------------------------------------------------------------------------------- /yaspi/templates/gpu-proc/master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Launch the sbatch job 4 | job_id=$(sbatch --parsable {{sbatch_path}}) 5 | echo $job_id -------------------------------------------------------------------------------- /examples/mnist_hyperparams.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"lr": 0.1, "gamma": 0.7}, 3 | {"lr": 0.01, "gamma": 0.7}, 4 | {"lr": 0.001, "gamma": 0.8} 5 | ] -------------------------------------------------------------------------------- /yaspi/templates/ray/start-ray-head-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script is responsible for initialising the ray head node on whichever 4 | # machine slurm has assigned to it 5 | 6 | # Setup the environment for Ray 7 | echo "setting up environment for ray head node" 8 | {{env_setup}} 9 | 10 | echo "starting ray head node" 11 | # Launch the head node 12 | ray start --head --redis-port=6379 --include-webui {{ray_args}} 13 | echo "started ray head node" 14 | 15 | # Prevent the slurm scheduler from releasing the machine 16 | sleep infinity 17 | echo "Ray head node was stopped" -------------------------------------------------------------------------------- /yaspi/templates/ray/start-ray-worker-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script is responsible for initialising a ray worker node on whichever 4 | # machine slurm has assigned to it 5 | 6 | # parse the arguments 7 | PID=$$ 8 | redis_address=$1 9 | worker_id=$2 10 | 11 | # Setup the environment for Ray 12 | {{env_setup}} 13 | 14 | # Launch the worker node 15 | cmd="ray start --redis-address=${redis_address} {{ray_args}}" 16 | echo "running cmd: ${cmd}" 17 | eval $cmd 18 | 19 | # Prevent the slurm scheduler from releasing the machine 20 | sleep infinity 21 | echo "Worker ${worker_id} stopped" -------------------------------------------------------------------------------- /yaspi/templates/ray/ray-master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Generate a temporary text file to share state between the workers. This must be 4 | # somewhere on the home folder (i.e. not in $TMPDIR). Its sole purpose is to pass the 5 | # address of the head node to the worker nodes. 6 | TMPDIR="${HOME}/tmp/ray-scripts" 7 | mkdir -p "${TMPDIR}" 8 | tmpfile=$(mktemp "${TMPDIR}/ray-scheduler.XXXXXX") 9 | echo "created tmpfile at $tmpfile to share ray meta data" 10 | 11 | # sleep to ensure the temporary file has time to propagate over NFS 12 | sleep {{nfs_update_secs}} 13 | 14 | # Launch the ray job 15 | sbatch --export=all,tmpfile=$tmpfile {{ray_sbatch_path}} -------------------------------------------------------------------------------- /examples/yaspi_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe": "gpu-proc", 3 | "partition": "gpu", 4 | "time_limit": "24:00:00", 5 | "gen_script_dir": "data/slurm-gen-scripts", 6 | "mem": "24G", 7 | "gpus_per_task": 1, 8 | "cpus_per_task": 2, 9 | "throttle_array": 20, 10 | "ssh_forward": "", 11 | "log_dir": "data/slurm-logs", 12 | "use_custom_ray_tmp_dir": false, 13 | "refresh_logs": false, 14 | "exclude": "", 15 | "constraint_str": "", 16 | "prep": "", 17 | "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH; export PATH=\"${HOME}\"/local/miniconda3/condabin/:$PATH; source ~/local/miniconda3/etc/profile.d/conda.sh; conda activate py37" 18 | } 19 | -------------------------------------------------------------------------------- /yaspi_test/misc/dummy_yaspi_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe": "cpu-proc", 3 | "partition": "compute", 4 | "time_limit": "12:00:00", 5 | "gen_script_dir": "data/slurm-gen-scripts", 6 | "mem": "32G", 7 | "cpus_per_task": 2, 8 | "gpus_per_task": 0, 9 | "throttle_array": 2, 10 | "ssh_forward": "", 11 | "log_dir": "data/slurm-logs", 12 | "use_custom_ray_tmp_dir": false, 13 | "refresh_logs": false, 14 | "exclude": "", 15 | "constraint_str": "", 16 | "prep": "", 17 | "custom_directives": "#SBATCH --comment \"a harmless comment\"\n#SBATCH --mail-type=END,FAIL\n#SBATCH --mail-user=username@email.com", 18 | "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH;" 19 | } 20 | -------------------------------------------------------------------------------- /yaspi/templates/cpu-proc/template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name={{job_name}} 3 | #SBATCH --mem={{mem|ordeleteline}} 4 | #SBATCH --array={{array}} 5 | #SBATCH --time={{time_limit|ordeleteline}} 6 | #SBATCH --output={{log_path}} 7 | #SBATCH --partition={{partition|ordeleteline}} 8 | #SBATCH --cpus-per-task={{cpus_per_task|ordeleteline}} 9 | {{sbatch_resources|ordeleteline}} 10 | {{exclude_nodes}} 11 | {{custom_directives}} 12 | {{sbatch_resources}} 13 | # ------------------------------- 14 | 15 | # enable terminal stdout logging 16 | echo "linking job logs to terminal" 17 | echo "==================================================================" 18 | 19 | {{env_setup}} 20 | 21 | # Run the loop of runs for this task. 22 | worker_id=$((SLURM_ARRAY_TASK_ID - 1)) 23 | echo "($HOSTNAME) This is SLURM task $SLURM_ARRAY_TASK_ID, worker id $worker_id" 24 | 25 | # handle potential ipython issues with history 26 | export IPYTHONDIR=/tmp 27 | 28 | prep="{{prep}}" 29 | echo "running prep cmd $prep" 30 | eval "${prep}" 31 | 32 | cmd="{{cmd}}" 33 | cmd="srun ${cmd} --slurm --worker_id ${worker_id}" 34 | echo "running cmd $cmd" 35 | eval "${cmd}" 36 | -------------------------------------------------------------------------------- /.github/workflows/ci-workflow.yml: -------------------------------------------------------------------------------- 1 | name: yaspi-ci 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.7] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -r requirements.txt 23 | - name: Lint with flake8 24 | run: | 25 | pip install flake8 26 | # stop the build if there are Python syntax errors or undefined names 27 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 28 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 29 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 30 | 31 | - name: Test with pytest 32 | run: | 33 | pip install pytest coverage 34 | coverage run --source=. -m py.test -------------------------------------------------------------------------------- /examples/minimal_ray_example.py: -------------------------------------------------------------------------------- 1 | """A minimal working example for Ray usage with yaspi. 2 | 3 | See the official documentation for examples and tutorials: 4 | https://ray.readthedocs.io/en/latest/ 5 | """ 6 | import ray 7 | import time 8 | import argparse 9 | from datetime import datetime 10 | 11 | 12 | @ray.remote 13 | def remote_function(): 14 | time.sleep(5) 15 | return datetime.now() 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser(description="Minimal Ray example") 20 | parser.add_argument("--local_mode", action="store_true", help="run on local machine") 21 | parser.add_argument("--ray_address", help="address of the ray head node") 22 | args = parser.parse_args() 23 | 24 | # initialise the server 25 | ray.init( 26 | address=args.ray_address, 27 | local_mode=args.local_mode, 28 | ignore_reinit_error=1, 29 | ) 30 | 31 | # execute functions 32 | timestamps = [remote_function.remote() for _ in range(4)] 33 | for worker_timestamp in timestamps: 34 | print(f"timestamp from worker: {ray.get(worker_timestamp)}") 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /yaspi/templates/gpu-proc/template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name={{job_name}} 3 | #SBATCH --mem={{mem|ordeleteline}} 4 | #SBATCH --array={{array}} 5 | #SBATCH --time={{time_limit|ordeleteline}} 6 | #SBATCH --output={{log_path}} 7 | #SBATCH --partition={{partition|ordeleteline}} 8 | #SBATCH --cpus-per-task={{cpus_per_task|ordeleteline}} 9 | {{sbatch_resources}} 10 | {{exclude_nodes}} 11 | {{custom_directives}} 12 | {{sbatch_resources}} 13 | # ------------------------------- 14 | 15 | # enable terminal stdout logging 16 | echo "linking job logs to terminal" 17 | echo "==================================================================" 18 | 19 | {{env_setup}} 20 | 21 | # Run the loop of runs for this task. 22 | worker_id=$((SLURM_ARRAY_TASK_ID - 1)) 23 | echo "($HOSTNAME) This is SLURM task $SLURM_ARRAY_TASK_ID, worker id $worker_id" 24 | declare -a custom_args_queue=({{job_queue}}) 25 | 26 | # handle potential ipython issues with history 27 | export IPYTHONDIR=/tmp 28 | 29 | prep="{{prep}}" 30 | echo "running prep cmd $prep" 31 | eval "${prep}" 32 | 33 | cmd="{{cmd}}" 34 | cmd="srun --unbuffered ${cmd} ${custom_args_queue[${worker_id}]}" 35 | echo "running cmd $cmd" 36 | eval "${cmd}" 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Yaspi setup.py 3 | 4 | Build/upload commands: 5 | coverage run -m pytest --capture=tee-sys yaspi_test 6 | python3 setup.py sdist bdist_wheel 7 | twine upload --skip-existing dist/* 8 | """ 9 | from pathlib import Path 10 | 11 | import setuptools 12 | 13 | with open("README.md", "r") as f: 14 | long_description = f.read() 15 | 16 | 17 | # Ensure that extra data (example scripts and recipe templates) are included 18 | package_dir = "yaspi" 19 | extra_package_patterns = ["misc/*.py", "templates/**/*.sh"] 20 | extra_package_files = [] 21 | for pattern in extra_package_patterns: 22 | paths = Path(package_dir).glob(pattern) 23 | rel_paths = [str(x.relative_to(package_dir)) for x in paths] 24 | extra_package_files.extend(rel_paths) 25 | 26 | 27 | setuptools.setup( 28 | name="yaspi", 29 | version="0.0.8", 30 | entry_points={ 31 | "console_scripts": [ 32 | "yaspi=yaspi.yaspi:main", 33 | ], 34 | }, 35 | author="Samuel Albanie", 36 | description="Yet Another Slurm Python Interface", 37 | long_description=long_description, 38 | long_description_content_type="text/markdown", 39 | url="https://github.com/albanie/yaspi", 40 | packages=["yaspi"], 41 | package_dir={"yaspi": package_dir}, 42 | package_data={"yaspi": extra_package_files}, 43 | install_requires=[ 44 | "watchlogs", 45 | "beartype>=0.7.1" 46 | ], 47 | python_requires=">=3.7", 48 | classifiers=[ 49 | "Programming Language :: Python :: 3.7", 50 | "License :: OSI Approved :: MIT License", 51 | 'Operating System :: POSIX :: Linux', 52 | ], 53 | ) 54 | -------------------------------------------------------------------------------- /yaspi/templates/ray/ray-sbatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name={{job_name}} 3 | #SBATCH --mem={{mem|ordeleteline}} 4 | #SBATCH --array={{array}} 5 | #SBATCH --time={{time_limit|ordeleteline}} 6 | #SBATCH --output={{log_path}} 7 | #SBATCH --partition={{partition|ordeleteline}} 8 | #SBATCH --cpus-per-task={{cpus_per_task|ordeleteline}} 9 | {{sbatch_resources}} 10 | {{exclude_nodes}} 11 | # ------------------------------- 12 | 13 | # This script is a modification to the implementation suggest by gregSchwartz18 here: 14 | # https://github.com/ray-project/ray/issues/826#issuecomment-522116599 15 | 16 | worker_id=$((SLURM_ARRAY_TASK_ID - 1)) 17 | echo "($HOSTNAME) This is SLURM job: $SLURM_ARRAY_JOB_ID, worker id $worker_id" 18 | 19 | # define a length of time (in seconds) for workers to wait while either the head node 20 | # (or other workers) initiailise ray servers 21 | approx_ray_init_time_in_secs={{approx_ray_init_time_in_secs}} 22 | 23 | # The first array worker is responsible for managing the head-node. All remaining array 24 | # members will be used as ray workers. 25 | if [ $worker_id -eq 0 ]; then 26 | 27 | # find the ip address of the machine that will be the head node and write it to disk 28 | # so that the other workers can connect to it 29 | ip_prefix=$(srun --ntasks=1 hostname --ip-address) 30 | echo "ip_prefix: ${ip_prefix}" 31 | 32 | # We will run the head node on the standard Redis port - we also write this 33 | # information to disk so that it can be accessed by the workers 34 | suffix=':6379' 35 | ip_head=$ip_prefix$suffix 36 | echo "Writing values to ${tmpfile}" 37 | echo $ip_head >> $tmpfile 38 | 39 | # run the head-node initialisation script 40 | head_init_script={{head_init_script}} 41 | srun -u --export=ALL --ntasks=1 "${head_init_script}" & 42 | echo "launched ${head_init_script}" 43 | else 44 | # For each non-head worker, we first sleep to allow the head worker to write its 45 | # details to disk 46 | sleep ${approx_ray_init_time_in_secs} 47 | 48 | echo "=====================================================" 49 | echo "running ray worker ${worker_id}" 50 | echo "=====================================================" 51 | echo "reading head node information from: ${tmpfile}" 52 | readarray -t head_node_config < ${tmpfile} 53 | ip_head=${head_node_config[0]} 54 | echo "attaching to head node ip_head: ${ip_head}" 55 | 56 | worker_init_script={{worker_init_script}} 57 | srun -u --export=ALL --ntasks=1 ${worker_init_script} $ip_head $worker_id 58 | echo "launched ${cmd}" 59 | fi 60 | 61 | if [ $worker_id -eq 0 ]; then 62 | {{env_setup}} 63 | {{ssh_forward}} 64 | cmd="{{cmd}}" 65 | echo "Launching ${cmd} on head node in ${approx_ray_init_time_in_secs} secs" 66 | sleep ${approx_ray_init_time_in_secs} 67 | {{cmd}} --ray_address $ip_head 68 | echo "cancelling $SLURM_JOB_ID" 69 | scancel $SLURM_ARRAY_JOB_ID 70 | fi 71 | -------------------------------------------------------------------------------- /yaspi_test/test_yaspi.py: -------------------------------------------------------------------------------- 1 | """Minimal tests to validate syntax. 2 | 3 | It would be possible to CI test SLURM launches by adding docker + slurmd to the 4 | github workflow, but github doesn't give me enough free testing minutes for that :) 5 | As a work around, tests that involve slurm submissions are only run for known 6 | hostnames. 7 | """ 8 | 9 | import json 10 | import socket 11 | from pathlib import Path 12 | from yaspi.yaspi import Yaspi 13 | 14 | 15 | PATH_ARGS = {"gen_script_dir", "log_dir", "template_dir"} 16 | HOSTS_WITH_SLURM = ["login1.triton.cluster"] 17 | 18 | 19 | def test_yaspi_object_creation(): 20 | with open("yaspi_test/misc/dummy_yaspi_config.json", "r") as f: 21 | yaspi_defaults = json.load(f) 22 | for key, val in yaspi_defaults.items(): 23 | if key in PATH_ARGS: 24 | yaspi_defaults[key] = Path(val) 25 | cmd = "python yaspi_test/misc/hello_world.py" 26 | job_name = "test_yaspi" 27 | job_queue = "" 28 | yaspi = Yaspi( 29 | cmd=cmd, 30 | job_name=job_name, 31 | job_queue=job_queue, 32 | job_array_size=1, 33 | **yaspi_defaults, 34 | ) 35 | print(f"Test yaspi object: {yaspi}") 36 | if socket.gethostname() in HOSTS_WITH_SLURM: 37 | yaspi.submit() 38 | 39 | 40 | def test_yaspi_object_creation_with_code_snapshot_dir(): 41 | with open("yaspi_test/misc/dummy_yaspi_config.json", "r") as f: 42 | yaspi_defaults = json.load(f) 43 | for key, val in yaspi_defaults.items(): 44 | if key in PATH_ARGS: 45 | yaspi_defaults[key] = Path(val) 46 | cmd = "python yaspi_test/misc/hello_world.py" 47 | yaspi_defaults["code_snapshot_dir"] = Path("data/code_snapshot_dir") 48 | job_name = "test_yaspi" 49 | job_queue = "" 50 | yaspi = Yaspi( 51 | cmd=cmd, 52 | job_name=job_name, 53 | job_queue=job_queue, 54 | job_array_size=1, 55 | **yaspi_defaults, 56 | ) 57 | print(f"Test yaspi object: {yaspi}") 58 | if socket.gethostname() in HOSTS_WITH_SLURM: 59 | yaspi.submit() 60 | 61 | 62 | def test_yaspi_object_line_deletion(): 63 | with open("yaspi_test/misc/dummy_yaspi_config.json", "r") as f: 64 | yaspi_defaults = json.load(f) 65 | for key, val in yaspi_defaults.items(): 66 | if key in PATH_ARGS: 67 | yaspi_defaults[key] = Path(val) 68 | cmd = "python yaspi_test/misc/hello_world.py" 69 | job_name = "test_yaspi" 70 | job_queue = "" 71 | 72 | # Check that yaspi only includes sbatch directives for values that 73 | # are not None when OR_DELETE_LINE is specified in the sbatch template. 74 | # This test uses the "constraint_str" flag as an example of a directive 75 | # that should be None by default 76 | 77 | # First, check that supplying a yaspi key-value pair ensures it is present 78 | yaspi_defaults["constraint_str"] = "p40" 79 | sbatch_directive = "#SBATCH --constraint" 80 | yaspi = Yaspi( 81 | cmd=cmd, 82 | job_name=job_name, 83 | job_queue=job_queue, 84 | job_array_size=1, 85 | **yaspi_defaults, 86 | ) 87 | # Read the template that was written to disk 88 | with open("data/slurm-gen-scripts/cpu-proc/template.sh", "r") as f: 89 | template_contents = f.read() 90 | assert sbatch_directive in template_contents, ( 91 | f"Expected to find {sbatch_directive} in template contents" 92 | ) 93 | 94 | # Check that supplying a None-valued yaspi key-value pair ensures it is not present 95 | yaspi_defaults["constraint_str"] = None 96 | yaspi = Yaspi( 97 | cmd=cmd, 98 | job_name=job_name, 99 | job_queue=job_queue, 100 | job_array_size=1, 101 | **yaspi_defaults, 102 | ) 103 | # Read the template that was written to disk 104 | with open("data/slurm-gen-scripts/cpu-proc/template.sh", "r") as f: 105 | template_contents = f.read() 106 | assert sbatch_directive not in template_contents, ( 107 | f"Expected not to find {sbatch_directive} in template contents" 108 | ) 109 | if socket.gethostname() in HOSTS_WITH_SLURM: 110 | yaspi.submit() 111 | 112 | 113 | if __name__ == "__main__": 114 | test_yaspi_object_creation() 115 | test_yaspi_object_line_deletion() 116 | test_yaspi_object_creation_with_code_snapshot_dir() 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### yaspi - yet another slurm python interface 2 | 3 | The goal of `yaspi` is to provide an interface to submitting [slurm](https://slurm.schedmd.com/documentation.html) jobs, thereby obviating the joys of sbatch files. It does so through `recipes` - these are collections of templates and rules for generating sbatch scripts. 4 | 5 | ![yaspi-ci](https://github.com/albanie/yaspi/workflows/yaspi-ci/badge.svg) 6 | 7 | ### Installation 8 | 9 | Install via `pip install yaspi`. If you prefer to hack around with the source code, it's a [single python file](yaspi/yaspi.py). 10 | 11 | It should be considered (highly) experimental. 12 | 13 | ### Implementation 14 | 15 | `yaspi` makes heavy usage of slurm [job arrays](https://slurm.schedmd.com/job_array.html). 16 | 17 | ### Supported recipes: 18 | 19 | * `cpu-proc` - a generic recipe for submitting CPU jobs via a job array. 20 | * `gpu-proc` - a generic recipe for submitting GPU jobs via a job array. 21 | * `ray` - job submissions for the [ray scheduler](https://github.com/ray-project/ray). 22 | 23 | ### Dependencies 24 | 25 | * `Python >= 3.6` 26 | * `watchlogs` 27 | 28 | ### Requirements: 29 | 30 | `yaspi` has been tested on CentOS Linux release 7.7.1908 with slurm 18.08.7 and Python 3.7. YMMV on other platforms. 31 | 32 | ### Usage and outputs 33 | 34 | `yaspi` can be used either from the command-line or directly from a python program. Command-line usage is shown in the following examples (the effect of each argument is documented in the [implementation](yaspi/yaspi.py)). 35 | 36 | **Code - scheduling a slurm job array with CPUs**: 37 | ``` 38 | prep_command='echo \"optional preparation cmd\"' 39 | command='echo \"I am running on a CPU node\"' 40 | yaspi --job_name=example \ 41 | --job_array_size=2 \ 42 | --cpus_per_task=5 \ 43 | --cmd="$command" \ 44 | --prep="$prep_command" \ 45 | --recipe=cpu-proc \ 46 | --mem=10G 47 | ``` 48 | **Effect**: This will run the `command` value on two workers as part of a slurm [job array](https://slurm.schedmd.com/job_array.html). Each worker will be allocated 5 CPUs and 10G of memory by the scheduler. Each worker will also be passed two extra flags, `--slurm` (without options) and `--worker_id` (which will be given the 0-indexed value of the current worker index in the job array) which can be used to assign tasks to the worker. The `--prep` flag is optional, and will run a commands prior to the main job (e.g. to change into an appropriate code directory). The effect of the command will be to produce the following: 49 | 50 | ``` 51 | # run on CPU job array worker 0 52 | optional preparation cmd 53 | I am running on a CPU node --slurm --worker_id 0 54 | 55 | # run on CPU job array worker 1 56 | optional preparation cmd 57 | I am running on a CPU node --slurm --worker_id 1 58 | ``` 59 | 60 | When launched, a slightly more verbose (and colourized) output will be produced by [watchlogs](https://github.com/albanie/watchlogs) (this assumes your terminal supports color sequences): 61 | 62 | cpu-proc-output 63 | 64 | 65 | **Code - scheduling a slurm job array with GPUs**: 66 | ``` 67 | prep_command='echo \"optional preparation cmd\"' 68 | job_queue="\"flags for worker 0\" \"flags for worker 1\"" 69 | command='echo \"I am running on a GPU node\"' 70 | yaspi --job_name=example \ 71 | --job_array_size=2 \ 72 | --cpus_per_task=5 \ 73 | --gpus_per_task=1 \ 74 | --prep="$prep_command" \ 75 | --cmd="$command" \ 76 | --recipe=gpu-proc \ 77 | --job_queue="$job_queue" \ 78 | --mem=10G 79 | ``` 80 | **Effect**: This command is similar to the `cpu-proc` recipe described above. Again, the `command` will be run on two workers as part of a slurm [job array](https://slurm.schedmd.com/job_array.html). Each worker will be allocated 5 CPUs and 10G of memory by the scheduler, as well as one GPU. One further difference is that `gpu-proc` also takes an `job_queue` option that can be used to pass options to each GPU worker separately. 81 | 82 | gpu-proc-output 83 | 84 | **Extras - custom directives**: 85 | 86 | The previous example can be extended with custom directives. For example, suppose 87 | you wish to add an extra directive to your `sbatch` of the form 88 | `#SBATCH --comment "a harmless comment goes here"`. You can do this as follows: 89 | ``` 90 | prep_command='echo \"optional preparation cmd\"' 91 | job_queue="\"flags for worker 0\" \"flags for worker 1\"" 92 | command='echo \"I am running on a GPU node\"' 93 | custom_directive='#SBATCH --comment "a harmless comment goes here"' 94 | python yaspi.py --job_name=example \ 95 | --job_array_size=2 \ 96 | --cpus_per_task=5 \ 97 | --gpus_per_task=1 \ 98 | --prep="$prep_command" \ 99 | --cmd="$command" \ 100 | --recipe=gpu-proc \ 101 | --job_queue="$job_queue" \ 102 | --custom_directive="$custom_directive" \ 103 | --mem=10G 104 | ``` 105 | 106 | Custom directives can also be added to json config. For example, to receive emails from slurm, add a `"custom_directives"` flag: 107 | 108 | ``` 109 | { 110 | ... 111 | "custom_directives": "#SBATCH --mail-type=END,FAIL\n#SBATCH --mail-user=your_email_address", 112 | } 113 | ``` 114 | 115 | **Code - scheduling a job with the [ray](https://ray.readthedocs.io/en/latest/index.html) framework:** 116 | 117 | ``` 118 | yaspi_dir=$(yaspi --install_location) 119 | command="python $yaspi_dir/misc/minimal_ray_example.py" 120 | yaspi --job_name=example \ 121 | --cmd="$command" \ 122 | --job_array_size=3 \ 123 | --cpus_per_task=2 \ 124 | --gpus_per_task=1 \ 125 | --mem=10G \ 126 | --recipe=ray 127 | ``` 128 | 129 | **Effect**: Scheduling jobs with the ray framework operates in a slightly different manner to the previous two examples (both of which assume [embarrasingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel) processing i.e. no communication between the workers). The ray receipe similarly launches a slurm job array, but assigns the job at index 0 to be the *master*, and all other nodes as *worker* nodes. The command is run *only* on the master node, which then uses ray to allocate tasks to the worker nodes. The command above will launch a slurm job, with the name "example", that: (1) initialises a ray head node and a set of 2 ray workers via a SLURM array job; (2) launches `$command` from the head node. It will produce an output similar to the following: 130 | 131 | 132 | ``` 133 | started ray head node 134 | timestamp from worker: 2020-02-17 06:40:44.861733 135 | timestamp from worker: 2020-02-17 06:40:44.861793 136 | timestamp from worker: 2020-02-17 06:40:45.062484 137 | timestamp from worker: 2020-02-17 06:40:45.065494 138 | ``` 139 | 140 | **Code - using yaspi directly from python**: 141 | 142 | 143 | An example for training multiple MNIST runs is given in [train_mnist.py](examples/train_mnist.py). Running this file should launch three jobs on SLURM, each with different hyperparameters, producing the output below: 144 | 145 | mnist-output 146 | 147 | 148 | **Modifying your code to use Yaspi**: 149 | 150 | To run an existing piece of code with yaspi requires two things: 151 | 1. A json file containing SLURM settings (e.g. these [yaspi_settings](examples/yaspi_settings.json)). This file will set the options that you would normally set in an SBATCH script (e.g. number of GPUS, total job duration etc.) together with any bash commands you would usually run to set up your job environment (these are supplied via the `"env_estup"` option) 152 | 2. A small block of logic somewhere in your script (visible for the MNIST example [here](https://github.com/albanie/yaspi/blob/master/examples/train_mnist.py#L120-L165)) which sets the job name and calls the Yaspi `submit()` function. 153 | 154 | **Using code snapshot directories**: 155 | 156 | One downside of launching a yaspi job directly from a source code folder is that if you edit your code after submitting your jobs to slurm but the jobs haven't yet launched, the code edits will affect the jobs. Since this is (typically) undesirable behaviour, you can supply extra flags to yaspi so that it copies the source code in your current folder to a new "snapshot" directory and launches from there. As a consequence, any local code changes you make after launching with yaspi will not affect the queued jobs. The flags to pass are: 157 | ``` 158 | --code_snapshot_dir snapshot_dir # is the location where the snapshot of your code will be stored 159 | --code_snapshot_filter_patterns patterns # are a set of glob-patterns to determine which source code is copied 160 | ``` -------------------------------------------------------------------------------- /examples/train_mnist.py: -------------------------------------------------------------------------------- 1 | """Yaspi example: 2 | Aims to be a minimal modification to the PyTorch MNIST example given here: 3 | https://github.com/pytorch/examples/blob/master/mnist/main.py 4 | 5 | Example usage 6 | --------------- 7 | 8 | - Standard training: 9 | python train_mnist.py 10 | 11 | - Yaspi training: 12 | python train_mnist.py --hyperparams mnist_hyperparams.json --yaspify 13 | (this launches one run for each experiment config defined in mnist_hyperparams.json) 14 | """ 15 | import argparse 16 | import torch 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | import torch.optim as optim 20 | from torchvision import datasets, transforms 21 | from torch.optim.lr_scheduler import StepLR 22 | import sys 23 | import json 24 | from yaspi.yaspi import Yaspi 25 | 26 | 27 | class Net(nn.Module): 28 | def __init__(self): 29 | super(Net, self).__init__() 30 | self.conv1 = nn.Conv2d(1, 32, 3, 1) 31 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 32 | self.dropout1 = nn.Dropout(0.25) 33 | self.dropout2 = nn.Dropout(0.5) 34 | self.fc1 = nn.Linear(9216, 128) 35 | self.fc2 = nn.Linear(128, 10) 36 | 37 | def forward(self, x): 38 | x = self.conv1(x) 39 | x = F.relu(x) 40 | x = self.conv2(x) 41 | x = F.relu(x) 42 | x = F.max_pool2d(x, 2) 43 | x = self.dropout1(x) 44 | x = torch.flatten(x, 1) 45 | x = self.fc1(x) 46 | x = F.relu(x) 47 | x = self.dropout2(x) 48 | x = self.fc2(x) 49 | output = F.log_softmax(x, dim=1) 50 | return output 51 | 52 | 53 | def train(args, model, device, train_loader, optimizer, epoch): 54 | model.train() 55 | for batch_idx, (data, target) in enumerate(train_loader): 56 | data, target = data.to(device), target.to(device) 57 | optimizer.zero_grad() 58 | output = model(data) 59 | loss = F.nll_loss(output, target) 60 | loss.backward() 61 | optimizer.step() 62 | if batch_idx % args.log_interval == 0: 63 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 64 | epoch, batch_idx * len(data), len(train_loader.dataset), 65 | 100. * batch_idx / len(train_loader), loss.item())) 66 | if args.dry_run: 67 | break 68 | 69 | 70 | def test(model, device, test_loader): 71 | model.eval() 72 | test_loss = 0 73 | correct = 0 74 | with torch.no_grad(): 75 | for data, target in test_loader: 76 | data, target = data.to(device), target.to(device) 77 | output = model(data) 78 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 79 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 80 | correct += pred.eq(target.view_as(pred)).sum().item() 81 | 82 | test_loss /= len(test_loader.dataset) 83 | 84 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 85 | test_loss, correct, len(test_loader.dataset), 86 | 100. * correct / len(test_loader.dataset))) 87 | 88 | 89 | def main(): 90 | # Training settings 91 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 92 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 93 | help='input batch size for training (default: 64)') 94 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 95 | help='input batch size for testing (default: 1000)') 96 | parser.add_argument('--epochs', type=int, default=14, metavar='N', 97 | help='number of epochs to train (default: 14)') 98 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 99 | help='learning rate (default: 1.0)') 100 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 101 | help='Learning rate step gamma (default: 0.7)') 102 | parser.add_argument('--no-cuda', action='store_true', default=False, 103 | help='disables CUDA training') 104 | parser.add_argument('--dry-run', action='store_true', default=False, 105 | help='quickly check a single pass') 106 | parser.add_argument('--seed', type=int, default=1, metavar='S', 107 | help='random seed (default: 1)') 108 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 109 | help='how many batches to wait before logging training status') 110 | parser.add_argument('--save-model', action='store_true', default=False, 111 | help='For Saving the current Model') 112 | 113 | # Additional flags used by yaspi 114 | parser.add_argument("--yaspify", action="store_true") 115 | parser.add_argument("--yaspi_settings", default="yaspi_settings.json", 116 | help="file of SLURM specific options (e.g. number of GPUS)") 117 | parser.add_argument("--hyperparams", default="mnist_hyperparams.json") 118 | args = parser.parse_args() 119 | 120 | if args.yaspify: 121 | # -------------------------------------------------------------------- 122 | # This section contains the logic for launching multiple runs 123 | # -------------------------------------------------------------------- 124 | # The command that will be launched on each worker will be identical to the 125 | # python command used to launch this script (including all flags), except: 126 | # 1. The --yaspify flag will be removed 127 | # 2. Flags from hyperparams will be inserted 128 | # ------------------------------------------------------------------------- 129 | 130 | # load the hyperparameters 131 | with open(args.hyperparams, "r") as f: 132 | hyperparams = json.load(f) 133 | exp_flags = [] 134 | for exp in hyperparams: 135 | exp_flags.append(" ".join([f"--{key} {val}" for key, val in exp.items()])) 136 | 137 | # Select a name for your jobs (this is what will be visible via the `sinfo` 138 | # SLURM command) 139 | num_jobs = len(exp_flags) 140 | job_name = f"train-mnist-{num_jobs}-jobs" 141 | 142 | # Provide the arguments to each SLURM worker as space-separated quoted strings 143 | job_queue = " ".join([f'"{flags}"' for flags in exp_flags]) 144 | 145 | # remove the yaspify flag 146 | cmd_args = sys.argv 147 | cmd_args.remove("--yaspify") 148 | 149 | # construct the final command that will run each worker, together with job_queue 150 | base_cmd = f"python {' '.join(cmd_args)}" 151 | 152 | # load SLURM specific settings 153 | with open(args.yaspi_settings, "r") as f: 154 | yaspi_defaults = json.load(f) 155 | 156 | # Launch the jobs over SLURM 157 | job = Yaspi( 158 | cmd=base_cmd, 159 | job_queue=job_queue, 160 | job_name=job_name, 161 | job_array_size=num_jobs, 162 | **yaspi_defaults, 163 | ) 164 | # The `watch` argument will keep 165 | job.submit(watch=True, conserve_resources=5) 166 | else: 167 | # -------------------------------------------------------------------- 168 | # This section contains the original, unmodified code 169 | # -------------------------------------------------------------------- 170 | use_cuda = not args.no_cuda and torch.cuda.is_available() 171 | torch.manual_seed(args.seed) 172 | 173 | device = torch.device("cuda" if use_cuda else "cpu") 174 | 175 | train_kwargs = {'batch_size': args.batch_size} 176 | test_kwargs = {'batch_size': args.test_batch_size} 177 | if use_cuda: 178 | cuda_kwargs = {'num_workers': 1, 179 | 'pin_memory': True, 180 | 'shuffle': True} 181 | train_kwargs.update(cuda_kwargs) 182 | test_kwargs.update(cuda_kwargs) 183 | 184 | transform=transforms.Compose([ 185 | transforms.ToTensor(), 186 | transforms.Normalize((0.1307,), (0.3081,)) 187 | ]) 188 | dataset1 = datasets.MNIST('../data', train=True, download=True, 189 | transform=transform) 190 | dataset2 = datasets.MNIST('../data', train=False, 191 | transform=transform) 192 | train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) 193 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) 194 | 195 | model = Net().to(device) 196 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 197 | 198 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 199 | for epoch in range(1, args.epochs + 1): 200 | train(args, model, device, train_loader, optimizer, epoch) 201 | test(model, device, test_loader) 202 | scheduler.step() 203 | 204 | if args.save_model: 205 | torch.save(model.state_dict(), "mnist_cnn.pt") 206 | 207 | if __name__ == '__main__': 208 | main() 209 | -------------------------------------------------------------------------------- /yaspi/yaspi.py: -------------------------------------------------------------------------------- 1 | """YASPI - yet another python slurm interface. 2 | """ 3 | 4 | import re 5 | import shutil 6 | import argparse 7 | import subprocess 8 | from typing import List, Union 9 | from pathlib import Path 10 | from datetime import datetime 11 | from itertools import zip_longest 12 | 13 | from beartype import beartype 14 | from beartype.cave import NoneTypeOr 15 | from watchlogs.watchlogs import Watcher 16 | 17 | 18 | class Yaspi: 19 | 20 | @beartype 21 | def __init__( 22 | self, 23 | job_name: str, 24 | cmd: str, 25 | prep: str, 26 | recipe: str, 27 | gen_script_dir: Union[Path, str], 28 | log_dir: Union[Path, str], 29 | partition: NoneTypeOr[str], 30 | job_array_size: int, 31 | cpus_per_task: NoneTypeOr[int], 32 | gpus_per_task: NoneTypeOr[int], 33 | refresh_logs: bool, 34 | exclude: str, 35 | use_custom_ray_tmp_dir: bool, 36 | ssh_forward: str, 37 | time_limit: NoneTypeOr[str], 38 | throttle_array: int, 39 | mem: str, 40 | constraint_str: NoneTypeOr[str], 41 | custom_directives: str = "", 42 | template_dir: Path = Path(__file__).parent / "templates", 43 | job_queue: NoneTypeOr[str] = None, 44 | env_setup: NoneTypeOr[str] = None, 45 | code_snapshot_dir: NoneTypeOr[Path, str] = None, 46 | code_snapshot_filter_patterns: List[str] = ["**/*.py", "symlinks"], 47 | ): 48 | self.cmd = cmd 49 | self.mem = mem 50 | self.prep = prep 51 | self.recipe = recipe 52 | self.exclude = exclude 53 | self.job_name = job_name 54 | self.partition = partition 55 | self.time_limit = time_limit 56 | self.env_setup = env_setup 57 | self.job_queue = job_queue 58 | self.ssh_forward = ssh_forward 59 | self.refresh_logs = refresh_logs 60 | self.template_dir = Path(template_dir) 61 | self.cpus_per_task = cpus_per_task 62 | self.gpus_per_task = gpus_per_task 63 | self.constraint_str = constraint_str 64 | self.throttle_array = throttle_array 65 | self.job_array_size = job_array_size 66 | self.use_custom_ray_tmp_dir = use_custom_ray_tmp_dir 67 | self.custom_directives = custom_directives 68 | self.gen_script_dir = Path(gen_script_dir) 69 | if code_snapshot_dir is not None: 70 | self.code_snapshot_dir = Path(code_snapshot_dir).resolve() 71 | else: 72 | self.code_snapshot_dir = None 73 | self.code_snapshot_filter_patterns = code_snapshot_filter_patterns 74 | self.slurm_logs = None 75 | # SLURM expects the logfiles to be absolute paths 76 | self.log_dir = Path(log_dir).resolve() 77 | self.generate_scripts() 78 | 79 | def generate_scripts(self): 80 | gen_dir = self.gen_script_dir 81 | if self.env_setup is None: 82 | self.env_setup = ( 83 | 'export PYTHONPATH="${BASE}":$PYTHONPATH\n' 84 | 'export PATH="${HOME}/local/anaconda3/condabin/:$PATH"\n' 85 | 'source ~/local/anaconda3/etc/profile.d/conda.sh\n' 86 | 'conda activate pt37' 87 | ) 88 | 89 | # set up logging 90 | ts = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S") 91 | self.log_path = str(Path(self.log_dir) / self.job_name / ts / "%4a-log.txt") 92 | 93 | if self.code_snapshot_dir is not None: 94 | supported_recipes = {"cpu-proc", "gpu-proc"} 95 | assert self.recipe in supported_recipes, ( 96 | f"For now, `code_snapshot_dir` is only supported for {supported_recipes}" 97 | f" ({self.recipe} is not yet supported)" 98 | ) 99 | code_snapshot_dir = self.code_snapshot_dir / self.job_name / ts 100 | self.copy_to_snapshot_dir(code_snapshot_dir=code_snapshot_dir) 101 | # modify the srun command to first move to the code snapshot directory before 102 | # the user command is launched 103 | self.cmd = f"cd {code_snapshot_dir} ; {self.cmd}" 104 | 105 | if self.recipe == "ray": 106 | # TODO(Samuel): configure this more sensibly 107 | template_paths = { 108 | "master": "ray/ray-master.sh", 109 | "sbatch": "ray/ray-sbatch.sh", 110 | "head-node": "ray/start-ray-head-node.sh", 111 | "worker-node": "ray/start-ray-worker-node.sh", 112 | } 113 | # NOTE: Due to unix max socket length (108 characters) it's best if this is 114 | # short and absolute 115 | if self.use_custom_ray_tmp_dir: 116 | ray_tmp_dir = Path.home() / "data/sock" 117 | ray_tmp_dir.mkdir(exist_ok=True, parents=True) 118 | ray_args = f"--temp-dir={ray_tmp_dir}" 119 | else: 120 | ray_args = "" 121 | array_str = f"1-{self.job_array_size}" 122 | if self.throttle_array: 123 | array_str = f"{array_str}%{self.throttle_array}" 124 | rules = { 125 | "master": { 126 | "nfs_update_secs": 1, 127 | "ray_sbatch_path": str(gen_dir / template_paths["sbatch"]), 128 | }, 129 | "sbatch": { 130 | "cmd": self.cmd, 131 | "mem": self.mem, 132 | "log_path": self.log_path, 133 | "job_name": self.job_name, 134 | "partition": self.partition, 135 | "time_limit": self.time_limit, 136 | "env_setup": self.env_setup, 137 | "array": array_str, 138 | "cpus_per_task": self.cpus_per_task, 139 | "approx_ray_init_time_in_secs": 10, 140 | "exclude_nodes": f"#SBATCH --exclude={self.exclude}", 141 | "head_init_script": str(gen_dir / template_paths["head-node"]), 142 | "worker_init_script": str(gen_dir / template_paths["worker-node"]), 143 | "ssh_forward": self.ssh_forward, 144 | }, 145 | "head-node": { 146 | "ray_args": ray_args, 147 | "env_setup": self.env_setup, 148 | }, 149 | "worker-node": { 150 | "ray_args": ray_args, 151 | "env_setup": self.env_setup, 152 | }, 153 | } 154 | self._add_batch_resources(rules) 155 | elif self.recipe in {"cpu-proc", "gpu-proc"}: 156 | if self.env_setup is None: 157 | # TODO(Samuel): configure this more sensibly 158 | self.env_setup = ( 159 | 'export PYTHONPATH="${BASE}":$PYTHONPATH\n' 160 | 'export PATH="${HOME}/local/anaconda3/condabin/:$PATH"\n' 161 | 'source ~/local/anaconda3/etc/profile.d/conda.sh\n' 162 | 'conda activate pt14' 163 | ) 164 | template_paths = { 165 | "master": f"{self.recipe}/master.sh", 166 | "sbatch": f"{self.recipe}/template.sh", 167 | } 168 | array_str = f"1-{self.job_array_size}" 169 | if self.throttle_array: 170 | array_str = f"{array_str}%{self.throttle_array}" 171 | rules = { 172 | "master": { 173 | "sbatch_path": str(gen_dir / template_paths["sbatch"]), 174 | }, 175 | "sbatch": { 176 | "cmd": self.cmd, 177 | "mem": self.mem, 178 | "prep": self.prep, 179 | "array": array_str, 180 | "log_path": self.log_path, 181 | "job_name": self.job_name, 182 | "job_queue": self.job_queue, 183 | "env_setup": self.env_setup, 184 | "partition": self.partition, 185 | "time_limit": self.time_limit, 186 | "cpus_per_task": self.cpus_per_task, 187 | "exclude_nodes": f"#SBATCH --exclude={self.exclude}", 188 | "custom_directives": self.custom_directives, 189 | "sbatch_resources": None, 190 | }, 191 | } 192 | self._add_batch_resources(rules, self.recipe == "gpu-proc") 193 | else: 194 | raise ValueError(f"template: {self.recipe} unrecognised") 195 | 196 | template_paths = {key: Path(self.template_dir) / val 197 | for key, val in template_paths.items()} 198 | 199 | self.gen_scripts = {} 200 | for key, template_path in template_paths.items(): 201 | gen = self.fill_template(template_path=template_path, rules=rules[key]) 202 | dest_path = gen_dir / Path(template_path).relative_to(self.template_dir) 203 | self.gen_scripts[key] = dest_path 204 | dest_path.parent.mkdir(exist_ok=True, parents=True) 205 | with open(str(dest_path), "w") as f: 206 | print(f"Writing slurm script ({key}) to {dest_path}") 207 | f.write(gen) 208 | dest_path.chmod(0o755) 209 | 210 | def _add_batch_resources(self, rules, allow_gpu=True): 211 | resource_strs = [] 212 | if self.constraint_str: 213 | resource_strs.append(f"#SBATCH --constraint={self.constraint_str}") 214 | if self.gpus_per_task and allow_gpu: 215 | resource_strs.append(f"#SBATCH --gres=gpu:{self.gpus_per_task}") 216 | rules["sbatch"]["sbatch_resources"] = "\n".join(resource_strs) 217 | 218 | @beartype 219 | def copy_to_snapshot_dir(self, code_snapshot_dir: Path): 220 | src_files_to_copy = set() 221 | for pattern in self.code_snapshot_filter_patterns: 222 | if pattern == "symlinks": 223 | candidates = Path(".").glob("**/*") 224 | src_files = [x for x in candidates if x.is_symlink()] 225 | else: 226 | src_files = list(Path(".").glob(pattern)) 227 | src_files_to_copy.update(src_files) 228 | 229 | print(f"Copying {len(src_files_to_copy)} src files to {code_snapshot_dir}") 230 | for src_file in src_files_to_copy: 231 | dest_path = code_snapshot_dir / src_file 232 | dest_path.parent.mkdir(exist_ok=True, parents=True) 233 | shutil.copyfile(str(src_file), str(dest_path), follow_symlinks=False) 234 | 235 | def get_log_paths(self): 236 | watched_logs = [] 237 | for idx in range(self.job_array_size): 238 | if self.recipe == "ray" and idx > 0: 239 | # for ray jobs, we only need to watch the log from the headnode 240 | break 241 | slurm_id = idx + 1 242 | watched_log = Path(str(self.log_path).replace("%4a", f"{slurm_id:04d}")) 243 | watched_log.parent.mkdir(exist_ok=True, parents=True) 244 | if self.refresh_logs: 245 | if watched_log.exists(): 246 | watched_log.unlink() 247 | # We also remove Pygtail files 248 | pygtail_file = watched_log.with_suffix(".txt.offset") 249 | if pygtail_file.exists(): 250 | pygtail_file.unlink() 251 | # We must make sure that the log file exists to enable monitoring 252 | if not watched_log.exists(): 253 | print(f"Creating watch log: {watched_log} for the first time") 254 | watched_log.touch() 255 | watched_logs.append(str(watched_log.resolve())) 256 | return watched_logs 257 | 258 | @beartype 259 | def submit(self, watch: bool = True, conserve_resources: int = 5): 260 | if watch: 261 | watched_logs = self.get_log_paths() 262 | submission_cmd = f"bash {self.gen_scripts['master']}" 263 | print(f"Submitting job with command: {submission_cmd}") 264 | print(f"using command:\n{self.cmd}") 265 | out = subprocess.check_output(submission_cmd.split()) 266 | job_id = out.decode("utf-8").rstrip() 267 | 268 | def halting_condition(): 269 | job_state = f"scontrol show job {job_id}" 270 | out = subprocess.check_output(job_state.split()) 271 | regex = "JobState=[A-Z]+" 272 | completed = True 273 | for match in re.finditer(regex, out.decode("utf-8").rstrip()): 274 | status = match.group().replace("JobState=", "") 275 | if status != "COMPLETED": 276 | return False 277 | return completed 278 | 279 | if watch: 280 | Watcher( 281 | heartbeat=True, 282 | watched_logs=watched_logs, 283 | halting_condition=halting_condition, 284 | conserve_resources=conserve_resources, 285 | ).run() 286 | print("Job completed") 287 | 288 | def __repr__(self): 289 | """Produce a human-readable string representation of the Yaspi object. 290 | 291 | Returns: 292 | (str): a summary of the object settings. 293 | """ 294 | summary = "Yaspi object\n========================\n" 295 | kwargs = sorted(self.__dict__.items(), key=lambda x: len(str(x[0]) + str(x[1]))) 296 | for key, val in kwargs: 297 | summary += f"{key}: {val}\n" 298 | return summary 299 | 300 | @beartype 301 | def fill_template(self, template_path: Path, rules: dict) -> str: 302 | """Transform a template according to a given set of rules. 303 | 304 | Args: 305 | template_path: location of the template to be filled. 306 | rules (dict[str:object]): a key, value mapping between template keys 307 | and their target values. 308 | 309 | Returns: 310 | A single string represnting the transformed contents of the template 311 | file. 312 | """ 313 | generated = [] 314 | with open(template_path, "r") as f: 315 | template = f.read().splitlines() 316 | 317 | # A template key use to denote sbatch directives that can be removed 318 | # if no value is specified 319 | OR_DELETE_LINE = "|ordeleteline" 320 | 321 | for row in template: 322 | skip_row = False 323 | edits = [] 324 | regex = r"\{\{(.*?)\}\}" 325 | for match in re.finditer(regex, row): 326 | groups = match.groups() 327 | assert len(groups) == 1, "expected single group" 328 | key = groups[0] 329 | ordeleteline = False 330 | if key.endswith(OR_DELETE_LINE): 331 | ordeleteline = True 332 | key = key[:-len(OR_DELETE_LINE)] 333 | token = rules[key] 334 | if ordeleteline and token is None: 335 | skip_row = True 336 | break 337 | edits.append((match.span(), token)) 338 | if skip_row: 339 | continue 340 | if edits: 341 | # invert the spans 342 | spans = [(None, 0)] + [x[0] for x in edits] + [(len(row), None)] 343 | inverse_spans = [(x[1], y[0]) for x, y in zip(spans, spans[1:])] 344 | tokens = [row[start:stop] for start, stop in inverse_spans] 345 | urls = [str(x[1]) for x in edits] 346 | new_row = "" 347 | for token, url in zip_longest(tokens, urls, fillvalue=""): 348 | new_row += token + url 349 | row = new_row 350 | generated.append(row) 351 | return "\n".join(generated) 352 | 353 | 354 | def main(): 355 | parser = argparse.ArgumentParser(description="Yaspi Tool") 356 | parser.add_argument("--install_location", action="store_true", 357 | help="if given, report the install location of yaspi") 358 | parser.add_argument("--job_name", default="yaspi-test", 359 | help="the name that slurm will give to the job") 360 | parser.add_argument("--recipe", default="ray", 361 | help="the SLURM recipe to use to generate scripts") 362 | parser.add_argument("--template_dir", 363 | type=Path, 364 | help="if given, override directory containing SLURM templates") 365 | parser.add_argument("--partition", default=None, 366 | help="The name of the SLURM partition used to run the job") 367 | parser.add_argument("--time_limit", default=None, 368 | help="The maximum amount of time allowed to run the job") 369 | parser.add_argument("--gen_script_dir", default="data/slurm-gen-scripts", 370 | type=Path, 371 | help="directory in which generated slurm scripts will be stored") 372 | parser.add_argument("--cmd", default='echo "hello"', 373 | help="single command (or comma separated commands) to run") 374 | parser.add_argument("--mem", default=None, 375 | help="the memory to be requested for each SLURM worker") 376 | parser.add_argument("--prep", default="", help="a command to be run before srun") 377 | parser.add_argument("--job_array_size", type=int, default=2, 378 | help="The number of SLURM array workers") 379 | parser.add_argument("--cpus_per_task", type=int, default=None, 380 | help="the number of cpus requested for each SLURM task") 381 | parser.add_argument("--gpus_per_task", type=int, default=None, 382 | help="the number of gpus requested for each SLURM task") 383 | parser.add_argument("--throttle_array", type=int, default=0, 384 | help="limit the number of array workers running at once") 385 | parser.add_argument("--env_setup", help="setup string for a custom environment") 386 | parser.add_argument("--ssh_forward", 387 | default="ssh -N -f -R 8080:localhost:8080 triton.robots.ox.ac.uk", 388 | help="setup string for a custom environment") 389 | parser.add_argument("--log_dir", default="data/slurm-logs", type=str, 390 | help="location where SLURM logs will be stored") 391 | parser.add_argument("--use_custom_ray_tmp_dir", action="store_true") 392 | parser.add_argument("--refresh_logs", action="store_true") 393 | parser.add_argument("--watch", type=int, default=1, 394 | help="whether to watch the generated SLURM logs") 395 | parser.add_argument("--exclude", default="", 396 | help="comma separated list of nodes to exclude") 397 | parser.add_argument("--constraint_str", help="SLURM --constraint string") 398 | parser.add_argument("--job_queue", default="", 399 | help="a queue of jobs to pass to a yaspi recipe") 400 | parser.add_argument("--custom_directives", default="", 401 | help=('Add any extra directives here, separated by newlines' 402 | 'e.g. "#SBATCH -A account-name\n#SBATCH --mem 10G"')) 403 | parser.add_argument("--code_snapshot_dir", type=Path, 404 | help=("if this argument is supplied, yaspi will make a snapshot " 405 | "of the codebase (starting from the current root directory)," 406 | "copy the snapshot to `code_snapshot_dir`, and then launch" 407 | "the command from there. Currently, only supported for the" 408 | "'cpu-proc' and 'gpu-proc' recipes")) 409 | parser.add_argument("--code_snapshot_filter_patterns", nargs="+", 410 | default=["**/*.py", "symlinks"], 411 | help=("if `--code_snapshot_dir` is supplied, then " 412 | "`--code_snapshot_filter_patterns` is used as a glob pattern " 413 | "to select which files will be included in the snapshot. If " 414 | "`symlinks` is included as a filter pattern, it is treated " 415 | "as a special pattern that mimics symlinks in the original " 416 | "code dir")) 417 | args = parser.parse_args() 418 | 419 | if args.install_location: 420 | print(Path(__file__).parent) 421 | return 422 | 423 | # Certain properties use defaults set by the Yaspi class, rather than argparse, to 424 | # ensure that users of the Python interface (i.e. directly creating Yaspi object) 425 | # can aslo benefit from these defaults 426 | prop_keys = {"template_dir", "custom_directives"} 427 | prop_kwargs = {key: getattr(args, key) for key in prop_keys if getattr(args, key)} 428 | 429 | job = Yaspi( 430 | cmd=args.cmd, 431 | mem=args.mem, 432 | prep=args.prep, 433 | recipe=args.recipe, 434 | log_dir=args.log_dir, 435 | exclude=args.exclude, 436 | job_name=args.job_name, 437 | job_queue=args.job_queue, 438 | partition=args.partition, 439 | time_limit=args.time_limit, 440 | env_setup=args.env_setup, 441 | ssh_forward=args.ssh_forward, 442 | refresh_logs=args.refresh_logs, 443 | cpus_per_task=args.cpus_per_task, 444 | gpus_per_task=args.gpus_per_task, 445 | gen_script_dir=args.gen_script_dir, 446 | constraint_str=args.constraint_str, 447 | job_array_size=args.job_array_size, 448 | use_custom_ray_tmp_dir=args.use_custom_ray_tmp_dir, 449 | throttle_array=args.throttle_array, 450 | code_snapshot_dir=args.code_snapshot_dir, 451 | code_snapshot_filter_patterns=args.code_snapshot_filter_patterns, 452 | **prop_kwargs, 453 | ) 454 | job.submit(watch=bool(args.watch)) 455 | 456 | 457 | if __name__ == "__main__": 458 | main() 459 | --------------------------------------------------------------------------------