├── requirements.txt
├── yaspi
├── misc
│ ├── mnist.png
│ ├── cpu-proc.png
│ └── gpu-proc.png
├── templates
│ ├── cpu-proc
│ │ ├── master.sh
│ │ └── template.sh
│ ├── gpu-proc
│ │ ├── master.sh
│ │ └── template.sh
│ └── ray
│ │ ├── start-ray-head-node.sh
│ │ ├── start-ray-worker-node.sh
│ │ ├── ray-master.sh
│ │ └── ray-sbatch.sh
└── yaspi.py
├── .gitignore
├── yaspi_test
├── misc
│ ├── hello_world.py
│ └── dummy_yaspi_config.json
└── test_yaspi.py
├── examples
├── mnist_hyperparams.json
├── yaspi_settings.json
├── minimal_ray_example.py
└── train_mnist.py
├── .github
└── workflows
│ └── ci-workflow.yml
├── setup.py
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | watchlogs
2 | beartype
--------------------------------------------------------------------------------
/yaspi/misc/mnist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albanie/yaspi/HEAD/yaspi/misc/mnist.png
--------------------------------------------------------------------------------
/yaspi/misc/cpu-proc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albanie/yaspi/HEAD/yaspi/misc/cpu-proc.png
--------------------------------------------------------------------------------
/yaspi/misc/gpu-proc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albanie/yaspi/HEAD/yaspi/misc/gpu-proc.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | data
3 | .vscode
4 | *__pycache__
5 | build
6 | dist
7 | *.egg-info
8 | .coverage
9 |
--------------------------------------------------------------------------------
/yaspi_test/misc/hello_world.py:
--------------------------------------------------------------------------------
1 | """Test module for yaspi.
2 | """
3 |
4 | if __name__ == "__main__":
5 | print(f"Hello world")
--------------------------------------------------------------------------------
/yaspi/templates/cpu-proc/master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Launch the sbatch job
4 | job_id=$(sbatch --parsable {{sbatch_path}})
5 | echo $job_id
--------------------------------------------------------------------------------
/yaspi/templates/gpu-proc/master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Launch the sbatch job
4 | job_id=$(sbatch --parsable {{sbatch_path}})
5 | echo $job_id
--------------------------------------------------------------------------------
/examples/mnist_hyperparams.json:
--------------------------------------------------------------------------------
1 | [
2 | {"lr": 0.1, "gamma": 0.7},
3 | {"lr": 0.01, "gamma": 0.7},
4 | {"lr": 0.001, "gamma": 0.8}
5 | ]
--------------------------------------------------------------------------------
/yaspi/templates/ray/start-ray-head-node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script is responsible for initialising the ray head node on whichever
4 | # machine slurm has assigned to it
5 |
6 | # Setup the environment for Ray
7 | echo "setting up environment for ray head node"
8 | {{env_setup}}
9 |
10 | echo "starting ray head node"
11 | # Launch the head node
12 | ray start --head --redis-port=6379 --include-webui {{ray_args}}
13 | echo "started ray head node"
14 |
15 | # Prevent the slurm scheduler from releasing the machine
16 | sleep infinity
17 | echo "Ray head node was stopped"
--------------------------------------------------------------------------------
/yaspi/templates/ray/start-ray-worker-node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script is responsible for initialising a ray worker node on whichever
4 | # machine slurm has assigned to it
5 |
6 | # parse the arguments
7 | PID=$$
8 | redis_address=$1
9 | worker_id=$2
10 |
11 | # Setup the environment for Ray
12 | {{env_setup}}
13 |
14 | # Launch the worker node
15 | cmd="ray start --redis-address=${redis_address} {{ray_args}}"
16 | echo "running cmd: ${cmd}"
17 | eval $cmd
18 |
19 | # Prevent the slurm scheduler from releasing the machine
20 | sleep infinity
21 | echo "Worker ${worker_id} stopped"
--------------------------------------------------------------------------------
/yaspi/templates/ray/ray-master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Generate a temporary text file to share state between the workers. This must be
4 | # somewhere on the home folder (i.e. not in $TMPDIR). Its sole purpose is to pass the
5 | # address of the head node to the worker nodes.
6 | TMPDIR="${HOME}/tmp/ray-scripts"
7 | mkdir -p "${TMPDIR}"
8 | tmpfile=$(mktemp "${TMPDIR}/ray-scheduler.XXXXXX")
9 | echo "created tmpfile at $tmpfile to share ray meta data"
10 |
11 | # sleep to ensure the temporary file has time to propagate over NFS
12 | sleep {{nfs_update_secs}}
13 |
14 | # Launch the ray job
15 | sbatch --export=all,tmpfile=$tmpfile {{ray_sbatch_path}}
--------------------------------------------------------------------------------
/examples/yaspi_settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe": "gpu-proc",
3 | "partition": "gpu",
4 | "time_limit": "24:00:00",
5 | "gen_script_dir": "data/slurm-gen-scripts",
6 | "mem": "24G",
7 | "gpus_per_task": 1,
8 | "cpus_per_task": 2,
9 | "throttle_array": 20,
10 | "ssh_forward": "",
11 | "log_dir": "data/slurm-logs",
12 | "use_custom_ray_tmp_dir": false,
13 | "refresh_logs": false,
14 | "exclude": "",
15 | "constraint_str": "",
16 | "prep": "",
17 | "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH; export PATH=\"${HOME}\"/local/miniconda3/condabin/:$PATH; source ~/local/miniconda3/etc/profile.d/conda.sh; conda activate py37"
18 | }
19 |
--------------------------------------------------------------------------------
/yaspi_test/misc/dummy_yaspi_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe": "cpu-proc",
3 | "partition": "compute",
4 | "time_limit": "12:00:00",
5 | "gen_script_dir": "data/slurm-gen-scripts",
6 | "mem": "32G",
7 | "cpus_per_task": 2,
8 | "gpus_per_task": 0,
9 | "throttle_array": 2,
10 | "ssh_forward": "",
11 | "log_dir": "data/slurm-logs",
12 | "use_custom_ray_tmp_dir": false,
13 | "refresh_logs": false,
14 | "exclude": "",
15 | "constraint_str": "",
16 | "prep": "",
17 | "custom_directives": "#SBATCH --comment \"a harmless comment\"\n#SBATCH --mail-type=END,FAIL\n#SBATCH --mail-user=username@email.com",
18 | "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH;"
19 | }
20 |
--------------------------------------------------------------------------------
/yaspi/templates/cpu-proc/template.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name={{job_name}}
3 | #SBATCH --mem={{mem|ordeleteline}}
4 | #SBATCH --array={{array}}
5 | #SBATCH --time={{time_limit|ordeleteline}}
6 | #SBATCH --output={{log_path}}
7 | #SBATCH --partition={{partition|ordeleteline}}
8 | #SBATCH --cpus-per-task={{cpus_per_task|ordeleteline}}
9 | {{sbatch_resources|ordeleteline}}
10 | {{exclude_nodes}}
11 | {{custom_directives}}
12 | {{sbatch_resources}}
13 | # -------------------------------
14 |
15 | # enable terminal stdout logging
16 | echo "linking job logs to terminal"
17 | echo "=================================================================="
18 |
19 | {{env_setup}}
20 |
21 | # Run the loop of runs for this task.
22 | worker_id=$((SLURM_ARRAY_TASK_ID - 1))
23 | echo "($HOSTNAME) This is SLURM task $SLURM_ARRAY_TASK_ID, worker id $worker_id"
24 |
25 | # handle potential ipython issues with history
26 | export IPYTHONDIR=/tmp
27 |
28 | prep="{{prep}}"
29 | echo "running prep cmd $prep"
30 | eval "${prep}"
31 |
32 | cmd="{{cmd}}"
33 | cmd="srun ${cmd} --slurm --worker_id ${worker_id}"
34 | echo "running cmd $cmd"
35 | eval "${cmd}"
36 |
--------------------------------------------------------------------------------
/.github/workflows/ci-workflow.yml:
--------------------------------------------------------------------------------
1 | name: yaspi-ci
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | python-version: [3.7]
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v1
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install -r requirements.txt
23 | - name: Lint with flake8
24 | run: |
25 | pip install flake8
26 | # stop the build if there are Python syntax errors or undefined names
27 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
28 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
29 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
30 |
31 | - name: Test with pytest
32 | run: |
33 | pip install pytest coverage
34 | coverage run --source=. -m py.test
--------------------------------------------------------------------------------
/examples/minimal_ray_example.py:
--------------------------------------------------------------------------------
1 | """A minimal working example for Ray usage with yaspi.
2 |
3 | See the official documentation for examples and tutorials:
4 | https://ray.readthedocs.io/en/latest/
5 | """
6 | import ray
7 | import time
8 | import argparse
9 | from datetime import datetime
10 |
11 |
12 | @ray.remote
13 | def remote_function():
14 | time.sleep(5)
15 | return datetime.now()
16 |
17 |
18 | def main():
19 | parser = argparse.ArgumentParser(description="Minimal Ray example")
20 | parser.add_argument("--local_mode", action="store_true", help="run on local machine")
21 | parser.add_argument("--ray_address", help="address of the ray head node")
22 | args = parser.parse_args()
23 |
24 | # initialise the server
25 | ray.init(
26 | address=args.ray_address,
27 | local_mode=args.local_mode,
28 | ignore_reinit_error=1,
29 | )
30 |
31 | # execute functions
32 | timestamps = [remote_function.remote() for _ in range(4)]
33 | for worker_timestamp in timestamps:
34 | print(f"timestamp from worker: {ray.get(worker_timestamp)}")
35 |
36 |
37 | if __name__ == "__main__":
38 | main()
39 |
--------------------------------------------------------------------------------
/yaspi/templates/gpu-proc/template.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name={{job_name}}
3 | #SBATCH --mem={{mem|ordeleteline}}
4 | #SBATCH --array={{array}}
5 | #SBATCH --time={{time_limit|ordeleteline}}
6 | #SBATCH --output={{log_path}}
7 | #SBATCH --partition={{partition|ordeleteline}}
8 | #SBATCH --cpus-per-task={{cpus_per_task|ordeleteline}}
9 | {{sbatch_resources}}
10 | {{exclude_nodes}}
11 | {{custom_directives}}
12 | {{sbatch_resources}}
13 | # -------------------------------
14 |
15 | # enable terminal stdout logging
16 | echo "linking job logs to terminal"
17 | echo "=================================================================="
18 |
19 | {{env_setup}}
20 |
21 | # Run the loop of runs for this task.
22 | worker_id=$((SLURM_ARRAY_TASK_ID - 1))
23 | echo "($HOSTNAME) This is SLURM task $SLURM_ARRAY_TASK_ID, worker id $worker_id"
24 | declare -a custom_args_queue=({{job_queue}})
25 |
26 | # handle potential ipython issues with history
27 | export IPYTHONDIR=/tmp
28 |
29 | prep="{{prep}}"
30 | echo "running prep cmd $prep"
31 | eval "${prep}"
32 |
33 | cmd="{{cmd}}"
34 | cmd="srun --unbuffered ${cmd} ${custom_args_queue[${worker_id}]}"
35 | echo "running cmd $cmd"
36 | eval "${cmd}"
37 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | Yaspi setup.py
3 |
4 | Build/upload commands:
5 | coverage run -m pytest --capture=tee-sys yaspi_test
6 | python3 setup.py sdist bdist_wheel
7 | twine upload --skip-existing dist/*
8 | """
9 | from pathlib import Path
10 |
11 | import setuptools
12 |
13 | with open("README.md", "r") as f:
14 | long_description = f.read()
15 |
16 |
17 | # Ensure that extra data (example scripts and recipe templates) are included
18 | package_dir = "yaspi"
19 | extra_package_patterns = ["misc/*.py", "templates/**/*.sh"]
20 | extra_package_files = []
21 | for pattern in extra_package_patterns:
22 | paths = Path(package_dir).glob(pattern)
23 | rel_paths = [str(x.relative_to(package_dir)) for x in paths]
24 | extra_package_files.extend(rel_paths)
25 |
26 |
27 | setuptools.setup(
28 | name="yaspi",
29 | version="0.0.8",
30 | entry_points={
31 | "console_scripts": [
32 | "yaspi=yaspi.yaspi:main",
33 | ],
34 | },
35 | author="Samuel Albanie",
36 | description="Yet Another Slurm Python Interface",
37 | long_description=long_description,
38 | long_description_content_type="text/markdown",
39 | url="https://github.com/albanie/yaspi",
40 | packages=["yaspi"],
41 | package_dir={"yaspi": package_dir},
42 | package_data={"yaspi": extra_package_files},
43 | install_requires=[
44 | "watchlogs",
45 | "beartype>=0.7.1"
46 | ],
47 | python_requires=">=3.7",
48 | classifiers=[
49 | "Programming Language :: Python :: 3.7",
50 | "License :: OSI Approved :: MIT License",
51 | 'Operating System :: POSIX :: Linux',
52 | ],
53 | )
54 |
--------------------------------------------------------------------------------
/yaspi/templates/ray/ray-sbatch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name={{job_name}}
3 | #SBATCH --mem={{mem|ordeleteline}}
4 | #SBATCH --array={{array}}
5 | #SBATCH --time={{time_limit|ordeleteline}}
6 | #SBATCH --output={{log_path}}
7 | #SBATCH --partition={{partition|ordeleteline}}
8 | #SBATCH --cpus-per-task={{cpus_per_task|ordeleteline}}
9 | {{sbatch_resources}}
10 | {{exclude_nodes}}
11 | # -------------------------------
12 |
13 | # This script is a modification to the implementation suggest by gregSchwartz18 here:
14 | # https://github.com/ray-project/ray/issues/826#issuecomment-522116599
15 |
16 | worker_id=$((SLURM_ARRAY_TASK_ID - 1))
17 | echo "($HOSTNAME) This is SLURM job: $SLURM_ARRAY_JOB_ID, worker id $worker_id"
18 |
19 | # define a length of time (in seconds) for workers to wait while either the head node
20 | # (or other workers) initiailise ray servers
21 | approx_ray_init_time_in_secs={{approx_ray_init_time_in_secs}}
22 |
23 | # The first array worker is responsible for managing the head-node. All remaining array
24 | # members will be used as ray workers.
25 | if [ $worker_id -eq 0 ]; then
26 |
27 | # find the ip address of the machine that will be the head node and write it to disk
28 | # so that the other workers can connect to it
29 | ip_prefix=$(srun --ntasks=1 hostname --ip-address)
30 | echo "ip_prefix: ${ip_prefix}"
31 |
32 | # We will run the head node on the standard Redis port - we also write this
33 | # information to disk so that it can be accessed by the workers
34 | suffix=':6379'
35 | ip_head=$ip_prefix$suffix
36 | echo "Writing values to ${tmpfile}"
37 | echo $ip_head >> $tmpfile
38 |
39 | # run the head-node initialisation script
40 | head_init_script={{head_init_script}}
41 | srun -u --export=ALL --ntasks=1 "${head_init_script}" &
42 | echo "launched ${head_init_script}"
43 | else
44 | # For each non-head worker, we first sleep to allow the head worker to write its
45 | # details to disk
46 | sleep ${approx_ray_init_time_in_secs}
47 |
48 | echo "====================================================="
49 | echo "running ray worker ${worker_id}"
50 | echo "====================================================="
51 | echo "reading head node information from: ${tmpfile}"
52 | readarray -t head_node_config < ${tmpfile}
53 | ip_head=${head_node_config[0]}
54 | echo "attaching to head node ip_head: ${ip_head}"
55 |
56 | worker_init_script={{worker_init_script}}
57 | srun -u --export=ALL --ntasks=1 ${worker_init_script} $ip_head $worker_id
58 | echo "launched ${cmd}"
59 | fi
60 |
61 | if [ $worker_id -eq 0 ]; then
62 | {{env_setup}}
63 | {{ssh_forward}}
64 | cmd="{{cmd}}"
65 | echo "Launching ${cmd} on head node in ${approx_ray_init_time_in_secs} secs"
66 | sleep ${approx_ray_init_time_in_secs}
67 | {{cmd}} --ray_address $ip_head
68 | echo "cancelling $SLURM_JOB_ID"
69 | scancel $SLURM_ARRAY_JOB_ID
70 | fi
71 |
--------------------------------------------------------------------------------
/yaspi_test/test_yaspi.py:
--------------------------------------------------------------------------------
1 | """Minimal tests to validate syntax.
2 |
3 | It would be possible to CI test SLURM launches by adding docker + slurmd to the
4 | github workflow, but github doesn't give me enough free testing minutes for that :)
5 | As a work around, tests that involve slurm submissions are only run for known
6 | hostnames.
7 | """
8 |
9 | import json
10 | import socket
11 | from pathlib import Path
12 | from yaspi.yaspi import Yaspi
13 |
14 |
15 | PATH_ARGS = {"gen_script_dir", "log_dir", "template_dir"}
16 | HOSTS_WITH_SLURM = ["login1.triton.cluster"]
17 |
18 |
19 | def test_yaspi_object_creation():
20 | with open("yaspi_test/misc/dummy_yaspi_config.json", "r") as f:
21 | yaspi_defaults = json.load(f)
22 | for key, val in yaspi_defaults.items():
23 | if key in PATH_ARGS:
24 | yaspi_defaults[key] = Path(val)
25 | cmd = "python yaspi_test/misc/hello_world.py"
26 | job_name = "test_yaspi"
27 | job_queue = ""
28 | yaspi = Yaspi(
29 | cmd=cmd,
30 | job_name=job_name,
31 | job_queue=job_queue,
32 | job_array_size=1,
33 | **yaspi_defaults,
34 | )
35 | print(f"Test yaspi object: {yaspi}")
36 | if socket.gethostname() in HOSTS_WITH_SLURM:
37 | yaspi.submit()
38 |
39 |
40 | def test_yaspi_object_creation_with_code_snapshot_dir():
41 | with open("yaspi_test/misc/dummy_yaspi_config.json", "r") as f:
42 | yaspi_defaults = json.load(f)
43 | for key, val in yaspi_defaults.items():
44 | if key in PATH_ARGS:
45 | yaspi_defaults[key] = Path(val)
46 | cmd = "python yaspi_test/misc/hello_world.py"
47 | yaspi_defaults["code_snapshot_dir"] = Path("data/code_snapshot_dir")
48 | job_name = "test_yaspi"
49 | job_queue = ""
50 | yaspi = Yaspi(
51 | cmd=cmd,
52 | job_name=job_name,
53 | job_queue=job_queue,
54 | job_array_size=1,
55 | **yaspi_defaults,
56 | )
57 | print(f"Test yaspi object: {yaspi}")
58 | if socket.gethostname() in HOSTS_WITH_SLURM:
59 | yaspi.submit()
60 |
61 |
62 | def test_yaspi_object_line_deletion():
63 | with open("yaspi_test/misc/dummy_yaspi_config.json", "r") as f:
64 | yaspi_defaults = json.load(f)
65 | for key, val in yaspi_defaults.items():
66 | if key in PATH_ARGS:
67 | yaspi_defaults[key] = Path(val)
68 | cmd = "python yaspi_test/misc/hello_world.py"
69 | job_name = "test_yaspi"
70 | job_queue = ""
71 |
72 | # Check that yaspi only includes sbatch directives for values that
73 | # are not None when OR_DELETE_LINE is specified in the sbatch template.
74 | # This test uses the "constraint_str" flag as an example of a directive
75 | # that should be None by default
76 |
77 | # First, check that supplying a yaspi key-value pair ensures it is present
78 | yaspi_defaults["constraint_str"] = "p40"
79 | sbatch_directive = "#SBATCH --constraint"
80 | yaspi = Yaspi(
81 | cmd=cmd,
82 | job_name=job_name,
83 | job_queue=job_queue,
84 | job_array_size=1,
85 | **yaspi_defaults,
86 | )
87 | # Read the template that was written to disk
88 | with open("data/slurm-gen-scripts/cpu-proc/template.sh", "r") as f:
89 | template_contents = f.read()
90 | assert sbatch_directive in template_contents, (
91 | f"Expected to find {sbatch_directive} in template contents"
92 | )
93 |
94 | # Check that supplying a None-valued yaspi key-value pair ensures it is not present
95 | yaspi_defaults["constraint_str"] = None
96 | yaspi = Yaspi(
97 | cmd=cmd,
98 | job_name=job_name,
99 | job_queue=job_queue,
100 | job_array_size=1,
101 | **yaspi_defaults,
102 | )
103 | # Read the template that was written to disk
104 | with open("data/slurm-gen-scripts/cpu-proc/template.sh", "r") as f:
105 | template_contents = f.read()
106 | assert sbatch_directive not in template_contents, (
107 | f"Expected not to find {sbatch_directive} in template contents"
108 | )
109 | if socket.gethostname() in HOSTS_WITH_SLURM:
110 | yaspi.submit()
111 |
112 |
113 | if __name__ == "__main__":
114 | test_yaspi_object_creation()
115 | test_yaspi_object_line_deletion()
116 | test_yaspi_object_creation_with_code_snapshot_dir()
117 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### yaspi - yet another slurm python interface
2 |
3 | The goal of `yaspi` is to provide an interface to submitting [slurm](https://slurm.schedmd.com/documentation.html) jobs, thereby obviating the joys of sbatch files. It does so through `recipes` - these are collections of templates and rules for generating sbatch scripts.
4 |
5 | 
6 |
7 | ### Installation
8 |
9 | Install via `pip install yaspi`. If you prefer to hack around with the source code, it's a [single python file](yaspi/yaspi.py).
10 |
11 | It should be considered (highly) experimental.
12 |
13 | ### Implementation
14 |
15 | `yaspi` makes heavy usage of slurm [job arrays](https://slurm.schedmd.com/job_array.html).
16 |
17 | ### Supported recipes:
18 |
19 | * `cpu-proc` - a generic recipe for submitting CPU jobs via a job array.
20 | * `gpu-proc` - a generic recipe for submitting GPU jobs via a job array.
21 | * `ray` - job submissions for the [ray scheduler](https://github.com/ray-project/ray).
22 |
23 | ### Dependencies
24 |
25 | * `Python >= 3.6`
26 | * `watchlogs`
27 |
28 | ### Requirements:
29 |
30 | `yaspi` has been tested on CentOS Linux release 7.7.1908 with slurm 18.08.7 and Python 3.7. YMMV on other platforms.
31 |
32 | ### Usage and outputs
33 |
34 | `yaspi` can be used either from the command-line or directly from a python program. Command-line usage is shown in the following examples (the effect of each argument is documented in the [implementation](yaspi/yaspi.py)).
35 |
36 | **Code - scheduling a slurm job array with CPUs**:
37 | ```
38 | prep_command='echo \"optional preparation cmd\"'
39 | command='echo \"I am running on a CPU node\"'
40 | yaspi --job_name=example \
41 | --job_array_size=2 \
42 | --cpus_per_task=5 \
43 | --cmd="$command" \
44 | --prep="$prep_command" \
45 | --recipe=cpu-proc \
46 | --mem=10G
47 | ```
48 | **Effect**: This will run the `command` value on two workers as part of a slurm [job array](https://slurm.schedmd.com/job_array.html). Each worker will be allocated 5 CPUs and 10G of memory by the scheduler. Each worker will also be passed two extra flags, `--slurm` (without options) and `--worker_id` (which will be given the 0-indexed value of the current worker index in the job array) which can be used to assign tasks to the worker. The `--prep` flag is optional, and will run a commands prior to the main job (e.g. to change into an appropriate code directory). The effect of the command will be to produce the following:
49 |
50 | ```
51 | # run on CPU job array worker 0
52 | optional preparation cmd
53 | I am running on a CPU node --slurm --worker_id 0
54 |
55 | # run on CPU job array worker 1
56 | optional preparation cmd
57 | I am running on a CPU node --slurm --worker_id 1
58 | ```
59 |
60 | When launched, a slightly more verbose (and colourized) output will be produced by [watchlogs](https://github.com/albanie/watchlogs) (this assumes your terminal supports color sequences):
61 |
62 |
63 |
64 |
65 | **Code - scheduling a slurm job array with GPUs**:
66 | ```
67 | prep_command='echo \"optional preparation cmd\"'
68 | job_queue="\"flags for worker 0\" \"flags for worker 1\""
69 | command='echo \"I am running on a GPU node\"'
70 | yaspi --job_name=example \
71 | --job_array_size=2 \
72 | --cpus_per_task=5 \
73 | --gpus_per_task=1 \
74 | --prep="$prep_command" \
75 | --cmd="$command" \
76 | --recipe=gpu-proc \
77 | --job_queue="$job_queue" \
78 | --mem=10G
79 | ```
80 | **Effect**: This command is similar to the `cpu-proc` recipe described above. Again, the `command` will be run on two workers as part of a slurm [job array](https://slurm.schedmd.com/job_array.html). Each worker will be allocated 5 CPUs and 10G of memory by the scheduler, as well as one GPU. One further difference is that `gpu-proc` also takes an `job_queue` option that can be used to pass options to each GPU worker separately.
81 |
82 |
83 |
84 | **Extras - custom directives**:
85 |
86 | The previous example can be extended with custom directives. For example, suppose
87 | you wish to add an extra directive to your `sbatch` of the form
88 | `#SBATCH --comment "a harmless comment goes here"`. You can do this as follows:
89 | ```
90 | prep_command='echo \"optional preparation cmd\"'
91 | job_queue="\"flags for worker 0\" \"flags for worker 1\""
92 | command='echo \"I am running on a GPU node\"'
93 | custom_directive='#SBATCH --comment "a harmless comment goes here"'
94 | python yaspi.py --job_name=example \
95 | --job_array_size=2 \
96 | --cpus_per_task=5 \
97 | --gpus_per_task=1 \
98 | --prep="$prep_command" \
99 | --cmd="$command" \
100 | --recipe=gpu-proc \
101 | --job_queue="$job_queue" \
102 | --custom_directive="$custom_directive" \
103 | --mem=10G
104 | ```
105 |
106 | Custom directives can also be added to json config. For example, to receive emails from slurm, add a `"custom_directives"` flag:
107 |
108 | ```
109 | {
110 | ...
111 | "custom_directives": "#SBATCH --mail-type=END,FAIL\n#SBATCH --mail-user=your_email_address",
112 | }
113 | ```
114 |
115 | **Code - scheduling a job with the [ray](https://ray.readthedocs.io/en/latest/index.html) framework:**
116 |
117 | ```
118 | yaspi_dir=$(yaspi --install_location)
119 | command="python $yaspi_dir/misc/minimal_ray_example.py"
120 | yaspi --job_name=example \
121 | --cmd="$command" \
122 | --job_array_size=3 \
123 | --cpus_per_task=2 \
124 | --gpus_per_task=1 \
125 | --mem=10G \
126 | --recipe=ray
127 | ```
128 |
129 | **Effect**: Scheduling jobs with the ray framework operates in a slightly different manner to the previous two examples (both of which assume [embarrasingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel) processing i.e. no communication between the workers). The ray receipe similarly launches a slurm job array, but assigns the job at index 0 to be the *master*, and all other nodes as *worker* nodes. The command is run *only* on the master node, which then uses ray to allocate tasks to the worker nodes. The command above will launch a slurm job, with the name "example", that: (1) initialises a ray head node and a set of 2 ray workers via a SLURM array job; (2) launches `$command` from the head node. It will produce an output similar to the following:
130 |
131 |
132 | ```
133 | started ray head node
134 | timestamp from worker: 2020-02-17 06:40:44.861733
135 | timestamp from worker: 2020-02-17 06:40:44.861793
136 | timestamp from worker: 2020-02-17 06:40:45.062484
137 | timestamp from worker: 2020-02-17 06:40:45.065494
138 | ```
139 |
140 | **Code - using yaspi directly from python**:
141 |
142 |
143 | An example for training multiple MNIST runs is given in [train_mnist.py](examples/train_mnist.py). Running this file should launch three jobs on SLURM, each with different hyperparameters, producing the output below:
144 |
145 |
146 |
147 |
148 | **Modifying your code to use Yaspi**:
149 |
150 | To run an existing piece of code with yaspi requires two things:
151 | 1. A json file containing SLURM settings (e.g. these [yaspi_settings](examples/yaspi_settings.json)). This file will set the options that you would normally set in an SBATCH script (e.g. number of GPUS, total job duration etc.) together with any bash commands you would usually run to set up your job environment (these are supplied via the `"env_estup"` option)
152 | 2. A small block of logic somewhere in your script (visible for the MNIST example [here](https://github.com/albanie/yaspi/blob/master/examples/train_mnist.py#L120-L165)) which sets the job name and calls the Yaspi `submit()` function.
153 |
154 | **Using code snapshot directories**:
155 |
156 | One downside of launching a yaspi job directly from a source code folder is that if you edit your code after submitting your jobs to slurm but the jobs haven't yet launched, the code edits will affect the jobs. Since this is (typically) undesirable behaviour, you can supply extra flags to yaspi so that it copies the source code in your current folder to a new "snapshot" directory and launches from there. As a consequence, any local code changes you make after launching with yaspi will not affect the queued jobs. The flags to pass are:
157 | ```
158 | --code_snapshot_dir snapshot_dir # is the location where the snapshot of your code will be stored
159 | --code_snapshot_filter_patterns patterns # are a set of glob-patterns to determine which source code is copied
160 | ```
--------------------------------------------------------------------------------
/examples/train_mnist.py:
--------------------------------------------------------------------------------
1 | """Yaspi example:
2 | Aims to be a minimal modification to the PyTorch MNIST example given here:
3 | https://github.com/pytorch/examples/blob/master/mnist/main.py
4 |
5 | Example usage
6 | ---------------
7 |
8 | - Standard training:
9 | python train_mnist.py
10 |
11 | - Yaspi training:
12 | python train_mnist.py --hyperparams mnist_hyperparams.json --yaspify
13 | (this launches one run for each experiment config defined in mnist_hyperparams.json)
14 | """
15 | import argparse
16 | import torch
17 | import torch.nn as nn
18 | import torch.nn.functional as F
19 | import torch.optim as optim
20 | from torchvision import datasets, transforms
21 | from torch.optim.lr_scheduler import StepLR
22 | import sys
23 | import json
24 | from yaspi.yaspi import Yaspi
25 |
26 |
27 | class Net(nn.Module):
28 | def __init__(self):
29 | super(Net, self).__init__()
30 | self.conv1 = nn.Conv2d(1, 32, 3, 1)
31 | self.conv2 = nn.Conv2d(32, 64, 3, 1)
32 | self.dropout1 = nn.Dropout(0.25)
33 | self.dropout2 = nn.Dropout(0.5)
34 | self.fc1 = nn.Linear(9216, 128)
35 | self.fc2 = nn.Linear(128, 10)
36 |
37 | def forward(self, x):
38 | x = self.conv1(x)
39 | x = F.relu(x)
40 | x = self.conv2(x)
41 | x = F.relu(x)
42 | x = F.max_pool2d(x, 2)
43 | x = self.dropout1(x)
44 | x = torch.flatten(x, 1)
45 | x = self.fc1(x)
46 | x = F.relu(x)
47 | x = self.dropout2(x)
48 | x = self.fc2(x)
49 | output = F.log_softmax(x, dim=1)
50 | return output
51 |
52 |
53 | def train(args, model, device, train_loader, optimizer, epoch):
54 | model.train()
55 | for batch_idx, (data, target) in enumerate(train_loader):
56 | data, target = data.to(device), target.to(device)
57 | optimizer.zero_grad()
58 | output = model(data)
59 | loss = F.nll_loss(output, target)
60 | loss.backward()
61 | optimizer.step()
62 | if batch_idx % args.log_interval == 0:
63 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
64 | epoch, batch_idx * len(data), len(train_loader.dataset),
65 | 100. * batch_idx / len(train_loader), loss.item()))
66 | if args.dry_run:
67 | break
68 |
69 |
70 | def test(model, device, test_loader):
71 | model.eval()
72 | test_loss = 0
73 | correct = 0
74 | with torch.no_grad():
75 | for data, target in test_loader:
76 | data, target = data.to(device), target.to(device)
77 | output = model(data)
78 | test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
79 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
80 | correct += pred.eq(target.view_as(pred)).sum().item()
81 |
82 | test_loss /= len(test_loader.dataset)
83 |
84 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
85 | test_loss, correct, len(test_loader.dataset),
86 | 100. * correct / len(test_loader.dataset)))
87 |
88 |
89 | def main():
90 | # Training settings
91 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
92 | parser.add_argument('--batch-size', type=int, default=64, metavar='N',
93 | help='input batch size for training (default: 64)')
94 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
95 | help='input batch size for testing (default: 1000)')
96 | parser.add_argument('--epochs', type=int, default=14, metavar='N',
97 | help='number of epochs to train (default: 14)')
98 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
99 | help='learning rate (default: 1.0)')
100 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
101 | help='Learning rate step gamma (default: 0.7)')
102 | parser.add_argument('--no-cuda', action='store_true', default=False,
103 | help='disables CUDA training')
104 | parser.add_argument('--dry-run', action='store_true', default=False,
105 | help='quickly check a single pass')
106 | parser.add_argument('--seed', type=int, default=1, metavar='S',
107 | help='random seed (default: 1)')
108 | parser.add_argument('--log-interval', type=int, default=10, metavar='N',
109 | help='how many batches to wait before logging training status')
110 | parser.add_argument('--save-model', action='store_true', default=False,
111 | help='For Saving the current Model')
112 |
113 | # Additional flags used by yaspi
114 | parser.add_argument("--yaspify", action="store_true")
115 | parser.add_argument("--yaspi_settings", default="yaspi_settings.json",
116 | help="file of SLURM specific options (e.g. number of GPUS)")
117 | parser.add_argument("--hyperparams", default="mnist_hyperparams.json")
118 | args = parser.parse_args()
119 |
120 | if args.yaspify:
121 | # --------------------------------------------------------------------
122 | # This section contains the logic for launching multiple runs
123 | # --------------------------------------------------------------------
124 | # The command that will be launched on each worker will be identical to the
125 | # python command used to launch this script (including all flags), except:
126 | # 1. The --yaspify flag will be removed
127 | # 2. Flags from hyperparams will be inserted
128 | # -------------------------------------------------------------------------
129 |
130 | # load the hyperparameters
131 | with open(args.hyperparams, "r") as f:
132 | hyperparams = json.load(f)
133 | exp_flags = []
134 | for exp in hyperparams:
135 | exp_flags.append(" ".join([f"--{key} {val}" for key, val in exp.items()]))
136 |
137 | # Select a name for your jobs (this is what will be visible via the `sinfo`
138 | # SLURM command)
139 | num_jobs = len(exp_flags)
140 | job_name = f"train-mnist-{num_jobs}-jobs"
141 |
142 | # Provide the arguments to each SLURM worker as space-separated quoted strings
143 | job_queue = " ".join([f'"{flags}"' for flags in exp_flags])
144 |
145 | # remove the yaspify flag
146 | cmd_args = sys.argv
147 | cmd_args.remove("--yaspify")
148 |
149 | # construct the final command that will run each worker, together with job_queue
150 | base_cmd = f"python {' '.join(cmd_args)}"
151 |
152 | # load SLURM specific settings
153 | with open(args.yaspi_settings, "r") as f:
154 | yaspi_defaults = json.load(f)
155 |
156 | # Launch the jobs over SLURM
157 | job = Yaspi(
158 | cmd=base_cmd,
159 | job_queue=job_queue,
160 | job_name=job_name,
161 | job_array_size=num_jobs,
162 | **yaspi_defaults,
163 | )
164 | # The `watch` argument will keep
165 | job.submit(watch=True, conserve_resources=5)
166 | else:
167 | # --------------------------------------------------------------------
168 | # This section contains the original, unmodified code
169 | # --------------------------------------------------------------------
170 | use_cuda = not args.no_cuda and torch.cuda.is_available()
171 | torch.manual_seed(args.seed)
172 |
173 | device = torch.device("cuda" if use_cuda else "cpu")
174 |
175 | train_kwargs = {'batch_size': args.batch_size}
176 | test_kwargs = {'batch_size': args.test_batch_size}
177 | if use_cuda:
178 | cuda_kwargs = {'num_workers': 1,
179 | 'pin_memory': True,
180 | 'shuffle': True}
181 | train_kwargs.update(cuda_kwargs)
182 | test_kwargs.update(cuda_kwargs)
183 |
184 | transform=transforms.Compose([
185 | transforms.ToTensor(),
186 | transforms.Normalize((0.1307,), (0.3081,))
187 | ])
188 | dataset1 = datasets.MNIST('../data', train=True, download=True,
189 | transform=transform)
190 | dataset2 = datasets.MNIST('../data', train=False,
191 | transform=transform)
192 | train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
193 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
194 |
195 | model = Net().to(device)
196 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
197 |
198 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
199 | for epoch in range(1, args.epochs + 1):
200 | train(args, model, device, train_loader, optimizer, epoch)
201 | test(model, device, test_loader)
202 | scheduler.step()
203 |
204 | if args.save_model:
205 | torch.save(model.state_dict(), "mnist_cnn.pt")
206 |
207 | if __name__ == '__main__':
208 | main()
209 |
--------------------------------------------------------------------------------
/yaspi/yaspi.py:
--------------------------------------------------------------------------------
1 | """YASPI - yet another python slurm interface.
2 | """
3 |
4 | import re
5 | import shutil
6 | import argparse
7 | import subprocess
8 | from typing import List, Union
9 | from pathlib import Path
10 | from datetime import datetime
11 | from itertools import zip_longest
12 |
13 | from beartype import beartype
14 | from beartype.cave import NoneTypeOr
15 | from watchlogs.watchlogs import Watcher
16 |
17 |
18 | class Yaspi:
19 |
20 | @beartype
21 | def __init__(
22 | self,
23 | job_name: str,
24 | cmd: str,
25 | prep: str,
26 | recipe: str,
27 | gen_script_dir: Union[Path, str],
28 | log_dir: Union[Path, str],
29 | partition: NoneTypeOr[str],
30 | job_array_size: int,
31 | cpus_per_task: NoneTypeOr[int],
32 | gpus_per_task: NoneTypeOr[int],
33 | refresh_logs: bool,
34 | exclude: str,
35 | use_custom_ray_tmp_dir: bool,
36 | ssh_forward: str,
37 | time_limit: NoneTypeOr[str],
38 | throttle_array: int,
39 | mem: str,
40 | constraint_str: NoneTypeOr[str],
41 | custom_directives: str = "",
42 | template_dir: Path = Path(__file__).parent / "templates",
43 | job_queue: NoneTypeOr[str] = None,
44 | env_setup: NoneTypeOr[str] = None,
45 | code_snapshot_dir: NoneTypeOr[Path, str] = None,
46 | code_snapshot_filter_patterns: List[str] = ["**/*.py", "symlinks"],
47 | ):
48 | self.cmd = cmd
49 | self.mem = mem
50 | self.prep = prep
51 | self.recipe = recipe
52 | self.exclude = exclude
53 | self.job_name = job_name
54 | self.partition = partition
55 | self.time_limit = time_limit
56 | self.env_setup = env_setup
57 | self.job_queue = job_queue
58 | self.ssh_forward = ssh_forward
59 | self.refresh_logs = refresh_logs
60 | self.template_dir = Path(template_dir)
61 | self.cpus_per_task = cpus_per_task
62 | self.gpus_per_task = gpus_per_task
63 | self.constraint_str = constraint_str
64 | self.throttle_array = throttle_array
65 | self.job_array_size = job_array_size
66 | self.use_custom_ray_tmp_dir = use_custom_ray_tmp_dir
67 | self.custom_directives = custom_directives
68 | self.gen_script_dir = Path(gen_script_dir)
69 | if code_snapshot_dir is not None:
70 | self.code_snapshot_dir = Path(code_snapshot_dir).resolve()
71 | else:
72 | self.code_snapshot_dir = None
73 | self.code_snapshot_filter_patterns = code_snapshot_filter_patterns
74 | self.slurm_logs = None
75 | # SLURM expects the logfiles to be absolute paths
76 | self.log_dir = Path(log_dir).resolve()
77 | self.generate_scripts()
78 |
79 | def generate_scripts(self):
80 | gen_dir = self.gen_script_dir
81 | if self.env_setup is None:
82 | self.env_setup = (
83 | 'export PYTHONPATH="${BASE}":$PYTHONPATH\n'
84 | 'export PATH="${HOME}/local/anaconda3/condabin/:$PATH"\n'
85 | 'source ~/local/anaconda3/etc/profile.d/conda.sh\n'
86 | 'conda activate pt37'
87 | )
88 |
89 | # set up logging
90 | ts = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
91 | self.log_path = str(Path(self.log_dir) / self.job_name / ts / "%4a-log.txt")
92 |
93 | if self.code_snapshot_dir is not None:
94 | supported_recipes = {"cpu-proc", "gpu-proc"}
95 | assert self.recipe in supported_recipes, (
96 | f"For now, `code_snapshot_dir` is only supported for {supported_recipes}"
97 | f" ({self.recipe} is not yet supported)"
98 | )
99 | code_snapshot_dir = self.code_snapshot_dir / self.job_name / ts
100 | self.copy_to_snapshot_dir(code_snapshot_dir=code_snapshot_dir)
101 | # modify the srun command to first move to the code snapshot directory before
102 | # the user command is launched
103 | self.cmd = f"cd {code_snapshot_dir} ; {self.cmd}"
104 |
105 | if self.recipe == "ray":
106 | # TODO(Samuel): configure this more sensibly
107 | template_paths = {
108 | "master": "ray/ray-master.sh",
109 | "sbatch": "ray/ray-sbatch.sh",
110 | "head-node": "ray/start-ray-head-node.sh",
111 | "worker-node": "ray/start-ray-worker-node.sh",
112 | }
113 | # NOTE: Due to unix max socket length (108 characters) it's best if this is
114 | # short and absolute
115 | if self.use_custom_ray_tmp_dir:
116 | ray_tmp_dir = Path.home() / "data/sock"
117 | ray_tmp_dir.mkdir(exist_ok=True, parents=True)
118 | ray_args = f"--temp-dir={ray_tmp_dir}"
119 | else:
120 | ray_args = ""
121 | array_str = f"1-{self.job_array_size}"
122 | if self.throttle_array:
123 | array_str = f"{array_str}%{self.throttle_array}"
124 | rules = {
125 | "master": {
126 | "nfs_update_secs": 1,
127 | "ray_sbatch_path": str(gen_dir / template_paths["sbatch"]),
128 | },
129 | "sbatch": {
130 | "cmd": self.cmd,
131 | "mem": self.mem,
132 | "log_path": self.log_path,
133 | "job_name": self.job_name,
134 | "partition": self.partition,
135 | "time_limit": self.time_limit,
136 | "env_setup": self.env_setup,
137 | "array": array_str,
138 | "cpus_per_task": self.cpus_per_task,
139 | "approx_ray_init_time_in_secs": 10,
140 | "exclude_nodes": f"#SBATCH --exclude={self.exclude}",
141 | "head_init_script": str(gen_dir / template_paths["head-node"]),
142 | "worker_init_script": str(gen_dir / template_paths["worker-node"]),
143 | "ssh_forward": self.ssh_forward,
144 | },
145 | "head-node": {
146 | "ray_args": ray_args,
147 | "env_setup": self.env_setup,
148 | },
149 | "worker-node": {
150 | "ray_args": ray_args,
151 | "env_setup": self.env_setup,
152 | },
153 | }
154 | self._add_batch_resources(rules)
155 | elif self.recipe in {"cpu-proc", "gpu-proc"}:
156 | if self.env_setup is None:
157 | # TODO(Samuel): configure this more sensibly
158 | self.env_setup = (
159 | 'export PYTHONPATH="${BASE}":$PYTHONPATH\n'
160 | 'export PATH="${HOME}/local/anaconda3/condabin/:$PATH"\n'
161 | 'source ~/local/anaconda3/etc/profile.d/conda.sh\n'
162 | 'conda activate pt14'
163 | )
164 | template_paths = {
165 | "master": f"{self.recipe}/master.sh",
166 | "sbatch": f"{self.recipe}/template.sh",
167 | }
168 | array_str = f"1-{self.job_array_size}"
169 | if self.throttle_array:
170 | array_str = f"{array_str}%{self.throttle_array}"
171 | rules = {
172 | "master": {
173 | "sbatch_path": str(gen_dir / template_paths["sbatch"]),
174 | },
175 | "sbatch": {
176 | "cmd": self.cmd,
177 | "mem": self.mem,
178 | "prep": self.prep,
179 | "array": array_str,
180 | "log_path": self.log_path,
181 | "job_name": self.job_name,
182 | "job_queue": self.job_queue,
183 | "env_setup": self.env_setup,
184 | "partition": self.partition,
185 | "time_limit": self.time_limit,
186 | "cpus_per_task": self.cpus_per_task,
187 | "exclude_nodes": f"#SBATCH --exclude={self.exclude}",
188 | "custom_directives": self.custom_directives,
189 | "sbatch_resources": None,
190 | },
191 | }
192 | self._add_batch_resources(rules, self.recipe == "gpu-proc")
193 | else:
194 | raise ValueError(f"template: {self.recipe} unrecognised")
195 |
196 | template_paths = {key: Path(self.template_dir) / val
197 | for key, val in template_paths.items()}
198 |
199 | self.gen_scripts = {}
200 | for key, template_path in template_paths.items():
201 | gen = self.fill_template(template_path=template_path, rules=rules[key])
202 | dest_path = gen_dir / Path(template_path).relative_to(self.template_dir)
203 | self.gen_scripts[key] = dest_path
204 | dest_path.parent.mkdir(exist_ok=True, parents=True)
205 | with open(str(dest_path), "w") as f:
206 | print(f"Writing slurm script ({key}) to {dest_path}")
207 | f.write(gen)
208 | dest_path.chmod(0o755)
209 |
210 | def _add_batch_resources(self, rules, allow_gpu=True):
211 | resource_strs = []
212 | if self.constraint_str:
213 | resource_strs.append(f"#SBATCH --constraint={self.constraint_str}")
214 | if self.gpus_per_task and allow_gpu:
215 | resource_strs.append(f"#SBATCH --gres=gpu:{self.gpus_per_task}")
216 | rules["sbatch"]["sbatch_resources"] = "\n".join(resource_strs)
217 |
218 | @beartype
219 | def copy_to_snapshot_dir(self, code_snapshot_dir: Path):
220 | src_files_to_copy = set()
221 | for pattern in self.code_snapshot_filter_patterns:
222 | if pattern == "symlinks":
223 | candidates = Path(".").glob("**/*")
224 | src_files = [x for x in candidates if x.is_symlink()]
225 | else:
226 | src_files = list(Path(".").glob(pattern))
227 | src_files_to_copy.update(src_files)
228 |
229 | print(f"Copying {len(src_files_to_copy)} src files to {code_snapshot_dir}")
230 | for src_file in src_files_to_copy:
231 | dest_path = code_snapshot_dir / src_file
232 | dest_path.parent.mkdir(exist_ok=True, parents=True)
233 | shutil.copyfile(str(src_file), str(dest_path), follow_symlinks=False)
234 |
235 | def get_log_paths(self):
236 | watched_logs = []
237 | for idx in range(self.job_array_size):
238 | if self.recipe == "ray" and idx > 0:
239 | # for ray jobs, we only need to watch the log from the headnode
240 | break
241 | slurm_id = idx + 1
242 | watched_log = Path(str(self.log_path).replace("%4a", f"{slurm_id:04d}"))
243 | watched_log.parent.mkdir(exist_ok=True, parents=True)
244 | if self.refresh_logs:
245 | if watched_log.exists():
246 | watched_log.unlink()
247 | # We also remove Pygtail files
248 | pygtail_file = watched_log.with_suffix(".txt.offset")
249 | if pygtail_file.exists():
250 | pygtail_file.unlink()
251 | # We must make sure that the log file exists to enable monitoring
252 | if not watched_log.exists():
253 | print(f"Creating watch log: {watched_log} for the first time")
254 | watched_log.touch()
255 | watched_logs.append(str(watched_log.resolve()))
256 | return watched_logs
257 |
258 | @beartype
259 | def submit(self, watch: bool = True, conserve_resources: int = 5):
260 | if watch:
261 | watched_logs = self.get_log_paths()
262 | submission_cmd = f"bash {self.gen_scripts['master']}"
263 | print(f"Submitting job with command: {submission_cmd}")
264 | print(f"using command:\n{self.cmd}")
265 | out = subprocess.check_output(submission_cmd.split())
266 | job_id = out.decode("utf-8").rstrip()
267 |
268 | def halting_condition():
269 | job_state = f"scontrol show job {job_id}"
270 | out = subprocess.check_output(job_state.split())
271 | regex = "JobState=[A-Z]+"
272 | completed = True
273 | for match in re.finditer(regex, out.decode("utf-8").rstrip()):
274 | status = match.group().replace("JobState=", "")
275 | if status != "COMPLETED":
276 | return False
277 | return completed
278 |
279 | if watch:
280 | Watcher(
281 | heartbeat=True,
282 | watched_logs=watched_logs,
283 | halting_condition=halting_condition,
284 | conserve_resources=conserve_resources,
285 | ).run()
286 | print("Job completed")
287 |
288 | def __repr__(self):
289 | """Produce a human-readable string representation of the Yaspi object.
290 |
291 | Returns:
292 | (str): a summary of the object settings.
293 | """
294 | summary = "Yaspi object\n========================\n"
295 | kwargs = sorted(self.__dict__.items(), key=lambda x: len(str(x[0]) + str(x[1])))
296 | for key, val in kwargs:
297 | summary += f"{key}: {val}\n"
298 | return summary
299 |
300 | @beartype
301 | def fill_template(self, template_path: Path, rules: dict) -> str:
302 | """Transform a template according to a given set of rules.
303 |
304 | Args:
305 | template_path: location of the template to be filled.
306 | rules (dict[str:object]): a key, value mapping between template keys
307 | and their target values.
308 |
309 | Returns:
310 | A single string represnting the transformed contents of the template
311 | file.
312 | """
313 | generated = []
314 | with open(template_path, "r") as f:
315 | template = f.read().splitlines()
316 |
317 | # A template key use to denote sbatch directives that can be removed
318 | # if no value is specified
319 | OR_DELETE_LINE = "|ordeleteline"
320 |
321 | for row in template:
322 | skip_row = False
323 | edits = []
324 | regex = r"\{\{(.*?)\}\}"
325 | for match in re.finditer(regex, row):
326 | groups = match.groups()
327 | assert len(groups) == 1, "expected single group"
328 | key = groups[0]
329 | ordeleteline = False
330 | if key.endswith(OR_DELETE_LINE):
331 | ordeleteline = True
332 | key = key[:-len(OR_DELETE_LINE)]
333 | token = rules[key]
334 | if ordeleteline and token is None:
335 | skip_row = True
336 | break
337 | edits.append((match.span(), token))
338 | if skip_row:
339 | continue
340 | if edits:
341 | # invert the spans
342 | spans = [(None, 0)] + [x[0] for x in edits] + [(len(row), None)]
343 | inverse_spans = [(x[1], y[0]) for x, y in zip(spans, spans[1:])]
344 | tokens = [row[start:stop] for start, stop in inverse_spans]
345 | urls = [str(x[1]) for x in edits]
346 | new_row = ""
347 | for token, url in zip_longest(tokens, urls, fillvalue=""):
348 | new_row += token + url
349 | row = new_row
350 | generated.append(row)
351 | return "\n".join(generated)
352 |
353 |
354 | def main():
355 | parser = argparse.ArgumentParser(description="Yaspi Tool")
356 | parser.add_argument("--install_location", action="store_true",
357 | help="if given, report the install location of yaspi")
358 | parser.add_argument("--job_name", default="yaspi-test",
359 | help="the name that slurm will give to the job")
360 | parser.add_argument("--recipe", default="ray",
361 | help="the SLURM recipe to use to generate scripts")
362 | parser.add_argument("--template_dir",
363 | type=Path,
364 | help="if given, override directory containing SLURM templates")
365 | parser.add_argument("--partition", default=None,
366 | help="The name of the SLURM partition used to run the job")
367 | parser.add_argument("--time_limit", default=None,
368 | help="The maximum amount of time allowed to run the job")
369 | parser.add_argument("--gen_script_dir", default="data/slurm-gen-scripts",
370 | type=Path,
371 | help="directory in which generated slurm scripts will be stored")
372 | parser.add_argument("--cmd", default='echo "hello"',
373 | help="single command (or comma separated commands) to run")
374 | parser.add_argument("--mem", default=None,
375 | help="the memory to be requested for each SLURM worker")
376 | parser.add_argument("--prep", default="", help="a command to be run before srun")
377 | parser.add_argument("--job_array_size", type=int, default=2,
378 | help="The number of SLURM array workers")
379 | parser.add_argument("--cpus_per_task", type=int, default=None,
380 | help="the number of cpus requested for each SLURM task")
381 | parser.add_argument("--gpus_per_task", type=int, default=None,
382 | help="the number of gpus requested for each SLURM task")
383 | parser.add_argument("--throttle_array", type=int, default=0,
384 | help="limit the number of array workers running at once")
385 | parser.add_argument("--env_setup", help="setup string for a custom environment")
386 | parser.add_argument("--ssh_forward",
387 | default="ssh -N -f -R 8080:localhost:8080 triton.robots.ox.ac.uk",
388 | help="setup string for a custom environment")
389 | parser.add_argument("--log_dir", default="data/slurm-logs", type=str,
390 | help="location where SLURM logs will be stored")
391 | parser.add_argument("--use_custom_ray_tmp_dir", action="store_true")
392 | parser.add_argument("--refresh_logs", action="store_true")
393 | parser.add_argument("--watch", type=int, default=1,
394 | help="whether to watch the generated SLURM logs")
395 | parser.add_argument("--exclude", default="",
396 | help="comma separated list of nodes to exclude")
397 | parser.add_argument("--constraint_str", help="SLURM --constraint string")
398 | parser.add_argument("--job_queue", default="",
399 | help="a queue of jobs to pass to a yaspi recipe")
400 | parser.add_argument("--custom_directives", default="",
401 | help=('Add any extra directives here, separated by newlines'
402 | 'e.g. "#SBATCH -A account-name\n#SBATCH --mem 10G"'))
403 | parser.add_argument("--code_snapshot_dir", type=Path,
404 | help=("if this argument is supplied, yaspi will make a snapshot "
405 | "of the codebase (starting from the current root directory),"
406 | "copy the snapshot to `code_snapshot_dir`, and then launch"
407 | "the command from there. Currently, only supported for the"
408 | "'cpu-proc' and 'gpu-proc' recipes"))
409 | parser.add_argument("--code_snapshot_filter_patterns", nargs="+",
410 | default=["**/*.py", "symlinks"],
411 | help=("if `--code_snapshot_dir` is supplied, then "
412 | "`--code_snapshot_filter_patterns` is used as a glob pattern "
413 | "to select which files will be included in the snapshot. If "
414 | "`symlinks` is included as a filter pattern, it is treated "
415 | "as a special pattern that mimics symlinks in the original "
416 | "code dir"))
417 | args = parser.parse_args()
418 |
419 | if args.install_location:
420 | print(Path(__file__).parent)
421 | return
422 |
423 | # Certain properties use defaults set by the Yaspi class, rather than argparse, to
424 | # ensure that users of the Python interface (i.e. directly creating Yaspi object)
425 | # can aslo benefit from these defaults
426 | prop_keys = {"template_dir", "custom_directives"}
427 | prop_kwargs = {key: getattr(args, key) for key in prop_keys if getattr(args, key)}
428 |
429 | job = Yaspi(
430 | cmd=args.cmd,
431 | mem=args.mem,
432 | prep=args.prep,
433 | recipe=args.recipe,
434 | log_dir=args.log_dir,
435 | exclude=args.exclude,
436 | job_name=args.job_name,
437 | job_queue=args.job_queue,
438 | partition=args.partition,
439 | time_limit=args.time_limit,
440 | env_setup=args.env_setup,
441 | ssh_forward=args.ssh_forward,
442 | refresh_logs=args.refresh_logs,
443 | cpus_per_task=args.cpus_per_task,
444 | gpus_per_task=args.gpus_per_task,
445 | gen_script_dir=args.gen_script_dir,
446 | constraint_str=args.constraint_str,
447 | job_array_size=args.job_array_size,
448 | use_custom_ray_tmp_dir=args.use_custom_ray_tmp_dir,
449 | throttle_array=args.throttle_array,
450 | code_snapshot_dir=args.code_snapshot_dir,
451 | code_snapshot_filter_patterns=args.code_snapshot_filter_patterns,
452 | **prop_kwargs,
453 | )
454 | job.submit(watch=bool(args.watch))
455 |
456 |
457 | if __name__ == "__main__":
458 | main()
459 |
--------------------------------------------------------------------------------