├── .github ├── dependabot.yaml └── workflows │ ├── pypi.yaml │ └── tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CHANGES.md ├── DesignNotes.md ├── LICENSE ├── Readme.md ├── disbatch ├── __init__.py ├── __main__.py ├── dbMon.py ├── dbUtil.template.sh ├── disBatch.py └── kvsstcp │ ├── .gitignore │ ├── Readme.md │ ├── __init__.py │ ├── kvsclient.py │ ├── kvscommon.py │ └── kvsstcp.py ├── exampleTaskFiles ├── 4KChecks ├── 4KTasksRep ├── DBtasksOneBadOneLeaky ├── DCPTTasks ├── GPUTasks ├── WayTooLongTask ├── barrierCheckFail ├── dberTest.py ├── dberTest.submit ├── emptyTaskFile └── latePETask ├── pyproject.toml ├── tests ├── test_slurm │ ├── Tasks │ ├── Tasks_failfast │ └── run.sh └── test_ssh │ ├── Tasks │ ├── Tasks_failfast │ └── run.sh └── uv.lock /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Dist 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | build: 9 | name: Build 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Install uv 15 | uses: astral-sh/setup-uv@v6 16 | with: 17 | version: "0.4.30" 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version-file: ".python-version" 23 | 24 | - name: Build 25 | run: uv build 26 | 27 | - name: Test wheels 28 | run: | 29 | uv venv --no-project testwhl 30 | . testwhl/bin/activate 31 | uv pip install dist/*.whl 32 | cd tests/test_ssh 33 | ./run.sh 34 | 35 | - name: Test sdist 36 | run: | 37 | uv venv --no-project testsdist 38 | . testsdist/bin/activate 39 | uv pip install dist/*.tar.gz 40 | cd tests/test_ssh 41 | ./run.sh 42 | 43 | - name: Upload dist artifacts 44 | uses: actions/upload-artifact@v4 45 | with: 46 | name: dists 47 | path: dist/* 48 | 49 | upload: 50 | name: Upload 51 | needs: [build] 52 | runs-on: ubuntu-latest 53 | environment: pypi 54 | permissions: 55 | id-token: write 56 | steps: 57 | - name: Install uv 58 | uses: astral-sh/setup-uv@v6 59 | with: 60 | version: "0.4.30" 61 | 62 | - name: Download dist artifacts 63 | uses: actions/download-artifact@v4 64 | with: 65 | name: dists 66 | path: dist 67 | 68 | - name: Publish 69 | run: uv publish 70 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test: 7 | 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 12 | os: [ubuntu-latest] 13 | include: 14 | - os: macos-latest 15 | python-version: "3.13" 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Install uv 21 | uses: astral-sh/setup-uv@v6 22 | with: 23 | version: "0.4.30" 24 | 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Install the project 31 | run: | 32 | uv sync --all-extras --dev 33 | 34 | - name: Run local-mode (ssh) test 35 | working-directory: ./tests/test_ssh 36 | run: | 37 | uv run ./run.sh 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /exampleTaskFiles/*.log 3 | /exampleTaskFiles/*_disBatch_* 4 | /exampleTaskFiles/slurm*out 5 | /exampleTaskFiles/slurm*resize.*sh 6 | /exampleTaskFiles/dbTestOutputDir 7 | /tmp 8 | /tests/test_*/disbatch-test.* 9 | __pycache__/ 10 | *.egg-info/ 11 | /disbatch/_version.py 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autoupdate_schedule: monthly 3 | 4 | repos: 5 | - repo: https://github.com/astral-sh/ruff-pre-commit 6 | rev: v0.11.12 7 | hooks: 8 | - id: ruff 9 | args: [ --fix ] 10 | - id: ruff-format 11 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 3.0.0 (2024-12-05) 4 | 5 | https://github.com/flatironinstitute/disBatch/pull/32 6 | 7 | ### Breaking changes 8 | - The Python package has been renamed `disbatch` from `disbatchc` 9 | - Removed the `disBatch` script from the repo root. Instead, a `disBatch` command will be placed on the `PATH` upon installation, or see the [installation instructions](Readme.md#Installation) for other options. 10 | 11 | ### Fixes 12 | - disBatch now installs all the necessary helper files so out-of-place installs work 13 | - Bugs (e.g. misspelled variables) in less common code paths fixed 14 | 15 | ### Enhancements 16 | - PEP518 compliant build system 17 | - More robust discovery of disBatch installation by worker processes 18 | - Initial release on PyPI 19 | - uvx and pipx support 20 | - Set up linting and formatting 21 | - The executable can be invoked as `disbatch` or `disBatch` 22 | - Refreshed the readme 23 | - Added `disbatch --version` and `disbatch.__version__` 24 | - Added MacOS test 25 | - Added `--fail-fast` option [https://github.com/flatironinstitute/disBatch/pull/38] 26 | - Gracefully handle empty task list [https://github.com/flatironinstitute/disBatch/pull/38] 27 | 28 | ### Changes 29 | - `kvsstcp` submodule is now vendored 30 | -------------------------------------------------------------------------------- /DesignNotes.md: -------------------------------------------------------------------------------- 1 | Basic Design 2 | ============ 3 | 4 | With version **2**, disBatch consists of three major components: 5 | 6 | * A driver (aka _controller_) that maintains the state of the task processing. 7 | * An execution context that encapsulates one or more engines running on one or mode nodes. A disBatch run may have multiple contexts. 8 | * An engine that is a collection of cylinder threads. Each cylinder runs a loop that waits for a task from the controller, spawns a sub-process to evaluate it, waits for the sub-process to exit, and then sends a report to the controller. 9 | 10 | 11 | Driver 12 | ----- 13 | 14 | In normal operation, the driver spawns a couple of threads. One implements that KVS service. Another is the task feeder. This takes tasks from a task generator and hands them off to the controller via KVS. 15 | 16 | Each task has an age, which reflects the number of synchronization events that preceded it. Synchronization events are barriers and per engine tasks. Per engine tasks are posted to KVS. A barrier is tracked by the controller. When all tasks prior to the barrier have been completed, the barrier is satisfied, a per engine event to this effect is posted to KVS and the controller's age is increased. The controller notifies the task feeder thread of the new age. The task feeder will not issue a task to the controller unless the controller's age is equal to the task's age. This interplay ensures no task is available for execution until all previous barriers (and thus in turn, all previous tasks) have been accounted for. Something akin to this takes place between an engine and its cylinders to implement per engine synchronization. See below. 17 | 18 | The controller executes a main loop that waits for a controller event to arrive from KVS. These events include a new task from the task feeder, a completed task report from a cylinder, the registration of an execution context or an engine, a cylinder start, the notification that a context, engine or cylinder has stopped, requests to shutdown a context or a cylinder, and a few other events. 19 | 20 | Each pass through the loop, the controller: 21 | 22 | - Accepts a controller message from KVS. These may lead it to alter its internal state (say add a new cylinder) or execute an operation like sending a shutdown message to an engine. Of particular interest are messages providing a new task to execute, which causes that task to be added to a list of tasks with the same age, and messages reporting the completion of a task, which causes the cylinder it was assigned to to be marked available again and the finished task id to be recorded. 23 | - Checks to see if all necessary tasks have been completed to satisfy a barrier. If so the age is advanced, and other barriers iteratively checked---that is the completion of one task could in effect satisfy a series of successive barriers. 24 | - If there are tasks for the current age and available cylinders, assign tasks to the individual cylinders until we run out of one or the other. **Note:** If we record the assignments (including the full task), it should be straightforward to reissue tasks upon engine "failure". 25 | - Update overall status info that is kept in KVS. This is used by `dbmon.py` to provide quasi-realtime info about the state of a disBatch run. 26 | 27 | As noted, the driver receives messages informing it of new contexts, engines and cylinders. A portion of this information is incorporated in the status report placed in KVS. It is also used to implement task limits for contexts. Once the controller has assigned the cylinder(s) of the engine(s) of a context a total number of tasks equal to the task limit specified for the context, it sends a shutdown request to every engine in the context. 28 | 29 | Execution context 30 | ----------------- 31 | 32 | A context is responsible for interfacing between a collection of computational resources and a controller. Currently two kinds are supported: 33 | 34 | * SLURM: This context makes use of environment variables set by SLURM to identify the allocated nodes and uses `srun` to start engines. The code here could serve as a model for implementing contexts for other batch queuing systems. 35 | * SSH: The nodes to be used are passed via the command line option (`-s`) or the environment variable `DISBATCH_SSH_NODELIST`. Engines are started via `ssh`. 36 | 37 | Each context monitors its engines and invokes a retirement method, if provided, when an engine exits. 38 | 39 | A context is also a logical shutdown unit. The user can, for example via `dbmon.py`, request that a context be shutdown. This is implemented by sending a shutdown request to each of the context's engines. **Note:** Such a request waits politely for all cylinders to complete any currently assigned tasks before stopping the engine. 40 | 41 | 42 | Engine 43 | ------ 44 | 45 | An engine is a collection of N+1 cylinder threads, where N is the number of allowable concurrently executing tasks specified for the engine. The extra cylinder handles the per-engine tasks. Per-engine tasks are maintained as an ordered queue in KVS: engines `view` values using a key with an index, stepping the index each time. Thus an engine joining at any given time can "replay" all the per engine activity. As it does so, it updates its internal age, and notifies each of its cylinders of the current age. A cylinder will not execute an assigned task until the engine has reached that task's age. 46 | 47 | 48 | Use modes 49 | --------- 50 | 51 | With the exception of some reporting details, the "standard" case should be the same as with version **1**. 52 | 53 | With version **2**, a user can invoke `disBatch` with `-S`, which starts a disBatch "service"---effectively just the controller. In this case, the name of a utility script is displayed. This script (always created by version **2**), can be submitted via sbatch to add an execution context. One could even submit this with a job array specification, and so add nodes on the fly to the disBatch run. The same script can be invoked with `-s` to add some ssh hosts to the mix, e.g., the user's own workstation. 54 | 55 | The script can be invoked with `--mon` to start up a simple ASCII-UI to monitor progress and request shutdown of an engine or a context. 56 | 57 | Comments 58 | -------- 59 | 1. The controller is supposed to be the only single point of failure, nothing else (in the disBatch system) should be (assuming non malicious failure). Barriers (including an implicit one at the end), of course, might not be satisfied, but that aside a disBatch run can keep going even if a context or engine dies (if all engines died, more would have to be added to make more progress). 60 | 61 | 2. Idempotency and task reissue. 62 | 63 | 3. cli version of dbmon.py. 64 | 65 | 4. Job array demo. (Theory vs practice.) 66 | 67 | 5. Add option to insert `timeout`? 68 | 69 | 6. Add heartbeat as a failure detection mechanism? 70 | 71 | 7. pernode vs perengine 72 | 73 | 8. Remove delay for explicitly started engines? Probably not ... 74 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # disBatch 2 | 3 | Distributed processing of a batch of tasks. 4 | 5 | [![Tests](https://github.com/flatironinstitute/disBatch/actions/workflows/tests.yaml/badge.svg)](https://github.com/flatironinstitute/disBatch/actions/workflows/tests.yaml) 6 | 7 | ## Quickstart 8 | 9 | Install with pip: 10 | 11 | pip install disbatch 12 | 13 | Create a file called `Tasks` with a list of commands you want to run. These should be Bash commands as one would run on the command line: 14 | 15 | myprog arg0 &> myprog_0.log 16 | myprog arg1 &> myprog_1.log 17 | ... 18 | myprog argN &> myprog_N.log 19 | 20 | This file can have as many tasks (lines) as you like. The `...` is just a stand-in and wouldn't literally be in the task file. 21 | 22 | Then, to run 5 tasks at a time in parallel on your local machine, run: 23 | 24 | disBatch -s localhost:5 Tasks 25 | 26 | `disBatch` will start the first five running concurrently. When one finishes, the next will be started until all are done. 27 | 28 | Or, to distribute this work on a Slurm cluster, run: 29 | 30 | sbatch -n 5 disBatch Tasks 31 | 32 | You may need to provide additional arguments specific to your cluster to specify a partition, time limit, etc. 33 | 34 | ## Overview 35 | 36 | One common usage pattern for distributed computing involves processing a 37 | long list of commands (aka *tasks*): 38 | 39 | myprog -a 0 -b 0 -c 0 40 | myprog -a 0 -b 0 -c 1 41 | ... 42 | myprog -a 9 -b 9 -c 9 43 | 44 | One could run this by submitting 1,000 separate jobs to a cluster, but that may 45 | present problems for the queuing system and can behave badly if the 46 | system is configured to handle jobs in a simple first come, first serve 47 | fashion. For short tasks, the job launch overhead may dominate the runtime, too. 48 | 49 | One could simplify this by using, e.g., Slurm job arrays, but each job in a job 50 | array is an independent Slurm job, so this suffers from the same per-job overheads 51 | as if you submitted 1000 independent jobs. Furthermore, if nodes are being allocated 52 | exclusively (i.e. the nodes that are allocated to your job are not shared by other jobs), 53 | then the job array approach can hugely underutilize the compute resources unless each 54 | task is using a full node's worth of resources. 55 | 56 | And what if you don't have a cluster available, but do have a collection of networked computers? Or you just want to make use of multiple cores on your own computer? 57 | 58 | In any event, when processing such a list of tasks, it is helpful to 59 | acquire metadata about the execution of each task: where it ran, how 60 | long it took, its exit return code, etc. 61 | 62 | disBatch has been designed to support this usage in a simple and 63 | portable way, as well as to provide the sort of metadata that can be 64 | helpful for debugging and reissuing failed tasks. 65 | 66 | It can take as input a file, each of whose lines is a task in the form of a 67 | Bash command. For example, the file could consists of the 1000 commands listed above. It launches the tasks one 68 | after the other until all specified execution resources are in use. Then as one 69 | executing task exits, the next task in the file is launched. This repeats until all 70 | the lines in the file have been processed. 71 | 72 | Each task is run in a new shell; i.e. all lines are independent of one another. 73 | 74 | Here's a more complicated example, demonstrating controlling the execution environment and capturing the output of the tasks: 75 | 76 | ( cd /path/to/workdir ; source SetupEnv ; myprog -a 0 -b 0 -c 0 ) &> task_0_0_0.log 77 | ( cd /path/to/workdir ; source SetupEnv ; myprog -a 0 -b 0 -c 1 ) &> task_0_0_1.log 78 | ... 79 | ( cd /path/to/workdir ; source SetupEnv ; myprog -a 9 -b 9 -c 8 ) &> task_9_9_8.log 80 | ( cd /path/to/workdir ; source SetupEnv ; myprog -a 9 -b 9 -c 9 ) &> task_9_9_9.log 81 | 82 | Each line uses standard Bash syntax. Let's break it down: 83 | 84 | 1. the `( ... ) &> task_0_0_0.log` captures all output (stdout and stderr) from any command in the parentheses and writes it to `task_0_0_0.log`; 85 | 2. `cd /path/to/workdir` changes the working directory; 86 | 3. `source SetupEnv` executes a script called `SetupEnv`, which could contain commands like `export PATH=...` or `module load ...` to set up the environment; 87 | 4. `myprog -a 0 -b 0 -c 0` is the command you want to run. 88 | 89 | The semicolons between the last 3 statements are Bash syntax to run a series of commands on the same line. 90 | 91 | You can simplify this kind of task file with the `#DISBATCH PREFIX` and `#DISBATCH SUFFIX` directives. See the [#DISBATCH directives](#disbatch-directives) section for full details, but here's how that could look: 92 | 93 | #DISBATCH PREFIX ( cd /path/to/workdir ; source SetupEnv ; myprog 94 | #DISBATCH SUFFIX ) &> task_${DISBATCH_TASKID}.log 95 | -a 0 -b 0 -c 0 96 | -a 0 -b 0 -c 1 97 | ... 98 | -a 9 -b 9 -c 9 99 | 100 | 101 | Note that for a simple environment setup, you don't need a `source SetupEnv`. You can just set an environment variable directly in the task line, as you can in Bash: 102 | 103 | export LD_LIBRARY_PATH=/d0/d1/d2:$LD_LIBRARY_PATH ; rest ; of ; command ; sequence 104 | 105 | For more complex set ups, command sequences and input/output redirection requirements, you could place everything in a small shell script with appropriate arguments for the parts that vary from task to task, say `RunMyprog.sh`: 106 | 107 | #!/bin/bash 108 | 109 | id=$1 110 | shift 111 | cd /path/to/workdir 112 | module purge 113 | module load gcc openblas python3 114 | 115 | export LD_LIBRARY_PATH=/d0/d1/d2:$LD_LIBRARY_PATH 116 | myProg "$@" > results/${id}.out 2> logs/${id}.log 117 | 118 | The task file would then contain: 119 | 120 | ./RunMyprog.sh 0_0_0 -a 0 -b 0 -c 0 121 | ./RunMyprog.sh 0_0_1 -a 0 -b 0 -c 1 122 | ... 123 | ./RunMyprog.sh 9_9_8 -a 9 -b 9 -c 8 124 | ./RunMyprog.sh 9_9_9 -a 9 -b 9 -c 9 125 | 126 | See [#DISBATCH directives](#disbatch-directives) for more ways to simplify task lines. disBatch also sets some environment variables that can be used in your commands as arguments or to generate task-specifc file names: 127 | 128 | * `DISBATCH_JOBID`: A name disBatch creates that should be unique to the job 129 | * `DISBATCH_NAMETASKS`: The basename of the task file 130 | * `DISBATCH_REPEAT_INDEX`: See the repeat construct in [\#DISBATCH directives](#disbatch-directives) 131 | * `DISBATCH_STREAM_INDEX`: The 1-based line number of the line from the task file that generated the task 132 | " `DISBATCH_TASKID`: 0-based sequential counter value that uniquely identifies each task 133 | 134 | Appending `_ZP` to any of the last three will produce a 0-padded value (to six places). If these variables are used to create file names, 0-padding will result in files names that sort correctly. 135 | 136 | Once you have created the task file, running disBatch is straightforward. For example, working with a cluster managed by Slurm, 137 | all that needs to be done is to submit a job like the following: 138 | 139 | sbatch -n 20 -c 4 disBatch TaskFileName 140 | 141 | This particular invocation will allocate sufficient resources to process 142 | 20 tasks at a time, each of which needs 4 cores. 143 | disBatch will use environment variables initialized by Slurm to determine the execution resources to use for the run. 144 | This invocation assumes an appropriately installed disBatch is in your PATH, see [installation](#installation) for details. 145 | 146 | disBatch also allows the pool of execution resources to be increased or decreased during the course of a run: 147 | 148 | sbatch -n 10 -c 4 ./TaskFileName_dbUtil.sh 149 | 150 | will add enough resources to run 10 more tasks concurrently. `TaskFileName_dbUtl.sh` is a utility script created by `disBatch` when the run starts (the actual name is a little more complex, see [startup](#user-content-startup)). 151 | 152 | Various log files will be created as the run unfolds: 153 | 154 | * `TaskFileName_*_status.txt`: status of every task (details below). `*` elides a unique identifier disBatch creates to distinguish one run from another. This is the most important output file and we recommend checking it after every run. 155 | * `TaskFileName_*_[context|driver|engine].log`: 156 | The disBatch driver log file contains details mostly of interest in case of a 157 | problem with disBatch itself. (The driver log file name can be changed with `-l`). It can generally be ignored by end 158 | users (but keep it around in the event that something did go 159 | wrong—it will aid debugging). The `*_[context|engine].log` files contain similar information for the disBatch components that manage execution resources. 160 | * `disBatch_*_kvsinfo.txt`: TCP address of invoked KVS server if any (for additional advanced status monitoring) 161 | 162 | > [!TIP] 163 | > The `*_status.txt` file is the most important disBatch output file and we recommend checking it after every run. 164 | 165 | While disBatch is a Python 3 application, it can run tasks from any language environment—anything you can run from a shell can be run as a task. 166 | 167 | ### Status file 168 | 169 | The status file is the most important disBatch output file and we recommend checking it after every run. The filename is `TaskFileName_*_status.txt`. It contains tab-delimited lines of the form: 170 | 171 | 314 315 -1 worker032 8016 0 10.0486528873 1458660919.78 1458660929.83 0 "" 0 "" cd /path/to/workdir ; myprog -a 3 -b 1 -c 4 > task_3_1_4.log 2>&1 172 | 173 | These fields are: 174 | 175 | 1. Flags: The first field, blank in this case, may contain `E`, `O`, `R`, `B`, or `S` flags. 176 | Each program/task should be invoked in such a way that standard error 177 | and standard output end up in appropriate files. If that is not the case 178 | `E` or `O` flags will be raised. `R` indicates that the task 179 | returned a non-zero exit code. `B` indicates a [barrier](#disbatch-directives). `S` indicates the job was skipped (this may happen during "resume" runs). 180 | 1. Task ID: The `314` is the 0-based index of the task (starting from the beginning of the task file, incremented for each task, including repeats). 181 | 1. Line number: The `315` is the 1-based line from the task file. Blank lines, comments, directives and repeats may cause this to drift considerably from the value of Task ID. 182 | 1. Repeat index: The `-1` is the repeat index (as in this example, `-1` indicates this task was not part of a repeat directive). 183 | 1. Node: `worker032` identifies the node on which the task ran. 184 | 1. PID: `8016` is the PID of the bash shell used to run the task. 185 | 1. Exit code: `0` is the exit code returned. 186 | 1. Elapsed time: `10.0486528873` (seconds), 187 | 1. Start time:`1458660919.78` (Unix epoch based), 188 | 1. Finish time: `1458660929.83` (Unix epoch based). 189 | 1. Bytes of *leaked* output (not redirected to a file), 190 | 1. Output snippet (up to 80 bytes consisting of the prefix and suffix of the output), 191 | 1. Bytes of leaked error output, 192 | 1. Error snippet, 193 | 1. Command: `cd ...` is the text of the task (repeated from the task file, but subject to modification by [directives](#disbatch-directives)). 194 | 195 | 196 | ## Installation 197 | 198 | **Users of Flatiron clusters: disBatch is available via the module system. You can run `module load disBatch` instead of installing it.** 199 | 200 | There are several ways to get disBatch: 201 | 202 | 1. installation with pip; 203 | 1. direct invocation with pipx or uvx; 204 | 1. cloning the repo. 205 | 206 | Most users can install via pip. Direct invocation with uvx may be of particular interest for users on systems without a modern Python, as uvx will bootstrap Python for you. 207 | 208 | ### Installation with pip 209 | You can use pip to install disbatch just like a normal Python package: 210 | 211 | 1. from PyPI: `pip install disbatch` 212 | 2. from GitHub: `pip install git+https://github.com/flatironinstitute/disBatch.git` 213 | 214 | These should be run in a venv. Installing with `pip install --user disbatch` may work instead, but as a general practice is discouraged. 215 | 216 | After installation, disBatch will be available via the `disbatch` and `disBatch` executables on the `PATH` so long as the venv is activated. Likewise, disBatch can be run as a module with `python -m disbatch`. 217 | 218 |
219 | Click here for a complete example using pip and venv 220 | 221 | You'll need a modern Python to install disBatch this way. We recommend the uvx installation method below if you don't have one, as uv will boostrap Python for you. 222 | 223 | ``` 224 | python -m venv venv 225 | . venv/bin/activate 226 | pip install disbatch 227 | disbatch TaskFile 228 | ``` 229 |
230 | 231 | ### Direct invocation with pipx or uvx 232 | 233 | [pipx](https://pipx.pypa.io/stable/) and [uvx](https://docs.astral.sh/uv/guides/tools/) are two tools that will create an isolated venv, download and install disbatch into that venv, and run it all in a single command: 234 | 235 | 1. `pipx disbatch TaskFile` 236 | 1. `uvx disbatch TaskFile` 237 | 238 | pipx already requires a somewhat modern Python, so for disbatch's purposes it just saves you the step of creating and activating a venv and installing disBatch. 239 | 240 | uvx, on the other hand, will download a modern Python for you if you don't have one available locally. It requires [installing uv](https://docs.astral.sh/uv/getting-started/installation/), which is straightforward and portable. 241 | 242 | Here's a complete example of running disbatch on a system without modern Python: 243 | 244 | ``` 245 | curl -LsSf https://astral.sh/uv/install.sh | sh 246 | source $HOME/.local/bin/env 247 | uvx disbatch TaskFile 248 | ``` 249 | 250 | Afterwards, disbatch will always be available as `uvx disbatch`. 251 | 252 | For Slurm users, note that the above will install disbatch into the user's default cache directory. If this directory is not visible to all nodes on the cluster, then disbatch jobs will fail. One can specify a different cache directory with `uvx --cache-dir=...`, but the simplest fix is to do a `tool install`: 253 | 254 | ``` 255 | uv tool install disbatch 256 | sbatch disbatch TaskFile 257 | ``` 258 | 259 | This places `disbatch` on the `PATH` in a persistent location; no need to use `uvx` anymore. 260 | 261 | 262 | ### Cloning the repo 263 | Users or developers who want to work on the code should clone the repo then do an editable install into a venv: 264 | 265 | ``` 266 | git clone https://github.com/flatironinstitute/disBatch.git 267 | pip install -e ./disBatch 268 | ``` 269 | 270 | Setting `PYTHONPATH` may also work, but as a general practice is discouraged. If you don't have a modern Python available, [uv](https://docs.astral.sh/uv/getting-started/installation/) can bootstrap one for you. 271 | 272 | ## Execution Environments 273 | disBatch is designed to support a variety of execution environments, from your own desktop, to a local collection of workstations, to large clusters managed by job schedulers. 274 | It currently supports Slurm and can be executed from `sbatch`, but it is architected to make it simple to add support for other resource managers. 275 | 276 | You can also run directly on one or more machines by setting an environment variable: 277 | 278 | DISBATCH_SSH_NODELIST=localhost:7,otherhost:3 279 | 280 | or specifying an invocation argument: 281 | 282 | -s localhost:7,otherhost:3 283 | 284 | This allows execution directly on your `localhost` and via ssh for remote hosts without the need for a resource management system. 285 | In this example, disBatch is told it can use seven CPUs on your local host and three on `otherhost`. Assuming the default mapping of one task to one CPU applies in this example, seven tasks could be in progress at any given time on `localhost`, and three on `otherhost`. Note that `localhost` is an actual name you can use to refer to the machine on which you are currently working. `otherhost` is fictious. 286 | Hosts used via ssh must be set up to allow ssh to work without a password and must share the working directory for the disBatch run. 287 | 288 | disBatch refers to a collection of execution resources as a *context* and the resources proper as *engines*. So the Slurm example `sbatch -n 20 -c 4`, run on a cluster with 16-core nodes, might create one context with five engines (one each for five 16-core nodes, capable of running four concurrent 4-core tasks each), while the SSH example creates one context with two engines (capable of running seven and three concurrent tasks, respectively). 289 | 290 | ## Invocation 291 | ``` 292 | usage: disbatch [-h] [-e] [--force-resume] [--kvsserver [HOST:PORT]] [--logfile FILE] 293 | [--loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG}] [--mailFreq N] [--mailTo ADDR] [-p PATH] 294 | [-r STATUSFILE] [-R] [-S] [--status-header] [--use-address HOST:PORT] [-w] [-f] 295 | [--taskcommand COMMAND] [--taskserver [HOST:PORT]] [--version] [-C TASK_LIMIT] [-c N] [--fill] 296 | [--no-retire] [-l COMMAND] [--retire-cmd COMMAND] [-s HOST:CORECOUNT] [-t N] 297 | [taskfile] 298 | 299 | Use batch resources to process a file of tasks, one task per line. 300 | 301 | positional arguments: 302 | taskfile File with tasks, one task per line ("-" for stdin) 303 | 304 | options: 305 | -h, --help show this help message and exit 306 | -e, --exit-code When any task fails, exit with non-zero status (default: only if disBatch itself fails) 307 | --force-resume With -r, proceed even if task commands/lines are different. 308 | --kvsserver [HOST:PORT] 309 | Use a running KVS server. 310 | --logfile FILE Log file. 311 | --loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG} 312 | Logging level (default: INFO). 313 | --mailFreq N Send email every N task completions (default: 1). "--mailTo" must be given. 314 | --mailTo ADDR Mail address for task completion notification(s). 315 | -p PATH, --prefix PATH 316 | Path for log, dbUtil, and status files (default: "."). If ends with non-directory component, 317 | use as prefix for these files names (default: _disBatch__). 318 | -r STATUSFILE, --resume-from STATUSFILE 319 | Read the status file from a previous run and skip any completed tasks (may be specified 320 | multiple times). 321 | -R, --retry With -r, also retry any tasks which failed in previous runs (non-zero return). 322 | -S, --startup-only Startup only the disBatch server (and KVS server if appropriate). Use "dbUtil..." script to 323 | add execution contexts. Incompatible with "--ssh-node". 324 | --status-header Add header line to status file. 325 | --use-address HOST:PORT 326 | Specify hostname and port to use for this run. 327 | -w, --web Enable web interface. 328 | -f, --fail-fast Exit on first task failure. Running tasks will be interrupted and disBatch will exit with a 329 | non-zero exit code. 330 | --taskcommand COMMAND 331 | Tasks will come from the command specified via the KVS server (passed in the environment). 332 | --taskserver [HOST:PORT] 333 | Tasks will come from the KVS server. 334 | --version Print the version and exit 335 | -C TASK_LIMIT, --context-task-limit TASK_LIMIT 336 | Shutdown after running COUNT tasks (0 => no limit). 337 | -c N, --cpusPerTask N 338 | Number of cores used per task; may be fractional (default: 1). 339 | --fill Try to use extra cores if allocated cores exceeds requested cores. 340 | --no-retire Don't retire nodes from the batch system (e.g., if running as part of a larger job). 341 | -l COMMAND, --label COMMAND 342 | Label for this context. Should be unique. 343 | --retire-cmd COMMAND Shell command to run to retire a node (environment includes $NODE being retired, remaining 344 | $ACTIVE node list, $RETIRED node list; default based on batch system). Incompatible with "-- 345 | ssh-node". 346 | -s HOST:CORECOUNT, --ssh-node HOST:CORECOUNT 347 | Run tasks over SSH on the given nodes (can be specified multiple times for additional hosts; 348 | equivalent to setting DISBATCH_SSH_NODELIST) 349 | -t N, --tasksPerNode N 350 | Maximum concurrently executing tasks per node (up to cores/cpusPerTask). 351 | ``` 352 | 353 | The options for mail will only work if your computing environment permits processes to access mail via SMTP. 354 | 355 | A value for `-c` < 1 effectively allows you to run more tasks concurrently than CPUs specified for the run. This is somewhat unusual, and generally not recommended, but could be appropriate in some cases. 356 | 357 | The `--no-retire` and `--retire-cmd` flags allow you to control what disBatch does when a node is no longer needed to run jobs. 358 | When running under slurm, disBatch will by default run the command: 359 | 360 | scontrol update JobId="$SLURM_JOBID" NodeList="${DRIVER_NODE:+$DRIVER_NODE,}$ACTIVE" 361 | 362 | which will tell slurm to release any nodes no longer being used. 363 | You can set this to run a different command, or nothing at all. 364 | While running this command, the follow environment variables will be set: `NODE` (the node that is no longer needed), `ACTIVE` (a comma-delimited list of nodes that are still active), `RETIRED` (a comma-delimited list of nodes that are no longer active, including `$NODE`), and possibly `DRIVER_NODE` (the node still running the main disBatch script, if it's not in `ACTIVE`). 365 | 366 | `-S` Startup only mode. In this mode, `disBatch` starts up the task management system and then waits for execution resources to be added. 367 | At startup, `disBatch` always generates a script `_dbUtil.sh`, where `` refers to the `-p` option or default, see above. We'll call this simply `dbUtils.sh` here, 368 | but remember to include `_` in actual use. You can add execution resources by doing one or more of the following multiple times: 369 | 1. Submit `dbUtils.sh` as a job, e.g.: 370 | 371 | `sbatch -n 40 dbUtil.sh` 372 | 373 | 2. Use ssh, e.g.: 374 | 375 | `./dbUtil.sh -s localhost:4,friendlyNeighbor:5` 376 | 377 | Each of these creates an execution context, which contains one of more execution engines (if using, for example, 8-core nodes, then five for the first; two in the second). 378 | An engine can run one or more tasks currently. In the first example, each of the five engines will run up to eight tasks concurrently, while in the 379 | second example, the engine on `localhost` will run up to four tasks concurrently and the engine on `friendlyNeighbor` will run up to five. 380 | `./dbUtil.sh --mon` will start a simple ASCII-based monitor that tracks the overall state of the disBatch run, and the activity of the individual 381 | contexts and engines. By cursoring over an engine, you can send a shutdown signal to the engine or its context. This signal is *soft*, triggering 382 | a graceful shutdown that will occur only after currently assigned tasks are complete. Other execution resources are uneffected. 383 | 384 | When a context is started, you can also supply the argument `--context-task-limit N`. This will shutdown the context and all associated engines 385 | after it has run `N` tasks. 386 | 387 | Taken together, these mechanisms enable disBatch to run on a dynamic pool of execution resources, so you can "borrow" a colleague's workstation overnight, or 388 | claim a large chunk of a currently idle partition, but return some if demands picks up, or chain together a series of time limited allocations to 389 | accomplish a long run. When using this mode, keep in mind two caveats: (i) The time quantum is determined by your task duration. If any given task might 390 | run for hours or days, then the utility of this is limited. You can still use standard means (kill, scancel) to terminate contexts and engines, but 391 | you will likely have incomplete tasks to 392 | reckon with; (ii) The task manangement system must itself be run in a setting where a long lived process is OK. Say in a `screen` or `tmux` session on 393 | the login node of a cluster, or on your personal workstation (assuming it has the appropriate connectivity to reach the other resources you plan to use). 394 | 395 | 396 | `-r` uses the status file of a previous run to determine what tasks to run during this disBatch invocation. Only those tasks that haven't yet run (or with `-R`, those that haven't run or did but returned a non-0 exit code) are run this time. By default, the numeric task identifier and the text of the command are used to determine if a current task is the same as one found in the status file. `--force-resume` restricts the comparison to just the numeric identifier. 397 | 398 | `--use-address HOST:PORT` can be used if disBatch is not able to determine the correct hostname for the machine it is running on (or you need to override what was detected). This is often the case when running on a personal laptop without a "real" network configuration. In this case `--use-address=localhost:0` will generally be sufficient. 399 | 400 | `--kvsserver`, `--taskcommand`, and `--taskserver` implement advanced functionality (placing disBatch in an existing shared key store context and allowing for a programmatic rather than textual task interface). Contact the authors for more details. 401 | 402 | 403 | ### Considerations for large runs 404 | 405 | If you do submit jobs with order 10000 or more tasks, you should 406 | carefully consider how you want to organize the output (and error) files 407 | produced by each of the tasks. It is generally a bad idea to have more 408 | than a few thousand files in any one directory, so you will probably 409 | want to introduce at least one extra level of directory hierarchy so 410 | that the files can be divided into smaller groups. Intermediate 411 | directory `13`, say, might hold all the files for tasks 13000 to 412 | 13999. 413 | 414 | ## #DISBATCH directives 415 | 416 | ### PREFIX and SUFFIX 417 | 418 | In order to simplify task files, disBatch supports a couple of 419 | directives to specify common task prefix strings and suffix strings. As noted above, it 420 | also sets environment variables to identify various aspects of the 421 | submission. Here's an example 422 | 423 | # Note there is a space at the end of the next line. 424 | #DISBATCH PREFIX ( cd /path/to/workdir ; source SetupEnv ; 425 | #DISBATCH SUFFIX ) &> ${DISBATCH_NAMETASKS}_${DISBATCH_JOBID}_${DISBATCH_TASKID_ZP}.log 426 | 427 | These are textually prepended and appended, respectively, to the text of 428 | each subsequent task line. If the suffix includes redirection and a task is a proper command sequence (a series of 429 | commands joined by `;`), then the task should be wrapped in `( ... )`, as in this example, so that the standard error and standard output of the whole sequence 430 | will be redirected to the log file. If this is not done, only standard 431 | error and standard output for the last component of the command sequence 432 | will be captured. This is probably not what you want unless you have 433 | redirected these outputs for the previous individual parts of the 434 | command sequence. 435 | 436 | Using these, the above commands could be replaced with: 437 | 438 | myprog -a 0 -b 0 -c 0 439 | myprog -a 0 -b 0 -c 1 440 | ... 441 | myprog -a 9 -b 9 -c 8 442 | myprog -a 9 -b 9 -c 9 443 | 444 | Note: the log files will have a different naming scheme, but there will still be one per task. 445 | 446 | Later occurrences of `#DISBATCH PREFIX` or `#DISBATCH SUFFIX` in a task 447 | file simply replace previous ones. When these are used, the tasks 448 | reported in the status file include the prefix and suffix in 449 | force at the time the task was launched. 450 | 451 | ### BARRIER 452 | 453 | If your tasks fall into groups where a later group should only begin 454 | after all tasks of the previous group have completely finished, you can 455 | use this directive: 456 | 457 | #DISBATCH BARRIER 458 | 459 | When disBatch encounters this directive, it will not launch another task 460 | until all tasks in progress have completed. The following form: 461 | 462 | #DISBATCH BARRIER CHECK 463 | 464 | checks the exit status of the tasks done since the last barrier (or 465 | start of the run). If any task had a non-zero exit status, the run 466 | will exit once this barrier is met. 467 | 468 | ### REPEAT 469 | 470 | For those problems that are easily handled via a job-array-like approach: 471 | 472 | #DISBATCH REPEAT 5 myprog file${DISBATCH_REPEAT_INDEX} 473 | 474 | will expand into five tasks, each with the environment variable 475 | `DISBATCH_REPEAT_INDEX` set to one of 0, 1, 2, 3 or 4. 476 | 477 | The starting index and step size can also be changed: 478 | 479 | #DISBATCH REPEAT 5 start 100 step 50 myprog file${DISBATCH_REPEAT_INDEX} 480 | 481 | This will result in indices 100, 150, 200, 250, and 300. `start` defaults 482 | to 0, and `step` to 1. 483 | 484 | The command is actually optional; one might want to omit the command 485 | if a prefix and/or suffix are in place. Returning to our earlier example, the task file 486 | could be: 487 | 488 | #DISBATCH PREFIX a=$((DISBATCH_REPEAT_INDEX/100)) b=$(((DISBATCH_REPEAT_INDEX%100)/10 )) c=$((DISBATCH_REPEAT_INDEX%10) ; ( cd /path/to/workdir ; source SetupEnv ; myprog -a $a -b $b -c $c ) &> task_${a}_${b}_${c}.log 489 | #DISBATCH REPEAT 1000 490 | 491 | This is not a model of clarity, but it does illustrate that the repeat constuct can be relatively powerful. Many users may find it more convenient to use the tool of their choice to generate a text file with 1000 invocations explictly written out. 492 | 493 | ### PERENGINE 494 | 495 | #DISBATCH PERENGINE START { command ; sequence ; } &> engine_start_${DISBATCH_ENGINE_RANK}.log 496 | #DISBATCH PERENGINE STOP { command ; sequence ; } &> engine_stop_${DISBATCH_ENGINE_RANK}.log 497 | 498 | Use these to specify commands that should run at the time an engine joins a disBatch run or at the time the engine leaves the disBatch run, respectively. 499 | You could, for example, use these to bulk copy some heavily referenced read-only data to the engine's local storage area before any tasks are run, and then delete that data when the engine shuts down. 500 | You can use the environment variable DISBATCH_ENGINE_RANK to distinguish one engine from another; for example, it is used here to keep log files separate. 501 | 502 | These directives must come before any other tasks. 503 | 504 | ## Embedded disBatch 505 | 506 | You can start disBatch from within a python script by instantiating a "DisBatcher" object. 507 | 508 | See `exampleTaskFiles/dberTest.py` for an example. 509 | 510 | The "DisBatcher" class (defined in `disbatch/disBatch.py`) illustrates how to interact with disBatch via KVS. This approach could be used to enable similar functionality in other language settings. 511 | 512 | ## License 513 | 514 | Copyright 2024 Simons Foundation 515 | 516 | Licensed under the Apache License, Version 2.0 (the "License"); 517 | you may not use this file except in compliance with the License. 518 | You may obtain a copy of the License at 519 | 520 | http://www.apache.org/licenses/LICENSE-2.0 521 | 522 | Unless required by applicable law or agreed to in writing, software 523 | distributed under the License is distributed on an "AS IS" BASIS, 524 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 525 | See the License for the specific language governing permissions and 526 | limitations under the License. 527 | -------------------------------------------------------------------------------- /disbatch/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['main', '__version__'] 2 | 3 | from .disBatch import main 4 | 5 | try: 6 | from ._version import __version__ 7 | except Exception: 8 | # TODO: hatch-vcs doesn't seem to work well with editable installs 9 | # We could switch back to setuptools, but maybe we just wait for the uv build backend... 10 | __version__ = 'editable' 11 | -------------------------------------------------------------------------------- /disbatch/__main__.py: -------------------------------------------------------------------------------- 1 | from . import main 2 | 3 | main() 4 | -------------------------------------------------------------------------------- /disbatch/dbMon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import curses 4 | import json 5 | import os 6 | import sys 7 | import time 8 | from queue import Queue 9 | from threading import Thread 10 | 11 | from disbatch.kvsstcp import KVSClient 12 | 13 | # Connect to the disBatch communication service for this run. 14 | try: 15 | kvscStatus = KVSClient(os.environ['DISBATCH_KVSSTCP_HOST']) 16 | kvscDisplay = kvscStatus.clone() 17 | except Exception: 18 | print('Cannot contact the disBatch server. This usally means the run has ended.', file=sys.stderr) 19 | sys.exit(1) 20 | 21 | uniqueId = sys.argv[1] 22 | uniqueIdName = os.path.split(uniqueId)[-1] 23 | 24 | curses.initscr() 25 | curses.start_color() 26 | curses.init_pair(1, curses.COLOR_CYAN, curses.COLOR_BLACK) 27 | curses.init_pair(2, curses.COLOR_GREEN, curses.COLOR_BLACK) 28 | curses.init_pair(3, curses.COLOR_BLACK, curses.COLOR_RED) 29 | curses.init_pair(4, curses.COLOR_YELLOW, curses.COLOR_BLACK) 30 | curses.init_pair(5, curses.COLOR_RED, curses.COLOR_BLACK) 31 | curses.init_pair(6, curses.COLOR_BLACK, curses.COLOR_BLACK) 32 | curses.init_pair(7, curses.COLOR_WHITE, curses.COLOR_WHITE) 33 | curses.curs_set(False) 34 | 35 | CPCB, CPGB, CPBR, CPYB, CPRB, CPBB, CPWW = (curses.color_pair(x) for x in range(1, 8)) 36 | 37 | Diamond = curses.ACS_DIAMOND 38 | Horizontal, Vertical = curses.ACS_HLINE, curses.ACS_VLINE 39 | CornerUL, CornerUR, CornerLL, CornerLR = ( 40 | curses.ACS_ULCORNER, 41 | curses.ACS_URCORNER, 42 | curses.ACS_LLCORNER, 43 | curses.ACS_LRCORNER, 44 | ) 45 | TeeD, TeeU, TeeR, TeeL = curses.ACS_TTEE, curses.ACS_BTEE, curses.ACS_LTEE, curses.ACS_RTEE 46 | 47 | # TODO: Come up with a better way to set these based on the actual 48 | # layout encoded in dbStatus. 49 | HeaderLength = 6 50 | FooterLength = 1 51 | Width = 85 52 | 53 | MinLines, MinCols = HeaderLength + FooterLength + 10, Width + 2 54 | 55 | 56 | # Thread that periodically checks for status updates from the disBatch 57 | # controller. Puts formatted results and auxillary data on the shared 58 | # queue. 59 | def dbStatus(kvsc, outq): 60 | while True: 61 | try: 62 | j = kvsc.view('DisBatch status') 63 | except Exception: 64 | outq.put(('stop', None)) 65 | break 66 | 67 | if j != b'': 68 | statusd = json.loads(j) 69 | 70 | now = time.time() 71 | 72 | # convert keys back to ints after json transform. 73 | engines = {int(k): v for k, v in statusd['engines'].items()} 74 | contexts = {int(k): v for k, v in statusd['contexts'].items()} 75 | ee = engines.values() 76 | statusd['slots'] = sum([e['active'] for e in ee if e['status'] == 'running']) 77 | statusd['finished'] = sum([e['finished'] for e in ee]) 78 | statusd['failed'] = sum([e['failed'] for e in ee]) 79 | header = [] 80 | tuin = uniqueIdName if len(uniqueIdName) <= 40 else (uniqueIdName[:17] + '...' + uniqueIdName[-20:]) 81 | label = f'Run label: {tuin:<40s} Status: {statusd["more"]:15s}' 82 | header.append(([CornerUL] + [Horizontal] * Width + [CornerUR], CPCB)) 83 | header.append(([Vertical] + [label + ' ' * (Width - len(label))] + [Vertical], CPCB)) 84 | header.append( 85 | ( 86 | [Vertical] 87 | + [ 88 | 'Slots {slots:5d} Tasks: Finished {finished:7d} Failed{failed:5d} Barrier{barriers:3d}'.format( 89 | **statusd 90 | ) 91 | ] 92 | + [Vertical], 93 | CPCB, 94 | ) 95 | ) 96 | header.append(([TeeR] + [Horizontal] * Width + [TeeL], CPCB)) 97 | # '01234 012345678901 01234567890123456789 0123456 0123456 0123456789 0123456789 0123456' 98 | header.append( 99 | ( 100 | [Vertical] 101 | + ['Rank Context Host Last Avail Assigned Finished Failed'] 102 | + [Vertical], 103 | CPCB, 104 | ) 105 | ) 106 | header.append(([CornerLL] + [Horizontal] * Width + [CornerLR], CPCB)) 107 | assert len(header) == HeaderLength 108 | 109 | ee = sorted(engines.items()) 110 | content = [] 111 | for rank, engine in ee: 112 | if engine['status'] == 'stopped': 113 | continue 114 | engine['delay'] = now - engine['last'] 115 | engine['cLabel'] = contexts[engine['cRank']]['label'] 116 | content.append( 117 | ( 118 | rank, 119 | '{rank:5d} {cLabel:12.12s} {hostname:20.20s} {delay:6.0f}s {active:7d} {assigned:10d} {finished:10d} {failed:7d}'.format( 120 | **engine 121 | ), 122 | ) 123 | ) 124 | outq.put(('status', (engines, contexts, header, content))) 125 | time.sleep(3) 126 | 127 | 128 | # Utility to pop up a Yes/No/Cancel dialog. Read reply from shared 129 | # queue, return first acceptable response. 130 | def popYNC(msg, parent, inq, title='Confirm'): 131 | ph, pw = parent.getmaxyx() 132 | h = int(ph * 0.75) 133 | w = int(pw * 0.85) 134 | ro, co = int((ph - h) * 0.5), int((pw - w) * 0.5) 135 | 136 | # Wrap msg to fit in pop up. 137 | L, msgw = '', [] 138 | for word in msg.split(): 139 | if len(word) > w: 140 | word = word[: w - 3] + '...' 141 | if len(L) + 1 + len(word) > w: 142 | msgw.append(L) 143 | L = word 144 | else: 145 | L = L + (' ' if L else '') + word 146 | msgw.append(L) 147 | if len(msgw) > h: 148 | missing = 1 + len(msgw) - h 149 | msgw = msgw[: h - 1] 150 | msgw.append(f'{missing:d} lines elided.') 151 | 152 | nw = curses.newwin(h + 2, w + 2, ro, co) 153 | nw.border() 154 | nw.addstr(0, int((w - len(title)) * 0.5), title) 155 | for r, L in enumerate(msgw): 156 | nw.addstr(r + 1, 1, L) 157 | nw.addstr(r + 2, int((w - 19) * 0.5), '[Y]es/[N]o/[C]ancel', curses.A_REVERSE) 158 | nw.refresh() 159 | 160 | # Acceptable responses. Treat a resize event as "cancel". 161 | resp = { 162 | ord('y'): 'Y', 163 | ord('Y'): 'Y', 164 | ord('n'): 'N', 165 | ord('N'): 'N', 166 | ord('c'): 'C', 167 | ord('C'): 'C', 168 | curses.KEY_RESIZE: 'C', 169 | } 170 | while True: 171 | tag, k = inq.get() 172 | if tag == 'key' and k in resp: 173 | break 174 | # TODO: If tag isn't key raise exception? 175 | 176 | parent.redrawwin() 177 | parent.refresh() 178 | return resp[k] 179 | 180 | 181 | # Thread that paints the display and responds to user input. Reads status 182 | # updates and keyboard input (including resize events) from the shared queue. 183 | def display(S, kvsc, inq): 184 | content = [] 185 | lenContent = len(content) 186 | 187 | header = [(' ', CPBB)] * 4 188 | 189 | tooSmall = curses.LINES < MinLines or curses.COLS < MinCols 190 | displayLines = curses.LINES - (HeaderLength + FooterLength) 191 | 192 | engines = None # TODO: we may be relying on this being populated before it's referenced below 193 | localEngineStatus = {} 194 | 195 | contentCursor, contentFirst, done = 0, 0, False 196 | msg = '' 197 | while True: 198 | S.clear() 199 | 200 | if tooSmall: 201 | S.addstr(0, 0, f'Screen must be at least {MinLines:d}X{MinCols:d}', CPRB) 202 | else: 203 | # Header 204 | for r, (L, cp) in enumerate(header): 205 | S.move(r, 0) 206 | for e in L: 207 | if type(e) is int: 208 | S.addch(e, cp) 209 | else: 210 | S.addstr(e, cp) 211 | 212 | # Footer 213 | if msg or done: 214 | if done: 215 | msg = '[disBatch controller has exited]' + (' ' if msg else '') + msg 216 | S.addstr(curses.LINES - 1, 0, msg, CPBR) 217 | 218 | # Main content 219 | if content: 220 | # Adjust window to ensure cursor displays. 221 | if contentCursor < contentFirst: 222 | # move window so last line corresponds to cursor, i.e.: 223 | # contentCursor == contentFirst + (displayLines-1) 224 | contentFirst = max(0, contentCursor - (displayLines - 1)) 225 | elif contentCursor >= (contentFirst + displayLines): 226 | # move window so first line corresponds to cursor. 227 | contentFirst = contentCursor 228 | # ensure window is as full as possible. 229 | contentLast = min(contentFirst + displayLines, lenContent) 230 | contentFirst = max(0, contentLast - displayLines) 231 | for r, (rank, L) in enumerate(content[contentFirst:contentLast]): 232 | if len(L) > curses.COLS - 1: 233 | L = L[: curses.COLS - 4] + '...' 234 | cp = CPGB 235 | if engines[rank]['status'] == 'stopping': 236 | cp = CPRB 237 | elif localEngineStatus.get(rank, '') == 'requesting shutdown': 238 | cp = CPYB 239 | S.addstr(HeaderLength + r, 1, L, cp) 240 | 241 | # Scroll indicator and cursor 242 | regionStart = (displayLines * contentFirst) // lenContent 243 | regionEnd = (displayLines * contentLast + lenContent - 1) // lenContent 244 | S.addch(HeaderLength + regionStart, 0, TeeD, CPYB) 245 | for r in range(regionStart + 1, regionEnd - 1): 246 | S.addch(HeaderLength + r, 0, Vertical, CPYB) 247 | S.addch(HeaderLength + regionEnd - 1, 0, TeeU, CPYB) 248 | S.addch(HeaderLength + (contentCursor - contentFirst), 0, Diamond, CPCB) 249 | else: 250 | S.addstr(HeaderLength, 0, '', CPRB) 251 | 252 | S.refresh() 253 | 254 | tag, o = inq.get() 255 | if tag == 'key': 256 | msg = '' 257 | k = o 258 | if k == curses.KEY_RESIZE: 259 | curses.update_lines_cols() 260 | if curses.LINES < MinLines or curses.COLS < MinCols: 261 | tooSmall = True 262 | continue 263 | tooSmall = False 264 | 265 | displayLines = curses.LINES - (HeaderLength + FooterLength) 266 | if displayLines > (lenContent - contentCursor): 267 | contentFirst = max(0, lenContent - displayLines) 268 | else: 269 | contentFirst = max(0, contentCursor - displayLines // 2) 270 | 271 | S.clear() 272 | S.refresh() 273 | continue 274 | 275 | if k == ord('u') or k == curses.KEY_UP: 276 | contentCursor = max(0, contentCursor - 1) 277 | elif k == ord('d') or k == curses.KEY_DOWN: 278 | contentCursor = min(max(0, lenContent - 1), contentCursor + 1) 279 | elif k == ord('q'): 280 | break 281 | elif k in [ord('h'), ord('?')]: 282 | msg = 'C: Shutdown context; E: Shutdown engine; q: quit' 283 | elif k in [ord('C'), ord('E')]: 284 | if not done: 285 | target = content[contentCursor][0] 286 | if target is not None: 287 | if k == ord('C'): 288 | cRank = engines[target]['cRank'] 289 | r = popYNC('Stopping context {cLabel:s} ({cRank:d})'.format(**engines[target]), S, inq) 290 | if r == 'Y': 291 | try: 292 | msg = f'Asking controller to stop context {cRank!r}' 293 | kvsc.put('.controller', ('stop context', cRank)) 294 | for rank, e in engines.items(): 295 | if e['cRank'] == cRank: 296 | localEngineStatus[rank] = 'requesting shutdown' 297 | except OSError: 298 | pass 299 | elif k == ord('E'): 300 | r = popYNC( 301 | 'Stopping engine {rank:d} ({hostname:s}, {pid:d})'.format(**engines[target]), S, inq 302 | ) 303 | if r == 'Y': 304 | try: 305 | msg = f'Asking controller to stop engine {target!r}' 306 | kvsc.put('.controller', ('stop engine', target)) 307 | localEngineStatus[target] = 'requesting shutdown' 308 | except OSError: 309 | pass 310 | else: 311 | msg = f'Got unrecognized key: {k:d}' 312 | elif tag == 'status': 313 | engines, contexts, header, content = o 314 | # Adjust cursor location if needed. 315 | oldLen, lenContent = lenContent, len(content) 316 | if oldLen > lenContent: 317 | f = contentCursor / oldLen 318 | contentCursor = int(f * lenContent) 319 | elif tag == 'stop': 320 | done = True 321 | else: 322 | msg = f'Unrecognized tag: "{tag}",' 323 | 324 | 325 | # (Wrapped) main. 326 | # Creates a shared queue, sets up status and display threads, and then waits for 327 | # keyboard events and writes them to the shared queue. Intercepts "q" to quit. 328 | # 329 | # It appears that getch() needs to be called from the main processes. 330 | def main(S): 331 | S.bkgdset(CPBB) 332 | S.clear() 333 | S.refresh() 334 | 335 | inq = Queue() 336 | gc = Thread(target=display, args=(S, kvscDisplay, inq)) 337 | gc.daemon = True 338 | gc.start() 339 | db = Thread(target=dbStatus, args=(kvscStatus, inq)) 340 | db.daemon = True 341 | db.start() 342 | 343 | while True: 344 | k = S.getch() 345 | if k == ord('q'): 346 | break 347 | inq.put(('key', k)) 348 | 349 | 350 | curses.wrapper(main) 351 | -------------------------------------------------------------------------------- /disbatch/dbUtil.template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DISBATCH_KVSSTCP_HOST={kvsserver:s} 4 | 5 | if [[ $1 == '--mon' ]] 6 | then 7 | exec {DisBatchPython} -m disbatch.dbMon {uniqueId:s} 8 | elif [[ $1 == '--engine' ]] 9 | then 10 | exec {DisBatchPython} -m disbatch "$@" 11 | else 12 | exec {DisBatchPython} -m disbatch --context {DbUtilPath:} "$@" < /dev/null 1> {uniqueId:s}_${{BASHPID-$$}}_context_launch.log 13 | fi 14 | -------------------------------------------------------------------------------- /disbatch/kvsstcp/.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /dist/ 3 | *.pyc 4 | *.o 5 | /kvsLoop 6 | /kvsTestWIc 7 | /kvsTestWIf 8 | /kvsTestf 9 | /.stack-work 10 | -------------------------------------------------------------------------------- /disbatch/kvsstcp/Readme.md: -------------------------------------------------------------------------------- 1 | Key value storage server 2 | ======================== 3 | 4 | Inspired by networkspaces, which was in turn inspired by the Linda coordination language. 5 | 6 | Similar systems exist, the point of this one is to provide a simple to deploy and reasonably functional and efficient store that is easy to integrate with many different programming environments. 7 | 8 | The reference python implementation should work with any stock python 2.7 or above: 9 | 10 | * `kvscommon.py` contains the line protocol description and common utilities, 11 | * `kvsstcp.py` contains the server, which can be run from the command line or from within another python module as `KVSServer()` to start the server thread 12 | * `kvsclient.py` contains the client interface, which can be run from the command line or from within another python module as `KVSClient(host, port)` 13 | 14 | "kvsSupport.[ch]" contains a client that can be linked with C or FORTRAN codes. 15 | 16 | "kvsTest.py" provides a simple example of use. 17 | 18 | "kvsRing.py" can be used to generate some basic timing information. 19 | 20 | "kvsLoop.c" and "kvsTestf.f" are example codes for C and FORTRAN. "Makefile" can be used to build these. 21 | 22 | "kvsBatchWrapper.sh" is a short script to invoke a program that uses KVS via a Slurm sbatch submission, e.g.: 23 | 24 | sbatch -N 2 --ntasks-per-node=28 --exclusive kvsBatchWrapper.sh ./kvsTestf 25 | 26 | `wskvsmu.py` is a prototype web interface for displaying the state of a KVS server (and injecting values into it). Uses `wskvspage.html` as the frontend. 27 | 28 | "kvsTestWIc.c" and "kvsTestWIf.f" provide example codes that use KVS via wskvsmu.py to enter input from a web browser into C or FORTRAN. 29 | 30 | ## License 31 | 32 | Copyright 2017 Simons Foundation 33 | 34 | Licensed under the Apache License, Version 2.0 (the "License"); 35 | you may not use this file except in compliance with the License. 36 | You may obtain a copy of the License at 37 | 38 | http://www.apache.org/licenses/LICENSE-2.0 39 | 40 | Unless required by applicable law or agreed to in writing, software 41 | distributed under the License is distributed on an "AS IS" BASIS, 42 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 43 | See the License for the specific language governing permissions and 44 | limitations under the License. 45 | -------------------------------------------------------------------------------- /disbatch/kvsstcp/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['KVSClient', 'KVSServerThread'] 2 | 3 | from .kvsclient import KVSClient 4 | from .kvsstcp import KVSServer as KVSServerThread 5 | -------------------------------------------------------------------------------- /disbatch/kvsstcp/kvsclient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import errno 4 | import os 5 | import socket 6 | import sys 7 | import time 8 | from pickle import dumps as PDS 9 | from pickle import loads as PLS 10 | 11 | from .kvscommon import AsciiLenChars, AsciiLenFormat, recvall 12 | 13 | 14 | class KVSClient: 15 | """KVS convenience wrapper that includes pickling by default.""" 16 | 17 | def __init__(self, host=None, port=None, retry=0): 18 | """Establish connection to a key value storage server at an address 19 | given by host, port or "host:port" 20 | 21 | If retry > 0, retry the connection this many times if it fails. 22 | """ 23 | if not host: 24 | host = os.environ.get('KVSSTCP_HOST', None) 25 | # TODO: Silently overrides user supplied value, if there is one. 26 | port = os.environ.get('KVSSTCP_PORT', None) 27 | 28 | if not host: 29 | raise Exception('Missing host') 30 | 31 | if not port: 32 | if type(host) is tuple: 33 | host, port = host 34 | elif ':' in host: 35 | host, port = host.rsplit(':', 1) 36 | else: 37 | raise Exception('Missing port') 38 | 39 | self.addr = (host, int(port)) 40 | self.socket = None 41 | self.connect(retry) 42 | 43 | def clone(self): 44 | """Create a new connection to the same server as this one.""" 45 | return KVSClient(self.addr) 46 | 47 | # Low-level network operations 48 | def _close(self): 49 | if not self.socket: 50 | return 51 | try: 52 | self._real_socket().close() 53 | except OSError: 54 | pass 55 | self.socket = None 56 | 57 | def _recvValue(self, doPickle=False): 58 | L = int(recvall(self.socket, AsciiLenChars)) 59 | payload = recvall(self.socket, L) 60 | if doPickle: 61 | payload = PLS(payload) 62 | return payload 63 | 64 | def _sendLenAndBytes(self, payload): 65 | if type(payload) is not bytes: 66 | payload = bytes(payload, 'utf-8') 67 | self.socket.sendall(AsciiLenFormat(len(payload))) 68 | self.socket.sendall(payload) 69 | 70 | class SocketWaiting: 71 | """Used as placeholder socket when there's an incomplete get/view call 72 | that must be retried. The real socket and outstanding op are stashed.""" 73 | 74 | def __init__(self, socket, op): 75 | self.socket = socket 76 | self.op = op 77 | 78 | def __nonzero__(self): 79 | return True 80 | 81 | def __bool__(self): 82 | return True 83 | 84 | def __getattr__(self, attr): 85 | """Disallow any other operations on a waiting socket.""" 86 | raise Exception( 87 | "Previous {} timed out: you must retreive the previously requested '{}' value first.".format(*self.op) 88 | ) 89 | 90 | def _real_socket(self): 91 | """Get the real socket, even if we have an outstanding SocketWaiting.""" 92 | try: 93 | # for SocketWaiting 94 | return self.socket.socket 95 | except AttributeError: 96 | return self.socket 97 | 98 | def _get_view(self, op, k, encoding, timeout=None): 99 | try: 100 | # check if we're waiting for something 101 | waiting = self.socket.op 102 | except AttributeError: 103 | waiting = None 104 | if waiting == (op, k): 105 | # continue previous timedout wait 106 | self.socket = self.socket.socket 107 | else: 108 | # new wait 109 | self.socket.sendall(op) 110 | self._sendLenAndBytes(k) 111 | if timeout is None: 112 | coding = recvall(self.socket, 4) 113 | else: 114 | self.socket.settimeout(timeout) 115 | try: 116 | c = self.socket.recv(1) 117 | except socket.timeout: 118 | self.socket = self.SocketWaiting(self.socket, (op, k)) 119 | return 120 | except OSError as e: 121 | if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN): 122 | self.socket = self.SocketWaiting(self.socket, (op, k)) 123 | return 124 | else: 125 | raise 126 | finally: 127 | self._real_socket().settimeout(None) 128 | if not c: 129 | raise OSError('Connection closed') 130 | coding = c + recvall(self.socket, 3) 131 | v = self._recvValue(encoding is True and coding == b'PYPK') 132 | return v if isinstance(encoding, bool) else (coding, v) 133 | 134 | def connect(self, retry=0): 135 | """Reconnect, if necessary. Can be used after an explicit close.""" 136 | if self.socket: 137 | return 138 | rep = 0 139 | while 1: 140 | try: 141 | self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 142 | self.socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 143 | self.socket.connect(self.addr) 144 | return 145 | except OSError as msg: 146 | self._close() 147 | if rep >= retry: 148 | raise 149 | print(f'kvs socket error: {msg}, retrying', file=sys.stderr) 150 | # exponential backoff 151 | time.sleep(2**rep) 152 | rep += 1 153 | 154 | def close(self): 155 | """Close the connection to the KVS storage server. Does a socket shutdown as well.""" 156 | if not self.socket: 157 | return 158 | try: 159 | self.socket.sendall(b'clos') 160 | self.socket.shutdown(socket.SHUT_RDWR) 161 | except OSError as e: 162 | # this is the client --- cannot assume logging is available. 163 | print(f'Ignoring exception during client close: "{e}"', file=sys.stderr) 164 | self._close() 165 | 166 | def dump(self): 167 | """Returns a snapshot of the KV store and its statistics.""" 168 | self.socket.sendall(b'dump') 169 | return self._recvValue(True) 170 | 171 | def get(self, key, encoding=True): 172 | """Retrieve and remove a value from the store. If there is no value 173 | associated with this key, block until one is added by another client 174 | (with put). 175 | 176 | If encoding is True, and the value was pickled, then the value will be 177 | unpickled before being returned. If encoding is False, just return the 178 | raw value. For anything else, return (encoding, value). 179 | """ 180 | return self._get_view(b'get_', key, encoding) 181 | 182 | def _get_nb(self, key, encoding=True, timeout=None): 183 | """Non-blocking get. 184 | 185 | If timeout is not None, this will only wait for timeout seconds before 186 | returning None. In this case, you MUST call this function again in the 187 | future until it returns a value before doing any other operation, 188 | otherwise the value may be lost.""" 189 | return self._get_view(b'get_', key, encoding, timeout) 190 | 191 | def view(self, key, encoding=True): 192 | """Retrieve, but do not remove, a value from the store. See 'get'.""" 193 | return self._get_view(b'view', key, encoding) 194 | 195 | def _view_nb(self, key, encoding=True, timeout=None): 196 | """Non-blocking view. See '_get_nb' and 'view'.""" 197 | return self._get_view(b'view', key, encoding, timeout) 198 | 199 | def put(self, key, value, encoding=True): 200 | """Add a value to the key. If encoding is True, pickle the value and 201 | encode as PYPK. If False, convert to string and store as ASTR. 202 | Otherwise, encoding must be a 4 character string, and value must be a 203 | string.""" 204 | if encoding is True: 205 | value = PDS(value) 206 | encoding = b'PYPK' 207 | elif encoding is False: 208 | # TODO: Is this silent stringification too clever by half? 209 | # Maybe, since unicode strings will end up as "u'\\u...'". perhaps utf8-encode strings, and fail on other types? 210 | if type(value) is not str and type(value) is not bytes: 211 | value = repr(value) 212 | encoding = b'ASTR' 213 | else: 214 | if type(encoding) is not bytes: 215 | if type(encoding) is not str: 216 | encoding = repr(encoding) 217 | encoding = bytes(encoding, 'utf-8') 218 | if len(encoding) != 4: 219 | raise TypeError(f'Invalid encoding: {encoding}') 220 | 221 | self.socket.sendall(b'put_') 222 | self._sendLenAndBytes(key) 223 | self.socket.sendall(encoding) 224 | self._sendLenAndBytes(value) 225 | 226 | def monkey(self, mkey, value): 227 | """Make mkey a monitor key. Value encodes what events to monitor and 228 | for which key: 229 | 230 | Key:Events 231 | 232 | Whenever a listed event occurs for "Key", a put will be done 233 | to "Mkey" with the value " ". If 'Key' is empty, 234 | the events listed will be monitored for all keys. 'Events' is 235 | some subset of 'g', 'p', 'v' and 'w' (get, put, view and 236 | wait). Monitoring of any event *not* listed is turned off for 237 | the specified key. 238 | """ 239 | self.socket.sendall(b'mkey') 240 | self._sendLenAndBytes(mkey) 241 | self._sendLenAndBytes(value) 242 | 243 | def shutdown(self): 244 | """Tell the KVS server to shutdown (and run the close() method for this client).""" 245 | try: 246 | self._real_socket().sendall(b'down') 247 | finally: 248 | self._close() 249 | 250 | 251 | def addKVSServerArgument(argp, name='kvsserver'): 252 | """Add an argument to the given ArgumentParser that accepts the address of a running KVSServer, defaulting to $KVSSTCP_HOST:$KVSSTCP_PORT.""" 253 | host = os.environ.get('KVSSTCP_HOST') 254 | port = os.environ.get('KVSSTCP_PORT') if host else None 255 | argp.add_argument( 256 | name, 257 | metavar='host:port', 258 | nargs='?' if port else None, 259 | default=host + ':' + port if port else None, 260 | help='KVS server address.', 261 | ) 262 | 263 | 264 | if '__main__' == __name__: 265 | import argparse 266 | 267 | class OpAction(argparse.Action): 268 | def __call__(self, parser, namespace, values, option_string=None): 269 | items = getattr(namespace, 'ops', []) 270 | op = self.option_strings[1][2:] 271 | if op in ('get', 'view', 'put'): 272 | encoding = getattr(namespace, 'encoding', False) 273 | values.append(encoding) 274 | if encoding is True and op == 'put': 275 | values[1] = eval(values[1], {}) 276 | if op in ('get', 'view'): 277 | op = '_' + op + '_nb' 278 | values.append(getattr(namespace, 'timeout', None)) 279 | values.insert(0, op) 280 | items.append(values) 281 | namespace.ops = items 282 | 283 | argp = argparse.ArgumentParser(description='Command-line client to key-value storage server.') 284 | argp.add_argument( 285 | '-R', '--retry', default=0, type=int, metavar='COUNT', help='Number of times to retry on connect failure [0]' 286 | ) 287 | argp.add_argument( 288 | '-P', '--pickle', dest='encoding', action='store_true', help='(Un-)Pickle values to/from python expressions' 289 | ) 290 | argp.add_argument( 291 | '-A', '--no-pickle', dest='encoding', action='store_false', help="Don't (un-)pickle values (default)" 292 | ) 293 | argp.add_argument( 294 | '-E', 295 | '--encoding', 296 | dest='encoding', 297 | type=str, 298 | metavar='CODE', 299 | help='Explicitly set/get encoding (4-character string, ignored on get) [ASTR or PYPK with -P]', 300 | ) 301 | argp.add_argument('-T', '--timeout', type=float, metavar='SECS', nargs='?', help='Timeout waiting for get/view') 302 | argp.add_argument('-d', '--dump', action=OpAction, nargs=0, help='Dump the current state') 303 | argp.add_argument('-g', '--get', action=OpAction, nargs=1, metavar='KEY', help='Retrieve and remove a value') 304 | argp.add_argument('-v', '--view', action=OpAction, nargs=1, metavar='KEY', help='Retrieve a value') 305 | argp.add_argument('-p', '--put', action=OpAction, nargs=2, metavar=('KEY', 'VALUE'), help='Put a value') 306 | argp.add_argument( 307 | '-m', 308 | '--monkey', 309 | action=OpAction, 310 | nargs=2, 311 | metavar=('MKEY', 'KEY:EVENTS'), 312 | help='Create or update a monitor for the key and events', 313 | ) 314 | argp.add_argument('-S', '--shutdown', action=OpAction, nargs=0, help='Tell the server to shutdown') 315 | argp.add_argument('-s', '--sleep', action=OpAction, nargs=1, type=float, metavar='SECS', help='Pause for a time') 316 | addKVSServerArgument(argp, 'server') 317 | args = argp.parse_args() 318 | 319 | kvs = KVSClient(args.server, retry=args.retry) 320 | 321 | if hasattr(args, 'ops') and args.ops: 322 | for cmd in args.ops: 323 | op = cmd.pop(0) 324 | if op == 'sleep': 325 | time.sleep(*cmd) 326 | else: 327 | try: 328 | r = getattr(kvs, op)(*cmd) 329 | if r is not None: 330 | print(r) 331 | except Exception as e: 332 | print(e, file=sys.stderr) 333 | else: 334 | print('Nothing to do.') 335 | kvs.close() 336 | -------------------------------------------------------------------------------- /disbatch/kvsstcp/kvscommon.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | 4 | # The line protocol is very simple: 5 | # 6 | # 4 byte operation ('clos', 'dump', 'get_', 'mkey', 'put_', 'view') 7 | # 8 | # 'clos': No additional argument 9 | # 10 | # 'dump': No additional argument 11 | # 12 | # 'get_': One key argument, expects a value argument in reply. 13 | # 14 | # 'mkey' (monitor key): Two key arguments. The second may have a ': ...' 15 | # suffix indicating events to be monitored. 16 | # 17 | # 'put_': One key argument followed by one value argument. 18 | # 19 | # 'view': One key argument, expects a value argument in reply. 20 | # 21 | # Key representation: 22 | # 10 bytes: A 10 character string (ascii, not null terminated) with the base 10 23 | # representation of the byte length of the key string 24 | # length bytes: the key string 25 | # 26 | # Value representatin: 27 | # 4 bytes: coding scheme. 28 | # 10 bytes: A 10 character string (ascii, not null terminated) with the base 10 29 | # representation of the byte length of the argument 30 | # length bytes: the string representing the key 31 | # 32 | # Notes: 33 | # 34 | # 1) Coding schemes for values is a work in progress. 35 | # 36 | 37 | AsciiLenChars = 10 38 | 39 | 40 | def AsciiLenFormat(n): 41 | assert n <= 9999999999 42 | return str(n).encode('ascii').rjust(AsciiLenChars) 43 | 44 | 45 | if hasattr(socket, 'MSG_WAITALL') and os.uname()[0] != 'Darwin': 46 | # MSG_WAITALL on OSX ends up blocking if the tcp buffer is not big enough for the entire message: don't use it 47 | def recvall(s, n): 48 | if s is None: 49 | raise OSError('socket is None, cannot receive') 50 | if not n: 51 | return b'' 52 | r = s.recv(n, socket.MSG_WAITALL) 53 | if len(r) < n: 54 | raise OSError('Connection dropped') 55 | return r 56 | else: 57 | 58 | def recvall(s, n): 59 | """Wrapper to deal with partial recvs when we know there are N bytes to be had.""" 60 | if s is None: 61 | raise OSError('socket is None, cannot receive') 62 | d = b'' 63 | while n: 64 | b = s.recv(n) 65 | if not b: 66 | raise OSError('Connection dropped') 67 | d += b 68 | n -= len(b) 69 | return d 70 | -------------------------------------------------------------------------------- /disbatch/kvsstcp/kvsstcp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import errno 3 | import logging 4 | import os 5 | import resource 6 | import select 7 | import socket 8 | import threading 9 | from collections import defaultdict as DD 10 | from functools import partial 11 | from pickle import dumps as PDS 12 | 13 | from .kvscommon import AsciiLenChars, AsciiLenFormat 14 | 15 | logger = logging.getLogger('kvs') 16 | 17 | # There are some cyclic references in in asyncio, handlers, waiters, etc., so I'm re-enabling this: 18 | # gc.disable() 19 | 20 | _DISCONNECTED = frozenset( 21 | (errno.ECONNRESET, errno.ENOTCONN, errno.ESHUTDOWN, errno.ECONNABORTED, errno.EPIPE, errno.EBADF) 22 | ) 23 | _BUFSIZ = 8192 24 | 25 | # Concepts: 26 | # 27 | # Every connection is represented by a dispatcher. 28 | # 29 | # Every dispatcher is registered with a handler, which in effect runs 30 | # the KVS server loop. 31 | # 32 | # The handler runs an infinite loop that mostly sits on a poll of some 33 | # sort waiting for one or more events associated with registered 34 | # connections (identified by their file descriptor). 35 | # 36 | # When an event occurs the dispatcher associated with the connection 37 | # is used to process the event. 38 | # 39 | # The listening socket is treated just like any other connection and 40 | # has its own dispatcher. An "event" on this connection triggers an 41 | # accept that leads to the creation of a new dispatcher 42 | # (KVSRequestDispatcher) to handle exchanges with the client. 43 | # 44 | # This approach has the very important benefit that it is single threaded. 45 | 46 | 47 | class Handler: 48 | """Based on asyncore, but with a simpler, stricter per-thread interface that allows better performance.""" 49 | 50 | def __init__(self): 51 | self.disps = dict() 52 | self.current = None 53 | self.running = True 54 | 55 | def register(self, disp): 56 | self.disps[disp.fd] = disp 57 | 58 | def unregister(self, disp): 59 | del self.disps[disp.fd] 60 | 61 | def run(self): 62 | while self.running: 63 | try: 64 | self.poll() 65 | except OSError as e: 66 | if e.errno == errno.EINTR: 67 | continue 68 | raise 69 | for d in list(self.disps.values()): 70 | try: 71 | d.close() 72 | except Exception as e: 73 | logger.info('%r reported %r on close in handler.', d, e) 74 | self.close() 75 | 76 | def writable(self, disp): 77 | "Equivalent to setting mask | OUT, but safe to be called from other (non-current) handlers." 78 | if disp.mask & self.OUT: 79 | return 80 | disp.mask |= self.OUT 81 | # write can be called from other threads 82 | if self.current is not disp: 83 | self.modify(disp) 84 | 85 | def close(self): 86 | self.running = False 87 | 88 | 89 | class PollHandler(Handler): 90 | def __init__(self): 91 | self.IN, self.OUT, self.EOF = select.POLLIN, select.POLLOUT, select.POLLHUP 92 | self.poller = select.poll() 93 | Handler.__init__(self) 94 | 95 | def register(self, disp): 96 | Handler.register(self, disp) 97 | self.poller.register(disp.fd, disp.mask) 98 | 99 | def unregister(self, disp): 100 | self.poller.unregister(disp.fd) 101 | Handler.unregister(self, disp) 102 | 103 | def modify(self, disp): 104 | self.poller.modify(disp.fd, disp.mask) 105 | 106 | def poll(self): 107 | ev = self.poller.poll() 108 | for f, e in ev: 109 | d = self.current = self.disps[f] 110 | oldm = d.mask 111 | if e & self.EOF: 112 | d.handle_close() 113 | continue 114 | if e & self.IN: 115 | d.handle_read() 116 | if d.mask & self.OUT: 117 | d.handle_write() 118 | self.current = None 119 | if d.mask != oldm and not (d.mask & self.EOF): 120 | self.modify(d) 121 | 122 | def stop(self, disp): 123 | Handler.close(self) 124 | 125 | 126 | class EPollHandler(PollHandler): 127 | def __init__(self): 128 | self.IN, self.OUT, self.EOF = select.EPOLLIN, select.EPOLLOUT, select.EPOLLHUP 129 | self.poller = select.epoll() 130 | Handler.__init__(self) 131 | 132 | def close(self): 133 | self.poller.close() 134 | Handler.close(self) 135 | 136 | 137 | class KQueueHandler(Handler): 138 | def __init__(self): 139 | self.IN, self.OUT, self.EOF = 1, 2, 4 140 | self.kqueue = select.kqueue() 141 | Handler.__init__(self) 142 | 143 | def register(self, disp): 144 | Handler.register(self, disp) 145 | disp.curmask = 0 146 | self.modify(disp) 147 | 148 | def unregister(self, disp): 149 | disp.mask = 0 150 | self.modify(disp) 151 | Handler.unregister(self, disp) 152 | 153 | def modify(self, disp): 154 | c = [] 155 | if disp.mask & self.IN: 156 | if not (disp.curmask & self.IN): 157 | c.append(select.kevent(disp.fd, select.KQ_FILTER_READ, select.KQ_EV_ADD)) 158 | elif disp.curmask & self.IN: 159 | c.append(select.kevent(disp.fd, select.KQ_FILTER_READ, select.KQ_EV_DELETE)) 160 | if disp.mask & self.OUT: 161 | if not (disp.curmask & self.OUT): 162 | c.append(select.kevent(disp.fd, select.KQ_FILTER_WRITE, select.KQ_EV_ADD)) 163 | elif disp.curmask & self.OUT: 164 | c.append(select.kevent(disp.fd, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)) 165 | if c: 166 | self.kqueue.control(c, 0) 167 | disp.curmask = disp.mask 168 | 169 | def poll(self): 170 | try: 171 | ev = self.kqueue.control(None, 1024) 172 | except OSError as e: 173 | if e.errno == errno.EBADF: 174 | self.running = False 175 | return 176 | raise 177 | for e in ev: 178 | d = self.current = self.disps[e.ident] 179 | if e.filter == select.KQ_FILTER_READ: 180 | d.handle_read() 181 | elif e.filter == select.KQ_FILTER_WRITE: 182 | d.handle_write() 183 | self.current = None 184 | if self.running: 185 | self.modify(d) 186 | 187 | def close(self): 188 | self.kqueue.close() 189 | Handler.close(self) 190 | 191 | def stop(self, disp): 192 | self.close() 193 | 194 | 195 | class Dispatcher: 196 | def __init__(self, sock, handler, mask=0): 197 | self.sock = sock 198 | self.fd = sock.fileno() 199 | self.mask = mask 200 | sock.setblocking(0) 201 | self.handler = handler 202 | 203 | def open(self): 204 | self.handler.register(self) 205 | 206 | def close(self): 207 | self.mask = self.handler.EOF 208 | self.handler.unregister(self) 209 | try: 210 | self.sock.close() 211 | except OSError: 212 | pass 213 | 214 | def accept(self): 215 | try: 216 | return self.sock.accept() 217 | except OSError as e: 218 | if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN): 219 | return 220 | if e.errno in _DISCONNECTED or e.errno == errno.EINVAL: 221 | self.handle_close() 222 | return 223 | raise 224 | 225 | def send(self, data): 226 | try: 227 | return self.sock.send(data) 228 | except OSError as e: 229 | if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN): 230 | return 0 231 | if e.errno in _DISCONNECTED: 232 | self.handle_close() 233 | return 0 234 | raise 235 | 236 | def recv(self, siz): 237 | try: 238 | data = self.sock.recv(siz) 239 | if not data: 240 | self.handle_close() 241 | return data 242 | except OSError as e: 243 | if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN): 244 | return b'' 245 | if e.errno in _DISCONNECTED: 246 | self.handle_close() 247 | return b'' 248 | raise 249 | 250 | def recv_into(self, buf): 251 | try: 252 | n = self.sock.recv_into(buf) 253 | if n == 0: 254 | self.handle_close() 255 | return n 256 | except OSError as e: 257 | if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN): 258 | return b'' 259 | if e.errno in _DISCONNECTED: 260 | self.handle_close() 261 | return b'' 262 | raise 263 | 264 | def shutdown(self): 265 | try: 266 | self.mask |= self.handler.IN 267 | self.sock.shutdown(socket.SHUT_RDWR) 268 | except OSError as e: 269 | if e.errno not in _DISCONNECTED: 270 | raise 271 | 272 | def handle_close(self): 273 | self.close() 274 | 275 | 276 | class StreamDispatcher(Dispatcher): 277 | """Based on asyncore.dispatcher_with_send, works with EventHandler. 278 | Also allows input of known-size blocks.""" 279 | 280 | def __init__(self, sock, handler): 281 | super().__init__(sock, handler) 282 | self.out_buf = [] 283 | self.in_buf = memoryview(bytearray(_BUFSIZ)) 284 | self.in_off = 0 285 | self.read_size = 0 286 | self.read_handler = None 287 | 288 | def write(self, *data): 289 | for d in data: 290 | self.out_buf.append(memoryview(d)) 291 | self.handler.writable(self) 292 | 293 | def handle_write(self): 294 | while self.out_buf: 295 | buf = self.out_buf[0] 296 | r = self.send(buf[:1048576]) 297 | if r < len(buf): 298 | if r: 299 | self.out_buf[0] = buf[r:] 300 | return 301 | self.out_buf.pop(0) 302 | self.mask &= ~self.handler.OUT 303 | 304 | def next_read(self, size, f): 305 | self.read_size = size 306 | if size > len(self.in_buf): 307 | buf = memoryview(bytearray(max(size, _BUFSIZ))) 308 | buf[: self.in_off] = self.in_buf[: self.in_off] 309 | self.in_buf = buf 310 | self.read_handler = f 311 | self.mask |= self.handler.IN 312 | 313 | def handle_read(self): 314 | if self.in_off < len(self.in_buf): 315 | self.in_off += self.recv_into(self.in_buf[self.in_off :]) 316 | while True: 317 | handler = self.read_handler 318 | z = self.read_size 319 | if not handler or self.in_off < z: 320 | return 321 | i = self.in_buf[:z] 322 | self.in_buf = self.in_buf[z:] 323 | self.in_off -= z 324 | self.read_handler = None 325 | self.mask &= ~self.handler.IN 326 | handler(i) 327 | 328 | 329 | class KVSRequestDispatcher(StreamDispatcher): 330 | def __init__(self, pair, server, handler): 331 | sock, self.addr = pair 332 | self.server = server 333 | # Keep track of any currently waiting get: 334 | self.waiter = None 335 | sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 336 | super().__init__(sock, handler) 337 | logger.info('Accepted connect from %r', self.addr) 338 | self.next_op() 339 | self.open() 340 | 341 | def handle_close(self): 342 | self.cancel_waiter() 343 | logger.info('Closing connection from %r', self.addr) 344 | self.close() 345 | 346 | def error(self, msg): 347 | logger.error(f'Error from {self.addr!r}: {msg}') 348 | self.close() 349 | 350 | def cancel_waiter(self): 351 | if self.waiter: 352 | self.server.kvs.cancel_wait(self.waiter) 353 | self.waiter = None 354 | 355 | def next_op(self): 356 | self.next_read(4, self.handle_op) 357 | 358 | def next_lendata(self, handler): 359 | # wait for variable-length data prefixed by AsciiLenFormat 360 | def handle_len(L): 361 | L = L.tobytes() 362 | try: 363 | n = int(L) 364 | except ValueError: 365 | n = -1 366 | if n < 0: 367 | self.error(f"invalid data len: '{L}'") 368 | return 369 | self.next_read(n, handler) 370 | 371 | self.next_read(AsciiLenChars, handle_len) 372 | 373 | def handle_op(self, op): 374 | op = op.tobytes() 375 | if b'clos' == op: 376 | self.shutdown() 377 | elif b'down' == op: 378 | logger.info('Calling server shutdown') 379 | self.server.shutdown() 380 | elif b'dump' == op: 381 | d = self.server.kvs.dump() 382 | self.write(AsciiLenFormat(len(d)), d) 383 | self.next_op() 384 | elif op in [b'get_', b'mkey', b'put_', b'view']: 385 | self.next_lendata(partial(self.handle_opkey, op)) 386 | else: 387 | self.error(f"Unknown op: '{op!r}'") 388 | 389 | def handle_opkey(self, op, key): 390 | key = key.tobytes() 391 | # DEBUGOFF logger.debug('(%s) %s key "%s"', whoAmI, reqtxt, key) 392 | if b'mkey' == op: 393 | self.next_lendata(partial(self.handle_mkey, key)) 394 | elif b'put_' == op: 395 | self.next_read(4, lambda encoding: self.next_lendata(partial(self.handle_put, key, encoding))) 396 | else: # 'get_' or 'view' 397 | # Cancel waiting for any previous get/view operation (since client wouldn't be able to distinguish the async response) 398 | self.cancel_waiter() 399 | self.waiter = KVSWaiter(op, key, self.handle_got) 400 | self.server.kvs.wait(self.waiter) 401 | # But keep listening for another op (like 'clos') to cancel this one 402 | self.next_op() 403 | 404 | def handle_mkey(self, key, val): 405 | # DEBUGOFF logger.debug('(%s) val: %s', whoAmI, repr(val)) 406 | self.server.kvs.monkey(key, val) 407 | self.next_op() 408 | 409 | def handle_put(self, key, encoding, val): 410 | # TODO: bytearray val? 411 | # DEBUGOFF logger.debug('(%s) val: %s', whoAmI, repr(val)) 412 | self.server.kvs.put(key, (encoding, val)) 413 | self.next_op() 414 | 415 | def handle_got(self, encval): 416 | (encoding, val) = encval 417 | self.write(encoding, AsciiLenFormat(len(val)), val) 418 | self.waiter = None 419 | 420 | 421 | class KVSWaiter: 422 | def __init__(self, op, key, handler): 423 | if op == b'get_': 424 | op = b'get' 425 | self.op = op 426 | self.delete = op == b'get' 427 | self.key = key 428 | self.handler = handler 429 | 430 | 431 | class KVS: 432 | """Get/Put/View implements a client-server key value store. If no 433 | value is associated with a given key, clients will block on get or 434 | view until a value is available. Multiple values may be associated 435 | with any given key. 436 | 437 | This is, by design, a very simple, lightweight service that only 438 | depends on standard Python modules. 439 | 440 | """ 441 | 442 | def __init__(self, getIndex=0, viewIndex=-1): 443 | self.getIndex, self.viewIndex = getIndex, viewIndex # TODO: Add sanity checks? 444 | self.key2mon = DD(lambda: DD(set)) # Maps a normal key to keys that monitor it. 445 | self.monkeys = set() # List of monitor keys. 446 | # store and waiters are mutually exclusive, and could be kept in the same place 447 | self.store = DD(list) 448 | self.waiters = DD(list) 449 | self.opCounts = {b'get': 0, b'put': 0, b'view': 0, b'wait': 0} 450 | self.ac, self.rc = 0, 0 451 | 452 | def _doMonkeys(self, op, k): 453 | # Don't monitor operations on monitor keys. 454 | if k in self.monkeys: 455 | return 456 | # DEBUGOFF logger.debug('doMonkeys: %s %s %s', op, k, repr(self.key2mon[True][op] | self.key2mon[k][op])) 457 | for p in (True, k): 458 | for mk in self.key2mon[p][op]: 459 | self.put(mk, (b'ASTR', repr((op, k)))) 460 | 461 | def dump(self): 462 | """Utility function that returns a snapshot of the KV store.""" 463 | 464 | def vrep(v): 465 | t = v[0].tobytes() 466 | # Omit or truncate some values, in which cases add the original length as a third value 467 | if v == b'JSON' or t == b'HTML': 468 | return (t, v[1].tobytes()) 469 | if t != b'ASTR': 470 | return (t, None, len(v[1])) 471 | if v[1][:6].tobytes().lower() == '': 472 | return (t, v[1].tobytes()) # for backwards compatibility only 473 | if len(v[1]) > 50: 474 | return (t, v[1][:24].tobytes() + '...' + v[1][-23:].tobytes(), len(v[1])) 475 | return (t, v[1].tobytes()) 476 | 477 | return PDS( 478 | ( 479 | [ 480 | self.opCounts[b'get'], 481 | self.opCounts[b'put'], 482 | self.opCounts[b'view'], 483 | self.opCounts[b'wait'], 484 | self.ac, 485 | self.rc, 486 | ], 487 | [(k, len(v)) for k, v in self.waiters.items() if v], 488 | [(k, len(vv), vrep(vv[-1])) for k, vv in self.store.items() if vv], 489 | ) 490 | ) 491 | 492 | def wait(self, waiter): 493 | """Atomically (remove and) return a value associated with key k. If 494 | none, block.""" 495 | # DEBUGOFF logger.debug('wait: %s, %s', repr(waiter.key), repr(waiter.op)) 496 | self._doMonkeys(waiter.op, waiter.key) 497 | vv = self.store.get(waiter.key) 498 | if vv: 499 | if waiter.delete: 500 | v = vv.pop(self.getIndex) 501 | if not vv: 502 | self.store.pop(waiter.key) 503 | else: 504 | v = vv[self.viewIndex] 505 | self.opCounts[waiter.op] += 1 506 | # DEBUGOFF logger.debug('_gv (%s): %s => %s (%d)', waiter.op, waiter.key, repr(v[0]), len(v[1])) 507 | waiter.handler(v) 508 | else: 509 | self.waiters[waiter.key].append(waiter) 510 | self.opCounts[b'wait'] += 1 511 | self._doMonkeys(b'wait', waiter.key) 512 | # DEBUGOFF logger.debug('(%s) %s acquiring', repr(waiter), repr(s)) 513 | self.ac += 1 514 | 515 | def cancel_wait(self, waiter): 516 | ww = self.waiters.get(waiter.key) 517 | if ww: 518 | try: 519 | ww.remove(waiter) 520 | except ValueError: 521 | pass 522 | if not ww: 523 | self.waiters.pop(waiter.key) 524 | 525 | def monkey(self, mkey, v): 526 | """Make Mkey a monitor key. Value encodes what events to monitor and 527 | for which key: 528 | 529 | Key:Events 530 | 531 | Whenever a listed event occurs for "Key", a put will be done 532 | to "Mkey" with the value " ". If 'Key' is empty, 533 | the events listed will be monitored for all keys. 'Events' is 534 | some subset of 'g', 'p', 'v' and 'w' (get, put, view and 535 | wait). Monitoring of any event *not* listed is turned off for 536 | the specified key. 537 | 538 | """ 539 | # DEBUGOFF logger.debug('monkey: %s %s', mkey, v) 540 | if b':' not in v: 541 | return # TODO: Add some sort of error handling? 542 | self.monkeys.add(mkey) 543 | k, events = v.rsplit(b':', 1) 544 | if not k: 545 | k = True 546 | for e, op in [(b'g', b'get'), (b'p', b'put'), (b'v', b'view'), (b'w', b'wait')]: 547 | if e in events: 548 | self.key2mon[k][op].add(mkey) 549 | else: 550 | try: 551 | self.key2mon[k][op].remove(mkey) 552 | except KeyError: 553 | pass 554 | # DEBUGOFF logger.debug('monkey: %s', repr(self.key2mon)) 555 | 556 | def put(self, k, v): 557 | """Add value v to those associated with the key k.""" 558 | # DEBUGOFF logger.debug('put: %s, %s', repr(k), repr(v)) 559 | self.opCounts[b'put'] += 1 560 | ww = self.waiters.get(k) # No waiters is probably most common, so optimize for 561 | # that. ww will be None if no waiters have been 562 | # registered for key k. 563 | consumed = False 564 | if ww: 565 | while ww: 566 | waiter = ww.pop(0) 567 | # DEBUGOFF logger.debug('%s releasing', repr(waiter)) 568 | self.rc += 1 569 | self.opCounts[waiter.op] += 1 570 | waiter.handler(v) 571 | if waiter.delete: 572 | consumed = True 573 | break 574 | if not ww: 575 | self.waiters.pop(k) 576 | 577 | if not consumed: 578 | self.store[k].append(v) 579 | self._doMonkeys(b'put', k) 580 | 581 | 582 | class KVSServer(threading.Thread, Dispatcher): 583 | def __init__(self, host=None, port=0): 584 | if not host: 585 | host = socket.gethostname() 586 | 587 | self.kvs = KVS() 588 | 589 | snof, hnof = resource.getrlimit(resource.RLIMIT_NOFILE) 590 | hnof = min(hnof, 1000000) # don't need unreasonably many 591 | if snof < hnof: 592 | try: 593 | resource.setrlimit(resource.RLIMIT_NOFILE, (hnof, hnof)) 594 | logger.info('Raised max open files from %d to %d', snof, hnof) 595 | except Exception: 596 | logger.info('Failed to raise max open files from %d to %d; continuing anyway', snof, hnof) 597 | pass 598 | 599 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 600 | self.sock.setblocking(1) 601 | self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 602 | self.sock.bind((host, port)) 603 | logger.info('Setting queue size to 4000') 604 | self.sock.listen(4000) 605 | self.cinfo = self.sock.getsockname() 606 | 607 | if hasattr(select, 'epoll'): 608 | self.handler = EPollHandler() 609 | elif hasattr(select, 'kqueue'): 610 | self.handler = KQueueHandler() 611 | else: 612 | self.handler = PollHandler() 613 | Dispatcher.__init__(self, self.sock, self.handler, self.handler.IN) 614 | self.open() 615 | 616 | threading.Thread.__init__(self, name='KVSServerThread', target=self.handler.run) 617 | self.start() 618 | 619 | def handle_read(self): 620 | pair = self.accept() 621 | if pair: 622 | KVSRequestDispatcher(pair, self, self.handler) 623 | 624 | def handle_close(self): 625 | logger.info('Server shutting down') 626 | self.close() 627 | self.handler.close() 628 | 629 | def shutdown(self): 630 | if self.handler.running: 631 | super().shutdown() 632 | self.handler.stop(self) 633 | 634 | def env(self, env=os.environ.copy()): 635 | """Add the KVSSTCP environment variables to the given environment.""" 636 | env['KVSSTCP_HOST'] = self.cinfo[0] 637 | env['KVSSTCP_PORT'] = str(self.cinfo[1]) 638 | return env 639 | 640 | 641 | if '__main__' == __name__: 642 | import argparse 643 | 644 | argp = argparse.ArgumentParser(description='Start key-value storage server.') 645 | argp.add_argument('-H', '--host', default='', help='Host interface (default is hostname).') 646 | argp.add_argument('-p', '--port', type=int, default=0, help='Port (default is 0 --- let the OS choose).') 647 | argp.add_argument( 648 | '-a', 649 | '--addrfile', 650 | default=None, 651 | metavar='AddressFile', 652 | type=argparse.FileType('w'), 653 | help='Write address to this file.', 654 | ) 655 | argp.add_argument( 656 | '-e', '--execcmd', default=None, metavar='COMMAND SEQUENCE', help='Execute command with augmented environment.' 657 | ) 658 | argp.add_argument( 659 | '-l', 660 | '--logfile', 661 | default=None, 662 | metavar='KVSSLogfile', 663 | type=argparse.FileType('w'), 664 | help='Log file for key-value storage server.', 665 | ) 666 | args = argp.parse_args() 667 | 668 | # TODO: figure out where this should really go. 669 | lconf = {'format': '%(asctime)s %(levelname)-8s %(name)-15s: %(message)s', 'level': logging.DEBUG} 670 | if args.logfile: 671 | args.logfile.close() 672 | lconf['filename'] = args.logfile.name 673 | logging.basicConfig(**lconf) 674 | 675 | t = KVSServer(args.host, args.port) 676 | addr = '{:s}:{:d}'.format(*t.cinfo) 677 | logger.info('Server running at %s.', addr) 678 | if args.addrfile: 679 | args.addrfile.write(addr) 680 | args.addrfile.close() 681 | 682 | try: 683 | if args.execcmd: 684 | import subprocess 685 | 686 | logger.info('Launching: %r, env %r', args.execcmd, t.env()) 687 | subprocess.check_call(args.execcmd, shell=True, env=t.env()) 688 | else: 689 | while t.isAlive(): 690 | t.join(60) 691 | finally: 692 | t.shutdown() 693 | t.join() 694 | -------------------------------------------------------------------------------- /exampleTaskFiles/4KChecks: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Argument list should be the log files from a 4KTasksRep run. 4 | 5 | fc=4000 6 | al="$#" 7 | [[ ${al} == ${fc} ]] && echo "Found ${fc} files, as expected." || echo "Wrong number of log files ${al}, expected ${fc}." 8 | 9 | sum=4007998000 10 | dbsum=$(awk 'FNR == 2{c = c + $1}END{printf("%d\n", c)}' "$@") 11 | [[ ${sum} == ${dbsum} ]] && echo "Sum is ${sum}, as expected." || echo "Wrong sum ${dbsum}, expected ${sum}." 12 | 13 | echo "Should be ~13.00" 14 | awk 'FNR == 3{s = $1}FNR == 4 {c = c + $1 - s}END{printf("%.2f\n", c/4000.)}' "$@" 15 | 16 | echo -e "Now run:\n awk -F'\\\\t' '{print \$5}' | sort | uniq -c " 17 | -------------------------------------------------------------------------------- /exampleTaskFiles/4KTasksRep: -------------------------------------------------------------------------------- 1 | #DISBATCH PREFIX ( hostname ; echo $DISBATCH_REPEAT_INDEX ; date +%s.%3N; sleep 13 ; date +%s.%3N ) > /CHANGE/THIS/PATH/DeleteMe_${DISBATCH_JOBID}_${DISBATCH_TASKID}.log 2>&1 2 | #DISBATCH REPEAT 4000 start 1000000 3 | -------------------------------------------------------------------------------- /exampleTaskFiles/DBtasksOneBadOneLeaky: -------------------------------------------------------------------------------- 1 | # Note there is a space at the end of the next line. 2 | #DISBATCH PREFIX cd dbTestOutputDir ; 3 | #DISBATCH SUFFIX &>> ${DISBATCH_NAMETASKS}_engine_${DISBATCH_ENGINE_RANK}.log 4 | #DISBATCH PERENGINE START ( echo -n "perengine start on " ; hostname ; date ; sleep 10 ; date ) 5 | #DISBATCH PERENGINE STOP ( echo -n "perengine stop on " ; hostname ; date ; sleep 10 ; date ) 6 | 7 | 8 | #DISBATCH SUFFIX &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}.log 9 | # parentheses are important in this example so that all output is 10 | # collected and captured by the redirection. 11 | ( echo "13 running on" $(hostname) ; date ; sleep 10 ; date ; ) 12 | ( echo "14 running on" $(hostname) ; date ; sleep 10 ; date ; ) 13 | ( echo "15 running on" $(hostname) ; date ; sleep 10 ; date ; ) 14 | ( echo "16 running on" $(hostname) ; date ; sleep 10 ; date ; ) 15 | ( echo "17 running on" $(hostname) ; date ; sleep 10 ; date ; ) 16 | ( echo "18 running on" $(hostname) ; date ; sleep 10 ; date ; ) 17 | ( echo "19 running on" $(hostname) ; date ; sleep 10 ; date ; ) 18 | ( echo "20 running on" $(hostname) ; date ; sleep 10 ; date ; ) 19 | ( echo "21 running on" $(hostname) ; date ; sleep 10 ; date ; ) 20 | ( echo "22 running on" $(hostname) ; date ; sleep 10 ; date ; ) 21 | ( echo "23 running on" $(hostname) ; date ; sleep 10 ; date ; ) 22 | ( echo "24 running on" $(hostname) ; date ; sleep 10 ; date ; ) 23 | ( echo "25 running on" $(hostname) ; date ; sleep 10 ; date ; ) 24 | 25 | # generate non-zero return code 26 | ( echo "26 running on" $(hostname) ; date ; sleep 10 ; date ; exit 13 ) 27 | 28 | ( echo "27 running on" $(hostname) ; date ; sleep 10 ; date ; ) 29 | ( echo "28 running on" $(hostname) ; date ; sleep 10 ; date ; ) 30 | ( echo "29 running on" $(hostname) ; date ; sleep 10 ; date ; ) 31 | #DISBATCH BARRIER 32 | #DISBATCH BARRIER mykey 33 | #DISBATCH BARRIER 34 | ( echo "33 running on" $(hostname) ; date ; sleep 10 ; date ; ) 35 | ( echo "34 running on" $(hostname) ; date ; sleep 10 ; date ; ) 36 | ( echo "35 running on" $(hostname) ; date ; sleep 10 ; date ; ) 37 | ( echo "36 running on" $(hostname) ; date ; sleep 10 ; date ; ) 38 | ( echo "37 running on" $(hostname) ; date ; sleep 10 ; date ; ) 39 | 40 | # leak some output 41 | ( echo "38 running on" $(hostname) ; date ; sleep 10 ; date ; ) ; echo 'missed some output' 42 | 43 | # singleton repeat 44 | #DISBATCH SUFFIX &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}_rx_${DISBATCH_REPEAT_INDEX}.log 45 | #DISBATCH REPEAT 1 start 39 ( echo "$DISBATCH_REPEAT_INDEX running on" $(hostname) ; date ; sleep 10 ; date ; ) 46 | 47 | # empty repeat 48 | #DISBATCH REPEAT 0 start 1 49 | 50 | # use zero padding for env variables in file name. 51 | #DISBATCH BARRIER 52 | #DISBATCH SUFFIX ( echo "${DISBATCH_REPEAT_INDEX} running on" $(hostname) ; echo "Zero-padded stream index: ${DISBATCH_STREAM_INDEX_ZP}" ; date ; sleep 10 ; date ; ) &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID_ZP}_rx_${DISBATCH_REPEAT_INDEX_ZP}.log 53 | #DISBATCH REPEAT 7 start 40 54 | 55 | # fail two in repeat 56 | #DISBATCH SUFFIX &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}_rx_${DISBATCH_REPEAT_INDEX}.log 57 | #DISBATCH REPEAT 3 start 50 ( echo "$DISBATCH_REPEAT_INDEX running on" $(hostname) ; date ; sleep 10 ; date ; [[ $DISBATCH_REPEAT_INDEX -eq 51 ]] ) 58 | 59 | #DISBATCH BARRIER 60 | #DISBATCH SUFFIX 61 | # All engines are idle at this point. Running one last task should trigger retirement(s) of all but one engine. 62 | ( echo "55 running on" $(hostname) ; date ; sleep 10 ; date ; ) &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}.log 63 | -------------------------------------------------------------------------------- /exampleTaskFiles/DCPTTasks: -------------------------------------------------------------------------------- 1 | #DISBATCH PREFIX ( hostname ; echo $DISBATCH_REPEAT_INDEX ; date +%s.%3N ; echo I have "$DISBATCH_CORES_PER_TASK cores." ; sleep 30 ; date +%s.%3N ) > DCPTTest/DeleteMe_${DISBATCH_JOBID}_${DISBATCH_TASKID}.log 2>&1 2 | #DISBATCH REPEAT 20 start 1000000 3 | -------------------------------------------------------------------------------- /exampleTaskFiles/GPUTasks: -------------------------------------------------------------------------------- 1 | #DISBATCH PREFIX ( hostname ; echo $DISBATCH_REPEAT_INDEX ; date +%s.%3N ; env | egrep 'CUDA|GPU' ; nvidia-smi ; sleep 30 ; date +%s.%3N ) > GPUTest/DeleteMe_${DISBATCH_JOBID}_${DISBATCH_TASKID}.log 2>&1 2 | #DISBATCH REPEAT 20 start 1000000 3 | -------------------------------------------------------------------------------- /exampleTaskFiles/barrierCheckFail: -------------------------------------------------------------------------------- 1 | ( echo "23 running on" $(hostname) ; date ; sleep 10 ; date ; ) 2 | ( echo "24 running on" $(hostname) ; date ; sleep 10 ; date ; ) 3 | ( echo "25 running on" $(hostname) ; date ; sleep 10 ; date ; ) 4 | 5 | # generate non-zero return code 6 | ( echo "26 running on" $(hostname) ; date ; sleep 10 ; date ; exit 13 ) 7 | 8 | ( echo "27 running on" $(hostname) ; date ; sleep 10 ; date ; ) 9 | ( echo "28 running on" $(hostname) ; date ; sleep 10 ; date ; ) 10 | ( echo "29 running on" $(hostname) ; date ; sleep 10 ; date ; ) 11 | #DISBATCH BARRIER CHECK 12 | ( echo "33 running on" $(hostname) ; date ; sleep 10 ; date ; ) 13 | ( echo "34 running on" $(hostname) ; date ; sleep 10 ; date ; ) 14 | ( echo "35 running on" $(hostname) ; date ; sleep 10 ; date ; ) 15 | -------------------------------------------------------------------------------- /exampleTaskFiles/dberTest.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | import sys 4 | 5 | from disbatch import disBatch 6 | 7 | # This test script requires at least one argument: the number of tasks to run. 8 | # The rest, if any, are arguments that will be passed to disBatch: 9 | # 10 | # - If testing on your local machine, try something like 11 | # 12 | # dberTest.py 15 -s localhost:5 13 | # 14 | # - If testing via a Slurm submission, disBatch will auto detect 15 | # that, so no additional arguments are needed 16 | # 17 | NumTasks = int(sys.argv[1]) 18 | dbArgs = sys.argv[2:] 19 | 20 | # The first argument is a prefix that will be used internally to 21 | # identify support activities related to this run. The rest are 22 | # arguments for disBatch. 23 | db = disBatch.DisBatcher(tasksname='testing', args=dbArgs) 24 | 25 | # We use this to keep track of the tasks. 26 | # disBatch assigns a numeric ID to each tasks, starting from 0. We need 27 | # to do the same to track the tasks. 28 | tasks = {} 29 | for x in range(NumTasks): 30 | # Tasks are simply ASCII command lines. The '{}' in the following 31 | # are interpreted by python, not bash. 32 | # We force an error return of task 7. 33 | tasks[x] = ( 34 | f'{{ date ; hostname ; sleep 2 ; echo {x}^2 $(( {x} * {x} )) ; [[ {x} == 7 ]] && exit 1 ; date ; }} > square.log_{x:03d} 2>&1 ' 35 | ) 36 | 37 | # Submit the task. 38 | db.submit(tasks[x]) 39 | 40 | # syncTasks waits for all tasks identified by the keys of "tasks" to 41 | # complete. It returns a dictionary that maps an id to a return code 42 | # and the complete status report for the task. syncTasks maintains an 43 | # internal dictionary of return codes, so this operation is 44 | # idempotent. 45 | tid2status = db.syncTasks(tasks) 46 | for tid in tasks: 47 | print( 48 | 'task {:d}: {:s} returned {:d}, matched: {:s}'.format( 49 | tid, 50 | repr(tasks[tid]), 51 | tid2status[tid]['ReturnCode'], 52 | repr(tasks[tid]) == tid2status[tid]['TaskCmd'], 53 | ) 54 | ) 55 | 56 | # Now try a repeat construct. Force an error for the index 112. 57 | db.submit( 58 | f'#DISBATCH REPEAT {NumTasks} start 100 step 3 x=${{DISBATCH_REPEAT_INDEX}} ; {{ date ; hostname ; sleep 2 ; echo $x^3 $(( x * x * x )) ; [[ $x == 112 ]] && exit 1 ; date ; }} > cube.log_$(printf "%03d" $x) 2>&1' 59 | ) 60 | 61 | # The ids for the new tasks are the next NumTasks consecutive integers. 62 | target_tids = set(range(NumTasks, 2 * NumTasks)) 63 | for x in range(NumTasks): 64 | # Wait for one task and return its status info. 65 | s = db.wait_one_task() 66 | assert s['TaskId'] in target_tids 67 | print(f'task {s["TaskId"]:d}: returned {s["ReturnCode"]:d}, "{s["TaskCmd"]:s}"') 68 | 69 | # Tell DisBatcher no more tasks are coming. 70 | db.done() 71 | -------------------------------------------------------------------------------- /exampleTaskFiles/dberTest.submit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./dberTest.py 23 --fill &> dberTestSlurm.log 4 | -------------------------------------------------------------------------------- /exampleTaskFiles/emptyTaskFile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatironinstitute/disBatch/4072dc9f95d14b3bdd8ee9259505d993eeded424/exampleTaskFiles/emptyTaskFile -------------------------------------------------------------------------------- /exampleTaskFiles/latePETask: -------------------------------------------------------------------------------- 1 | echo 'hi there' 2 | #This should fail. 3 | #DISBATCH PERENGINE START echo 'Did already say "hi there"?' 4 | echo 'goodbye now' 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "disbatch" 3 | description = "Dynamically distribute a list of tasks over a pool of compute resources" 4 | readme = "Readme.md" 5 | authors = [ 6 | { name = "Nick Carriero" }, 7 | { name = "Lehman Garrison", email = "lgarrison@flatironinstitute.org" }, 8 | ] 9 | requires-python = ">=3.9" 10 | dependencies = [] 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | 14 | [project.scripts] 15 | disBatch = "disbatch:main" 16 | disbatch = "disbatch:main" 17 | 18 | [build-system] 19 | requires = ["hatchling", "hatch-vcs"] 20 | build-backend = "hatchling.build" 21 | 22 | [tool.ruff] 23 | line-length = 120 24 | 25 | [tool.ruff.format] 26 | quote-style = "single" 27 | 28 | [tool.ruff.lint] 29 | select = ["E4", "E7", "E9", "F", "I", "UP"] 30 | 31 | [tool.hatch.version] 32 | source = "vcs" 33 | 34 | [tool.hatch.build.hooks.vcs] 35 | version-file = "disbatch/_version.py" 36 | 37 | [dependency-groups] 38 | dev = [ 39 | "pre-commit>=4.0.1", 40 | ] 41 | -------------------------------------------------------------------------------- /tests/test_slurm/Tasks: -------------------------------------------------------------------------------- 1 | touch A.txt 2 | touch B.txt 3 | touch C.txt 4 | -------------------------------------------------------------------------------- /tests/test_slurm/Tasks_failfast: -------------------------------------------------------------------------------- 1 | sleep 1000 2 | exit 1 3 | touch A.txt 4 | -------------------------------------------------------------------------------- /tests/test_slurm/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | exit_fail() { 4 | err=$? 5 | echo "Slurm test failed! Output is in $workdir" 6 | exit $err 7 | } 8 | 9 | trap exit_fail ERR 10 | 11 | workdir=$(mktemp -d -p $PWD disbatch-test.XXXX) 12 | cp Tasks Tasks_failfast $workdir 13 | cd $workdir 14 | 15 | # Run the test 16 | salloc -n 2 disBatch Tasks 17 | 18 | # Check that all 3 tasks ran, 19 | # which means A.txt, B.txt, and C.txt exist 20 | [[ -f A.txt && -f B.txt && -f C.txt ]] 21 | rm -f A.txt B.txt C.txt 22 | 23 | # Add a task and check that we can resume 24 | echo "touch D.txt" >> Tasks 25 | salloc -n 2 disBatch Tasks -r Tasks*_status.txt 26 | 27 | [[ -f D.txt && ! -f A.txt && ! -f B.txt && ! -f C.txt ]] 28 | 29 | # Test empty task file 30 | salloc -n 2 disBatch /dev/null 31 | 32 | # disBatch is expected to exit with a non-zero exit code here 33 | salloc -n 2 disBatch --fail-fast Tasks_failfast || true 34 | 35 | # check that we failed fast and didn't run any more tasks 36 | [[ ! -f A.txt ]] 37 | 38 | cd - > /dev/null 39 | 40 | trap - ERR 41 | echo "Slurm test passed." 42 | # NFS sometimes leaves stale file handles, but don't fail the test 43 | rm -rf $workdir || true 44 | -------------------------------------------------------------------------------- /tests/test_ssh/Tasks: -------------------------------------------------------------------------------- 1 | touch A.txt 2 | touch B.txt 3 | touch C.txt 4 | -------------------------------------------------------------------------------- /tests/test_ssh/Tasks_failfast: -------------------------------------------------------------------------------- 1 | sleep 1000 2 | exit 1 3 | touch A.txt 4 | -------------------------------------------------------------------------------- /tests/test_ssh/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | exit_fail() { 4 | err=$? 5 | echo "SSH test failed! Output is in $workdir" 6 | exit $err 7 | } 8 | 9 | trap exit_fail ERR 10 | 11 | workdir=$(mktemp -d -p $PWD disbatch-test.XXXX) 12 | cp Tasks Tasks_failfast $workdir 13 | cd $workdir 14 | 15 | # Run the test 16 | disBatch -s localhost:2 Tasks 17 | 18 | # Check that all 3 tasks ran, 19 | # which means A.txt, B.txt, and C.txt exist 20 | [[ -f A.txt && -f B.txt && -f C.txt ]] 21 | rm -f A.txt B.txt C.txt 22 | 23 | # Add a task and check that we can resume 24 | echo "touch D.txt" >> Tasks 25 | disBatch -s localhost:2 Tasks -r Tasks*_status.txt 26 | 27 | [[ -f D.txt && ! -f A.txt && ! -f B.txt && ! -f C.txt ]] 28 | 29 | # Test empty task file 30 | disBatch -s localhost:2 /dev/null 31 | 32 | # disBatch is expected to exit with a non-zero exit code here 33 | disbatch -s localhost:2 --fail-fast Tasks_failfast || true 34 | 35 | # check that we failed fast and didn't run any more tasks 36 | [[ ! -f A.txt ]] 37 | 38 | cd - > /dev/null 39 | 40 | trap - ERR 41 | echo "SSH test passed." 42 | # NFS sometimes leaves stale file handles, but don't fail the test 43 | rm -rf $workdir || true 44 | -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | requires-python = ">=3.9" 3 | 4 | [[package]] 5 | name = "cfgv" 6 | version = "3.4.0" 7 | source = { registry = "https://pypi.org/simple" } 8 | sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114 } 9 | wheels = [ 10 | { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249 }, 11 | ] 12 | 13 | [[package]] 14 | name = "disbatch" 15 | version = "2.7.dev17+g406b65a.d20241108" 16 | source = { editable = "." } 17 | 18 | [package.dev-dependencies] 19 | dev = [ 20 | { name = "pre-commit" }, 21 | ] 22 | 23 | [package.metadata] 24 | 25 | [package.metadata.requires-dev] 26 | dev = [{ name = "pre-commit", specifier = ">=4.0.1" }] 27 | 28 | [[package]] 29 | name = "distlib" 30 | version = "0.3.9" 31 | source = { registry = "https://pypi.org/simple" } 32 | sdist = { url = "https://files.pythonhosted.org/packages/0d/dd/1bec4c5ddb504ca60fc29472f3d27e8d4da1257a854e1d96742f15c1d02d/distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403", size = 613923 } 33 | wheels = [ 34 | { url = "https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87", size = 468973 }, 35 | ] 36 | 37 | [[package]] 38 | name = "filelock" 39 | version = "3.16.1" 40 | source = { registry = "https://pypi.org/simple" } 41 | sdist = { url = "https://files.pythonhosted.org/packages/9d/db/3ef5bb276dae18d6ec2124224403d1d67bccdbefc17af4cc8f553e341ab1/filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435", size = 18037 } 42 | wheels = [ 43 | { url = "https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0", size = 16163 }, 44 | ] 45 | 46 | [[package]] 47 | name = "identify" 48 | version = "2.6.1" 49 | source = { registry = "https://pypi.org/simple" } 50 | sdist = { url = "https://files.pythonhosted.org/packages/29/bb/25024dbcc93516c492b75919e76f389bac754a3e4248682fba32b250c880/identify-2.6.1.tar.gz", hash = "sha256:91478c5fb7c3aac5ff7bf9b4344f803843dc586832d5f110d672b19aa1984c98", size = 99097 } 51 | wheels = [ 52 | { url = "https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl", hash = "sha256:53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0", size = 98972 }, 53 | ] 54 | 55 | [[package]] 56 | name = "nodeenv" 57 | version = "1.9.1" 58 | source = { registry = "https://pypi.org/simple" } 59 | sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437 } 60 | wheels = [ 61 | { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 }, 62 | ] 63 | 64 | [[package]] 65 | name = "platformdirs" 66 | version = "4.3.6" 67 | source = { registry = "https://pypi.org/simple" } 68 | sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302 } 69 | wheels = [ 70 | { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439 }, 71 | ] 72 | 73 | [[package]] 74 | name = "pre-commit" 75 | version = "4.0.1" 76 | source = { registry = "https://pypi.org/simple" } 77 | dependencies = [ 78 | { name = "cfgv" }, 79 | { name = "identify" }, 80 | { name = "nodeenv" }, 81 | { name = "pyyaml" }, 82 | { name = "virtualenv" }, 83 | ] 84 | sdist = { url = "https://files.pythonhosted.org/packages/2e/c8/e22c292035f1bac8b9f5237a2622305bc0304e776080b246f3df57c4ff9f/pre_commit-4.0.1.tar.gz", hash = "sha256:80905ac375958c0444c65e9cebebd948b3cdb518f335a091a670a89d652139d2", size = 191678 } 85 | wheels = [ 86 | { url = "https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl", hash = "sha256:efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878", size = 218713 }, 87 | ] 88 | 89 | [[package]] 90 | name = "pyyaml" 91 | version = "6.0.2" 92 | source = { registry = "https://pypi.org/simple" } 93 | sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631 } 94 | wheels = [ 95 | { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199 }, 96 | { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758 }, 97 | { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463 }, 98 | { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280 }, 99 | { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239 }, 100 | { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802 }, 101 | { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527 }, 102 | { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052 }, 103 | { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774 }, 104 | { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612 }, 105 | { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040 }, 106 | { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829 }, 107 | { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167 }, 108 | { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952 }, 109 | { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301 }, 110 | { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638 }, 111 | { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850 }, 112 | { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980 }, 113 | { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873 }, 114 | { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302 }, 115 | { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154 }, 116 | { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223 }, 117 | { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542 }, 118 | { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164 }, 119 | { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611 }, 120 | { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591 }, 121 | { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 }, 122 | { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309 }, 123 | { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679 }, 124 | { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428 }, 125 | { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361 }, 126 | { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523 }, 127 | { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660 }, 128 | { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 }, 129 | { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 }, 130 | { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 }, 131 | { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 }, 132 | { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 }, 133 | { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 }, 134 | { url = "https://files.pythonhosted.org/packages/e9/6c/6e1b7f40181bc4805e2e07f4abc10a88ce4648e7e95ff1abe4ae4014a9b2/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12", size = 722614 }, 135 | { url = "https://files.pythonhosted.org/packages/3d/32/e7bd8535d22ea2874cef6a81021ba019474ace0d13a4819c2a4bce79bd6a/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19", size = 737360 }, 136 | { url = "https://files.pythonhosted.org/packages/d7/12/7322c1e30b9be969670b672573d45479edef72c9a0deac3bb2868f5d7469/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e", size = 699006 }, 137 | { url = "https://files.pythonhosted.org/packages/82/72/04fcad41ca56491995076630c3ec1e834be241664c0c09a64c9a2589b507/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725", size = 723577 }, 138 | { url = "https://files.pythonhosted.org/packages/ed/5e/46168b1f2757f1fcd442bc3029cd8767d88a98c9c05770d8b420948743bb/PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631", size = 144593 }, 139 | { url = "https://files.pythonhosted.org/packages/19/87/5124b1c1f2412bb95c59ec481eaf936cd32f0fe2a7b16b97b81c4c017a6a/PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8", size = 162312 }, 140 | ] 141 | 142 | [[package]] 143 | name = "virtualenv" 144 | version = "20.27.1" 145 | source = { registry = "https://pypi.org/simple" } 146 | dependencies = [ 147 | { name = "distlib" }, 148 | { name = "filelock" }, 149 | { name = "platformdirs" }, 150 | ] 151 | sdist = { url = "https://files.pythonhosted.org/packages/8c/b3/7b6a79c5c8cf6d90ea681310e169cf2db2884f4d583d16c6e1d5a75a4e04/virtualenv-20.27.1.tar.gz", hash = "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba", size = 6491145 } 152 | wheels = [ 153 | { url = "https://files.pythonhosted.org/packages/ae/92/78324ff89391e00c8f4cf6b8526c41c6ef36b4ea2d2c132250b1a6fc2b8d/virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4", size = 3117838 }, 154 | ] 155 | --------------------------------------------------------------------------------