├── .github
    ├── dependabot.yaml
    └── workflows
    │   ├── pypi.yaml
    │   └── tests.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── CHANGES.md
├── DesignNotes.md
├── LICENSE
├── Readme.md
├── disbatch
    ├── __init__.py
    ├── __main__.py
    ├── dbMon.py
    ├── dbUtil.template.sh
    ├── disBatch.py
    └── kvsstcp
    │   ├── .gitignore
    │   ├── Readme.md
    │   ├── __init__.py
    │   ├── kvsclient.py
    │   ├── kvscommon.py
    │   └── kvsstcp.py
├── exampleTaskFiles
    ├── 4KChecks
    ├── 4KTasksRep
    ├── DBtasksOneBadOneLeaky
    ├── DCPTTasks
    ├── GPUTasks
    ├── WayTooLongTask
    ├── barrierCheckFail
    ├── dberTest.py
    ├── dberTest.submit
    ├── emptyTaskFile
    └── latePETask
├── pyproject.toml
├── tests
    ├── test_slurm
    │   ├── Tasks
    │   ├── Tasks_failfast
    │   └── run.sh
    └── test_ssh
    │   ├── Tasks
    │   ├── Tasks_failfast
    │   └── run.sh
└── uv.lock


/.github/dependabot.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "monthly"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: Dist
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     name: Build
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 | 
14 |     - name: Install uv
15 |       uses: astral-sh/setup-uv@v6
16 |       with:
17 |         version: "0.4.30"
18 | 
19 |     - name: Set up Python
20 |       uses: actions/setup-python@v5
21 |       with:
22 |         python-version-file: ".python-version"
23 | 
24 |     - name: Build
25 |       run: uv build
26 | 
27 |     - name: Test wheels
28 |       run: |
29 |         uv venv --no-project testwhl
30 |         . testwhl/bin/activate
31 |         uv pip install dist/*.whl
32 |         cd tests/test_ssh
33 |         ./run.sh
34 | 
35 |     - name: Test sdist
36 |       run: |
37 |         uv venv --no-project testsdist
38 |         . testsdist/bin/activate
39 |         uv pip install dist/*.tar.gz
40 |         cd tests/test_ssh
41 |         ./run.sh
42 | 
43 |     - name: Upload dist artifacts
44 |       uses: actions/upload-artifact@v4
45 |       with:
46 |         name: dists
47 |         path: dist/*
48 | 
49 |   upload:
50 |     name: Upload
51 |     needs: [build]
52 |     runs-on: ubuntu-latest
53 |     environment: pypi
54 |     permissions:
55 |       id-token: write
56 |     steps:
57 |     - name: Install uv
58 |       uses: astral-sh/setup-uv@v6
59 |       with:
60 |         version: "0.4.30"
61 | 
62 |     - name: Download dist artifacts
63 |       uses: actions/download-artifact@v4
64 |       with:
65 |         name: dists
66 |         path: dist
67 |     
68 |     - name: Publish
69 |       run: uv publish
70 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   test:
 7 | 
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
12 |         os: [ubuntu-latest]
13 |         include:
14 |           - os: macos-latest
15 |             python-version: "3.13"
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - name: Install uv
21 |         uses: astral-sh/setup-uv@v6
22 |         with:
23 |           version: "0.4.30"
24 | 
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 | 
30 |       - name: Install the project
31 |         run: |
32 |           uv sync --all-extras --dev
33 |           
34 |       - name: Run local-mode (ssh) test
35 |         working-directory: ./tests/test_ssh
36 |         run: |
37 |           uv run ./run.sh
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /build/
 2 | /exampleTaskFiles/*.log
 3 | /exampleTaskFiles/*_disBatch_*
 4 | /exampleTaskFiles/slurm*out
 5 | /exampleTaskFiles/slurm*resize.*sh
 6 | /exampleTaskFiles/dbTestOutputDir
 7 | /tmp
 8 | /tests/test_*/disbatch-test.*
 9 | __pycache__/
10 | *.egg-info/
11 | /disbatch/_version.py
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autoupdate_schedule: monthly
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 6 |     rev: v0.11.12
 7 |     hooks:
 8 |       - id: ruff
 9 |         args: [ --fix ]
10 |       - id: ruff-format
11 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 3.0.0 (2024-12-05)
 4 | 
 5 | https://github.com/flatironinstitute/disBatch/pull/32
 6 | 
 7 | ### Breaking changes
 8 | - The Python package has been renamed `disbatch` from `disbatchc`
 9 | - Removed the `disBatch` script from the repo root. Instead, a `disBatch` command will be placed on the `PATH` upon installation, or see the [installation instructions](Readme.md#Installation) for other options.
10 | 
11 | ### Fixes
12 | - disBatch now installs all the necessary helper files so out-of-place installs work
13 | - Bugs (e.g. misspelled variables) in less common code paths fixed
14 | 
15 | ### Enhancements
16 | - PEP518 compliant build system
17 | - More robust discovery of disBatch installation by worker processes
18 | - Initial release on PyPI
19 | - uvx and pipx support
20 | - Set up linting and formatting
21 | - The executable can be invoked as `disbatch` or `disBatch`
22 | - Refreshed the readme
23 | - Added `disbatch --version` and `disbatch.__version__`
24 | - Added MacOS test
25 | - Added `--fail-fast` option [https://github.com/flatironinstitute/disBatch/pull/38]
26 | - Gracefully handle empty task list [https://github.com/flatironinstitute/disBatch/pull/38]
27 | 
28 | ### Changes
29 | - `kvsstcp` submodule is now vendored
30 | 


--------------------------------------------------------------------------------
/DesignNotes.md:
--------------------------------------------------------------------------------
 1 | Basic Design
 2 | ============
 3 | 
 4 | With version **2**, disBatch consists of three major components:
 5 | 
 6 | * A driver (aka _controller_) that maintains the state of the task processing.
 7 | * An execution context that encapsulates one or more engines running on one or mode nodes. A disBatch run may have multiple contexts. 
 8 | * An engine that is a collection of cylinder threads. Each cylinder runs a loop that waits for a task from the controller, spawns a sub-process to evaluate it, waits for the sub-process to exit, and then sends a report to the controller.
 9 | 
10 | 
11 | Driver
12 | -----
13 | 
14 | In normal operation, the driver spawns a couple of threads. One implements that KVS service. Another is the task feeder. This takes tasks from a task generator and hands them off to the controller via KVS.
15 | 
16 | Each task has an age, which reflects the number of synchronization events that preceded it. Synchronization events are barriers and per engine tasks. Per engine tasks are posted to KVS. A barrier is tracked by the controller. When all tasks prior to the barrier have been completed, the barrier is satisfied, a per engine event to this effect is posted to KVS and the controller's age is increased. The controller notifies the task feeder thread of the new age. The task feeder will not issue a task to the controller unless the controller's age is equal to the task's age. This interplay ensures no task is available for execution until all previous barriers (and thus in turn, all previous tasks) have been accounted for. Something akin to this takes place between an engine and its cylinders to implement per engine synchronization. See below.
17 | 
18 | The controller executes a main loop that waits for a controller event to arrive from KVS. These events include a new task from the task feeder, a completed task report from a cylinder, the registration of an execution context or an engine, a cylinder start, the notification that a context, engine or cylinder has stopped, requests to shutdown a context or a cylinder, and a few other events.
19 | 
20 | Each pass through the loop, the controller:
21 | 
22 | - Accepts a controller message from KVS. These may lead it to alter its internal state (say add a new cylinder) or execute an operation like sending a shutdown message to an engine. Of particular interest are messages providing a new task to execute, which causes that task to be added to a list of tasks with the same age, and messages reporting the completion of a task, which causes the cylinder it was assigned to to be marked available again and the finished task id to be recorded.
23 | - Checks to see if all necessary tasks have been completed to satisfy a barrier. If so the age is advanced, and other barriers iteratively checked---that is the completion of one task could in effect satisfy a series of successive barriers.
24 | - If there are tasks for the current age and available cylinders, assign tasks to the individual cylinders until we run out of one or the other. **Note:** If we record the assignments (including the full task), it should be straightforward to reissue tasks upon engine "failure".
25 | - Update overall status info that is kept in KVS. This is used by `dbmon.py` to provide quasi-realtime info about the state of a disBatch run.
26 | 
27 | As noted, the driver receives messages informing it of new contexts, engines and cylinders. A portion of this information is incorporated in the status report placed in KVS. It is also used to implement task limits for contexts. Once the controller has assigned the cylinder(s) of the engine(s) of a context a total number of tasks equal to the task limit specified for the context, it sends a shutdown request to every engine in the context.
28 | 
29 | Execution context
30 | -----------------
31 | 
32 | A context is responsible for interfacing between a collection of computational resources and a controller. Currently two kinds are supported:
33 | 
34 | * SLURM: This context makes use of environment variables set by SLURM to identify the allocated nodes and uses `srun` to start engines. The code here could serve as a model for implementing contexts for other batch queuing systems.
35 | * SSH: The nodes to be used are passed via the command line option (`-s`) or the environment variable `DISBATCH_SSH_NODELIST`. Engines are started via `ssh`.
36 | 
37 | Each context monitors its engines and invokes a retirement method, if provided, when an engine exits.
38 | 
39 | A context is also a logical shutdown unit. The user can, for example via `dbmon.py`, request that a context be shutdown. This is implemented by sending a shutdown request to each of the context's engines. **Note:** Such a request waits politely for all cylinders to complete any currently assigned tasks before stopping the engine.
40 | 
41 | 
42 | Engine
43 | ------
44 | 
45 | An engine is a collection of N+1 cylinder threads, where N is the number of allowable concurrently executing tasks specified for the engine. The extra cylinder handles the per-engine tasks. Per-engine tasks are maintained as an ordered queue in KVS: engines `view` values using a key with an index, stepping the index each time. Thus an engine joining at any given time can "replay" all the per engine activity. As it does so, it updates its internal age, and notifies each of its cylinders of the current age. A cylinder will not execute an assigned task until the engine has reached that task's age.
46 | 
47 | 
48 | Use modes
49 | ---------
50 | 
51 | With the exception of some reporting details, the "standard" case should be the same as with version **1**.
52 | 
53 | With version **2**, a user can invoke `disBatch` with `-S`, which starts a disBatch "service"---effectively just the controller. In this case, the name of a utility script is displayed. This script (always created by version **2**), can be submitted via sbatch to add an execution context. One could even submit this with a job array specification, and so add nodes on the fly to the disBatch run. The same script can be invoked with `-s` to add some ssh hosts to the mix, e.g., the user's own workstation.
54 | 
55 | The script can be invoked with `--mon` to start up a simple ASCII-UI to monitor progress and request shutdown of an engine or a context.
56 | 
57 | Comments
58 | --------
59 | 1. The controller is supposed to be the only single point of failure, nothing else (in the disBatch system) should be (assuming non malicious failure). Barriers (including an implicit one at the end), of course, might not be satisfied, but that aside a disBatch run can keep going even if a context or engine dies (if all engines died, more would have to be added to make more progress).
60 | 
61 | 2. Idempotency and task reissue.
62 | 
63 | 3. cli version of dbmon.py.
64 | 
65 | 4. Job array demo. (Theory vs practice.)
66 | 
67 | 5. Add option to insert `timeout`?
68 | 
69 | 6. Add heartbeat as a failure detection mechanism?
70 | 
71 | 7. pernode vs perengine
72 | 
73 | 8. Remove delay for explicitly started engines? Probably not ...
74 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | # disBatch
  2 | 
  3 | Distributed processing of a batch of tasks.
  4 | 
  5 | [![Tests](https://github.com/flatironinstitute/disBatch/actions/workflows/tests.yaml/badge.svg)](https://github.com/flatironinstitute/disBatch/actions/workflows/tests.yaml)
  6 | 
  7 | ## Quickstart
  8 | 
  9 | Install with pip:
 10 | 
 11 |     pip install disbatch
 12 | 
 13 | Create a file called `Tasks` with a list of commands you want to run. These should be Bash commands as one would run on the command line:
 14 | 
 15 |     myprog arg0 &> myprog_0.log
 16 |     myprog arg1 &> myprog_1.log
 17 |     ...
 18 |     myprog argN &> myprog_N.log
 19 | 
 20 | This file can have as many tasks (lines) as you like.  The `...` is just a stand-in and wouldn't literally be in the task file.
 21 | 
 22 | Then, to run 5 tasks at a time in parallel on your local machine, run:
 23 | 
 24 |     disBatch -s localhost:5 Tasks
 25 | 
 26 | `disBatch` will start the first five running concurrently. When one finishes, the next will be started until all are done.
 27 | 
 28 | Or, to distribute this work on a Slurm cluster, run:
 29 | 
 30 |     sbatch -n 5 disBatch Tasks
 31 | 
 32 | You may need to provide additional arguments specific to your cluster to specify a partition, time limit, etc.
 33 |   
 34 | ## Overview
 35 | 
 36 | One common usage pattern for distributed computing involves processing a
 37 | long list of commands (aka *tasks*):
 38 | 
 39 |     myprog -a 0 -b 0 -c 0
 40 |     myprog -a 0 -b 0 -c 1
 41 |     ...
 42 |     myprog -a 9 -b 9 -c 9
 43 | 
 44 | One could run this by submitting 1,000 separate jobs to a cluster, but that may
 45 | present problems for the queuing system and can behave badly if the
 46 | system is configured to handle jobs in a simple first come, first serve
 47 | fashion.  For short tasks, the job launch overhead may dominate the runtime, too.
 48 | 
 49 | One could simplify this by using, e.g., Slurm job arrays, but each job in a job
 50 | array is an independent Slurm job, so this suffers from the same per-job overheads
 51 | as if you submitted 1000 independent jobs. Furthermore, if nodes are being allocated
 52 | exclusively (i.e. the nodes that are allocated to your job are not shared by other jobs),
 53 | then the job array approach can hugely underutilize the compute resources unless each
 54 | task is using a full node's worth of resources.
 55 | 
 56 | And what if you don't have a cluster available, but do have a collection of networked computers? Or you just want to make use of multiple cores on your own computer?
 57 | 
 58 | In any event, when processing such a list of tasks, it is helpful to
 59 | acquire metadata about the execution of each task: where it ran, how
 60 | long it took, its exit return code, etc.
 61 | 
 62 | disBatch has been designed to support this usage in a simple and
 63 | portable way, as well as to provide the sort of metadata that can be
 64 | helpful for debugging and reissuing failed tasks.
 65 | 
 66 | It can take as input a file, each of whose lines is a task in the form of a
 67 | Bash command. For example, the file could consists of the 1000 commands listed above. It launches the tasks one
 68 | after the other until all specified execution resources are in use. Then as one
 69 | executing task exits, the next task in the file is launched. This repeats until all
 70 | the lines in the file have been processed.
 71 | 
 72 | Each task is run in a new shell; i.e. all lines are independent of one another. 
 73 | 
 74 | Here's a more complicated example, demonstrating controlling the execution environment and capturing the output of the tasks:
 75 | 
 76 |     ( cd /path/to/workdir ; source SetupEnv ; myprog -a 0 -b 0 -c 0 ) &> task_0_0_0.log
 77 |     ( cd /path/to/workdir ; source SetupEnv ; myprog -a 0 -b 0 -c 1 ) &> task_0_0_1.log
 78 |     ...
 79 |     ( cd /path/to/workdir ; source SetupEnv ; myprog -a 9 -b 9 -c 8 ) &> task_9_9_8.log
 80 |     ( cd /path/to/workdir ; source SetupEnv ; myprog -a 9 -b 9 -c 9 ) &> task_9_9_9.log
 81 | 
 82 | Each line uses standard Bash syntax. Let's break it down:
 83 | 
 84 | 1. the `( ... ) &> task_0_0_0.log` captures all output (stdout and stderr) from any command in the parentheses and writes it to `task_0_0_0.log`;
 85 | 2. `cd /path/to/workdir` changes the working directory;
 86 | 3. `source SetupEnv` executes a script called `SetupEnv`, which could contain commands like `export PATH=...` or `module load ...` to set up the environment;
 87 | 4. `myprog -a 0 -b 0 -c 0` is the command you want to run.
 88 | 
 89 | The semicolons between the last 3 statements are Bash syntax to run a series of commands on the same line.
 90 | 
 91 | You can simplify this kind of task file with the `#DISBATCH PREFIX` and `#DISBATCH SUFFIX` directives. See the [#DISBATCH directives](#disbatch-directives) section for full details, but here's how that could look:
 92 | 
 93 |     #DISBATCH PREFIX ( cd /path/to/workdir ; source SetupEnv ; myprog 
 94 |     #DISBATCH SUFFIX ) &> task_${DISBATCH_TASKID}.log
 95 |     -a 0 -b 0 -c 0
 96 |     -a 0 -b 0 -c 1
 97 |     ...
 98 |     -a 9 -b 9 -c 9
 99 | 
100 | 
101 | Note that for a simple environment setup, you don't need a `source SetupEnv`. You can just set an environment variable directly in the task line, as you can in Bash:
102 | 
103 |     export LD_LIBRARY_PATH=/d0/d1/d2:$LD_LIBRARY_PATH ; rest ; of ; command ; sequence
104 |     
105 | For more complex set ups, command sequences and input/output redirection requirements, you could place everything in a small shell script with appropriate arguments for the parts that vary from task to task, say `RunMyprog.sh`:
106 | 
107 |     #!/bin/bash
108 |     
109 |     id=$1
110 |     shift
111 |     cd /path/to/workdir
112 |     module purge
113 |     module load gcc openblas python3 
114 |     
115 |     export LD_LIBRARY_PATH=/d0/d1/d2:$LD_LIBRARY_PATH
116 |     myProg "$@" > results/${id}.out 2> logs/${id}.log
117 | 
118 | The task file would then contain:
119 | 
120 |     ./RunMyprog.sh 0_0_0 -a 0 -b 0 -c 0
121 |     ./RunMyprog.sh 0_0_1 -a 0 -b 0 -c 1
122 |     ...
123 |     ./RunMyprog.sh 9_9_8 -a 9 -b 9 -c 8
124 |     ./RunMyprog.sh 9_9_9 -a 9 -b 9 -c 9
125 | 
126 | See [#DISBATCH directives](#disbatch-directives) for more ways to simplify task lines. disBatch also sets some environment variables that can be used in your commands as arguments or to generate task-specifc file names:
127 | 
128 | * `DISBATCH_JOBID`: A name disBatch creates that should be unique to the job
129 | * `DISBATCH_NAMETASKS`: The basename of the task file
130 | * `DISBATCH_REPEAT_INDEX`: See the repeat construct in [\#DISBATCH directives](#disbatch-directives)
131 | * `DISBATCH_STREAM_INDEX`: The 1-based line number of the line from the task file that generated the task
132 | " `DISBATCH_TASKID`: 0-based sequential counter value that uniquely identifies each task
133 | 
134 | Appending `_ZP` to any of the last three will produce a 0-padded value (to six places). If these variables are used to create file names, 0-padding will result in files names that sort correctly.
135 | 
136 | Once you have created the task file, running disBatch is straightforward. For example, working with a cluster managed by Slurm,
137 | all that needs to be done is to submit a job like the following:
138 | 
139 |     sbatch -n 20 -c 4 disBatch TaskFileName
140 | 
141 | This particular invocation will allocate sufficient resources to process
142 | 20 tasks at a time, each of which needs 4 cores.
143 | disBatch will use environment variables initialized by Slurm to determine the execution resources to use for the run.
144 | This invocation assumes an appropriately installed disBatch is in your PATH, see [installation](#installation) for details.
145 | 
146 | disBatch also allows the pool of execution resources to be increased or decreased during the course of a run:
147 | 
148 |     sbatch -n 10 -c 4 ./TaskFileName_dbUtil.sh
149 | 
150 | will add enough resources to run 10 more tasks concurrently. `TaskFileName_dbUtl.sh` is a utility script created by `disBatch` when the run starts (the actual name is a little more complex, see [startup](#user-content-startup)).
151 | 
152 | Various log files will be created as the run unfolds:
153 | 
154 | * `TaskFileName_*_status.txt`: status of every task (details below). `*` elides a unique identifier disBatch creates to distinguish one run from another. This is the most important output file and we recommend checking it after every run.
155 | * `TaskFileName_*_[context|driver|engine].log`:
156 |   The disBatch driver log file contains details mostly of interest in case of a
157 |   problem with disBatch itself. (The driver log file name can be changed with `-l`). It can generally be ignored by end
158 |   users (but keep it around in the event that something did go
159 |   wrong&mdash;it will aid debugging). The `*_[context|engine].log` files contain similar information for the disBatch components that manage execution resources.
160 | * `disBatch_*_kvsinfo.txt`: TCP address of invoked KVS server if any (for additional advanced status monitoring)
161 | 
162 | > [!TIP]
163 | > The `*_status.txt` file is the most important disBatch output file and we recommend checking it after every run.
164 | 
165 | While disBatch is a Python 3 application, it can run tasks from any language environment&mdash;anything you can run from a shell can be run as a task.
166 | 
167 | ### Status file
168 | 
169 | The status file is the most important disBatch output file and we recommend checking it after every run. The filename is `TaskFileName_*_status.txt`. It contains tab-delimited lines of the form:
170 | 
171 |     314	315	-1	worker032	8016	0	10.0486528873	1458660919.78	1458660929.83	0	""	0	""	cd /path/to/workdir ; myprog -a 3 -b 1 -c 4 > task_3_1_4.log 2>&1
172 | 
173 | These fields are:
174 | 
175 |   1. Flags: The first field, blank in this case, may contain `E`, `O`, `R`, `B`, or `S` flags.
176 |      Each program/task should be invoked in such a way that standard error
177 |      and standard output end up in appropriate files. If that is not the case
178 |      `E` or `O` flags will be raised. `R` indicates that the task
179 |      returned a non-zero exit code. `B` indicates a [barrier](#disbatch-directives). `S` indicates the job was skipped (this may happen during "resume" runs).
180 |   1. Task ID: The `314` is the 0-based index of the task (starting from the beginning of the task file, incremented for each task, including repeats).
181 |   1. Line number: The `315` is the 1-based line from the task file. Blank lines, comments, directives and repeats may cause this to drift considerably from the value of Task ID.
182 |   1. Repeat index: The `-1` is the repeat index (as in this example, `-1` indicates this task was not part of a repeat directive).
183 |   1. Node: `worker032` identifies the node on which the task ran.
184 |   1. PID: `8016` is the PID of the bash shell used to run the task.
185 |   1. Exit code: `0` is the exit code returned.
186 |   1. Elapsed time: `10.0486528873` (seconds),
187 |   1. Start time:`1458660919.78` (Unix epoch based),
188 |   1. Finish time: `1458660929.83` (Unix epoch based).
189 |   1. Bytes of *leaked* output (not redirected to a file),
190 |   1. Output snippet (up to 80 bytes consisting of the prefix and suffix of the output),
191 |   1. Bytes of leaked error output,
192 |   1. Error snippet,
193 |   1. Command: `cd ...` is the text of the task (repeated from the task file, but subject to modification by [directives](#disbatch-directives)).
194 | 
195 | 
196 | ## Installation
197 | 
198 | **Users of Flatiron clusters: disBatch is available via the module system. You can run `module load disBatch` instead of installing it.**
199 | 
200 | There are several ways to get disBatch:
201 | 
202 |   1. installation with pip;
203 |   1. direct invocation with pipx or uvx;
204 |   1. cloning the repo.
205 | 
206 | Most users can install via pip. Direct invocation with uvx may be of particular interest for users on systems without a modern Python, as uvx will bootstrap Python for you.
207 |   
208 | ### Installation with pip
209 | You can use pip to install disbatch just like a normal Python package:
210 | 
211 |   1. from PyPI: `pip install disbatch`
212 |   2. from GitHub: `pip install git+https://github.com/flatironinstitute/disBatch.git`
213 | 
214 | These should be run in a venv. Installing with `pip install --user disbatch` may work instead, but as a general practice is discouraged.
215 | 
216 | After installation, disBatch will be available via the `disbatch` and `disBatch` executables on the `PATH` so long as the venv is activated. Likewise, disBatch can be run as a module with `python -m disbatch`.
217 | 
218 | <details>
219 | <summary>Click here for a complete example using pip and venv</summary>
220 | 
221 | You'll need a modern Python to install disBatch this way. We recommend the uvx installation method below if you don't have one, as uv will boostrap Python for you.
222 | 
223 | ```
224 | python -m venv venv
225 | . venv/bin/activate
226 | pip install disbatch
227 | disbatch TaskFile
228 | ```
229 | </details>
230 | 
231 | ### Direct invocation with pipx or uvx
232 | 
233 | [pipx](https://pipx.pypa.io/stable/) and [uvx](https://docs.astral.sh/uv/guides/tools/) are two tools that will create an isolated venv, download and install disbatch into that venv, and run it all in a single command:
234 | 
235 |   1. `pipx disbatch TaskFile`
236 |   1. `uvx disbatch TaskFile`
237 | 
238 | pipx already requires a somewhat modern Python, so for disbatch's purposes it just saves you the step of creating and activating a venv and installing disBatch.
239 | 
240 | uvx, on the other hand, will download a modern Python for you if you don't have one available locally. It requires [installing uv](https://docs.astral.sh/uv/getting-started/installation/), which is straightforward and portable.
241 | 
242 | Here's a complete example of running disbatch on a system without modern Python:
243 | 
244 | ```
245 | curl -LsSf https://astral.sh/uv/install.sh | sh
246 | source $HOME/.local/bin/env
247 | uvx disbatch TaskFile
248 | ```
249 | 
250 | Afterwards, disbatch will always be available as `uvx disbatch`.
251 | 
252 | For Slurm users, note that the above will install disbatch into the user's default cache directory. If this directory is not visible to all nodes on the cluster, then disbatch jobs will fail. One can specify a different cache directory with `uvx --cache-dir=...`, but the simplest fix is to do a `tool install`:
253 | 
254 | ```
255 | uv tool install disbatch
256 | sbatch disbatch TaskFile
257 | ```
258 | 
259 | This places `disbatch` on the `PATH` in a persistent location; no need to use `uvx` anymore.
260 | 
261 | 
262 | ### Cloning the repo
263 | Users or developers who want to work on the code should clone the repo then do an editable install into a venv:
264 | 
265 | ```
266 | git clone https://github.com/flatironinstitute/disBatch.git
267 | pip install -e ./disBatch
268 | ```
269 | 
270 | Setting `PYTHONPATH` may also work, but as a general practice is discouraged. If you don't have a modern Python available, [uv](https://docs.astral.sh/uv/getting-started/installation/) can bootstrap one for you.
271 | 
272 | ## Execution Environments
273 | disBatch is designed to support a variety of execution environments, from your own desktop, to a local collection of workstations, to large clusters managed by job schedulers.
274 | It currently supports Slurm and can be executed from `sbatch`, but it is architected to make it simple to add support for other resource managers.
275 | 
276 | You can also run directly on one or more machines by setting an environment variable:
277 | 
278 |     DISBATCH_SSH_NODELIST=localhost:7,otherhost:3
279 | 
280 | or specifying an invocation argument:
281 | 
282 |     -s localhost:7,otherhost:3
283 |     
284 | This allows execution directly on your `localhost` and via ssh for remote hosts without the need for a resource management system.
285 | In this example, disBatch is told it can use seven CPUs on your local host and three on `otherhost`. Assuming the default mapping of one task to one CPU applies in this example, seven tasks could be in progress at any given time on `localhost`, and three on `otherhost`. Note that `localhost` is an actual name you can use to refer to the machine on which you are currently working. `otherhost` is fictious. 
286 | Hosts used via ssh must be set up to allow ssh to work without a password and must share the working directory for the disBatch run.
287 | 
288 | disBatch refers to a collection of execution resources as a *context* and the resources proper as *engines*. So the Slurm example `sbatch -n 20 -c 4`, run on a cluster with 16-core nodes, might create one context with five engines (one each for five 16-core nodes, capable of running four concurrent 4-core tasks each), while the SSH example creates one context with two engines (capable of running seven and three concurrent tasks, respectively).
289 | 
290 | ## Invocation
291 | ```
292 | usage: disbatch [-h] [-e] [--force-resume] [--kvsserver [HOST:PORT]] [--logfile FILE]
293 |                 [--loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG}] [--mailFreq N] [--mailTo ADDR] [-p PATH]
294 |                 [-r STATUSFILE] [-R] [-S] [--status-header] [--use-address HOST:PORT] [-w] [-f]
295 |                 [--taskcommand COMMAND] [--taskserver [HOST:PORT]] [--version] [-C TASK_LIMIT] [-c N] [--fill]
296 |                 [--no-retire] [-l COMMAND] [--retire-cmd COMMAND] [-s HOST:CORECOUNT] [-t N]
297 |                 [taskfile]
298 | 
299 | Use batch resources to process a file of tasks, one task per line.
300 | 
301 | positional arguments:
302 |   taskfile              File with tasks, one task per line ("-" for stdin)
303 | 
304 | options:
305 |   -h, --help            show this help message and exit
306 |   -e, --exit-code       When any task fails, exit with non-zero status (default: only if disBatch itself fails)
307 |   --force-resume        With -r, proceed even if task commands/lines are different.
308 |   --kvsserver [HOST:PORT]
309 |                         Use a running KVS server.
310 |   --logfile FILE        Log file.
311 |   --loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG}
312 |                         Logging level (default: INFO).
313 |   --mailFreq N          Send email every N task completions (default: 1). "--mailTo" must be given.
314 |   --mailTo ADDR         Mail address for task completion notification(s).
315 |   -p PATH, --prefix PATH
316 |                         Path for log, dbUtil, and status files (default: "."). If ends with non-directory component,
317 |                         use as prefix for these files names (default: <Taskfile>_disBatch_<YYYYMMDDhhmmss>_<Random>).
318 |   -r STATUSFILE, --resume-from STATUSFILE
319 |                         Read the status file from a previous run and skip any completed tasks (may be specified
320 |                         multiple times).
321 |   -R, --retry           With -r, also retry any tasks which failed in previous runs (non-zero return).
322 |   -S, --startup-only    Startup only the disBatch server (and KVS server if appropriate). Use "dbUtil..." script to
323 |                         add execution contexts. Incompatible with "--ssh-node".
324 |   --status-header       Add header line to status file.
325 |   --use-address HOST:PORT
326 |                         Specify hostname and port to use for this run.
327 |   -w, --web             Enable web interface.
328 |   -f, --fail-fast       Exit on first task failure. Running tasks will be interrupted and disBatch will exit with a
329 |                         non-zero exit code.
330 |   --taskcommand COMMAND
331 |                         Tasks will come from the command specified via the KVS server (passed in the environment).
332 |   --taskserver [HOST:PORT]
333 |                         Tasks will come from the KVS server.
334 |   --version             Print the version and exit
335 |   -C TASK_LIMIT, --context-task-limit TASK_LIMIT
336 |                         Shutdown after running COUNT tasks (0 => no limit).
337 |   -c N, --cpusPerTask N
338 |                         Number of cores used per task; may be fractional (default: 1).
339 |   --fill                Try to use extra cores if allocated cores exceeds requested cores.
340 |   --no-retire           Don't retire nodes from the batch system (e.g., if running as part of a larger job).
341 |   -l COMMAND, --label COMMAND
342 |                         Label for this context. Should be unique.
343 |   --retire-cmd COMMAND  Shell command to run to retire a node (environment includes $NODE being retired, remaining
344 |                         $ACTIVE node list, $RETIRED node list; default based on batch system). Incompatible with "--
345 |                         ssh-node".
346 |   -s HOST:CORECOUNT, --ssh-node HOST:CORECOUNT
347 |                         Run tasks over SSH on the given nodes (can be specified multiple times for additional hosts;
348 |                         equivalent to setting DISBATCH_SSH_NODELIST)
349 |   -t N, --tasksPerNode N
350 |                         Maximum concurrently executing tasks per node (up to cores/cpusPerTask).
351 | ```
352 | 
353 | The options for mail will only work if your computing environment permits processes to access mail via SMTP.
354 | 
355 | A value for `-c` < 1 effectively allows you to run more tasks concurrently than CPUs specified for the run. This is somewhat unusual, and generally not recommended, but could be appropriate in some cases.
356 | 
357 | The `--no-retire` and `--retire-cmd` flags allow you to control what disBatch does when a node is no longer needed to run jobs.
358 | When running under slurm, disBatch will by default run the command:
359 | 
360 |     scontrol update JobId="$SLURM_JOBID" NodeList="${DRIVER_NODE:+$DRIVER_NODE,}$ACTIVE"
361 | 
362 | which will tell slurm to release any nodes no longer being used.
363 | You can set this to run a different command, or nothing at all.
364 | While running this command, the follow environment variables will be set: `NODE` (the node that is no longer needed), `ACTIVE` (a comma-delimited list of nodes that are still active), `RETIRED` (a comma-delimited list of nodes that are no longer active, including `$NODE`), and possibly `DRIVER_NODE` (the node still running the main disBatch script, if it's not in `ACTIVE`).
365 | 
366 | `-S` Startup only mode. In this mode, `disBatch` starts up the task management system and then waits for execution resources to be added.
367 | <span id='user-content-startup'>At startup</span>, `disBatch` always generates a script `<Prefix>_dbUtil.sh`, where `<Prefix>` refers to the `-p` option or default, see above. We'll call this simply `dbUtils.sh` here,
368 | but remember to include `<Prefix>_` in actual use. You can add execution resources by doing one or more of the following multiple times:
369 | 1. Submit `dbUtils.sh` as a job, e.g.:
370 | 
371 |     `sbatch -n 40 dbUtil.sh`
372 | 
373 | 2. Use ssh, e.g.:
374 | 
375 |     `./dbUtil.sh -s localhost:4,friendlyNeighbor:5`
376 | 
377 | Each of these creates an execution context, which contains one of more execution engines (if using, for example, 8-core nodes, then five for the first; two in the second).
378 | An engine can run one or more tasks currently. In the first example, each of the five engines will run up to eight tasks concurrently, while in the
379 | second example, the engine on `localhost` will run up to four tasks concurrently and the engine on `friendlyNeighbor` will run up to five.
380 | `./dbUtil.sh --mon` will start a simple ASCII-based monitor that tracks the overall state of the disBatch run, and the activity of the individual
381 | contexts and engines. By cursoring over an engine, you can send a shutdown signal to the engine or its context. This signal is *soft*, triggering
382 | a graceful shutdown that will occur only after currently assigned tasks are complete. Other execution resources are uneffected.
383 | 
384 | When a context is started, you can also supply the argument `--context-task-limit N`. This will shutdown the context and all associated engines
385 | after it has run `N` tasks.  
386 | 
387 | Taken together, these mechanisms enable disBatch to run on a dynamic pool of execution resources, so you can "borrow" a colleague's workstation overnight, or
388 | claim a large chunk of a currently idle partition, but return some if demands picks up, or chain together a series of time limited allocations to
389 | accomplish a long run. When using this mode, keep in mind two caveats: (i) The time quantum is determined by your task duration. If any given task might
390 | run for hours or days, then the utility of this is limited. You can still use standard means (kill, scancel) to terminate contexts and engines, but
391 | you will likely have incomplete tasks to
392 | reckon with; (ii) The task manangement system must itself be run in a setting where a long lived process is OK. Say in a `screen` or `tmux` session on
393 | the login node of a cluster, or on your personal workstation (assuming it has the appropriate connectivity to reach the other resources you plan to use).
394 | 
395 | 
396 | `-r` uses the status file of a previous run to determine what tasks to run during this disBatch invocation. Only those tasks that haven't yet run (or with `-R`, those that haven't run or did but returned a non-0 exit code) are run this time. By default, the numeric task identifier and the text of the command are used to determine if a current task is the same as one found in the status file. `--force-resume` restricts the comparison to just the numeric identifier.
397 | 
398 | `--use-address HOST:PORT` can be used if disBatch is not able to determine the correct hostname for the machine it is running on (or you need to override what was detected). This is often the case when running on a personal laptop without a "real" network configuration. In this case `--use-address=localhost:0` will generally be sufficient.
399 | 
400 | `--kvsserver`, `--taskcommand`, and `--taskserver` implement advanced functionality (placing disBatch in an existing shared key store context and allowing for a programmatic rather than textual task interface). Contact the authors for more details.
401 | 
402 | 
403 | ### Considerations for large runs
404 | 
405 | If you do submit jobs with order 10000 or more tasks, you should
406 | carefully consider how you want to organize the output (and error) files
407 | produced by each of the tasks. It is generally a bad idea to have more
408 | than a few thousand files in any one directory, so you will probably
409 | want to introduce at least one extra level of directory hierarchy so
410 | that the files can be divided into smaller groups. Intermediate
411 | directory `13`, say, might hold all the files for tasks 13000 to
412 | 13999.
413 | 
414 | ## #DISBATCH directives
415 | 
416 | ### PREFIX and SUFFIX
417 | 
418 | In order to simplify task files, disBatch supports a couple of
419 | directives to specify common task prefix strings and suffix strings. As noted above, it
420 | also sets environment variables to identify various aspects of the
421 | submission. Here's an example
422 | 
423 |     # Note there is a space at the end of the next line.
424 |     #DISBATCH PREFIX ( cd /path/to/workdir ; source SetupEnv ; 
425 |     #DISBATCH SUFFIX  ) &> ${DISBATCH_NAMETASKS}_${DISBATCH_JOBID}_${DISBATCH_TASKID_ZP}.log
426 | 
427 | These are textually prepended and appended, respectively, to the text of
428 | each subsequent task line. If the suffix includes redirection and a task is a proper command sequence (a series of
429 | commands joined by `;`), then the task should be wrapped in `( ... )`, as in this example, so that the standard error and standard output of the whole sequence
430 | will be redirected to the log file. If this is not done, only standard
431 | error and standard output for the last component of the command sequence
432 | will be captured. This is probably not what you want unless you have
433 | redirected these outputs for the previous individual parts of the
434 | command sequence.
435 | 
436 | Using these, the above commands could be replaced with:
437 | 
438 |     myprog -a 0 -b 0 -c 0
439 |     myprog -a 0 -b 0 -c 1
440 |     ... 
441 |     myprog -a 9 -b 9 -c 8
442 |     myprog -a 9 -b 9 -c 9
443 | 
444 | Note: the log files will have a different naming scheme, but there will still be one per task.
445 | 
446 | Later occurrences of `#DISBATCH PREFIX` or `#DISBATCH SUFFIX` in a task
447 | file simply replace previous ones. When these are used, the tasks
448 | reported in the status file include the prefix and suffix in
449 | force at the time the task was launched.
450 | 
451 | ### BARRIER
452 | 
453 | If your tasks fall into groups where a later group should only begin
454 | after all tasks of the previous group have completely finished, you can
455 | use this directive:
456 | 
457 |     #DISBATCH BARRIER
458 | 
459 | When disBatch encounters this directive, it will not launch another task
460 | until all tasks in progress have completed. The following form:
461 | 
462 |     #DISBATCH BARRIER CHECK
463 | 
464 | checks the exit status of the tasks done since the last barrier (or
465 | start of the run). If any task had a non-zero exit status, the run
466 | will exit once this barrier is met.
467 | 
468 | ### REPEAT
469 | 
470 | For those problems that are easily handled via a job-array-like approach:
471 | 
472 |     #DISBATCH REPEAT 5 myprog file${DISBATCH_REPEAT_INDEX}
473 | 
474 | will expand into five tasks, each with the environment variable
475 | `DISBATCH_REPEAT_INDEX` set to one of 0, 1, 2, 3 or 4.
476 | 
477 | The starting index and step size can also be changed:
478 | 
479 |     #DISBATCH REPEAT 5 start 100 step 50 myprog file${DISBATCH_REPEAT_INDEX}
480 | 
481 | This will result in indices 100, 150, 200, 250, and 300. `start` defaults
482 | to 0, and `step` to 1.
483 | 
484 | The command is actually optional; one might want to omit the command
485 | if a prefix and/or suffix are in place. Returning to our earlier example, the task file
486 | could be:
487 | 
488 |     #DISBATCH PREFIX a=$((DISBATCH_REPEAT_INDEX/100)) b=$(((DISBATCH_REPEAT_INDEX%100)/10 )) c=$((DISBATCH_REPEAT_INDEX%10) ; ( cd /path/to/workdir ; source SetupEnv ; myprog -a $a -b $b -c $c ) &> task_${a}_${b}_${c}.log
489 |     #DISBATCH REPEAT 1000
490 | 
491 | This is not a model of clarity, but it does illustrate that the repeat constuct can be relatively powerful. Many users may find it more convenient to use the tool of their choice to generate a text file with 1000 invocations explictly written out.
492 | 
493 | ### PERENGINE
494 | 
495 |     #DISBATCH PERENGINE START { command ; sequence ; } &> engine_start_${DISBATCH_ENGINE_RANK}.log
496 |     #DISBATCH PERENGINE STOP { command ; sequence ; } &> engine_stop_${DISBATCH_ENGINE_RANK}.log
497 | 
498 | Use these to specify commands that should run at the time an engine joins a disBatch run or at the time the engine leaves the disBatch run, respectively.
499 | You could, for example, use these to bulk copy some heavily referenced read-only data to the engine's local storage area before any tasks are run, and then delete that data when the engine shuts down.
500 | You can use the environment variable DISBATCH_ENGINE_RANK to distinguish one engine from another; for example, it is used here to keep log files separate.
501 | 
502 | These directives must come before any other tasks.
503 | 
504 | ## Embedded disBatch
505 | 
506 | You can start disBatch from within a python script by instantiating a "DisBatcher" object.
507 | 
508 | See `exampleTaskFiles/dberTest.py` for an example.
509 | 
510 | The "DisBatcher" class (defined in `disbatch/disBatch.py`) illustrates how to interact with disBatch via KVS. This approach could be used to enable similar functionality in other language settings.
511 | 
512 | ## License
513 | 
514 | Copyright 2024 Simons Foundation
515 | 
516 | Licensed under the Apache License, Version 2.0 (the "License");
517 | you may not use this file except in compliance with the License.
518 | You may obtain a copy of the License at
519 | 
520 |     http://www.apache.org/licenses/LICENSE-2.0
521 | 
522 | Unless required by applicable law or agreed to in writing, software
523 | distributed under the License is distributed on an "AS IS" BASIS,
524 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
525 | See the License for the specific language governing permissions and
526 | limitations under the License.
527 | 


--------------------------------------------------------------------------------
/disbatch/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = ['main', '__version__']
 2 | 
 3 | from .disBatch import main
 4 | 
 5 | try:
 6 |     from ._version import __version__
 7 | except Exception:
 8 |     # TODO: hatch-vcs doesn't seem to work well with editable installs
 9 |     # We could switch back to setuptools, but maybe we just wait for the uv build backend...
10 |     __version__ = 'editable'
11 | 


--------------------------------------------------------------------------------
/disbatch/__main__.py:
--------------------------------------------------------------------------------
1 | from . import main
2 | 
3 | main()
4 | 


--------------------------------------------------------------------------------
/disbatch/dbMon.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import curses
  4 | import json
  5 | import os
  6 | import sys
  7 | import time
  8 | from queue import Queue
  9 | from threading import Thread
 10 | 
 11 | from disbatch.kvsstcp import KVSClient
 12 | 
 13 | # Connect to the disBatch communication service for this run.
 14 | try:
 15 |     kvscStatus = KVSClient(os.environ['DISBATCH_KVSSTCP_HOST'])
 16 |     kvscDisplay = kvscStatus.clone()
 17 | except Exception:
 18 |     print('Cannot contact the disBatch server. This usally means the run has ended.', file=sys.stderr)
 19 |     sys.exit(1)
 20 | 
 21 | uniqueId = sys.argv[1]
 22 | uniqueIdName = os.path.split(uniqueId)[-1]
 23 | 
 24 | curses.initscr()
 25 | curses.start_color()
 26 | curses.init_pair(1, curses.COLOR_CYAN, curses.COLOR_BLACK)
 27 | curses.init_pair(2, curses.COLOR_GREEN, curses.COLOR_BLACK)
 28 | curses.init_pair(3, curses.COLOR_BLACK, curses.COLOR_RED)
 29 | curses.init_pair(4, curses.COLOR_YELLOW, curses.COLOR_BLACK)
 30 | curses.init_pair(5, curses.COLOR_RED, curses.COLOR_BLACK)
 31 | curses.init_pair(6, curses.COLOR_BLACK, curses.COLOR_BLACK)
 32 | curses.init_pair(7, curses.COLOR_WHITE, curses.COLOR_WHITE)
 33 | curses.curs_set(False)
 34 | 
 35 | CPCB, CPGB, CPBR, CPYB, CPRB, CPBB, CPWW = (curses.color_pair(x) for x in range(1, 8))
 36 | 
 37 | Diamond = curses.ACS_DIAMOND
 38 | Horizontal, Vertical = curses.ACS_HLINE, curses.ACS_VLINE
 39 | CornerUL, CornerUR, CornerLL, CornerLR = (
 40 |     curses.ACS_ULCORNER,
 41 |     curses.ACS_URCORNER,
 42 |     curses.ACS_LLCORNER,
 43 |     curses.ACS_LRCORNER,
 44 | )
 45 | TeeD, TeeU, TeeR, TeeL = curses.ACS_TTEE, curses.ACS_BTEE, curses.ACS_LTEE, curses.ACS_RTEE
 46 | 
 47 | # TODO: Come up with a better way to set these based on the actual
 48 | # layout encoded in dbStatus.
 49 | HeaderLength = 6
 50 | FooterLength = 1
 51 | Width = 85
 52 | 
 53 | MinLines, MinCols = HeaderLength + FooterLength + 10, Width + 2
 54 | 
 55 | 
 56 | # Thread that periodically checks for status updates from the disBatch
 57 | # controller. Puts formatted results and auxillary data on the shared
 58 | # queue.
 59 | def dbStatus(kvsc, outq):
 60 |     while True:
 61 |         try:
 62 |             j = kvsc.view('DisBatch status')
 63 |         except Exception:
 64 |             outq.put(('stop', None))
 65 |             break
 66 | 
 67 |         if j != b'<Starting...>':
 68 |             statusd = json.loads(j)
 69 | 
 70 |             now = time.time()
 71 | 
 72 |             # convert keys back to ints after json transform.
 73 |             engines = {int(k): v for k, v in statusd['engines'].items()}
 74 |             contexts = {int(k): v for k, v in statusd['contexts'].items()}
 75 |             ee = engines.values()
 76 |             statusd['slots'] = sum([e['active'] for e in ee if e['status'] == 'running'])
 77 |             statusd['finished'] = sum([e['finished'] for e in ee])
 78 |             statusd['failed'] = sum([e['failed'] for e in ee])
 79 |             header = []
 80 |             tuin = uniqueIdName if len(uniqueIdName) <= 40 else (uniqueIdName[:17] + '...' + uniqueIdName[-20:])
 81 |             label = f'Run label: {tuin:<40s}           Status: {statusd["more"]:15s}'
 82 |             header.append(([CornerUL] + [Horizontal] * Width + [CornerUR], CPCB))
 83 |             header.append(([Vertical] + [label + ' ' * (Width - len(label))] + [Vertical], CPCB))
 84 |             header.append(
 85 |                 (
 86 |                     [Vertical]
 87 |                     + [
 88 |                         'Slots {slots:5d}                  Tasks: Finished {finished:7d}      Failed{failed:5d}      Barrier{barriers:3d}'.format(
 89 |                             **statusd
 90 |                         )
 91 |                     ]
 92 |                     + [Vertical],
 93 |                     CPCB,
 94 |                 )
 95 |             )
 96 |             header.append(([TeeR] + [Horizontal] * Width + [TeeL], CPCB))
 97 |             #                       '01234 012345678901 01234567890123456789 0123456  0123456 0123456789 0123456789 0123456'
 98 |             header.append(
 99 |                 (
100 |                     [Vertical]
101 |                     + ['Rank    Context           Host          Last     Avail   Assigned   Finished   Failed']
102 |                     + [Vertical],
103 |                     CPCB,
104 |                 )
105 |             )
106 |             header.append(([CornerLL] + [Horizontal] * Width + [CornerLR], CPCB))
107 |             assert len(header) == HeaderLength
108 | 
109 |             ee = sorted(engines.items())
110 |             content = []
111 |             for rank, engine in ee:
112 |                 if engine['status'] == 'stopped':
113 |                     continue
114 |                 engine['delay'] = now - engine['last']
115 |                 engine['cLabel'] = contexts[engine['cRank']]['label']
116 |                 content.append(
117 |                     (
118 |                         rank,
119 |                         '{rank:5d} {cLabel:12.12s} {hostname:20.20s} {delay:6.0f}s {active:7d} {assigned:10d} {finished:10d} {failed:7d}'.format(
120 |                             **engine
121 |                         ),
122 |                     )
123 |                 )
124 |             outq.put(('status', (engines, contexts, header, content)))
125 |         time.sleep(3)
126 | 
127 | 
128 | # Utility to pop up a Yes/No/Cancel dialog. Read reply from shared
129 | # queue, return first acceptable response.
130 | def popYNC(msg, parent, inq, title='Confirm'):
131 |     ph, pw = parent.getmaxyx()
132 |     h = int(ph * 0.75)
133 |     w = int(pw * 0.85)
134 |     ro, co = int((ph - h) * 0.5), int((pw - w) * 0.5)
135 | 
136 |     # Wrap msg to fit in pop up.
137 |     L, msgw = '', []
138 |     for word in msg.split():
139 |         if len(word) > w:
140 |             word = word[: w - 3] + '...'
141 |         if len(L) + 1 + len(word) > w:
142 |             msgw.append(L)
143 |             L = word
144 |         else:
145 |             L = L + (' ' if L else '') + word
146 |     msgw.append(L)
147 |     if len(msgw) > h:
148 |         missing = 1 + len(msgw) - h
149 |         msgw = msgw[: h - 1]
150 |         msgw.append(f'{missing:d} lines elided.')
151 | 
152 |     nw = curses.newwin(h + 2, w + 2, ro, co)
153 |     nw.border()
154 |     nw.addstr(0, int((w - len(title)) * 0.5), title)
155 |     for r, L in enumerate(msgw):
156 |         nw.addstr(r + 1, 1, L)
157 |     nw.addstr(r + 2, int((w - 19) * 0.5), '[Y]es/[N]o/[C]ancel', curses.A_REVERSE)
158 |     nw.refresh()
159 | 
160 |     # Acceptable responses. Treat a resize event as "cancel".
161 |     resp = {
162 |         ord('y'): 'Y',
163 |         ord('Y'): 'Y',
164 |         ord('n'): 'N',
165 |         ord('N'): 'N',
166 |         ord('c'): 'C',
167 |         ord('C'): 'C',
168 |         curses.KEY_RESIZE: 'C',
169 |     }
170 |     while True:
171 |         tag, k = inq.get()
172 |         if tag == 'key' and k in resp:
173 |             break
174 |         # TODO: If tag isn't key raise exception?
175 | 
176 |     parent.redrawwin()
177 |     parent.refresh()
178 |     return resp[k]
179 | 
180 | 
181 | # Thread that paints the display and responds to user input. Reads status
182 | # updates and keyboard input (including resize events) from the shared queue.
183 | def display(S, kvsc, inq):
184 |     content = []
185 |     lenContent = len(content)
186 | 
187 |     header = [(' ', CPBB)] * 4
188 | 
189 |     tooSmall = curses.LINES < MinLines or curses.COLS < MinCols
190 |     displayLines = curses.LINES - (HeaderLength + FooterLength)
191 | 
192 |     engines = None  # TODO: we may be relying on this being populated before it's referenced below
193 |     localEngineStatus = {}
194 | 
195 |     contentCursor, contentFirst, done = 0, 0, False
196 |     msg = ''
197 |     while True:
198 |         S.clear()
199 | 
200 |         if tooSmall:
201 |             S.addstr(0, 0, f'Screen must be at least {MinLines:d}X{MinCols:d}', CPRB)
202 |         else:
203 |             # Header
204 |             for r, (L, cp) in enumerate(header):
205 |                 S.move(r, 0)
206 |                 for e in L:
207 |                     if type(e) is int:
208 |                         S.addch(e, cp)
209 |                     else:
210 |                         S.addstr(e, cp)
211 | 
212 |             # Footer
213 |             if msg or done:
214 |                 if done:
215 |                     msg = '[disBatch controller has exited]' + (' ' if msg else '') + msg
216 |                 S.addstr(curses.LINES - 1, 0, msg, CPBR)
217 | 
218 |             # Main content
219 |             if content:
220 |                 # Adjust window to ensure cursor displays.
221 |                 if contentCursor < contentFirst:
222 |                     # move window so last line corresponds to cursor, i.e.:
223 |                     #    contentCursor == contentFirst + (displayLines-1)
224 |                     contentFirst = max(0, contentCursor - (displayLines - 1))
225 |                 elif contentCursor >= (contentFirst + displayLines):
226 |                     # move window so first line corresponds to cursor.
227 |                     contentFirst = contentCursor
228 |                 # ensure window is as full as possible.
229 |                 contentLast = min(contentFirst + displayLines, lenContent)
230 |                 contentFirst = max(0, contentLast - displayLines)
231 |                 for r, (rank, L) in enumerate(content[contentFirst:contentLast]):
232 |                     if len(L) > curses.COLS - 1:
233 |                         L = L[: curses.COLS - 4] + '...'
234 |                     cp = CPGB
235 |                     if engines[rank]['status'] == 'stopping':
236 |                         cp = CPRB
237 |                     elif localEngineStatus.get(rank, '') == 'requesting shutdown':
238 |                         cp = CPYB
239 |                     S.addstr(HeaderLength + r, 1, L, cp)
240 | 
241 |                 # Scroll indicator and cursor
242 |                 regionStart = (displayLines * contentFirst) // lenContent
243 |                 regionEnd = (displayLines * contentLast + lenContent - 1) // lenContent
244 |                 S.addch(HeaderLength + regionStart, 0, TeeD, CPYB)
245 |                 for r in range(regionStart + 1, regionEnd - 1):
246 |                     S.addch(HeaderLength + r, 0, Vertical, CPYB)
247 |                 S.addch(HeaderLength + regionEnd - 1, 0, TeeU, CPYB)
248 |                 S.addch(HeaderLength + (contentCursor - contentFirst), 0, Diamond, CPCB)
249 |             else:
250 |                 S.addstr(HeaderLength, 0, '<No Content>', CPRB)
251 | 
252 |         S.refresh()
253 | 
254 |         tag, o = inq.get()
255 |         if tag == 'key':
256 |             msg = ''
257 |             k = o
258 |             if k == curses.KEY_RESIZE:
259 |                 curses.update_lines_cols()
260 |                 if curses.LINES < MinLines or curses.COLS < MinCols:
261 |                     tooSmall = True
262 |                     continue
263 |                 tooSmall = False
264 | 
265 |                 displayLines = curses.LINES - (HeaderLength + FooterLength)
266 |                 if displayLines > (lenContent - contentCursor):
267 |                     contentFirst = max(0, lenContent - displayLines)
268 |                 else:
269 |                     contentFirst = max(0, contentCursor - displayLines // 2)
270 | 
271 |                 S.clear()
272 |                 S.refresh()
273 |                 continue
274 | 
275 |             if k == ord('u') or k == curses.KEY_UP:
276 |                 contentCursor = max(0, contentCursor - 1)
277 |             elif k == ord('d') or k == curses.KEY_DOWN:
278 |                 contentCursor = min(max(0, lenContent - 1), contentCursor + 1)
279 |             elif k == ord('q'):
280 |                 break
281 |             elif k in [ord('h'), ord('?')]:
282 |                 msg = 'C: Shutdown context; E: Shutdown engine; q: quit'
283 |             elif k in [ord('C'), ord('E')]:
284 |                 if not done:
285 |                     target = content[contentCursor][0]
286 |                     if target is not None:
287 |                         if k == ord('C'):
288 |                             cRank = engines[target]['cRank']
289 |                             r = popYNC('Stopping context {cLabel:s} ({cRank:d})'.format(**engines[target]), S, inq)
290 |                             if r == 'Y':
291 |                                 try:
292 |                                     msg = f'Asking controller to stop context {cRank!r}'
293 |                                     kvsc.put('.controller', ('stop context', cRank))
294 |                                     for rank, e in engines.items():
295 |                                         if e['cRank'] == cRank:
296 |                                             localEngineStatus[rank] = 'requesting shutdown'
297 |                                 except OSError:
298 |                                     pass
299 |                         elif k == ord('E'):
300 |                             r = popYNC(
301 |                                 'Stopping engine {rank:d} ({hostname:s}, {pid:d})'.format(**engines[target]), S, inq
302 |                             )
303 |                             if r == 'Y':
304 |                                 try:
305 |                                     msg = f'Asking controller to stop engine  {target!r}'
306 |                                     kvsc.put('.controller', ('stop engine', target))
307 |                                     localEngineStatus[target] = 'requesting shutdown'
308 |                                 except OSError:
309 |                                     pass
310 |             else:
311 |                 msg = f'Got unrecognized key: {k:d}'
312 |         elif tag == 'status':
313 |             engines, contexts, header, content = o
314 |             # Adjust cursor location if needed.
315 |             oldLen, lenContent = lenContent, len(content)
316 |             if oldLen > lenContent:
317 |                 f = contentCursor / oldLen
318 |                 contentCursor = int(f * lenContent)
319 |         elif tag == 'stop':
320 |             done = True
321 |         else:
322 |             msg = f'Unrecognized tag: "{tag}",'
323 | 
324 | 
325 | # (Wrapped) main.
326 | # Creates a shared queue, sets up status and display threads, and then waits for
327 | # keyboard events and writes them to the shared queue. Intercepts "q" to quit.
328 | #
329 | # It appears that getch() needs to be called from the main processes.
330 | def main(S):
331 |     S.bkgdset(CPBB)
332 |     S.clear()
333 |     S.refresh()
334 | 
335 |     inq = Queue()
336 |     gc = Thread(target=display, args=(S, kvscDisplay, inq))
337 |     gc.daemon = True
338 |     gc.start()
339 |     db = Thread(target=dbStatus, args=(kvscStatus, inq))
340 |     db.daemon = True
341 |     db.start()
342 | 
343 |     while True:
344 |         k = S.getch()
345 |         if k == ord('q'):
346 |             break
347 |         inq.put(('key', k))
348 | 
349 | 
350 | curses.wrapper(main)
351 | 


--------------------------------------------------------------------------------
/disbatch/dbUtil.template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export DISBATCH_KVSSTCP_HOST={kvsserver:s}
 4 | 
 5 | if [[ $1 == '--mon' ]]
 6 | then
 7 |     exec {DisBatchPython} -m disbatch.dbMon {uniqueId:s}
 8 | elif [[ $1 == '--engine' ]]
 9 | then
10 |     exec {DisBatchPython} -m disbatch "$@"
11 | else
12 |     exec {DisBatchPython} -m disbatch --context {DbUtilPath:} "$@" < /dev/null 1> {uniqueId:s}_${{BASHPID-$$}}_context_launch.log
13 | fi
14 | 


--------------------------------------------------------------------------------
/disbatch/kvsstcp/.gitignore:
--------------------------------------------------------------------------------
 1 | /build/
 2 | /dist/
 3 | *.pyc
 4 | *.o
 5 | /kvsLoop
 6 | /kvsTestWIc
 7 | /kvsTestWIf
 8 | /kvsTestf
 9 | /.stack-work
10 | 


--------------------------------------------------------------------------------
/disbatch/kvsstcp/Readme.md:
--------------------------------------------------------------------------------
 1 | Key value storage server
 2 | ========================
 3 | 
 4 | Inspired by networkspaces, which was in turn inspired by the Linda coordination language.
 5 | 
 6 | Similar systems exist, the point of this one is to provide a simple to deploy and reasonably functional and efficient store that is easy to integrate with many different programming environments.
 7 | 
 8 | The reference python implementation should work with any stock python 2.7 or above:
 9 | 
10 |    * `kvscommon.py` contains the line protocol description and common utilities,
11 |    * `kvsstcp.py` contains the server, which can be run from the command line or from within another python module as `KVSServer()` to start the server thread
12 |    * `kvsclient.py` contains the client interface, which can be run from the command line or from within another python module as `KVSClient(host, port)`
13 | 
14 | "kvsSupport.[ch]" contains a client that can be linked with C or FORTRAN codes.
15 | 
16 | "kvsTest.py" provides a simple example of use.
17 | 
18 | "kvsRing.py" can be used to generate some basic timing information.
19 | 
20 | "kvsLoop.c" and "kvsTestf.f" are example codes for C and FORTRAN. "Makefile" can be used to build these.
21 | 
22 | "kvsBatchWrapper.sh" is a short script to invoke a program that uses KVS via a Slurm sbatch submission, e.g.:
23 | 
24 | 	sbatch -N 2 --ntasks-per-node=28 --exclusive kvsBatchWrapper.sh ./kvsTestf
25 | 
26 | `wskvsmu.py` is a prototype web interface for displaying the state of a KVS server (and injecting values into it). Uses `wskvspage.html` as the frontend.
27 | 
28 | "kvsTestWIc.c" and "kvsTestWIf.f" provide example codes that use KVS via wskvsmu.py to enter input from a web browser into C or FORTRAN. 
29 | 
30 | ## License
31 | 
32 | Copyright 2017 Simons Foundation
33 | 
34 | Licensed under the Apache License, Version 2.0 (the "License");
35 | you may not use this file except in compliance with the License.
36 | You may obtain a copy of the License at
37 | 
38 |     http://www.apache.org/licenses/LICENSE-2.0
39 | 
40 | Unless required by applicable law or agreed to in writing, software
41 | distributed under the License is distributed on an "AS IS" BASIS,
42 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
43 | See the License for the specific language governing permissions and
44 | limitations under the License.
45 | 


--------------------------------------------------------------------------------
/disbatch/kvsstcp/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['KVSClient', 'KVSServerThread']
2 | 
3 | from .kvsclient import KVSClient
4 | from .kvsstcp import KVSServer as KVSServerThread
5 | 


--------------------------------------------------------------------------------
/disbatch/kvsstcp/kvsclient.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import errno
  4 | import os
  5 | import socket
  6 | import sys
  7 | import time
  8 | from pickle import dumps as PDS
  9 | from pickle import loads as PLS
 10 | 
 11 | from .kvscommon import AsciiLenChars, AsciiLenFormat, recvall
 12 | 
 13 | 
 14 | class KVSClient:
 15 |     """KVS convenience wrapper that includes pickling by default."""
 16 | 
 17 |     def __init__(self, host=None, port=None, retry=0):
 18 |         """Establish connection to a key value storage server at an address
 19 |         given by host, port or "host:port"
 20 | 
 21 |         If retry > 0, retry the connection this many times if it fails.
 22 |         """
 23 |         if not host:
 24 |             host = os.environ.get('KVSSTCP_HOST', None)
 25 |             # TODO: Silently overrides user supplied value, if there is one.
 26 |             port = os.environ.get('KVSSTCP_PORT', None)
 27 | 
 28 |         if not host:
 29 |             raise Exception('Missing host')
 30 | 
 31 |         if not port:
 32 |             if type(host) is tuple:
 33 |                 host, port = host
 34 |             elif ':' in host:
 35 |                 host, port = host.rsplit(':', 1)
 36 |             else:
 37 |                 raise Exception('Missing port')
 38 | 
 39 |         self.addr = (host, int(port))
 40 |         self.socket = None
 41 |         self.connect(retry)
 42 | 
 43 |     def clone(self):
 44 |         """Create a new connection to the same server as this one."""
 45 |         return KVSClient(self.addr)
 46 | 
 47 |     # Low-level network operations
 48 |     def _close(self):
 49 |         if not self.socket:
 50 |             return
 51 |         try:
 52 |             self._real_socket().close()
 53 |         except OSError:
 54 |             pass
 55 |         self.socket = None
 56 | 
 57 |     def _recvValue(self, doPickle=False):
 58 |         L = int(recvall(self.socket, AsciiLenChars))
 59 |         payload = recvall(self.socket, L)
 60 |         if doPickle:
 61 |             payload = PLS(payload)
 62 |         return payload
 63 | 
 64 |     def _sendLenAndBytes(self, payload):
 65 |         if type(payload) is not bytes:
 66 |             payload = bytes(payload, 'utf-8')
 67 |         self.socket.sendall(AsciiLenFormat(len(payload)))
 68 |         self.socket.sendall(payload)
 69 | 
 70 |     class SocketWaiting:
 71 |         """Used as placeholder socket when there's an incomplete get/view call
 72 |         that must be retried.  The real socket and outstanding op are stashed."""
 73 | 
 74 |         def __init__(self, socket, op):
 75 |             self.socket = socket
 76 |             self.op = op
 77 | 
 78 |         def __nonzero__(self):
 79 |             return True
 80 | 
 81 |         def __bool__(self):
 82 |             return True
 83 | 
 84 |         def __getattr__(self, attr):
 85 |             """Disallow any other operations on a waiting socket."""
 86 |             raise Exception(
 87 |                 "Previous {} timed out: you must retreive the previously requested '{}' value first.".format(*self.op)
 88 |             )
 89 | 
 90 |     def _real_socket(self):
 91 |         """Get the real socket, even if we have an outstanding SocketWaiting."""
 92 |         try:
 93 |             # for SocketWaiting
 94 |             return self.socket.socket
 95 |         except AttributeError:
 96 |             return self.socket
 97 | 
 98 |     def _get_view(self, op, k, encoding, timeout=None):
 99 |         try:
100 |             # check if we're waiting for something
101 |             waiting = self.socket.op
102 |         except AttributeError:
103 |             waiting = None
104 |         if waiting == (op, k):
105 |             # continue previous timedout wait
106 |             self.socket = self.socket.socket
107 |         else:
108 |             # new wait
109 |             self.socket.sendall(op)
110 |             self._sendLenAndBytes(k)
111 |         if timeout is None:
112 |             coding = recvall(self.socket, 4)
113 |         else:
114 |             self.socket.settimeout(timeout)
115 |             try:
116 |                 c = self.socket.recv(1)
117 |             except socket.timeout:
118 |                 self.socket = self.SocketWaiting(self.socket, (op, k))
119 |                 return
120 |             except OSError as e:
121 |                 if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN):
122 |                     self.socket = self.SocketWaiting(self.socket, (op, k))
123 |                     return
124 |                 else:
125 |                     raise
126 |             finally:
127 |                 self._real_socket().settimeout(None)
128 |             if not c:
129 |                 raise OSError('Connection closed')
130 |             coding = c + recvall(self.socket, 3)
131 |         v = self._recvValue(encoding is True and coding == b'PYPK')
132 |         return v if isinstance(encoding, bool) else (coding, v)
133 | 
134 |     def connect(self, retry=0):
135 |         """Reconnect, if necessary.  Can be used after an explicit close."""
136 |         if self.socket:
137 |             return
138 |         rep = 0
139 |         while 1:
140 |             try:
141 |                 self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
142 |                 self.socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
143 |                 self.socket.connect(self.addr)
144 |                 return
145 |             except OSError as msg:
146 |                 self._close()
147 |                 if rep >= retry:
148 |                     raise
149 |                 print(f'kvs socket error: {msg}, retrying', file=sys.stderr)
150 |             # exponential backoff
151 |             time.sleep(2**rep)
152 |             rep += 1
153 | 
154 |     def close(self):
155 |         """Close the connection to the KVS storage server. Does a socket shutdown as well."""
156 |         if not self.socket:
157 |             return
158 |         try:
159 |             self.socket.sendall(b'clos')
160 |             self.socket.shutdown(socket.SHUT_RDWR)
161 |         except OSError as e:
162 |             # this is the client --- cannot assume logging is available.
163 |             print(f'Ignoring exception during client close: "{e}"', file=sys.stderr)
164 |         self._close()
165 | 
166 |     def dump(self):
167 |         """Returns a snapshot of the KV store and its statistics."""
168 |         self.socket.sendall(b'dump')
169 |         return self._recvValue(True)
170 | 
171 |     def get(self, key, encoding=True):
172 |         """Retrieve and remove a value from the store.  If there is no value
173 |         associated with this key, block until one is added by another client
174 |         (with put).
175 | 
176 |         If encoding is True, and the value was pickled, then the value will be
177 |         unpickled before being returned.  If encoding is False, just return the
178 |         raw value.  For anything else, return (encoding, value).
179 |         """
180 |         return self._get_view(b'get_', key, encoding)
181 | 
182 |     def _get_nb(self, key, encoding=True, timeout=None):
183 |         """Non-blocking get.
184 | 
185 |         If timeout is not None, this will only wait for timeout seconds before
186 |         returning None.  In this case, you MUST call this function again in the
187 |         future until it returns a value before doing any other operation,
188 |         otherwise the value may be lost."""
189 |         return self._get_view(b'get_', key, encoding, timeout)
190 | 
191 |     def view(self, key, encoding=True):
192 |         """Retrieve, but do not remove, a value from the store.  See 'get'."""
193 |         return self._get_view(b'view', key, encoding)
194 | 
195 |     def _view_nb(self, key, encoding=True, timeout=None):
196 |         """Non-blocking view.  See '_get_nb' and 'view'."""
197 |         return self._get_view(b'view', key, encoding, timeout)
198 | 
199 |     def put(self, key, value, encoding=True):
200 |         """Add a value to the key.  If encoding is True, pickle the value and
201 |         encode as PYPK.  If False, convert to string and store as ASTR.
202 |         Otherwise, encoding must be a 4 character string, and value must be a
203 |         string."""
204 |         if encoding is True:
205 |             value = PDS(value)
206 |             encoding = b'PYPK'
207 |         elif encoding is False:
208 |             # TODO: Is this silent stringification too clever by half?
209 |             # Maybe, since unicode strings will end up as "u'\\u...'". perhaps utf8-encode strings, and fail on other types?
210 |             if type(value) is not str and type(value) is not bytes:
211 |                 value = repr(value)
212 |             encoding = b'ASTR'
213 |         else:
214 |             if type(encoding) is not bytes:
215 |                 if type(encoding) is not str:
216 |                     encoding = repr(encoding)
217 |                 encoding = bytes(encoding, 'utf-8')
218 |             if len(encoding) != 4:
219 |                 raise TypeError(f'Invalid encoding: {encoding}')
220 | 
221 |         self.socket.sendall(b'put_')
222 |         self._sendLenAndBytes(key)
223 |         self.socket.sendall(encoding)
224 |         self._sendLenAndBytes(value)
225 | 
226 |     def monkey(self, mkey, value):
227 |         """Make mkey a monitor key. Value encodes what events to monitor and
228 |         for which key:
229 | 
230 |                 Key:Events
231 | 
232 |         Whenever a listed event occurs for "Key", a put will be done
233 |         to "Mkey" with the value "<event> <key>".  If 'Key' is empty,
234 |         the events listed will be monitored for all keys.  'Events' is
235 |         some subset of 'g', 'p', 'v' and 'w' (get, put, view and
236 |         wait). Monitoring of any event *not* listed is turned off for
237 |         the specified key.
238 |         """
239 |         self.socket.sendall(b'mkey')
240 |         self._sendLenAndBytes(mkey)
241 |         self._sendLenAndBytes(value)
242 | 
243 |     def shutdown(self):
244 |         """Tell the KVS server to shutdown (and run the close() method for this client)."""
245 |         try:
246 |             self._real_socket().sendall(b'down')
247 |         finally:
248 |             self._close()
249 | 
250 | 
251 | def addKVSServerArgument(argp, name='kvsserver'):
252 |     """Add an argument to the given ArgumentParser that accepts the address of a running KVSServer, defaulting to $KVSSTCP_HOST:$KVSSTCP_PORT."""
253 |     host = os.environ.get('KVSSTCP_HOST')
254 |     port = os.environ.get('KVSSTCP_PORT') if host else None
255 |     argp.add_argument(
256 |         name,
257 |         metavar='host:port',
258 |         nargs='?' if port else None,
259 |         default=host + ':' + port if port else None,
260 |         help='KVS server address.',
261 |     )
262 | 
263 | 
264 | if '__main__' == __name__:
265 |     import argparse
266 | 
267 |     class OpAction(argparse.Action):
268 |         def __call__(self, parser, namespace, values, option_string=None):
269 |             items = getattr(namespace, 'ops', [])
270 |             op = self.option_strings[1][2:]
271 |             if op in ('get', 'view', 'put'):
272 |                 encoding = getattr(namespace, 'encoding', False)
273 |                 values.append(encoding)
274 |                 if encoding is True and op == 'put':
275 |                     values[1] = eval(values[1], {})
276 |                 if op in ('get', 'view'):
277 |                     op = '_' + op + '_nb'
278 |                     values.append(getattr(namespace, 'timeout', None))
279 |             values.insert(0, op)
280 |             items.append(values)
281 |             namespace.ops = items
282 | 
283 |     argp = argparse.ArgumentParser(description='Command-line client to key-value storage server.')
284 |     argp.add_argument(
285 |         '-R', '--retry', default=0, type=int, metavar='COUNT', help='Number of times to retry on connect failure [0]'
286 |     )
287 |     argp.add_argument(
288 |         '-P', '--pickle', dest='encoding', action='store_true', help='(Un-)Pickle values to/from python expressions'
289 |     )
290 |     argp.add_argument(
291 |         '-A', '--no-pickle', dest='encoding', action='store_false', help="Don't (un-)pickle values (default)"
292 |     )
293 |     argp.add_argument(
294 |         '-E',
295 |         '--encoding',
296 |         dest='encoding',
297 |         type=str,
298 |         metavar='CODE',
299 |         help='Explicitly set/get encoding (4-character string, ignored on get) [ASTR or PYPK with -P]',
300 |     )
301 |     argp.add_argument('-T', '--timeout', type=float, metavar='SECS', nargs='?', help='Timeout waiting for get/view')
302 |     argp.add_argument('-d', '--dump', action=OpAction, nargs=0, help='Dump the current state')
303 |     argp.add_argument('-g', '--get', action=OpAction, nargs=1, metavar='KEY', help='Retrieve and remove a value')
304 |     argp.add_argument('-v', '--view', action=OpAction, nargs=1, metavar='KEY', help='Retrieve a value')
305 |     argp.add_argument('-p', '--put', action=OpAction, nargs=2, metavar=('KEY', 'VALUE'), help='Put a value')
306 |     argp.add_argument(
307 |         '-m',
308 |         '--monkey',
309 |         action=OpAction,
310 |         nargs=2,
311 |         metavar=('MKEY', 'KEY:EVENTS'),
312 |         help='Create or update a monitor for the key and events',
313 |     )
314 |     argp.add_argument('-S', '--shutdown', action=OpAction, nargs=0, help='Tell the server to shutdown')
315 |     argp.add_argument('-s', '--sleep', action=OpAction, nargs=1, type=float, metavar='SECS', help='Pause for a time')
316 |     addKVSServerArgument(argp, 'server')
317 |     args = argp.parse_args()
318 | 
319 |     kvs = KVSClient(args.server, retry=args.retry)
320 | 
321 |     if hasattr(args, 'ops') and args.ops:
322 |         for cmd in args.ops:
323 |             op = cmd.pop(0)
324 |             if op == 'sleep':
325 |                 time.sleep(*cmd)
326 |             else:
327 |                 try:
328 |                     r = getattr(kvs, op)(*cmd)
329 |                     if r is not None:
330 |                         print(r)
331 |                 except Exception as e:
332 |                     print(e, file=sys.stderr)
333 |     else:
334 |         print('Nothing to do.')
335 |     kvs.close()
336 | 


--------------------------------------------------------------------------------
/disbatch/kvsstcp/kvscommon.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import socket
 3 | 
 4 | # The line protocol is very simple:
 5 | #
 6 | #  4 byte operation ('clos', 'dump', 'get_', 'mkey', 'put_', 'view')
 7 | #
 8 | # 'clos': No additional argument
 9 | #
10 | # 'dump': No additional argument
11 | #
12 | # 'get_': One key argument, expects a value argument in reply.
13 | #
14 | # 'mkey' (monitor key): Two key arguments. The second may have a ': ...'
15 | #     suffix indicating events to be monitored.
16 | #
17 | # 'put_': One key argument followed by one value argument.
18 | #
19 | # 'view': One key argument, expects a value argument in reply.
20 | #
21 | # Key representation:
22 | #     10 bytes: A 10 character string (ascii, not null terminated) with the base 10
23 | #               representation of the byte length of the key string
24 | #     length bytes: the key string
25 | #
26 | # Value representatin:
27 | #     4 bytes: coding scheme.
28 | #     10 bytes: A 10 character string (ascii, not null terminated) with the base 10
29 | #               representation of the byte length of the argument
30 | #     length bytes: the string representing the key
31 | #
32 | # Notes:
33 | #
34 | # 1) Coding schemes for values is a work in progress.
35 | #
36 | 
37 | AsciiLenChars = 10
38 | 
39 | 
40 | def AsciiLenFormat(n):
41 |     assert n <= 9999999999
42 |     return str(n).encode('ascii').rjust(AsciiLenChars)
43 | 
44 | 
45 | if hasattr(socket, 'MSG_WAITALL') and os.uname()[0] != 'Darwin':
46 |     # MSG_WAITALL on OSX ends up blocking if the tcp buffer is not big enough for the entire message: don't use it
47 |     def recvall(s, n):
48 |         if s is None:
49 |             raise OSError('socket is None, cannot receive')
50 |         if not n:
51 |             return b''
52 |         r = s.recv(n, socket.MSG_WAITALL)
53 |         if len(r) < n:
54 |             raise OSError('Connection dropped')
55 |         return r
56 | else:
57 | 
58 |     def recvall(s, n):
59 |         """Wrapper to deal with partial recvs when we know there are N bytes to be had."""
60 |         if s is None:
61 |             raise OSError('socket is None, cannot receive')
62 |         d = b''
63 |         while n:
64 |             b = s.recv(n)
65 |             if not b:
66 |                 raise OSError('Connection dropped')
67 |             d += b
68 |             n -= len(b)
69 |         return d
70 | 


--------------------------------------------------------------------------------
/disbatch/kvsstcp/kvsstcp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import errno
  3 | import logging
  4 | import os
  5 | import resource
  6 | import select
  7 | import socket
  8 | import threading
  9 | from collections import defaultdict as DD
 10 | from functools import partial
 11 | from pickle import dumps as PDS
 12 | 
 13 | from .kvscommon import AsciiLenChars, AsciiLenFormat
 14 | 
 15 | logger = logging.getLogger('kvs')
 16 | 
 17 | # There are some cyclic references in in asyncio, handlers, waiters, etc., so I'm re-enabling this:
 18 | # gc.disable()
 19 | 
 20 | _DISCONNECTED = frozenset(
 21 |     (errno.ECONNRESET, errno.ENOTCONN, errno.ESHUTDOWN, errno.ECONNABORTED, errno.EPIPE, errno.EBADF)
 22 | )
 23 | _BUFSIZ = 8192
 24 | 
 25 | # Concepts:
 26 | #
 27 | # Every connection is represented by a dispatcher.
 28 | #
 29 | # Every dispatcher is registered with a handler, which in effect runs
 30 | # the KVS server loop.
 31 | #
 32 | # The handler runs an infinite loop that mostly sits on a poll of some
 33 | # sort waiting for one or more events associated with registered
 34 | # connections (identified by their file descriptor).
 35 | #
 36 | # When an event occurs the dispatcher associated with the connection
 37 | # is used to process the event.
 38 | #
 39 | # The listening socket is treated just like any other connection and
 40 | # has its own dispatcher. An "event" on this connection triggers an
 41 | # accept that leads to the creation of a new dispatcher
 42 | # (KVSRequestDispatcher) to handle exchanges with the client.
 43 | #
 44 | # This approach has the very important benefit that it is single threaded.
 45 | 
 46 | 
 47 | class Handler:
 48 |     """Based on asyncore, but with a simpler, stricter per-thread interface that allows better performance."""
 49 | 
 50 |     def __init__(self):
 51 |         self.disps = dict()
 52 |         self.current = None
 53 |         self.running = True
 54 | 
 55 |     def register(self, disp):
 56 |         self.disps[disp.fd] = disp
 57 | 
 58 |     def unregister(self, disp):
 59 |         del self.disps[disp.fd]
 60 | 
 61 |     def run(self):
 62 |         while self.running:
 63 |             try:
 64 |                 self.poll()
 65 |             except OSError as e:
 66 |                 if e.errno == errno.EINTR:
 67 |                     continue
 68 |                 raise
 69 |         for d in list(self.disps.values()):
 70 |             try:
 71 |                 d.close()
 72 |             except Exception as e:
 73 |                 logger.info('%r reported %r on close in handler.', d, e)
 74 |         self.close()
 75 | 
 76 |     def writable(self, disp):
 77 |         "Equivalent to setting mask | OUT, but safe to be called from other (non-current) handlers."
 78 |         if disp.mask & self.OUT:
 79 |             return
 80 |         disp.mask |= self.OUT
 81 |         # write can be called from other threads
 82 |         if self.current is not disp:
 83 |             self.modify(disp)
 84 | 
 85 |     def close(self):
 86 |         self.running = False
 87 | 
 88 | 
 89 | class PollHandler(Handler):
 90 |     def __init__(self):
 91 |         self.IN, self.OUT, self.EOF = select.POLLIN, select.POLLOUT, select.POLLHUP
 92 |         self.poller = select.poll()
 93 |         Handler.__init__(self)
 94 | 
 95 |     def register(self, disp):
 96 |         Handler.register(self, disp)
 97 |         self.poller.register(disp.fd, disp.mask)
 98 | 
 99 |     def unregister(self, disp):
100 |         self.poller.unregister(disp.fd)
101 |         Handler.unregister(self, disp)
102 | 
103 |     def modify(self, disp):
104 |         self.poller.modify(disp.fd, disp.mask)
105 | 
106 |     def poll(self):
107 |         ev = self.poller.poll()
108 |         for f, e in ev:
109 |             d = self.current = self.disps[f]
110 |             oldm = d.mask
111 |             if e & self.EOF:
112 |                 d.handle_close()
113 |                 continue
114 |             if e & self.IN:
115 |                 d.handle_read()
116 |             if d.mask & self.OUT:
117 |                 d.handle_write()
118 |             self.current = None
119 |             if d.mask != oldm and not (d.mask & self.EOF):
120 |                 self.modify(d)
121 | 
122 |     def stop(self, disp):
123 |         Handler.close(self)
124 | 
125 | 
126 | class EPollHandler(PollHandler):
127 |     def __init__(self):
128 |         self.IN, self.OUT, self.EOF = select.EPOLLIN, select.EPOLLOUT, select.EPOLLHUP
129 |         self.poller = select.epoll()
130 |         Handler.__init__(self)
131 | 
132 |     def close(self):
133 |         self.poller.close()
134 |         Handler.close(self)
135 | 
136 | 
137 | class KQueueHandler(Handler):
138 |     def __init__(self):
139 |         self.IN, self.OUT, self.EOF = 1, 2, 4
140 |         self.kqueue = select.kqueue()
141 |         Handler.__init__(self)
142 | 
143 |     def register(self, disp):
144 |         Handler.register(self, disp)
145 |         disp.curmask = 0
146 |         self.modify(disp)
147 | 
148 |     def unregister(self, disp):
149 |         disp.mask = 0
150 |         self.modify(disp)
151 |         Handler.unregister(self, disp)
152 | 
153 |     def modify(self, disp):
154 |         c = []
155 |         if disp.mask & self.IN:
156 |             if not (disp.curmask & self.IN):
157 |                 c.append(select.kevent(disp.fd, select.KQ_FILTER_READ, select.KQ_EV_ADD))
158 |         elif disp.curmask & self.IN:
159 |             c.append(select.kevent(disp.fd, select.KQ_FILTER_READ, select.KQ_EV_DELETE))
160 |         if disp.mask & self.OUT:
161 |             if not (disp.curmask & self.OUT):
162 |                 c.append(select.kevent(disp.fd, select.KQ_FILTER_WRITE, select.KQ_EV_ADD))
163 |         elif disp.curmask & self.OUT:
164 |             c.append(select.kevent(disp.fd, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE))
165 |         if c:
166 |             self.kqueue.control(c, 0)
167 |         disp.curmask = disp.mask
168 | 
169 |     def poll(self):
170 |         try:
171 |             ev = self.kqueue.control(None, 1024)
172 |         except OSError as e:
173 |             if e.errno == errno.EBADF:
174 |                 self.running = False
175 |                 return
176 |             raise
177 |         for e in ev:
178 |             d = self.current = self.disps[e.ident]
179 |             if e.filter == select.KQ_FILTER_READ:
180 |                 d.handle_read()
181 |             elif e.filter == select.KQ_FILTER_WRITE:
182 |                 d.handle_write()
183 |             self.current = None
184 |             if self.running:
185 |                 self.modify(d)
186 | 
187 |     def close(self):
188 |         self.kqueue.close()
189 |         Handler.close(self)
190 | 
191 |     def stop(self, disp):
192 |         self.close()
193 | 
194 | 
195 | class Dispatcher:
196 |     def __init__(self, sock, handler, mask=0):
197 |         self.sock = sock
198 |         self.fd = sock.fileno()
199 |         self.mask = mask
200 |         sock.setblocking(0)
201 |         self.handler = handler
202 | 
203 |     def open(self):
204 |         self.handler.register(self)
205 | 
206 |     def close(self):
207 |         self.mask = self.handler.EOF
208 |         self.handler.unregister(self)
209 |         try:
210 |             self.sock.close()
211 |         except OSError:
212 |             pass
213 | 
214 |     def accept(self):
215 |         try:
216 |             return self.sock.accept()
217 |         except OSError as e:
218 |             if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN):
219 |                 return
220 |             if e.errno in _DISCONNECTED or e.errno == errno.EINVAL:
221 |                 self.handle_close()
222 |                 return
223 |             raise
224 | 
225 |     def send(self, data):
226 |         try:
227 |             return self.sock.send(data)
228 |         except OSError as e:
229 |             if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN):
230 |                 return 0
231 |             if e.errno in _DISCONNECTED:
232 |                 self.handle_close()
233 |                 return 0
234 |             raise
235 | 
236 |     def recv(self, siz):
237 |         try:
238 |             data = self.sock.recv(siz)
239 |             if not data:
240 |                 self.handle_close()
241 |             return data
242 |         except OSError as e:
243 |             if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN):
244 |                 return b''
245 |             if e.errno in _DISCONNECTED:
246 |                 self.handle_close()
247 |                 return b''
248 |             raise
249 | 
250 |     def recv_into(self, buf):
251 |         try:
252 |             n = self.sock.recv_into(buf)
253 |             if n == 0:
254 |                 self.handle_close()
255 |             return n
256 |         except OSError as e:
257 |             if e.errno in (errno.EWOULDBLOCK, errno.EAGAIN):
258 |                 return b''
259 |             if e.errno in _DISCONNECTED:
260 |                 self.handle_close()
261 |                 return b''
262 |             raise
263 | 
264 |     def shutdown(self):
265 |         try:
266 |             self.mask |= self.handler.IN
267 |             self.sock.shutdown(socket.SHUT_RDWR)
268 |         except OSError as e:
269 |             if e.errno not in _DISCONNECTED:
270 |                 raise
271 | 
272 |     def handle_close(self):
273 |         self.close()
274 | 
275 | 
276 | class StreamDispatcher(Dispatcher):
277 |     """Based on asyncore.dispatcher_with_send, works with EventHandler.
278 |     Also allows input of known-size blocks."""
279 | 
280 |     def __init__(self, sock, handler):
281 |         super().__init__(sock, handler)
282 |         self.out_buf = []
283 |         self.in_buf = memoryview(bytearray(_BUFSIZ))
284 |         self.in_off = 0
285 |         self.read_size = 0
286 |         self.read_handler = None
287 | 
288 |     def write(self, *data):
289 |         for d in data:
290 |             self.out_buf.append(memoryview(d))
291 |         self.handler.writable(self)
292 | 
293 |     def handle_write(self):
294 |         while self.out_buf:
295 |             buf = self.out_buf[0]
296 |             r = self.send(buf[:1048576])
297 |             if r < len(buf):
298 |                 if r:
299 |                     self.out_buf[0] = buf[r:]
300 |                 return
301 |             self.out_buf.pop(0)
302 |         self.mask &= ~self.handler.OUT
303 | 
304 |     def next_read(self, size, f):
305 |         self.read_size = size
306 |         if size > len(self.in_buf):
307 |             buf = memoryview(bytearray(max(size, _BUFSIZ)))
308 |             buf[: self.in_off] = self.in_buf[: self.in_off]
309 |             self.in_buf = buf
310 |         self.read_handler = f
311 |         self.mask |= self.handler.IN
312 | 
313 |     def handle_read(self):
314 |         if self.in_off < len(self.in_buf):
315 |             self.in_off += self.recv_into(self.in_buf[self.in_off :])
316 |         while True:
317 |             handler = self.read_handler
318 |             z = self.read_size
319 |             if not handler or self.in_off < z:
320 |                 return
321 |             i = self.in_buf[:z]
322 |             self.in_buf = self.in_buf[z:]
323 |             self.in_off -= z
324 |             self.read_handler = None
325 |             self.mask &= ~self.handler.IN
326 |             handler(i)
327 | 
328 | 
329 | class KVSRequestDispatcher(StreamDispatcher):
330 |     def __init__(self, pair, server, handler):
331 |         sock, self.addr = pair
332 |         self.server = server
333 |         # Keep track of any currently waiting get:
334 |         self.waiter = None
335 |         sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
336 |         super().__init__(sock, handler)
337 |         logger.info('Accepted connect from %r', self.addr)
338 |         self.next_op()
339 |         self.open()
340 | 
341 |     def handle_close(self):
342 |         self.cancel_waiter()
343 |         logger.info('Closing connection from %r', self.addr)
344 |         self.close()
345 | 
346 |     def error(self, msg):
347 |         logger.error(f'Error from {self.addr!r}: {msg}')
348 |         self.close()
349 | 
350 |     def cancel_waiter(self):
351 |         if self.waiter:
352 |             self.server.kvs.cancel_wait(self.waiter)
353 |             self.waiter = None
354 | 
355 |     def next_op(self):
356 |         self.next_read(4, self.handle_op)
357 | 
358 |     def next_lendata(self, handler):
359 |         # wait for variable-length data prefixed by AsciiLenFormat
360 |         def handle_len(L):
361 |             L = L.tobytes()
362 |             try:
363 |                 n = int(L)
364 |             except ValueError:
365 |                 n = -1
366 |             if n < 0:
367 |                 self.error(f"invalid data len: '{L}'")
368 |                 return
369 |             self.next_read(n, handler)
370 | 
371 |         self.next_read(AsciiLenChars, handle_len)
372 | 
373 |     def handle_op(self, op):
374 |         op = op.tobytes()
375 |         if b'clos' == op:
376 |             self.shutdown()
377 |         elif b'down' == op:
378 |             logger.info('Calling server shutdown')
379 |             self.server.shutdown()
380 |         elif b'dump' == op:
381 |             d = self.server.kvs.dump()
382 |             self.write(AsciiLenFormat(len(d)), d)
383 |             self.next_op()
384 |         elif op in [b'get_', b'mkey', b'put_', b'view']:
385 |             self.next_lendata(partial(self.handle_opkey, op))
386 |         else:
387 |             self.error(f"Unknown op: '{op!r}'")
388 | 
389 |     def handle_opkey(self, op, key):
390 |         key = key.tobytes()
391 |         # DEBUGOFF            logger.debug('(%s) %s key "%s"', whoAmI, reqtxt, key)
392 |         if b'mkey' == op:
393 |             self.next_lendata(partial(self.handle_mkey, key))
394 |         elif b'put_' == op:
395 |             self.next_read(4, lambda encoding: self.next_lendata(partial(self.handle_put, key, encoding)))
396 |         else:  # 'get_' or 'view'
397 |             # Cancel waiting for any previous get/view operation (since client wouldn't be able to distinguish the async response)
398 |             self.cancel_waiter()
399 |             self.waiter = KVSWaiter(op, key, self.handle_got)
400 |             self.server.kvs.wait(self.waiter)
401 |             # But keep listening for another op (like 'clos') to cancel this one
402 |             self.next_op()
403 | 
404 |     def handle_mkey(self, key, val):
405 |         # DEBUGOFF                logger.debug('(%s) val: %s', whoAmI, repr(val))
406 |         self.server.kvs.monkey(key, val)
407 |         self.next_op()
408 | 
409 |     def handle_put(self, key, encoding, val):
410 |         # TODO: bytearray val?
411 |         # DEBUGOFF                logger.debug('(%s) val: %s', whoAmI, repr(val))
412 |         self.server.kvs.put(key, (encoding, val))
413 |         self.next_op()
414 | 
415 |     def handle_got(self, encval):
416 |         (encoding, val) = encval
417 |         self.write(encoding, AsciiLenFormat(len(val)), val)
418 |         self.waiter = None
419 | 
420 | 
421 | class KVSWaiter:
422 |     def __init__(self, op, key, handler):
423 |         if op == b'get_':
424 |             op = b'get'
425 |         self.op = op
426 |         self.delete = op == b'get'
427 |         self.key = key
428 |         self.handler = handler
429 | 
430 | 
431 | class KVS:
432 |     """Get/Put/View implements a client-server key value store. If no
433 |     value is associated with a given key, clients will block on get or
434 |     view until a value is available. Multiple values may be associated
435 |     with any given key.
436 | 
437 |     This is, by design, a very simple, lightweight service that only
438 |     depends on standard Python modules.
439 | 
440 |     """
441 | 
442 |     def __init__(self, getIndex=0, viewIndex=-1):
443 |         self.getIndex, self.viewIndex = getIndex, viewIndex  # TODO: Add sanity checks?
444 |         self.key2mon = DD(lambda: DD(set))  # Maps a normal key to keys that monitor it.
445 |         self.monkeys = set()  # List of monitor keys.
446 |         # store and waiters are mutually exclusive, and could be kept in the same place
447 |         self.store = DD(list)
448 |         self.waiters = DD(list)
449 |         self.opCounts = {b'get': 0, b'put': 0, b'view': 0, b'wait': 0}
450 |         self.ac, self.rc = 0, 0
451 | 
452 |     def _doMonkeys(self, op, k):
453 |         # Don't monitor operations on monitor keys.
454 |         if k in self.monkeys:
455 |             return
456 |         # DEBUGOFF        logger.debug('doMonkeys: %s %s %s', op, k, repr(self.key2mon[True][op] | self.key2mon[k][op]))
457 |         for p in (True, k):
458 |             for mk in self.key2mon[p][op]:
459 |                 self.put(mk, (b'ASTR', repr((op, k))))
460 | 
461 |     def dump(self):
462 |         """Utility function that returns a snapshot of the KV store."""
463 | 
464 |         def vrep(v):
465 |             t = v[0].tobytes()
466 |             # Omit or truncate some values, in which cases add the original length as a third value
467 |             if v == b'JSON' or t == b'HTML':
468 |                 return (t, v[1].tobytes())
469 |             if t != b'ASTR':
470 |                 return (t, None, len(v[1]))
471 |             if v[1][:6].tobytes().lower() == '<html>':
472 |                 return (t, v[1].tobytes())  # for backwards compatibility only
473 |             if len(v[1]) > 50:
474 |                 return (t, v[1][:24].tobytes() + '...' + v[1][-23:].tobytes(), len(v[1]))
475 |             return (t, v[1].tobytes())
476 | 
477 |         return PDS(
478 |             (
479 |                 [
480 |                     self.opCounts[b'get'],
481 |                     self.opCounts[b'put'],
482 |                     self.opCounts[b'view'],
483 |                     self.opCounts[b'wait'],
484 |                     self.ac,
485 |                     self.rc,
486 |                 ],
487 |                 [(k, len(v)) for k, v in self.waiters.items() if v],
488 |                 [(k, len(vv), vrep(vv[-1])) for k, vv in self.store.items() if vv],
489 |             )
490 |         )
491 | 
492 |     def wait(self, waiter):
493 |         """Atomically (remove and) return a value associated with key k. If
494 |         none, block."""
495 |         # DEBUGOFF        logger.debug('wait: %s, %s', repr(waiter.key), repr(waiter.op))
496 |         self._doMonkeys(waiter.op, waiter.key)
497 |         vv = self.store.get(waiter.key)
498 |         if vv:
499 |             if waiter.delete:
500 |                 v = vv.pop(self.getIndex)
501 |                 if not vv:
502 |                     self.store.pop(waiter.key)
503 |             else:
504 |                 v = vv[self.viewIndex]
505 |             self.opCounts[waiter.op] += 1
506 |             # DEBUGOFF                logger.debug('_gv (%s): %s => %s (%d)', waiter.op, waiter.key, repr(v[0]), len(v[1]))
507 |             waiter.handler(v)
508 |         else:
509 |             self.waiters[waiter.key].append(waiter)
510 |             self.opCounts[b'wait'] += 1
511 |             self._doMonkeys(b'wait', waiter.key)
512 |             # DEBUGOFF                logger.debug('(%s) %s acquiring', repr(waiter), repr(s))
513 |             self.ac += 1
514 | 
515 |     def cancel_wait(self, waiter):
516 |         ww = self.waiters.get(waiter.key)
517 |         if ww:
518 |             try:
519 |                 ww.remove(waiter)
520 |             except ValueError:
521 |                 pass
522 |             if not ww:
523 |                 self.waiters.pop(waiter.key)
524 | 
525 |     def monkey(self, mkey, v):
526 |         """Make Mkey a monitor key. Value encodes what events to monitor and
527 |         for which key:
528 | 
529 |                 Key:Events
530 | 
531 |         Whenever a listed event occurs for "Key", a put will be done
532 |         to "Mkey" with the value "<event> <key>".  If 'Key' is empty,
533 |         the events listed will be monitored for all keys.  'Events' is
534 |         some subset of 'g', 'p', 'v' and 'w' (get, put, view and
535 |         wait). Monitoring of any event *not* listed is turned off for
536 |         the specified key.
537 | 
538 |         """
539 |         # DEBUGOFF        logger.debug('monkey: %s %s', mkey, v)
540 |         if b':' not in v:
541 |             return  # TODO: Add some sort of error handling?
542 |         self.monkeys.add(mkey)
543 |         k, events = v.rsplit(b':', 1)
544 |         if not k:
545 |             k = True
546 |         for e, op in [(b'g', b'get'), (b'p', b'put'), (b'v', b'view'), (b'w', b'wait')]:
547 |             if e in events:
548 |                 self.key2mon[k][op].add(mkey)
549 |             else:
550 |                 try:
551 |                     self.key2mon[k][op].remove(mkey)
552 |                 except KeyError:
553 |                     pass
554 |         # DEBUGOFF        logger.debug('monkey: %s', repr(self.key2mon))
555 | 
556 |     def put(self, k, v):
557 |         """Add value v to those associated with the key k."""
558 |         # DEBUGOFF        logger.debug('put: %s, %s', repr(k), repr(v))
559 |         self.opCounts[b'put'] += 1
560 |         ww = self.waiters.get(k)  # No waiters is probably most common, so optimize for
561 |         # that. ww will be None if no waiters have been
562 |         # registered for key k.
563 |         consumed = False
564 |         if ww:
565 |             while ww:
566 |                 waiter = ww.pop(0)
567 |                 # DEBUGOFF                    logger.debug('%s releasing', repr(waiter))
568 |                 self.rc += 1
569 |                 self.opCounts[waiter.op] += 1
570 |                 waiter.handler(v)
571 |                 if waiter.delete:
572 |                     consumed = True
573 |                     break
574 |             if not ww:
575 |                 self.waiters.pop(k)
576 | 
577 |         if not consumed:
578 |             self.store[k].append(v)
579 |         self._doMonkeys(b'put', k)
580 | 
581 | 
582 | class KVSServer(threading.Thread, Dispatcher):
583 |     def __init__(self, host=None, port=0):
584 |         if not host:
585 |             host = socket.gethostname()
586 | 
587 |         self.kvs = KVS()
588 | 
589 |         snof, hnof = resource.getrlimit(resource.RLIMIT_NOFILE)
590 |         hnof = min(hnof, 1000000)  # don't need unreasonably many
591 |         if snof < hnof:
592 |             try:
593 |                 resource.setrlimit(resource.RLIMIT_NOFILE, (hnof, hnof))
594 |                 logger.info('Raised max open files from %d to %d', snof, hnof)
595 |             except Exception:
596 |                 logger.info('Failed to raise max open files from %d to %d; continuing anyway', snof, hnof)
597 |                 pass
598 | 
599 |         self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
600 |         self.sock.setblocking(1)
601 |         self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
602 |         self.sock.bind((host, port))
603 |         logger.info('Setting queue size to 4000')
604 |         self.sock.listen(4000)
605 |         self.cinfo = self.sock.getsockname()
606 | 
607 |         if hasattr(select, 'epoll'):
608 |             self.handler = EPollHandler()
609 |         elif hasattr(select, 'kqueue'):
610 |             self.handler = KQueueHandler()
611 |         else:
612 |             self.handler = PollHandler()
613 |         Dispatcher.__init__(self, self.sock, self.handler, self.handler.IN)
614 |         self.open()
615 | 
616 |         threading.Thread.__init__(self, name='KVSServerThread', target=self.handler.run)
617 |         self.start()
618 | 
619 |     def handle_read(self):
620 |         pair = self.accept()
621 |         if pair:
622 |             KVSRequestDispatcher(pair, self, self.handler)
623 | 
624 |     def handle_close(self):
625 |         logger.info('Server shutting down')
626 |         self.close()
627 |         self.handler.close()
628 | 
629 |     def shutdown(self):
630 |         if self.handler.running:
631 |             super().shutdown()
632 |             self.handler.stop(self)
633 | 
634 |     def env(self, env=os.environ.copy()):
635 |         """Add the KVSSTCP environment variables to the given environment."""
636 |         env['KVSSTCP_HOST'] = self.cinfo[0]
637 |         env['KVSSTCP_PORT'] = str(self.cinfo[1])
638 |         return env
639 | 
640 | 
641 | if '__main__' == __name__:
642 |     import argparse
643 | 
644 |     argp = argparse.ArgumentParser(description='Start key-value storage server.')
645 |     argp.add_argument('-H', '--host', default='', help='Host interface (default is hostname).')
646 |     argp.add_argument('-p', '--port', type=int, default=0, help='Port (default is 0 --- let the OS choose).')
647 |     argp.add_argument(
648 |         '-a',
649 |         '--addrfile',
650 |         default=None,
651 |         metavar='AddressFile',
652 |         type=argparse.FileType('w'),
653 |         help='Write address to this file.',
654 |     )
655 |     argp.add_argument(
656 |         '-e', '--execcmd', default=None, metavar='COMMAND SEQUENCE', help='Execute command with augmented environment.'
657 |     )
658 |     argp.add_argument(
659 |         '-l',
660 |         '--logfile',
661 |         default=None,
662 |         metavar='KVSSLogfile',
663 |         type=argparse.FileType('w'),
664 |         help='Log file for key-value storage server.',
665 |     )
666 |     args = argp.parse_args()
667 | 
668 |     # TODO: figure out where this should really go.
669 |     lconf = {'format': '%(asctime)s %(levelname)-8s %(name)-15s: %(message)s', 'level': logging.DEBUG}
670 |     if args.logfile:
671 |         args.logfile.close()
672 |         lconf['filename'] = args.logfile.name
673 |     logging.basicConfig(**lconf)
674 | 
675 |     t = KVSServer(args.host, args.port)
676 |     addr = '{:s}:{:d}'.format(*t.cinfo)
677 |     logger.info('Server running at %s.', addr)
678 |     if args.addrfile:
679 |         args.addrfile.write(addr)
680 |         args.addrfile.close()
681 | 
682 |     try:
683 |         if args.execcmd:
684 |             import subprocess
685 | 
686 |             logger.info('Launching: %r, env %r', args.execcmd, t.env())
687 |             subprocess.check_call(args.execcmd, shell=True, env=t.env())
688 |         else:
689 |             while t.isAlive():
690 |                 t.join(60)
691 |     finally:
692 |         t.shutdown()
693 |     t.join()
694 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/4KChecks:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Argument list should be the log files from a 4KTasksRep run.
 4 | 
 5 | fc=4000
 6 | al="$#"
 7 | [[ ${al} == ${fc} ]] && echo "Found ${fc} files, as expected." || echo "Wrong number of log files ${al}, expected ${fc}."
 8 | 
 9 | sum=4007998000
10 | dbsum=$(awk 'FNR == 2{c = c + $1}END{printf("%d\n", c)}' "$@")
11 | [[ ${sum} == ${dbsum} ]] && echo "Sum is ${sum}, as expected." || echo "Wrong sum ${dbsum}, expected ${sum}."
12 | 
13 | echo "Should be ~13.00"
14 | awk 'FNR == 3{s = $1}FNR == 4 {c = c + $1 - s}END{printf("%.2f\n", c/4000.)}' "$@"
15 | 
16 | echo -e "Now run:\n awk -F'\\\\t' '{print \$5}' <StatusFile>  | sort | uniq -c "
17 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/4KTasksRep:
--------------------------------------------------------------------------------
1 | #DISBATCH PREFIX ( hostname ; echo $DISBATCH_REPEAT_INDEX ; date +%s.%3N; sleep 13 ; date +%s.%3N ) > /CHANGE/THIS/PATH/DeleteMe_${DISBATCH_JOBID}_${DISBATCH_TASKID}.log 2>&1
2 | #DISBATCH REPEAT 4000 start 1000000
3 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/DBtasksOneBadOneLeaky:
--------------------------------------------------------------------------------
 1 | # Note there is a space at the end of the next line.
 2 | #DISBATCH PREFIX cd dbTestOutputDir ; 
 3 | #DISBATCH SUFFIX  &>> ${DISBATCH_NAMETASKS}_engine_${DISBATCH_ENGINE_RANK}.log
 4 | #DISBATCH PERENGINE START ( echo -n "perengine start on " ; hostname ; date ; sleep 10 ; date )
 5 | #DISBATCH PERENGINE STOP ( echo -n "perengine stop on " ; hostname ; date ; sleep 10 ; date )
 6 | 
 7 | 
 8 | #DISBATCH SUFFIX  &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}.log
 9 | # parentheses are important in this example so that all output is
10 | # collected and captured by the redirection.
11 | ( echo "13 running on" $(hostname) ; date ; sleep 10 ; date ; )
12 | ( echo "14 running on" $(hostname) ; date ; sleep 10 ; date ; )
13 | ( echo "15 running on" $(hostname) ; date ; sleep 10 ; date ; )
14 | ( echo "16 running on" $(hostname) ; date ; sleep 10 ; date ; )
15 | ( echo "17 running on" $(hostname) ; date ; sleep 10 ; date ; )
16 | ( echo "18 running on" $(hostname) ; date ; sleep 10 ; date ; )
17 | ( echo "19 running on" $(hostname) ; date ; sleep 10 ; date ; )
18 | ( echo "20 running on" $(hostname) ; date ; sleep 10 ; date ; )
19 | ( echo "21 running on" $(hostname) ; date ; sleep 10 ; date ; )
20 | ( echo "22 running on" $(hostname) ; date ; sleep 10 ; date ; )
21 | ( echo "23 running on" $(hostname) ; date ; sleep 10 ; date ; )
22 | ( echo "24 running on" $(hostname) ; date ; sleep 10 ; date ; )
23 | ( echo "25 running on" $(hostname) ; date ; sleep 10 ; date ; )
24 | 
25 | # generate non-zero return code
26 | ( echo "26 running on" $(hostname) ; date ; sleep 10 ; date ; exit 13 )
27 | 
28 | ( echo "27 running on" $(hostname) ; date ; sleep 10 ; date ; )
29 | ( echo "28 running on" $(hostname) ; date ; sleep 10 ; date ; )
30 | ( echo "29 running on" $(hostname) ; date ; sleep 10 ; date ; )
31 | #DISBATCH BARRIER
32 | #DISBATCH BARRIER mykey
33 | #DISBATCH BARRIER
34 | ( echo "33 running on" $(hostname) ; date ; sleep 10 ; date ; )
35 | ( echo "34 running on" $(hostname) ; date ; sleep 10 ; date ; )
36 | ( echo "35 running on" $(hostname) ; date ; sleep 10 ; date ; )
37 | ( echo "36 running on" $(hostname) ; date ; sleep 10 ; date ; )
38 | ( echo "37 running on" $(hostname) ; date ; sleep 10 ; date ; )
39 | 
40 | # leak some output
41 | ( echo "38 running on" $(hostname) ; date ; sleep 10 ; date ; ) ; echo 'missed some output'
42 | 
43 | # singleton repeat
44 | #DISBATCH SUFFIX  &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}_rx_${DISBATCH_REPEAT_INDEX}.log
45 | #DISBATCH REPEAT 1 start 39 ( echo "$DISBATCH_REPEAT_INDEX running on" $(hostname) ; date ; sleep 10 ; date ; )
46 | 
47 | # empty repeat
48 | #DISBATCH REPEAT 0 start 1
49 | 
50 | # use zero padding for env variables in file name.
51 | #DISBATCH BARRIER
52 | #DISBATCH SUFFIX ( echo "${DISBATCH_REPEAT_INDEX} running on" $(hostname) ; echo "Zero-padded stream index: ${DISBATCH_STREAM_INDEX_ZP}" ; date ; sleep 10 ; date ; ) &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID_ZP}_rx_${DISBATCH_REPEAT_INDEX_ZP}.log
53 | #DISBATCH REPEAT 7 start 40
54 | 
55 | # fail two in repeat
56 | #DISBATCH SUFFIX  &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}_rx_${DISBATCH_REPEAT_INDEX}.log
57 | #DISBATCH REPEAT 3 start 50 ( echo "$DISBATCH_REPEAT_INDEX running on" $(hostname) ; date ; sleep 10 ; date ; [[ $DISBATCH_REPEAT_INDEX -eq 51 ]] )
58 | 
59 | #DISBATCH BARRIER
60 | #DISBATCH SUFFIX 
61 | # All engines are idle at this point. Running one last task should trigger retirement(s) of all but one engine.
62 | ( echo "55 running on" $(hostname) ; date ; sleep 10 ; date ; ) &> ${DISBATCH_NAMETASKS}_task_${DISBATCH_TASKID}.log
63 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/DCPTTasks:
--------------------------------------------------------------------------------
1 | #DISBATCH PREFIX ( hostname ; echo $DISBATCH_REPEAT_INDEX ; date +%s.%3N ; echo I have "$DISBATCH_CORES_PER_TASK cores." ; sleep 30 ; date +%s.%3N ) > DCPTTest/DeleteMe_${DISBATCH_JOBID}_${DISBATCH_TASKID}.log 2>&1
2 | #DISBATCH REPEAT 20 start 1000000
3 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/GPUTasks:
--------------------------------------------------------------------------------
1 | #DISBATCH PREFIX ( hostname ; echo $DISBATCH_REPEAT_INDEX ; date +%s.%3N ; env | egrep 'CUDA|GPU' ; nvidia-smi ; sleep 30 ; date +%s.%3N ) > GPUTest/DeleteMe_${DISBATCH_JOBID}_${DISBATCH_TASKID}.log 2>&1
2 | #DISBATCH REPEAT 20 start 1000000
3 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/barrierCheckFail:
--------------------------------------------------------------------------------
 1 | ( echo "23 running on" $(hostname) ; date ; sleep 10 ; date ; )
 2 | ( echo "24 running on" $(hostname) ; date ; sleep 10 ; date ; )
 3 | ( echo "25 running on" $(hostname) ; date ; sleep 10 ; date ; )
 4 | 
 5 | # generate non-zero return code
 6 | ( echo "26 running on" $(hostname) ; date ; sleep 10 ; date ; exit 13 )
 7 | 
 8 | ( echo "27 running on" $(hostname) ; date ; sleep 10 ; date ; )
 9 | ( echo "28 running on" $(hostname) ; date ; sleep 10 ; date ; )
10 | ( echo "29 running on" $(hostname) ; date ; sleep 10 ; date ; )
11 | #DISBATCH BARRIER CHECK
12 | ( echo "33 running on" $(hostname) ; date ; sleep 10 ; date ; )
13 | ( echo "34 running on" $(hostname) ; date ; sleep 10 ; date ; )
14 | ( echo "35 running on" $(hostname) ; date ; sleep 10 ; date ; )
15 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/dberTest.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | 
 3 | import sys
 4 | 
 5 | from disbatch import disBatch
 6 | 
 7 | # This test script requires at least one argument: the number of tasks to run.
 8 | # The rest, if any, are arguments that will be passed to disBatch:
 9 | #
10 | #  - If testing on your local machine, try something like
11 | #
12 | #      dberTest.py 15 -s localhost:5
13 | #
14 | #  - If testing via a Slurm submission, disBatch will auto detect
15 | #    that, so no additional arguments are needed
16 | #
17 | NumTasks = int(sys.argv[1])
18 | dbArgs = sys.argv[2:]
19 | 
20 | # The first argument is a prefix that will be used internally to
21 | # identify support activities related to this run. The rest are
22 | # arguments for disBatch.
23 | db = disBatch.DisBatcher(tasksname='testing', args=dbArgs)
24 | 
25 | # We use this to keep track of the tasks.
26 | # disBatch assigns a numeric ID to each tasks, starting from 0. We need
27 | # to do the same to track the tasks.
28 | tasks = {}
29 | for x in range(NumTasks):
30 |     # Tasks are simply ASCII command lines. The '{}' in the following
31 |     # are interpreted by python, not bash.
32 |     # We force an error return of task 7.
33 |     tasks[x] = (
34 |         f'{{ date ; hostname ; sleep 2 ; echo {x}^2 $(( {x} * {x} )) ; [[ {x} == 7 ]] && exit 1 ; date ; }} > square.log_{x:03d} 2>&1 '
35 |     )
36 | 
37 |     # Submit the task.
38 |     db.submit(tasks[x])
39 | 
40 | # syncTasks waits for all tasks identified by the keys of "tasks" to
41 | # complete. It returns a dictionary that maps an id to a return code
42 | # and the complete status report for the task.  syncTasks maintains an
43 | # internal dictionary of return codes, so this operation is
44 | # idempotent.
45 | tid2status = db.syncTasks(tasks)
46 | for tid in tasks:
47 |     print(
48 |         'task {:d}: {:s} returned {:d}, matched: {:s}'.format(
49 |             tid,
50 |             repr(tasks[tid]),
51 |             tid2status[tid]['ReturnCode'],
52 |             repr(tasks[tid]) == tid2status[tid]['TaskCmd'],
53 |         )
54 |     )
55 | 
56 | # Now try a repeat construct. Force an error for the index 112.
57 | db.submit(
58 |     f'#DISBATCH REPEAT {NumTasks} start 100 step 3 x=${{DISBATCH_REPEAT_INDEX}} ; {{ date ; hostname ; sleep 2 ; echo $x^3 $(( x * x * x )) ; [[ $x == 112 ]] && exit 1 ; date ; }} > cube.log_$(printf "%03d" $x) 2>&1'
59 | )
60 | 
61 | # The ids for the new tasks are the next NumTasks consecutive integers.
62 | target_tids = set(range(NumTasks, 2 * NumTasks))
63 | for x in range(NumTasks):
64 |     # Wait for one task and return its status info.
65 |     s = db.wait_one_task()
66 |     assert s['TaskId'] in target_tids
67 |     print(f'task {s["TaskId"]:d}: returned {s["ReturnCode"]:d}, "{s["TaskCmd"]:s}"')
68 | 
69 | # Tell DisBatcher no more tasks are coming.
70 | db.done()
71 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/dberTest.submit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./dberTest.py 23 --fill &> dberTestSlurm.log
4 | 


--------------------------------------------------------------------------------
/exampleTaskFiles/emptyTaskFile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flatironinstitute/disBatch/4072dc9f95d14b3bdd8ee9259505d993eeded424/exampleTaskFiles/emptyTaskFile


--------------------------------------------------------------------------------
/exampleTaskFiles/latePETask:
--------------------------------------------------------------------------------
1 | echo 'hi there'
2 | #This should fail.
3 | #DISBATCH PERENGINE START echo 'Did already say "hi there"?'
4 | echo 'goodbye now'
5 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "disbatch"
 3 | description = "Dynamically distribute a list of tasks over a pool of compute resources"
 4 | readme = "Readme.md"
 5 | authors = [
 6 |     { name = "Nick Carriero" },
 7 |     { name = "Lehman Garrison", email = "lgarrison@flatironinstitute.org" },
 8 | ]
 9 | requires-python = ">=3.9"
10 | dependencies = []
11 | license = { file = "LICENSE" }
12 | dynamic = ["version"]
13 | 
14 | [project.scripts]
15 | disBatch = "disbatch:main"
16 | disbatch = "disbatch:main"
17 | 
18 | [build-system]
19 | requires = ["hatchling", "hatch-vcs"]
20 | build-backend = "hatchling.build"
21 | 
22 | [tool.ruff]
23 | line-length = 120
24 | 
25 | [tool.ruff.format]
26 | quote-style = "single"
27 | 
28 | [tool.ruff.lint]
29 | select = ["E4", "E7", "E9", "F", "I", "UP"]
30 | 
31 | [tool.hatch.version]
32 | source = "vcs"
33 | 
34 | [tool.hatch.build.hooks.vcs]
35 | version-file = "disbatch/_version.py"
36 | 
37 | [dependency-groups]
38 | dev = [
39 |     "pre-commit>=4.0.1",
40 | ]
41 | 


--------------------------------------------------------------------------------
/tests/test_slurm/Tasks:
--------------------------------------------------------------------------------
1 | touch A.txt
2 | touch B.txt
3 | touch C.txt
4 | 


--------------------------------------------------------------------------------
/tests/test_slurm/Tasks_failfast:
--------------------------------------------------------------------------------
1 | sleep 1000
2 | exit 1
3 | touch A.txt
4 | 


--------------------------------------------------------------------------------
/tests/test_slurm/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | exit_fail() {
 4 |     err=$?
 5 |     echo "Slurm test failed! Output is in $workdir"
 6 |     exit $err
 7 | }
 8 | 
 9 | trap exit_fail ERR
10 | 
11 | workdir=$(mktemp -d -p $PWD disbatch-test.XXXX)
12 | cp Tasks Tasks_failfast $workdir
13 | cd $workdir
14 | 
15 | # Run the test
16 | salloc -n 2 disBatch Tasks
17 | 
18 | # Check that all 3 tasks ran,
19 | # which means A.txt, B.txt, and C.txt exist
20 | [[ -f A.txt && -f B.txt && -f C.txt ]]
21 | rm -f A.txt B.txt C.txt
22 | 
23 | # Add a task and check that we can resume
24 | echo "touch D.txt" >> Tasks
25 | salloc -n 2 disBatch Tasks -r Tasks*_status.txt
26 | 
27 | [[ -f D.txt && ! -f A.txt && ! -f B.txt && ! -f C.txt ]]
28 | 
29 | # Test empty task file
30 | salloc -n 2 disBatch /dev/null
31 | 
32 | # disBatch is expected to exit with a non-zero exit code here
33 | salloc -n 2 disBatch --fail-fast Tasks_failfast || true
34 | 
35 | # check that we failed fast and didn't run any more tasks
36 | [[ ! -f A.txt ]]
37 | 
38 | cd - > /dev/null
39 | 
40 | trap - ERR
41 | echo "Slurm test passed."
42 | # NFS sometimes leaves stale file handles, but don't fail the test
43 | rm -rf $workdir || true
44 | 


--------------------------------------------------------------------------------
/tests/test_ssh/Tasks:
--------------------------------------------------------------------------------
1 | touch A.txt
2 | touch B.txt
3 | touch C.txt
4 | 


--------------------------------------------------------------------------------
/tests/test_ssh/Tasks_failfast:
--------------------------------------------------------------------------------
1 | sleep 1000
2 | exit 1
3 | touch A.txt
4 | 


--------------------------------------------------------------------------------
/tests/test_ssh/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | exit_fail() {
 4 |     err=$?
 5 |     echo "SSH test failed! Output is in $workdir"
 6 |     exit $err
 7 | }
 8 | 
 9 | trap exit_fail ERR
10 | 
11 | workdir=$(mktemp -d -p $PWD disbatch-test.XXXX)
12 | cp Tasks Tasks_failfast $workdir
13 | cd $workdir
14 | 
15 | # Run the test
16 | disBatch -s localhost:2 Tasks
17 | 
18 | # Check that all 3 tasks ran,
19 | # which means A.txt, B.txt, and C.txt exist
20 | [[ -f A.txt && -f B.txt && -f C.txt ]]
21 | rm -f A.txt B.txt C.txt
22 | 
23 | # Add a task and check that we can resume
24 | echo "touch D.txt" >> Tasks
25 | disBatch -s localhost:2 Tasks -r Tasks*_status.txt
26 | 
27 | [[ -f D.txt && ! -f A.txt && ! -f B.txt && ! -f C.txt ]]
28 | 
29 | # Test empty task file
30 | disBatch -s localhost:2 /dev/null
31 | 
32 | # disBatch is expected to exit with a non-zero exit code here
33 | disbatch -s localhost:2 --fail-fast Tasks_failfast || true
34 | 
35 | # check that we failed fast and didn't run any more tasks
36 | [[ ! -f A.txt ]]
37 | 
38 | cd - > /dev/null
39 | 
40 | trap - ERR
41 | echo "SSH test passed."
42 | # NFS sometimes leaves stale file handles, but don't fail the test
43 | rm -rf $workdir || true
44 | 


--------------------------------------------------------------------------------
/uv.lock:
--------------------------------------------------------------------------------
  1 | version = 1
  2 | requires-python = ">=3.9"
  3 | 
  4 | [[package]]
  5 | name = "cfgv"
  6 | version = "3.4.0"
  7 | source = { registry = "https://pypi.org/simple" }
  8 | sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114 }
  9 | wheels = [
 10 |     { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249 },
 11 | ]
 12 | 
 13 | [[package]]
 14 | name = "disbatch"
 15 | version = "2.7.dev17+g406b65a.d20241108"
 16 | source = { editable = "." }
 17 | 
 18 | [package.dev-dependencies]
 19 | dev = [
 20 |     { name = "pre-commit" },
 21 | ]
 22 | 
 23 | [package.metadata]
 24 | 
 25 | [package.metadata.requires-dev]
 26 | dev = [{ name = "pre-commit", specifier = ">=4.0.1" }]
 27 | 
 28 | [[package]]
 29 | name = "distlib"
 30 | version = "0.3.9"
 31 | source = { registry = "https://pypi.org/simple" }
 32 | sdist = { url = "https://files.pythonhosted.org/packages/0d/dd/1bec4c5ddb504ca60fc29472f3d27e8d4da1257a854e1d96742f15c1d02d/distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403", size = 613923 }
 33 | wheels = [
 34 |     { url = "https://files.pythonhosted.org/packages/91/a1/cf2472db20f7ce4a6be1253a81cfdf85ad9c7885ffbed7047fb72c24cf87/distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87", size = 468973 },
 35 | ]
 36 | 
 37 | [[package]]
 38 | name = "filelock"
 39 | version = "3.16.1"
 40 | source = { registry = "https://pypi.org/simple" }
 41 | sdist = { url = "https://files.pythonhosted.org/packages/9d/db/3ef5bb276dae18d6ec2124224403d1d67bccdbefc17af4cc8f553e341ab1/filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435", size = 18037 }
 42 | wheels = [
 43 |     { url = "https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0", size = 16163 },
 44 | ]
 45 | 
 46 | [[package]]
 47 | name = "identify"
 48 | version = "2.6.1"
 49 | source = { registry = "https://pypi.org/simple" }
 50 | sdist = { url = "https://files.pythonhosted.org/packages/29/bb/25024dbcc93516c492b75919e76f389bac754a3e4248682fba32b250c880/identify-2.6.1.tar.gz", hash = "sha256:91478c5fb7c3aac5ff7bf9b4344f803843dc586832d5f110d672b19aa1984c98", size = 99097 }
 51 | wheels = [
 52 |     { url = "https://files.pythonhosted.org/packages/7d/0c/4ef72754c050979fdcc06c744715ae70ea37e734816bb6514f79df77a42f/identify-2.6.1-py2.py3-none-any.whl", hash = "sha256:53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0", size = 98972 },
 53 | ]
 54 | 
 55 | [[package]]
 56 | name = "nodeenv"
 57 | version = "1.9.1"
 58 | source = { registry = "https://pypi.org/simple" }
 59 | sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437 }
 60 | wheels = [
 61 |     { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314 },
 62 | ]
 63 | 
 64 | [[package]]
 65 | name = "platformdirs"
 66 | version = "4.3.6"
 67 | source = { registry = "https://pypi.org/simple" }
 68 | sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302 }
 69 | wheels = [
 70 |     { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439 },
 71 | ]
 72 | 
 73 | [[package]]
 74 | name = "pre-commit"
 75 | version = "4.0.1"
 76 | source = { registry = "https://pypi.org/simple" }
 77 | dependencies = [
 78 |     { name = "cfgv" },
 79 |     { name = "identify" },
 80 |     { name = "nodeenv" },
 81 |     { name = "pyyaml" },
 82 |     { name = "virtualenv" },
 83 | ]
 84 | sdist = { url = "https://files.pythonhosted.org/packages/2e/c8/e22c292035f1bac8b9f5237a2622305bc0304e776080b246f3df57c4ff9f/pre_commit-4.0.1.tar.gz", hash = "sha256:80905ac375958c0444c65e9cebebd948b3cdb518f335a091a670a89d652139d2", size = 191678 }
 85 | wheels = [
 86 |     { url = "https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl", hash = "sha256:efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878", size = 218713 },
 87 | ]
 88 | 
 89 | [[package]]
 90 | name = "pyyaml"
 91 | version = "6.0.2"
 92 | source = { registry = "https://pypi.org/simple" }
 93 | sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631 }
 94 | wheels = [
 95 |     { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199 },
 96 |     { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758 },
 97 |     { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463 },
 98 |     { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280 },
 99 |     { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239 },
100 |     { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802 },
101 |     { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527 },
102 |     { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052 },
103 |     { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774 },
104 |     { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612 },
105 |     { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040 },
106 |     { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829 },
107 |     { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167 },
108 |     { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952 },
109 |     { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301 },
110 |     { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638 },
111 |     { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850 },
112 |     { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980 },
113 |     { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873 },
114 |     { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302 },
115 |     { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154 },
116 |     { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223 },
117 |     { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542 },
118 |     { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164 },
119 |     { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611 },
120 |     { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591 },
121 |     { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 },
122 |     { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309 },
123 |     { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679 },
124 |     { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428 },
125 |     { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361 },
126 |     { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523 },
127 |     { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660 },
128 |     { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 },
129 |     { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 },
130 |     { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 },
131 |     { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 },
132 |     { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 },
133 |     { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 },
134 |     { url = "https://files.pythonhosted.org/packages/e9/6c/6e1b7f40181bc4805e2e07f4abc10a88ce4648e7e95ff1abe4ae4014a9b2/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12", size = 722614 },
135 |     { url = "https://files.pythonhosted.org/packages/3d/32/e7bd8535d22ea2874cef6a81021ba019474ace0d13a4819c2a4bce79bd6a/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19", size = 737360 },
136 |     { url = "https://files.pythonhosted.org/packages/d7/12/7322c1e30b9be969670b672573d45479edef72c9a0deac3bb2868f5d7469/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e", size = 699006 },
137 |     { url = "https://files.pythonhosted.org/packages/82/72/04fcad41ca56491995076630c3ec1e834be241664c0c09a64c9a2589b507/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725", size = 723577 },
138 |     { url = "https://files.pythonhosted.org/packages/ed/5e/46168b1f2757f1fcd442bc3029cd8767d88a98c9c05770d8b420948743bb/PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631", size = 144593 },
139 |     { url = "https://files.pythonhosted.org/packages/19/87/5124b1c1f2412bb95c59ec481eaf936cd32f0fe2a7b16b97b81c4c017a6a/PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8", size = 162312 },
140 | ]
141 | 
142 | [[package]]
143 | name = "virtualenv"
144 | version = "20.27.1"
145 | source = { registry = "https://pypi.org/simple" }
146 | dependencies = [
147 |     { name = "distlib" },
148 |     { name = "filelock" },
149 |     { name = "platformdirs" },
150 | ]
151 | sdist = { url = "https://files.pythonhosted.org/packages/8c/b3/7b6a79c5c8cf6d90ea681310e169cf2db2884f4d583d16c6e1d5a75a4e04/virtualenv-20.27.1.tar.gz", hash = "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba", size = 6491145 }
152 | wheels = [
153 |     { url = "https://files.pythonhosted.org/packages/ae/92/78324ff89391e00c8f4cf6b8526c41c6ef36b4ea2d2c132250b1a6fc2b8d/virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4", size = 3117838 },
154 | ]
155 | 


--------------------------------------------------------------------------------