├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── coverage.yml │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── ABOUT.md ├── LICENSE ├── README.md ├── imgs └── example.png ├── mypy.ini ├── noxfile.py ├── poetry.lock ├── pyproject.toml ├── src └── reportseff │ ├── __init__.py │ ├── console.py │ ├── db_inquirer.py │ ├── job.py │ ├── job_collection.py │ ├── output_renderer.py │ └── parameters.py └── tests ├── conftest.py ├── test_db_inquirer.py ├── test_job.py ├── test_job_collection.py ├── test_output_renderer.py └── test_reportseff.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = ANN,B,B9,BLK,C,D,DAR,E,F,I,W 3 | max-complexity = 10 4 | ignore = E203,W503,E501,ANN101,ANN401 5 | max-line-length = 80 6 | application-import-names= reportseff,tests 7 | import-order-style = google 8 | docstring-convention = google 9 | per-file-ignores = 10 | tests/*:S101,ANN,DAR 11 | noxfile.py:ANN,DAR 12 | src/reportseff/console.py:DAR101 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: troycomi 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Version [e.g. 22] 29 | 30 | **Debug Output** 31 | ```bash 32 | reportseff --debug [other options] 33 | # paste result here 34 | ``` 35 | 36 | **Additional context** 37 | Add any other context about the problem here. 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: troycomi 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | on: [push, pull_request] 3 | jobs: 4 | coverage: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v3 8 | - uses: wntrblm/nox@2022.11.21 9 | with: 10 | python-versions: "3.9, 3.10, 3.11" 11 | - run: pipx install poetry==1.3.1 12 | - run: pipx inject poetry poetry-plugin-export 13 | - run: nox --sessions tests-3.10 coverage 14 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | release: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - uses: wntrblm/nox@2022.11.21 11 | with: 12 | python-versions: "3.9, 3.10, 3.11" 13 | - run: pipx install poetry==1.3.1 14 | - run: pipx inject poetry poetry-plugin-export 15 | - run: nox 16 | - run: poetry build 17 | - run: poetry publish --username=__token__ --password=${{ secrets.PYPI_TOKEN }} 18 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push, pull_request] 3 | jobs: 4 | tests: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v3 8 | - uses: wntrblm/nox@2022.11.21 9 | with: 10 | python-versions: "3.9, 3.10, 3.11" 11 | - run: pipx install poetry==1.3.1 12 | - run: pipx inject poetry poetry-plugin-export 13 | - run: nox 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.egg-info 3 | *.swp 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: "v5.0.0" 4 | hooks: 5 | - id: check-added-large-files 6 | - id: check-case-conflict 7 | - id: check-merge-conflict 8 | - id: check-symlinks 9 | - id: check-yaml 10 | - id: debug-statements 11 | - id: end-of-file-fixer 12 | - id: mixed-line-ending 13 | - id: name-tests-test 14 | args: ["--pytest-test-first"] 15 | - id: requirements-txt-fixer 16 | - id: trailing-whitespace 17 | 18 | - repo: https://github.com/pre-commit/pygrep-hooks 19 | rev: v1.9.0 20 | hooks: 21 | - id: python-check-blanket-noqa 22 | - id: python-no-eval 23 | - id: python-use-type-annotations 24 | - id: python-check-blanket-type-ignore 25 | - id: python-check-mock-methods 26 | 27 | - repo: https://github.com/codespell-project/codespell 28 | rev: v2.1.0 29 | hooks: 30 | - id: codespell 31 | args: [--ignore-words-list, "absense,inout"] 32 | 33 | - repo: https://github.com/shellcheck-py/shellcheck-py 34 | rev: v0.8.0.4 35 | hooks: 36 | - id: shellcheck 37 | 38 | - repo: https://github.com/astral-sh/ruff-pre-commit 39 | rev: v0.5.5 40 | hooks: 41 | - id: ruff 42 | args: [ --fix ] 43 | - id: ruff-format 44 | -------------------------------------------------------------------------------- /ABOUT.md: -------------------------------------------------------------------------------- 1 | # Monitoring slurm efficiency with reportseff 2 | 3 | > Troy Comi 4 | 5 | ## Motivation 6 | 7 | As I started using Snakemake, I had hundreds of jobs which I wanted to get 8 | performance information about. seff gave the efficiency information I wanted, 9 | but for only a single job at a time. `sacct` handles multiple jobs, but couldn't 10 | give the efficiency. With the current python implementation, 11 | all job information is obtained from a single 12 | `sacct` call and with click the output is colored to quickly see how things are 13 | running. (But color isn't displayed below due to markdown limitations). 14 | 15 | ## Be good to your scheduler 16 | 17 | ### An introduction to scheduling efficiency 18 | 19 | Have you ever hosted an event that had to provide food? Perhaps you sent out 20 | RSVP's to estimate how many people would attend, guessed a handful of people 21 | would show up but not respond, and ordered some pizza. If you ordered enough 22 | food for 20 people and 18 showed, that would be a pizza efficiency of 90%. 23 | But what if only 2 people showed up? Or 30? As extreme as these numbers seem, 24 | memory and cpu usage efficiencies around 10% are not uncommon. 25 | 26 | The goal of a scheduler is to take the user-provided resource 27 | estimates for many jobs and decide who runs when. Let's say I have a small 28 | cluster with 64 cores, 128 GB of memory and want to run an array job of 29 | single-core processes with an estimated memory usage of 4 GB. The scheduler 30 | will allow only 32 jobs to run at once (128 GB / 4 GB) leaving half of the 31 | cores idling. If I actually only use 1 GB of memory, 64 jobs could be running 32 | instead. 33 | 34 | **Good jobs use the resources they promise to.** 35 | 36 | In practice, many more details of the system and user are incorporated into 37 | the decision to schedule a job. Once the scheduler decides a job will run, 38 | the scheduler has to dispatch the job. The overhead associated with scheduling 39 | only makes sense if the job will run for longer than a few minutes. Instead of 40 | submitting 1000 jobs that perform 1 minute of work, group 100 subprocesses 41 | together as 10 jobs with 100 minutes of work. 42 | 43 | **Good jobs run long enough to matter.** 44 | 45 | If every job on a cluster is efficient and long-running, the scheduler can 46 | make accurate decisions on execution order and keep usage high. 47 | 48 | ### Why it matters as a user? 49 | 50 | "But my qos only allows 2 jobs to run at once if the time 51 | is less than 2 hours! Can't I say my 10 minute job will take 2 hours?" Yes, 52 | but it is *rude* to the scheduler. If that doesn't sway you, improperly 53 | estimating resource usage can: 54 | 55 | - Decrease your priority for subsequent jobs. 56 | - Cause your account to be charged for the full, estimated usage. 57 | - Have fewer of your jobs running simultaneously. 58 | - Make it harder to fit your job into the available cluster resources, 59 | increasing the queue time. 60 | 61 | ### Monitoring efficiency 62 | 63 | Before releasing a swarm of jobs, check the estimated vs predicted usage. 64 | Tune your parameters to improve efficiency. 65 | 66 | [Seff](https://github.com/SchedMD/slurm/tree/master/contribs/seff) provides 67 | efficiency estimates for a single job. But to look at your usage 68 | for many jobs or monitor usage, I wrote 69 | [reportseff](https://github.com/troycomi/reportseff). It polls `sacct` 70 | and calculates the same efficiency information as seff, but outputs 71 | a tabular report. 72 | 73 | During testing, I looked at random ranges of jobids on a Princeton cluster. 74 | Here is some typical output, with jobids modified to protect the innocent: 75 | 76 | ```txt 77 | Name State Time CPU Memory 78 | XXXXX000 COMPLETED 00:01:53 97.3% 14.0% 79 | XXXXX001 COMPLETED 00:02:19 84.2% 14.0% 80 | XXXXX002 COMPLETED 00:06:33 28.2% 14.0% 81 | XXXXX003 COMPLETED 00:04:59 39.1% 14.0% 82 | XXXXX004 COMPLETED 00:02:31 97.4% 9.2% 83 | XXXXX005 COMPLETED 00:02:38 98.1% 9.1% 84 | XXXXX006 COMPLETED 00:02:24 97.2% 9.1% 85 | XXXXX007 COMPLETED 00:02:40 98.1% 9.0% 86 | XXXXX008 COMPLETED 00:02:39 96.2% 9.1% 87 | XXXXX009 COMPLETED 00:02:45 96.4% 9.0% 88 | XXXXX012 COMPLETED 00:00:53 58.5% 10.6% 89 | XXXXX013 COMPLETED 00:02:13 38.3% 10.6% 90 | XXXXX014 COMPLETED 00:37:02 44.9% 10.6% 91 | XXXXX015 COMPLETED 00:44:33 34.0% 10.6% 92 | XXXXX016 COMPLETED 00:38:29 29.6% 10.7% 93 | XXXXX017 COMPLETED 00:19:57 74.5% 10.8% 94 | XXXXX018 COMPLETED 00:14:25 95.0% 10.8% 95 | XXXXX019 COMPLETED 00:35:38 2.6% 10.6% 96 | XXXXX020 COMPLETED 00:02:16 38.2% 10.6% 97 | XXXXX021 COMPLETED 00:02:34 46.1% 10.9% 98 | XXXXX022 COMPLETED 00:20:53 7.1% 10.6% 99 | XXXXX023 COMPLETED 00:01:00 95.0% 11.1% 100 | XXXXX024 COMPLETED 00:09:06 88.5% 10.5% 101 | XXXXX025 COMPLETED 00:08:08 95.3% 10.6% 102 | ``` 103 | 104 | This is from at least 3 different users across departments. 105 | 106 | Notice how short the jobs are (most <5 minutes) and how little memory is used, 107 | about 500 MB of 4 GB in most cases. Another example is jobs with 4 cores using 108 | 25% of CPU. Though batching together short jobs is slightly difficult (nested 109 | for loops with some arithmetic), using the correct number of cores and cutting 110 | memory to improve usage is a simple fix. 111 | 112 | Try it out and see if you have been good to your scheduler! 113 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 troycomi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/troycomi/reportseff/workflows/Tests/badge.svg)](https://github.com/troycomi/reportseff/actions?workflow=Tests) 2 | [![codecov](https://codecov.io/gh/troycomi/reportseff/branch/main/graph/badge.svg)](https://codecov.io/gh/troycomi/reportseff) 3 | [![PyPI](https://img.shields.io/pypi/v/reportseff.svg)](https://pypi.org/project/reportseff/) 4 | [![DOI](https://img.shields.io/badge/doi-10.1145/3569951.3604396-blue.svg?style=flat&labelColor=whitesmoke&logo=data%3Aimage%2Fpng%3Bbase64%2CiVBORw0KGgoAAAANSUhEUgAAAB8AAAAfCAYAAAAfrhY5AAAJsklEQVR42qWXd1DTaRrHf%2BiB2Hdt5zhrAUKz4IKEYu9IGiGFFJJQ0gkJCAKiWFDWBRdFhCQUF3UVdeVcRQEBxUI3yY9iEnQHb3bdW1fPubnyz%2F11M7lvEHfOQee2ZOYzPyDv%2B3yf9%2Fk95YX4fx%2BltfUt08GcFEuPR4U9hDDZ%2FVngIlhb%2FSiI6InkTgLzgDcgfvtnovhH4BzoVlrbwr55QnhCtBW4QHXnFrZbPBaQoBh4%2FSYH2EnpBEtqcDMVzB93wA%2F8AFwa23XFGcc8CkT3mxz%2BfXWtq9T9IQlLIXYEuHojudb%2BCM7Hgdq8ydi%2FAHiBXyY%2BLjwFlAEnS6Jnar%2FvnQVhvdzasad0eKvWZKe8hvDB2ofLZ%2FZEcWsh%2BhyIuyO5Bxs2iZIE4nRv7NWAb0EO8AC%2FWPxjYAWuOEX2MSXZVgPxzmRL3xKz3ScGpx6p6QnOx4mDIFqO0w6Q4fEhO5IzwxlSwyD2FYHzwAW%2BAZ4fEsf74gCumykwNHskLM7taQxLYjjIyy8MUtraGhTWdkfhkFJqtvuVl%2F9l2ZquDfEyrH8B0W06nnpH3JtIyRGpH1iJ6SfxDIHjRXHJmdQjLpfHeN54gnfFx4W9QRnovx%2FN20aXZeTD2J84hn3%2BqoF2Tqr14VqTPUCIcP%2B5%2Fly4qC%2BUL3sYxSvNj1NwsVYPsWdMUfomsdkYm3Tj0nbV0N1wRKwFe1MgKACDIBdMAhPE%2FwicwNWxll8Ag40w%2BFfhibJkGHmutjYeQ8gVlaN%2BjO51nDysa9TwNUFMqaGbKdRJZFfOJSp6mkRKsv0rRIpEVWjAvyFkxNOEpwvcAVPfEe%2Bl8ojeNTx3nXLBcWRrYGxSRjDEk0VlpxYrbe1ZmaQ5xuT0u3r%2B2qe5j0J5uytiZPGsRL2Jm32AldpxPUNJ3jmmsN4x62z1cXrbedXBQf2yvIFCeZrtyicZZG2U2nrrBJzYorI2EXLrvTfCSB43s41PKEvbZDEfQby6L4JTj%2FfIwam%2B4%2BwucBu%2BDgNK05Nle1rSt9HvR%2FKPC4U6LTfvUIaip1mjIa8fPzykii23h2eanT57zQ7fsyYH5QjywwlooAUcAdOh5QumgTHx6aAO7%2FL52eaQNEShrxfhL6albEDmfhGflrsT4tps8gTHNOJbeDeBlt0WJWDHSgxs6cW6lQqyg1FpD5ZVDfhn1HYFF1y4Eiaqa18pQf3zzYMBhcanlBjYfgWNayAf%2FASOgklu8bmgD7hADrk4cRlOL7NSOewEcbqSmaivT33QuFdHXj5sdvjlN5yMDrAECmdgDWG2L8P%2BAKLs9ZLZ7dJda%2BB4Xl84t7QvnKfvpXJv9obz2KgK8dXyqISyV0sXGZ0U47hOA%2FAiigbEMECJxC9aoKp86re5O5prxOlHkcksutSQJzxZRlPZmrOKhsQBF5zEZKybUC0vVjG8PqOnhOq46qyDTDnj5gZBriWCk4DvXrudQnXQmnXblebhAC2cCB6zIbM4PYgGl0elPSgIf3iFEA21aLdHYLHUQuVkpgi02SxFdrG862Y8ymYGMvXDzUmiX8DS5vKZyZlGmsSgQqfLub5RyLNS4zfDiZc9Edzh%2FtCE%2BX8j9k%2FqWB071rcZyMImne1SLkL4GRw4UPHMV3jjwEYpPG5uW5fAEot0aTSJnsGAwHJi2nvF1Y5OIqWziVCQd5NT7t6Q8guOSpgS%2Fa1dSRn8JGGaCD3BPXDyQRG4Bqhu8XrgAp0yy8DMSvvyVXDgJcJTcr1wQ2BvFKf65jqhvmxXUuDpGBlRvV36XvGjQzLi8KAKT2lYOnmxQPGorURSV0NhyTIuIyqOmKTMhQ%2BieEsgOgpc4KBbfDM4B3SIgFljvfHF6cef7qpyLBXAiQcXvg5l3Iunp%2FWv4dH6qFziO%2BL9PbrimQ9RY6MQphEfGUpOmma7KkGzuS8sPUFnCtIYcKCaI9EXo4HlQLgGrBjbiK5EqMj2AKWt9QWcIFMtnVvQVDQV9lXJJqdPVtUQpbh6gCI2Ov1nvZts7yYdsnvRgxiWFOtNJcOMVLn1vgptVi6qrNiFOfEjHCDB3J%2BHDLqUB77YgQGwX%2Fb1eYna3hGKdlqJKIyiE4nSbV8VFgxmxR4b5mVkkeUhMgs5YTi4ja2XZ009xJRHdkfwMi%2BfocaancuO7h%2FMlcLOa0V%2FSw6Dq47CumRQAKhgbOP8t%2BMTjuxjJGhXCY6XpmDDFqWlVYbQ1aDJ5Cptdw4oLbf3Ck%2BdWkVP0LpH7s9XLPXI%2FQX8ws%2Bj2In63IcRvOOo%2BTTjiN%2BlssfRsanW%2B3REVKoavBOAPTXABW4AL7e4NygHdpAKBscmlDh9Jysp4wxbnUNna3L3xBvyE1jyrGIkUHaqQMuxhHElV6oj1picvgL1QEuS5PyZTEaivqh5vUCKJqOuIgPFGESns8kyFk7%2FDxyima3cYxi%2FYOQCj%2F%2B9Ms2Ll%2Bhn4FmKnl7JkGXQGDKDAz9rUGL1TIlBpuJr9Be2JjK6qPzyDg495UxXYF7JY1qKimw9jWjF0iV6DRIqE%2B%2FeWG0J2ofmZTk0mLYVd4GLiFCOoKR0Cg727tWq981InYynvCuKW43aXgEjofVbxIqrm0VL76zlH3gQzWP3R3Bv9oXxclrlO7VVtgBRpSP4hMFWJ8BrUSBCJXC07l40X4jWuvtc42ofNCxtlX2JH6bdeojXgTh5TxOBKEyY5wvBE%2BACh8BtOPNPkApjoxi5h%2B%2FFMQQNpWvZaMH7MKFu5Ax8HoCQdmGkJrtnOiLHwD3uS5y8%2F2xTSDrE%2F4PT1yqtt6vGe8ldMBVMEPd6KwqiYECHDlfbvzphcWP%2BJiZuL5swoWQYlS%2Br7Yu5mNUiGD2retxBi9fl6RDGn4Ti9B1oyYy%2BMP5G87D%2FCpRlvdnuy0PY6RC8BzTA40NXqckQ9TaOUDywkYsudxJzPgyDoAWn%2BB6nEFbaVxxC6UXjJiuDkW9TWq7uRBOJocky9iMfUhGpv%2FdQuVVIuGjYqACbXf8aa%2BPeYNIHZsM7l4s5gAQuUAzRUoT51hnH3EWofXf2vkD5HJJ33vwE%2FaEWp36GHr6GpMaH4AAPuqM5eabH%2FhfG9zcCz4nN6cPinuAw6IHwtvyB%2FdO1toZciBaPh25U0ducR2PI3Zl7mokyLWKkSnEDOg1x5fCsJE9EKhH7HwFNhWMGMS7%2BqxyYsbHHRUDUH4I%2FAheQY7wujJNnFUH4KdCju83riuQeHU9WEqNzjsJFuF%2FdTDAZ%2FK7%2F1WaAU%2BAWymT59pVMT4g2AxcwNa0XEBDdBDpAPvgDIH73R25teeuAF5ime2Ul0OUIiG4GpSAEJeYW9wDTf43wfwHgHLKJoPznkwAAAABJRU5ErkJggg%3D%3D)](https://doi.org/10.1145/3569951.3604396) 5 | 6 | # `reportseff` 7 | 8 | > A python script for tabular display of slurm efficiency information 9 | 10 | ![Example](https://github.com/troycomi/reportseff/raw/main/imgs/example.png) 11 | 12 | ## About 13 | 14 | ### Motivation 15 | 16 | Whether a sys admin or cluster user, knowing how well you are estimating job 17 | resources can help streamline job scheduling and maximize your priority. If you 18 | have ever tried to use `sacct` you probably had some trouble interpreting the 19 | output. While `seff` or `jobstats` can provide detailed summaries, they don't 20 | scale easily to array jobs or offer a way to see all the jobs from a single 21 | user. `reportseff` aims to fill this role. Read more about the [motivation 22 | for reportseff](https://github.com/troycomi/reportseff/blob/main/ABOUT.md). 23 | 24 | ### Audience 25 | 26 | If you are running more than one slurm job at a time, you should try 27 | `reportseff`. Users of HPC systems can get an idea how well they estimate 28 | resource usage. By tuning these values, you can get scheduled earlier and not 29 | be penalized for unused allocations. Since `reportseff` can parse job ids from 30 | slurm output files, it simplifies the task of identifying which jobs have 31 | failed and why. Sys admins can pipe `reportseff` output to identify users with 32 | poor utilization or produce summaries at the end of a billing cycle. 33 | 34 | ### Implementation 35 | 36 | `reportseff` is a wrapper around `sacct` that provides more complex option 37 | parsing, simpler options, and cleaner, colored outputs. All querying is 38 | performed in a single call to `sacct` and should have similar performance. 39 | Multi-node and GPU utilization is acquired from information contained in the 40 | `AdminComment` field, as generated by `jobstats`. 41 | 42 | ## Usage 43 | 44 | ### Installation 45 | 46 | `reportseff` runs on python >= 3.6. 47 | The only external dependency is click (>= 6.7). 48 | Calling 49 | 50 | ```sh 51 | pip install --user reportseff 52 | # OR 53 | pipx install reportseff 54 | ``` 55 | 56 | will create command line bindings and install click. 57 | 58 | ### Sample Usage 59 | 60 | Try `reportseff -u $USER` or just `reportseff` in a directory with some slurm 61 | outputs. You may be surprised by your results! 62 | 63 | #### Single job 64 | 65 | Calling `reportseff` with a single jobid will provide equivalent information to 66 | seff for that job. `reportseff 24371789` and `reportseff map_fastq_24371789` 67 | produce the following output: 68 | 69 | ```txt 70 | JobID State Elapsed CPUEff MemEff 71 | 24371789 COMPLETED 03:08:03 71.2% 45.7% 72 | ``` 73 | 74 | #### Single array job 75 | 76 | Providing either the raw job id or the array job id will get efficiency 77 | information for a single element of the array job. `reportseff 24220929_421` 78 | and `reportseff 24221219` generate: 79 | 80 | ```txt 81 | JobID State Elapsed CPUEff MemEff 82 | 24220929_421 COMPLETED 00:09:34 99.0% 34.6% 83 | ``` 84 | 85 | #### Array job group 86 | 87 | If the base job id of an array is provided, all elements of the array will 88 | be added to the output. `reportseff 24220929` 89 | 90 | ```txt 91 | JobID State Elapsed CPUEff MemEff 92 | 24220929_1 COMPLETED 00:10:43 99.2% 33.4% 93 | 24220929_11 COMPLETED 00:10:10 99.2% 37.5% 94 | 24220929_21 COMPLETED 00:09:25 98.8% 36.1% 95 | 24220929_31 COMPLETED 00:09:19 98.9% 33.3% 96 | 24220929_41 COMPLETED 00:09:23 98.9% 33.3% 97 | 24220929_51 COMPLETED 00:08:02 98.5% 36.3% 98 | ... 99 | 24220929_951 COMPLETED 00:25:12 99.5% 33.5% 100 | 24220929_961 COMPLETED 00:39:26 99.7% 34.1% 101 | 24220929_971 COMPLETED 00:24:11 99.5% 34.2% 102 | 24220929_981 COMPLETED 00:24:50 99.5% 44.3% 103 | 24220929_991 COMPLETED 00:13:05 98.7% 33.7% 104 | ``` 105 | 106 | #### Glob expansion of slurm outputs 107 | 108 | Because slurm output files can act as job id inputs, the following can 109 | get all seff information for a given job name: 110 | 111 | ```txt 112 | slurm_out ❯❯❯ reportseff split_ubam_24* 113 | JobID State Elapsed CPUEff MemEff 114 | split_ubam_24342816 COMPLETED 23:30:32 99.9% 4.5% 115 | split_ubam_24342914 COMPLETED 22:40:51 99.9% 4.6% 116 | split_ubam_24393599 COMPLETED 23:43:36 99.4% 4.4% 117 | split_ubam_24393655 COMPLETED 21:36:58 99.3% 4.5% 118 | split_ubam_24418960 RUNNING 02:53:11 --- --- 119 | split_ubam_24419972 RUNNING 01:26:26 --- --- 120 | ``` 121 | 122 | #### No arguments 123 | 124 | Without arguments, reportseff will try to find slurm output files in the 125 | current directory. Combine with `watch` to monitor job progress: 126 | `watch -cn 300 reportseff --color --modified-sort` 127 | 128 | ```txt 129 | JobID State Elapsed CPUEff MemEff 130 | split_ubam_24418960 RUNNING 02:56:14 --- --- 131 | fastq_to_ubam_24419971 RUNNING 01:29:29 --- --- 132 | split_ubam_24419972 RUNNING 01:29:29 --- --- 133 | fastq_to_ubam_24393600 COMPLETED 1-02:00:47 58.3% 41.1% 134 | map_fastq_24419330 RUNNING 02:14:53 --- --- 135 | map_fastq_24419323 RUNNING 02:15:24 --- --- 136 | map_fastq_24419324 RUNNING 02:15:24 --- --- 137 | map_fastq_24419322 RUNNING 02:15:24 --- --- 138 | mark_adapters_24418437 COMPLETED 01:29:23 99.8% 48.2% 139 | mark_adapters_24418436 COMPLETED 01:29:03 99.9% 47.4% 140 | ``` 141 | 142 | #### Filtering slurm output files 143 | 144 | One useful application of `reportseff` is filtering a directory of slurm output 145 | files based on the state or time since running. Additionally, if only the 146 | `jobid` is specified as a format output, the filenames will be returned in a 147 | pipe-friendly manner: 148 | 149 | ```txt 150 | old_runs ❯❯❯ reportseff --since d=4 --state Timeout 151 | 152 | JobID State Elapsed CPUEff MemEff 153 | call_variants_31550458 TIMEOUT 20:05:17 99.5% 0.0% 154 | call_variants_31550474 TIMEOUT 20:05:17 99.6% 0.0% 155 | call_variants_31550500 TIMEOUT 20:05:08 99.7% 0.0% 156 | old_runs ❯❯❯ reportseff --since d=4 --state Timeout --format jobid 157 | call_variants_31550458 158 | call_variants_31550474 159 | call_variants_31550500 160 | ``` 161 | 162 | To find all lines with `output:` in jobs which have timed out or failed 163 | in the last 4 days: 164 | 165 | ```sh 166 | reportseff --since 'd=4' --state TO,F --format jobid | xargs grep output: 167 | ``` 168 | 169 | ### Arguments 170 | 171 | Jobs can be passed as arguments in the following ways: 172 | 173 | - Job ID such as 1234567. If the id is part of an array job, only the element 174 | for that ID will be displayed. If the id is the base part of an array job, 175 | all elements in the array will be displayed. 176 | - Array Job ID such as 1234567\_89. Will display only the element specified. 177 | - Slurm output file. Format must be BASE\_%A\_%a. BASE is optional as is a 178 | '.out' suffix. Unix glob expansions can also be used to filter which jobs 179 | are displayed. 180 | - From current directory. If no argument is supplied, `reportseff` will attempt 181 | to find slurm output files in the current directory as described above. 182 | If a user is provided, instead `reportseff` will show recent jobs for that user. 183 | If only `since` is set, all recent jobs for all users will be shown (if allowed). 184 | - Supplying a directory as a single argument will override the current 185 | directory to check for slurm outputs. 186 | 187 | ### Options 188 | 189 | - `--color/--no-color`: Force color output or not. By default, will force color 190 | output. With the no-color flag, click will strip color codes for everything 191 | besides stdout. 192 | - `--modified-sort`: Instead of sorting by filename/jobid, sort by last 193 | modification time of the slurm output file. 194 | - `--debug`: Write sacct result to stderr. 195 | - `--user/-u`: Ignore job arguments and instead query sacct with provided user. 196 | Returns all jobs from the last week. 197 | - `--state/-s`: Output only jobs with states matching one of the provided options. 198 | Accepts comma separated values of job codes (e.g. 'R') or full names 199 | (e.g. RUNNING). Case insensitive. 200 | - `--not-state/-S`: Output only jobs with states not matching any of the provided options. 201 | Accepts comma separated values of job codes (e.g. 'R') or full names 202 | (e.g. RUNNING). Case insensitive. 203 | - `--format`: Provide a comma separated list of columns to produce. Prefixing the 204 | argument with `+` adds the specified values to the defaults. Values can 205 | be any valid column name to sacct and the custom efficiency values: TimeEff, 206 | cpuEff, MemEff. Can also optionally set alignment (<, ^, >) and maximum width. 207 | Default is center-aligned with a width of the maximum column entry. For 208 | example, `--format 'jobid%>,state%10,memeff%<5'` produces 3 columns with: 209 | - JobId aligned right, width set automatically 210 | - State with width 10 (center aligned by default) 211 | - MemEff aligned left, width 5 212 | - `--slurm-format`: The filename pattern passed to sbatch during job submission. 213 | Overrides the default regex for job id parsing from filenames. E.g. to match 214 | filenames like `123456.out` set `--slurm-format %j.out`. 215 | - `--since`: Limit results to those occurring after the specified time. Accepts 216 | sacct formats and a comma separated list of key/value pairs. To get jobs in 217 | the last hour and a half, can pass `h=1,m=30`. 218 | -`--until`: Limit results to those occurring before the specified time. Accepts 219 | sacct formats and a comma separated list of key/value pairs. 220 | Useful in combination with the 'since' option to query a specific range. 221 | - `--partition`: Limit results to a specific partition. 222 | - `--cluster/-M`: Select specific cluster (for multi-cluster systems) 223 | - `--node/-n`: Display information for multi-node jobs; requires additional 224 | sacct fields from jobstats. 225 | - `--node-and-gpu/-g`: Display information for multi-node jobs and GPU information; 226 | requires additional sacct fields from jobstats. 227 | - `--parsable/-p`: Ignore formatting and output as a `|` delimited table. Useful 228 | for piping into more complex analyses. 229 | 230 | ## Status, Contributions, and Support 231 | 232 | `reportseff` is actively maintained but currently feature complete. If there 233 | is a function missing, please open an issue to discuss its merit! 234 | 235 | Bug reports, pull requests, and any feedback are welcome! Prior to submitting 236 | a pull request, be sure any new features have been tested and all unit tests 237 | are passing. In the cloned repo with 238 | [poetry](https://github.com/python-poetry/poetry#installation) installed: 239 | 240 | ```sh 241 | poetry install 242 | poetry run pytest 243 | poetry run pre-commit install 244 | nox 245 | ``` 246 | 247 | ## Troubleshooting 248 | 249 | ### I can't install, what is pip? 250 | 251 | [pip](https://pip.pypa.io/en/stable/) is the package installer for python. If 252 | you get an error that pip isn't found, look for a python/anaconda/conda module. 253 | [pipx](https://pypa.github.io/pipx/) ensures that each application is installed 254 | in an isolated environment. This resolves issues of dependency versions and 255 | allows applications to be run from any environment. 256 | 257 | ### The output has no color with many jobs! 258 | 259 | Click should determine if the output supports color display and react automatically 260 | in a way you expect. Check that your terminal is setup to display colors and 261 | that your pager (probably less) will display color by default. Some commands, 262 | e.g. `watch` aren't handled properly even when invoked to support color. Here 263 | are some useful settings for your `.bashrc`: 264 | ``` 265 | # have less display colors by default. Will fix `reportseff` not showing colors 266 | export LESS="-R" 267 | # for watch aliases, include the `--color` option 268 | watch -cn 300 reportseff --color --modified-sort 269 | # ^ ^^^^^^^ 270 | ``` 271 | You can always for display of color (or suppress it) with the `--color/--no-color` 272 | options 273 | 274 | ### I get an error about broken pipes when chaining to other commands 275 | 276 | Python will report that the consumer of process output has closed the stream 277 | (i.e. the pipe) while still attempting to write. Newer versions of click 278 | should suppress the warning output, but it seems to not always work. Besides 279 | some extra printing on stderr, the output is not affected. 280 | 281 | ### My jobs don't have any information about multiple nodes or GPU efficiency 282 | 283 | Because `sacct` doesn't currently record this information, `reportseff` 284 | retrieves it from a custom field from `jobstats`, developed at Princeton 285 | University. If you are outside a Research Computing cluster, that information 286 | will likely be absent. Node-level reporting is only shown for jobs which use 287 | multiple nodes or GPUs. If you need a list of where jobs were run, you can add 288 | `--format +NodeList`. 289 | 290 | ## Acknowledgments 291 | 292 | The code for calling sacct and parsing the returning information was taken 293 | from [Slurmee](https://github.com/PrincetonUniversity/slurmee). 294 | 295 | Style and tooling from [hypermodern python](https://cjolowicz.github.io/posts/hypermodern-python-01-setup/) 296 | 297 | Code review provided from a [repo-review](https://researchcomputing.princeton.edu/services/repo-review-consultations) 298 | which vastly improved this readme. 299 | -------------------------------------------------------------------------------- /imgs/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/troycomi/reportseff/9dd39ecf52db79cafa5f7c32716f9d2af7d63fa6/imgs/example.png -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | 3 | [mypy-nox.*,pytest] 4 | ignore_missing_imports = True 5 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Nox sessions.""" 2 | 3 | import tempfile 4 | 5 | import nox 6 | 7 | locations = "src", "tests", "noxfile.py" 8 | nox.options.sessions = "lint", "pip_audit", "mypy", "pytype", "tests", "tests_old_click" 9 | package = "reportseff" 10 | 11 | 12 | def install_with_constraints(session, *args, **kwargs): 13 | """Install packages with poetry's lock file.""" 14 | with tempfile.NamedTemporaryFile() as requirements: 15 | session.run( 16 | "poetry", 17 | "export", 18 | "--with", 19 | "dev", 20 | "--format=requirements.txt", 21 | "--without-hashes", 22 | f"--output={requirements.name}", 23 | external=True, 24 | ) 25 | # strip extras 26 | session.run( 27 | "sed", 28 | "-i", 29 | r"s/\[.*\]//g", 30 | f"{requirements.name}", 31 | external=True, 32 | ) 33 | session.install(f"--constraint={requirements.name}", *args, **kwargs) 34 | 35 | 36 | @nox.session(python=["3.9", "3.10", "3.11"]) 37 | def tests(session): 38 | """Run test suite with pytest and coverage.""" 39 | args = session.posargs 40 | session.install(".") 41 | install_with_constraints( 42 | session, "coverage[toml]", "pytest", "pytest-cov", "pytest-mock" 43 | ) 44 | session.run("pytest", "--cov", *args) 45 | 46 | 47 | @nox.session(python=["3.9", "3.10", "3.11"]) 48 | def tests_old_click(session): 49 | """Run test suite with pytest and coverage, using click 6.7.""" 50 | args = session.posargs 51 | session.install(".") 52 | session.run("pip", "install", "click==6.7") 53 | install_with_constraints( 54 | session, "coverage[toml]", "pytest", "pytest-cov", "pytest-mock" 55 | ) 56 | session.run("pytest", "--cov", *args) 57 | 58 | 59 | @nox.session(python="3.10") 60 | def black(session): 61 | """Format code with black.""" 62 | args = session.posargs or locations 63 | install_with_constraints(session, "ruff") 64 | session.run("ruff", "format", *args) 65 | 66 | 67 | @nox.session(python="3.10") 68 | def lint(session): 69 | """Lint code with ruff.""" 70 | args = session.posargs or locations 71 | install_with_constraints( 72 | session, 73 | "ruff", 74 | ) 75 | session.run("ruff", "check", *args) 76 | 77 | 78 | @nox.session(python="3.10") 79 | def pip_audit(session): 80 | """Scan dependencies for insecure packages.""" 81 | with tempfile.NamedTemporaryFile() as requirements: 82 | session.run( 83 | "poetry", 84 | "export", 85 | "--with", 86 | "dev", 87 | "--format=requirements.txt", 88 | "--without-hashes", 89 | f"--output={requirements.name}", 90 | external=True, 91 | ) 92 | install_with_constraints(session, "pip-audit") 93 | session.run( 94 | "pip-audit", 95 | "-r", 96 | requirements.name, 97 | ) 98 | 99 | 100 | @nox.session(python=["3.9", "3.10"]) 101 | def mypy(session): 102 | """Type-check with mypy.""" 103 | args = session.posargs or locations 104 | install_with_constraints(session, "mypy", "types-click") 105 | session.run("mypy", *args) 106 | 107 | 108 | @nox.session(python="3.10") 109 | def pytype(session): 110 | """Run the static type checker pytype.""" 111 | args = session.posargs or ["--disable=import-error", *locations] 112 | install_with_constraints(session, "pytype") 113 | session.run("pytype", *args) 114 | 115 | 116 | @nox.session(python="3.10") 117 | def typeguard(session): 118 | """Runtime type checking during unit tests.""" 119 | args = session.posargs 120 | session.run("poetry", "install", "--only", "main", external=True) 121 | install_with_constraints(session, "pytest", "pytest-mock", "typeguard") 122 | session.run("pytest", f"--typeguard-packages={package}", *args) 123 | 124 | 125 | @nox.session(python="3.10") 126 | def coverage(session): 127 | """Upload coverage data.""" 128 | install_with_constraints(session, "coverage[toml]", "codecov") 129 | session.run("coverage", "xml", "--fail-under=0") 130 | session.run("codecov", *session.posargs) 131 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "reportseff" 3 | version = "2.8.3" 4 | description= "Tablular seff output" 5 | authors = ["Troy Comi "] 6 | license = "MIT" 7 | readme = "README.md" 8 | homepage= "https://github.com/troycomi/reportseff" 9 | repository= "https://github.com/troycomi/reportseff" 10 | keywords= ["slurm","seff"] 11 | 12 | [tool.poetry.dependencies] 13 | python = ">=3.9.1,<4.0" 14 | click = ">=6.7" 15 | importlib-metadata = {version = "^4.8.2", python = "<3.8"} 16 | 17 | [tool.poetry.dev-dependencies] 18 | pytest = "^8.2.2" 19 | pytest-mock = "^3.14.0" 20 | coverage = {extras = ["toml"], version = "^7.5.4"} 21 | pytest-cov = "^5.0.0" 22 | mypy = "^1.10.1" 23 | types-click = "^7.1.8" 24 | pytype = "^2024.4.11" 25 | typeguard = "^4.3.0" 26 | darglint = ">=1.8.1" 27 | codecov = ">=2.1.13" 28 | ruff = "^0.5.5" 29 | 30 | [tool.poetry.scripts] 31 | reportseff = "reportseff.console:main" 32 | 33 | [tool.poetry.group.dev-dependencies.dependencies] 34 | pip-audit = "^2.9.0" 35 | 36 | [build-system] 37 | requires = ["poetry-core>=1.0.0"] 38 | build-backend = "poetry.core.masonry.api" 39 | 40 | # pyproject.toml 41 | [tool.coverage.paths] 42 | source = ["src", "*/site-packages"] 43 | 44 | [tool.coverage.run] 45 | branch = true 46 | source = ["reportseff"] 47 | 48 | [tool.coverage.report] 49 | show_missing = true 50 | fail_under = 100 51 | 52 | [tool.ruff.lint] 53 | select = [ 54 | "A", # builtins 55 | "ANN", # annotations 56 | "ARG", # tidy imports 57 | "B", # bugbear 58 | "BLE", 59 | "C4", # comprehensions 60 | "COM", # commas 61 | "D", # pydocstyle 62 | "E", # pycode style Error 63 | "ERA", # commented out code 64 | "F", # pyflakes 65 | "FA", # error message 66 | "FBT", # boolean trap 67 | "FURB", # refurb 68 | "I", # isort 69 | "ICN", # import conventions 70 | "INT", # gettext 71 | "ISC", # String concatenation 72 | "N", # pep8-naming 73 | "PERF", # perflint 74 | "PIE", # pie 75 | "PL", # pylint 76 | "PT", # pytest style 77 | "PTH", # use pathlib 78 | "Q", # Quotes 79 | "RET", # return 80 | "RSE", # raise 81 | "RUF", # ruff 82 | "S", # bandit 83 | "SIM", # simplify 84 | "SLF", # Self 85 | "T10", # debugger 86 | "T20", # print 87 | "TCH", # type checking 88 | "TD", # todos 89 | "TID", # tidy imports 90 | "TRY", # tryceratops 91 | "UP", # pyupgrade 92 | "W", # pycode style Warning 93 | ] 94 | 95 | ignore = [ 96 | "ANN101", # missing self 97 | "ANN401", # allow Any 98 | "COM812", # allow missing commas on last element 99 | "ISC001", # recommended by format 100 | ] 101 | 102 | 103 | [tool.ruff.lint.per-file-ignores] 104 | "tests/*" = [ 105 | "ANN", # type annotations 106 | "S101", # use assert 107 | "PLR2004", # magic numbers in tests 108 | "SLF001", # allow private methods in tests 109 | ] 110 | 111 | "noxfile.py" = [ 112 | "ANN", # type annotations 113 | ] 114 | 115 | [tool.ruff.lint.pydocstyle] 116 | convention = "google" 117 | -------------------------------------------------------------------------------- /src/reportseff/__init__.py: -------------------------------------------------------------------------------- 1 | """Tabular efficiency with reportseff.""" 2 | 3 | try: 4 | from importlib.metadata import PackageNotFoundError, version # type: ignore[import] 5 | except ImportError: # pragma: no cover 6 | from importlib.metadata import PackageNotFoundError, version # type: ignore[import] 7 | 8 | 9 | try: 10 | __version__ = version(__name__) 11 | except PackageNotFoundError: # pragma: no cover 12 | __version__ = "unknown" 13 | -------------------------------------------------------------------------------- /src/reportseff/console.py: -------------------------------------------------------------------------------- 1 | """CLI for reportseff.""" 2 | 3 | from __future__ import annotations 4 | 5 | import sys 6 | from shutil import which 7 | from typing import Any 8 | 9 | import click 10 | 11 | from . import __version__ 12 | from .db_inquirer import BaseInquirer, SacctInquirer 13 | from .job_collection import JobCollection 14 | from .output_renderer import OutputRenderer, RenderOptions 15 | from .parameters import ReportseffParameters 16 | 17 | MAX_ENTRIES_TO_ECHO = 20 18 | 19 | 20 | @click.command() 21 | @click.option( 22 | "--modified-sort", 23 | default=False, 24 | is_flag=True, 25 | help="If set, will sort outputs by modified time of files", 26 | ) 27 | @click.option( 28 | "--color/--no-color", 29 | default=None, 30 | help="Force color output. No color will use click defaults", 31 | ) 32 | @click.option( 33 | "--format", 34 | "format_str", 35 | default="JobID%>,State,Elapsed%>,TimeEff,CPUEff,MemEff", 36 | help="Comma-separated list of columns to include. Options " 37 | "are any valid sacct input along with CPUEff, MemEff, Energy, " 38 | "and TimeEff. In systems with jobstat caching, GPU usage can be " 39 | "added with GPUEff, GPUMem or GPU (for both). " 40 | "A width and alignment may optionally be provided " 41 | 'after "%", e.g. JobID%>15 aligns job id right with max ' 42 | "width of 15 characters. Generally NAME[[%:][ALIGNMENT][WIDTH[e$]?]]. " 43 | "When an `e` or `$` is present after a width argument, " 44 | "the output will be truncated to the right." 45 | "Prefix with a + to add to the defaults. " 46 | "A single format token will suppress the header line. " 47 | "Wrap in quotes to pass a string literal, " 48 | "otherwise alignment may be misinterpreted.", 49 | ) 50 | @click.option( 51 | "--slurm-format", 52 | default="", 53 | help="Filename pattern passed to sbatch. By default, will handle " 54 | "patterns like slurm_%j.out, %x_%j, or slurm_%A_%a. In particular, the " 55 | "jobid is expected to start with '_'. Setting this to the same entry " 56 | "as used in sbatch will allow parsing slurm outputs like `1234.out`. " 57 | "Array jobs must have %A_%a to properly interface with sacct.", 58 | ) 59 | @click.option( 60 | "--debug", default=False, is_flag=True, help="Print raw db query to stderr" 61 | ) 62 | @click.option( 63 | "-u", 64 | "--user", 65 | default="", 66 | help="Ignore jobs, return all jobs in last week from user", 67 | ) 68 | @click.option( 69 | "--partition", 70 | default="", 71 | help="Only include jobs with the specified partition", 72 | ) 73 | @click.option( 74 | "-M", 75 | "--cluster", 76 | default="", 77 | help="Select specific cluster, for multi-cluster system only", 78 | ) 79 | @click.option( 80 | "--extra-args", 81 | default="", 82 | help="Extra arguments to forward to sacct", 83 | ) 84 | @click.option( 85 | "-s", "--state", default="", help="Only include jobs with the specified states" 86 | ) 87 | @click.option( 88 | "-S", "--not-state", default="", help="Include jobs without the specified states" 89 | ) 90 | @click.option( 91 | "--since", 92 | default="", 93 | help="Only include jobs after this time. Can be valid sacct " 94 | "or as a comma separated list of time deltas, e.g. d=2,h=1 " 95 | "means 2 days, 1 hour before current time. Weeks, days, " 96 | "hours, and minutes can use case-insensitive abbreviations. " 97 | "Minutes is the minimum resolution, while weeks is the coarsest.", 98 | ) 99 | @click.option( 100 | "--until", 101 | default="", 102 | help="Only include jobs before this time. Can be valid sacct " 103 | "or as a comma separated list of time deltas, e.g. d=2,h=1 " 104 | "means 2 days, 1 hour before current time. Weeks, days, " 105 | "hours, and minutes can use case-insensitive abbreviations. " 106 | "Minutes is the minimum resolution, while weeks is the coarsest.", 107 | ) 108 | @click.option( 109 | "--node/--no-node", 110 | "-n/-N", 111 | default=False, 112 | help="Report node-level statistics. Adds `jobid` to format for proper display.", 113 | ) 114 | @click.option( 115 | "--node-and-gpu/--no-node-gpu", 116 | "-g/-G", 117 | default=False, 118 | help=( 119 | "Report each GPU for each node. " 120 | "Sets `node` and adds `GPU` to format automatically." 121 | ), 122 | ) 123 | @click.option( 124 | "--parsable", 125 | "-p", 126 | is_flag=True, 127 | default=False, 128 | help="Output will be delmited without a delimiter at the end. " 129 | "Delimiter is by default '|', to change it see --delimiter flag.", 130 | ) 131 | @click.option( 132 | "--delimiter", 133 | "-d", 134 | default="|", 135 | help="Delimiter used for parsable output. The default default " 136 | "delimiter is '|' when --parsable is specified. " 137 | "This option is ignored if --parsable or -p is not specified.", 138 | ) 139 | @click.version_option(version=__version__) 140 | @click.argument("jobs", nargs=-1) 141 | def main(**kwargs: Any) -> None: 142 | """Main entry point for reportseff.""" 143 | args = ReportseffParameters(**kwargs) 144 | 145 | output, entries = get_jobs(args) 146 | 147 | if entries > MAX_ENTRIES_TO_ECHO: 148 | click.echo_via_pager(output, color=args.color) 149 | else: 150 | click.echo(output, color=args.color) 151 | 152 | 153 | def get_jobs(args: ReportseffParameters) -> tuple[str, int]: 154 | """Helper method to get jobs from db_inquirer. 155 | 156 | Returns: 157 | The string to display, tabulated and colored 158 | The number of jobs found to use paging properly 159 | 160 | Raises: 161 | Exception: if there is an error processing entries 162 | """ 163 | job_collection = JobCollection() 164 | 165 | if args.slurm_format: 166 | job_collection.set_custom_seff_format(args.slurm_format) 167 | 168 | inquirer, renderer = get_implementation( 169 | args.format_str, 170 | node=args.node, 171 | node_and_gpu=args.node_and_gpu, 172 | parsable=args.parsable, 173 | delimiter=args.delimiter, 174 | ) 175 | 176 | inquirer.set_state(args.state) 177 | inquirer.set_not_state(args.not_state) 178 | 179 | inquirer.set_since(args.since) 180 | inquirer.set_until(args.until) 181 | 182 | inquirer.set_partition(args.partition) 183 | inquirer.set_cluster(args.cluster) 184 | 185 | inquirer.set_extra_args(args.extra_args) 186 | 187 | add_jobs = False 188 | 189 | try: 190 | if args.user: 191 | inquirer.set_user(args.user) 192 | add_jobs = True 193 | elif inquirer.has_since() and not args.jobs: # since is set 194 | inquirer.all_users() 195 | add_jobs = True 196 | else: 197 | job_collection.set_jobs(args.jobs) 198 | 199 | except ValueError as error: 200 | click.secho(str(error), fg="red", err=True) 201 | sys.exit(1) 202 | 203 | job_collection.set_partition_limits(inquirer.get_partition_timelimits()) 204 | db_output = get_db_output( 205 | inquirer, 206 | renderer, 207 | job_collection, 208 | debug=args.debug, 209 | ) 210 | entry = None 211 | try: 212 | for entry in db_output: 213 | job_collection.process_entry(entry, add_job=add_jobs) 214 | except Exception: 215 | click.echo(f"Error processing entry: {entry}", err=True) 216 | raise 217 | 218 | found_jobs = job_collection.get_sorted_jobs(change_sort=args.modified_sort) 219 | found_jobs = [j for j in found_jobs if j.state] 220 | 221 | return renderer.format_jobs(found_jobs), len(found_jobs) 222 | 223 | 224 | def get_implementation( 225 | format_str: str, 226 | *, 227 | node: bool = False, 228 | node_and_gpu: bool = False, 229 | parsable: bool = False, 230 | delimiter: str = " ", 231 | ) -> tuple[BaseInquirer, OutputRenderer]: 232 | """Get system-specific objects. 233 | 234 | Args: 235 | format_str: the formatting options specified by user 236 | node: control if node-level stats are displayed 237 | node_and_gpu: control if node and gpu stats are displayed 238 | parsable: produce output with a delimiter separating columns 239 | delimiter: delimiter used for parsable output 240 | 241 | Returns: 242 | A db_inqurirer 243 | An output renderer 244 | """ 245 | if which("sacct") is not None: 246 | inquirer = SacctInquirer() 247 | renderer = OutputRenderer( 248 | inquirer.get_valid_formats(), 249 | RenderOptions( 250 | node=node or node_and_gpu, 251 | gpu=node_and_gpu, 252 | parsable=parsable, 253 | delimiter=delimiter, 254 | ), 255 | format_str, 256 | ) 257 | else: 258 | click.secho("No supported scheduling systems found!", fg="red", err=True) 259 | sys.exit(1) 260 | 261 | return inquirer, renderer 262 | 263 | 264 | def get_db_output( 265 | inquirer: BaseInquirer, 266 | renderer: OutputRenderer, 267 | job_collection: JobCollection, 268 | *, 269 | debug: bool, 270 | ) -> list[dict[str, str]]: 271 | """Get output from inquirer. 272 | 273 | Returns: 274 | The db inquirer entries for the provided objects 275 | """ 276 | 277 | def print_debug(info: str) -> None: 278 | click.echo(info, err=True) 279 | 280 | debug_cmd = None 281 | if debug: 282 | debug_cmd = print_debug 283 | 284 | try: 285 | result = inquirer.get_db_output( 286 | renderer.query_columns, job_collection.get_jobs(), debug_cmd 287 | ) 288 | except RuntimeError as error: 289 | click.secho(str(error), fg="red", err=True) 290 | sys.exit(1) 291 | 292 | return result 293 | -------------------------------------------------------------------------------- /src/reportseff/db_inquirer.py: -------------------------------------------------------------------------------- 1 | """Abstract and concrete implementations of scheduler databases.""" 2 | 3 | from __future__ import annotations 4 | 5 | import datetime 6 | import re 7 | import shlex 8 | import subprocess 9 | from abc import ABC, abstractmethod 10 | from typing import Callable 11 | 12 | import click 13 | 14 | 15 | class BaseInquirer(ABC): 16 | """Abstract interface for inquiring different schedulers.""" 17 | 18 | @abstractmethod 19 | def __init__(self) -> None: 20 | """Initialize a new inquirer.""" 21 | 22 | @abstractmethod 23 | def get_valid_formats(self) -> list[str]: 24 | """Get the valid formatting options supported by the inquirer. 25 | 26 | Returns: 27 | List of valid format options 28 | """ 29 | 30 | @abstractmethod 31 | def set_sacct_args(self, jobs: list[str]) -> list[str]: 32 | """Set arguments of sacct query. 33 | 34 | Args: 35 | jobs: list of job names 36 | 37 | Returns: 38 | String of sacct arguments 39 | 40 | Raises: 41 | RuntimeError: if sacct doesn't return properly 42 | """ 43 | 44 | @abstractmethod 45 | def get_db_output( 46 | self, 47 | columns: list[str], 48 | jobs: list[str], 49 | debug_cmd: Callable | None, 50 | ) -> list[dict[str, str]]: 51 | """Query the database with the supplied columns. 52 | 53 | Args: 54 | columns: validated format names as strings 55 | jobs: list of job names 56 | debug_cmd: If specified, the raw output will passed to this function 57 | 58 | Returns: 59 | List of rows, where each row is a dictionary 60 | with the columns as keys and entries as values 61 | Output order is not garunteed to match the jobs list 62 | 63 | """ 64 | 65 | @abstractmethod 66 | def set_user(self, user: str) -> None: 67 | """Set the collection of jobs based on the provided user. 68 | 69 | Args: 70 | user: user name 71 | """ 72 | 73 | @abstractmethod 74 | def set_partition(self, partition: str) -> None: 75 | """Set the collection of jobs based on the provided partition. 76 | 77 | Args: 78 | partition: partition name 79 | """ 80 | 81 | @abstractmethod 82 | def set_cluster(self, cluster: str) -> None: 83 | """Set the collection of jobs based on the provided cluster. 84 | 85 | Args: 86 | cluster: cluster name 87 | """ 88 | 89 | @abstractmethod 90 | def set_extra_args(self, extra_args: str) -> None: 91 | """Set extra arguments to be forwarded to sacct. 92 | 93 | Args: 94 | extra_args: list of arguments 95 | """ 96 | 97 | @abstractmethod 98 | def all_users(self) -> None: 99 | """Ignore provided jobs, query for all users.""" 100 | 101 | @abstractmethod 102 | def set_state(self, state: str) -> None: 103 | """Set the state to include output jobs. 104 | 105 | Args: 106 | state: comma separated list of state names or codes 107 | """ 108 | 109 | @abstractmethod 110 | def set_not_state(self, state: str) -> None: 111 | """Set the state to exclude from output jobs. 112 | 113 | Args: 114 | state: comma separated list of state names or codes 115 | """ 116 | 117 | @abstractmethod 118 | def parse_date(self, d: str) -> str: 119 | """Parse and convert custom string date format. 120 | 121 | Args: 122 | d: the string of date. 123 | 124 | Returns: 125 | converted string of date 126 | """ 127 | 128 | @abstractmethod 129 | def set_until(self, until: str) -> None: 130 | """Set the filter for time of jobs to consider. 131 | 132 | Args: 133 | until: the string for filtering. If specified as time=amount 134 | will subtract that amount from the current time 135 | """ 136 | 137 | @abstractmethod 138 | def set_since(self, since: str) -> None: 139 | """Set the filter for time of jobs to consider. 140 | 141 | Args: 142 | since: the string for filtering. If specified as time=amount 143 | will subtract that amount from the current time 144 | """ 145 | 146 | @abstractmethod 147 | def has_since(self) -> bool: 148 | """Tests if `since` has been set. 149 | 150 | Returns: 151 | True if set_since has been called on this inquirer 152 | """ 153 | 154 | @abstractmethod 155 | def get_partition_timelimits(self) -> dict: 156 | """Get partition time limits. 157 | 158 | Returns: 159 | dict mapping partition names to maximum timelimits. 160 | """ 161 | 162 | 163 | class SacctInquirer(BaseInquirer): 164 | """Implementation of BaseInquirer for the sacct slurm function.""" 165 | 166 | def __init__(self) -> None: 167 | """Initialize a new inquirer.""" 168 | self.default_args = "sacct --parsable -n --delimiter=^|^".split() 169 | self.user: str | None = None 170 | self.state: set | None = None 171 | self.not_state: set | None = None 172 | self.since: str | None = None 173 | self.until: str | None = None 174 | self.query_all_users: bool = False 175 | self.partition: str | None = None 176 | self.cluster: str | None = None 177 | self.extra_args: str | None = None 178 | 179 | def get_valid_formats(self) -> list[str]: 180 | """Get the valid formatting options supported by the inquirer. 181 | 182 | Returns: 183 | List of valid format options 184 | 185 | Raises: 186 | RuntimeError: if sacct raises an error 187 | """ 188 | command_args = "sacct --helpformat".split() 189 | cmd_result = subprocess.run( 190 | args=command_args, 191 | stdout=subprocess.PIPE, 192 | encoding="utf8", 193 | check=True, 194 | text=True, 195 | shell=False, 196 | ) 197 | if cmd_result.returncode != 0: 198 | msg = "Error retrieving sacct options with --helpformat" 199 | raise RuntimeError(msg) 200 | return cmd_result.stdout.split() 201 | 202 | def set_sacct_args(self, jobs: list[str]) -> list[str]: 203 | """Set arguments of sacct query. 204 | 205 | Args: 206 | jobs: list of job names 207 | 208 | Returns: 209 | String of sacct arguments 210 | 211 | """ 212 | args = [] 213 | if self.user: 214 | if not self.since: 215 | start_date = datetime.date.today() - datetime.timedelta(days=7) 216 | self.since = start_date.strftime("%m%d%y") # MMDDYY 217 | args += [f"--user={self.user}"] 218 | elif self.query_all_users: 219 | args += ["--allusers"] 220 | else: 221 | args += ["--jobs=" + ",".join(jobs)] 222 | 223 | if self.since: 224 | args += [f"--starttime={self.since}"] 225 | if self.partition: 226 | args += [f"--partition={self.partition}"] 227 | if self.cluster: 228 | args += [f"--cluster={self.cluster}"] 229 | if self.until: 230 | args += [f"--endtime={self.until}"] 231 | if self.extra_args: 232 | args += shlex.split(self.extra_args) 233 | return args 234 | 235 | def get_db_output( 236 | self, 237 | columns: list[str], 238 | jobs: list[str], 239 | debug_cmd: Callable | None = None, 240 | ) -> list[dict[str, str]]: 241 | """Query the database with the supplied columns. 242 | 243 | Args: 244 | columns: validated format names as strings 245 | jobs: list of job names 246 | debug_cmd: If specified, the raw output will passed to this function 247 | 248 | Returns: 249 | List of rows, where each row is a dictionary 250 | with the columns as keys and entries as values 251 | Output order is not guaranteed to match the jobs list 252 | 253 | Raises: 254 | RuntimeError: if sacct doesn't return properly 255 | """ 256 | args = [*self.default_args, "--format=" + ",".join(columns)] 257 | args += self.set_sacct_args(jobs) 258 | try: 259 | cmd_result = subprocess.run( 260 | args=args, 261 | stdout=subprocess.PIPE, 262 | encoding="utf8", 263 | check=True, 264 | text=True, 265 | shell=False, 266 | ) 267 | cmd_result.check_returncode() 268 | 269 | except subprocess.CalledProcessError as error: 270 | msg = f"Error running sacct!\n{error.stderr}" 271 | raise RuntimeError(msg) from error 272 | 273 | sacct_line_split = re.compile(r"\^\|\^\n") 274 | # convert newlines to printable \n 275 | lines = [ 276 | line.replace("\n", "\\n") 277 | for line in sacct_line_split.split(cmd_result.stdout) 278 | ] 279 | if debug_cmd is not None: 280 | debug_cmd("\n".join(line.replace("\n", "\\n") for line in lines)) 281 | 282 | sacct_split = re.compile(r"\^\|\^") 283 | result = [dict(zip(columns, sacct_split.split(line))) for line in lines if line] 284 | 285 | # Sometimes the main job has a different state than the sub jobs 286 | # e.g. timeouts have a state of canceled for the batch jobs. 287 | # When state filtering is active, need to filter main ids, then retain 288 | # only the jobs with matching job ids 289 | if self.state or self.not_state: 290 | main_jobs = [r for r in result if "." not in r["JobID"]] 291 | if self.state: 292 | # split to get first word in entries like "CANCELLED BY X" 293 | main_jobs = [ 294 | r for r in main_jobs if r["State"].split()[0] in self.state 295 | ] 296 | 297 | if self.not_state: 298 | # split to get first word in entries like "CANCELLED BY X" 299 | main_jobs = [ 300 | r for r in main_jobs if r["State"].split()[0] not in self.not_state 301 | ] 302 | 303 | main_job_ids = {r["JobID"] for r in main_jobs} 304 | result = [r for r in result if r["JobID"].split(".")[0] in main_job_ids] 305 | 306 | return result 307 | 308 | def set_user(self, user: str) -> None: 309 | """Set the collection of jobs based on the provided user. 310 | 311 | Args: 312 | user: user name 313 | """ 314 | self.user = user 315 | 316 | def set_partition(self, partition: str) -> None: 317 | """Set the collection of jobs based on the provided partition. 318 | 319 | Args: 320 | partition: partition name 321 | """ 322 | self.partition = partition 323 | 324 | def set_cluster(self, cluster: str) -> None: 325 | """Set the specific cluster in multi-cluster environment. 326 | 327 | Args: 328 | cluster: cluster name 329 | """ 330 | self.cluster = cluster 331 | 332 | def set_extra_args(self, extra_args: str) -> None: 333 | """Set extra arguments to be forwarded to sacct. 334 | 335 | Args: 336 | extra_args: list of arguments 337 | """ 338 | self.extra_args = extra_args 339 | 340 | def all_users(self) -> None: 341 | """Query for all users if `since` is set.""" 342 | self.query_all_users = True 343 | 344 | def set_state(self, state: str) -> None: 345 | """Set the state to include output jobs. 346 | 347 | Args: 348 | state: comma separated list of state names or codes 349 | """ 350 | if not state: 351 | return 352 | 353 | self.state = get_states_as_set(state) 354 | # add a single value if it's empty here 355 | if not self.state: 356 | click.secho("No valid states provided to include", fg="yellow", err=True) 357 | self.state.add(None) 358 | 359 | def set_not_state(self, state: str) -> None: 360 | """Set the state to exclude from output jobs. 361 | 362 | Args: 363 | state: comma separated list of state names or codes 364 | 365 | """ 366 | if not state: 367 | return 368 | 369 | self.not_state = get_states_as_set(state) 370 | # add a single value if it's empty here 371 | if not self.not_state: 372 | click.secho("No valid states provided to exclude", fg="yellow", err=True) 373 | self.not_state = None 374 | 375 | def parse_date(self, d: str) -> str: 376 | """Parse and convert custom string date format. 377 | 378 | Args: 379 | d: the string of date. 380 | 381 | Returns: 382 | converted string of date 383 | """ 384 | abbrev_to_key = { 385 | "w": "weeks", 386 | "W": "weeks", 387 | "d": "days", 388 | "D": "days", 389 | "h": "hours", 390 | "H": "hours", 391 | "m": "minutes", 392 | "M": "minutes", 393 | } 394 | valid_args = ["weeks", "days", "hours", "minutes"] 395 | date_args = {} 396 | 397 | args = d.split(",") 398 | for arg in args: 399 | if "=" not in arg: 400 | continue 401 | 402 | toks = arg.split("=") 403 | 404 | # convert key to name 405 | if toks[0] in abbrev_to_key: 406 | toks[0] = abbrev_to_key[toks[0]] 407 | 408 | toks[0] = toks[0].lower() 409 | 410 | if toks[0] in valid_args: 411 | try: 412 | date_args[toks[0]] = int(toks[1]) 413 | except ValueError: 414 | continue 415 | 416 | date = datetime.datetime.today() 417 | date -= datetime.timedelta(**date_args) 418 | return date.strftime("%Y-%m-%dT%H:%M") # MMDDYY 419 | 420 | def set_until(self, until: str) -> None: 421 | """Set the filter for time of jobs to consider. 422 | 423 | Args: 424 | until: the string for filtering. If specified as time=amount 425 | will subtract that amount from the current time 426 | """ 427 | if not until: 428 | return 429 | if "=" in until: # handle custom format 430 | self.until = self.parse_date(until) 431 | else: 432 | self.until = until 433 | 434 | def set_since(self, since: str) -> None: 435 | """Set the filter for time of jobs to consider. 436 | 437 | Args: 438 | since: the string for filtering. If specified as time=amount 439 | will subtract that amount from the current time 440 | """ 441 | if not since: 442 | return 443 | if "=" in since: # handle custom format 444 | self.since = self.parse_date(since) 445 | else: 446 | self.since = since 447 | 448 | def has_since(self) -> bool: 449 | """Check if since has been set. 450 | 451 | Returns: 452 | True if since has been set properly 453 | """ 454 | return bool(self.since) 455 | 456 | def get_partition_timelimits(self) -> dict: 457 | """Get partition time limits. 458 | 459 | Returns: 460 | dict mapping partition names to maximum timelimits. 461 | 462 | Raises: 463 | RuntimeError: if scontrol raises an error 464 | """ 465 | args = "" 466 | if self.cluster: 467 | args = f"--cluster {self.cluster}" 468 | 469 | command_args = f"scontrol {args} show partition".split() 470 | cmd_result = subprocess.run( 471 | args=command_args, 472 | stdout=subprocess.PIPE, 473 | encoding="utf8", 474 | check=True, 475 | text=True, 476 | shell=False, 477 | ) 478 | if cmd_result.returncode != 0: 479 | msg = "Error retrieving information from scontrol" 480 | raise RuntimeError(msg) 481 | 482 | partition_name = re.compile(r"^PartitionName=(?P\S+)$") 483 | time_limit = re.compile(r"MaxTime=(?P