├── .coveragerc ├── .flake8 ├── .gitignore ├── .gitlab-ci.yml ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── README.md ├── coverage_report.sh ├── docs ├── Makefile ├── make.bat └── source │ ├── bsub.rst │ ├── conf.py │ ├── index.rst │ ├── job.rst │ ├── pbs.rst │ ├── pbs_batch.rst │ └── scripts.rst ├── examples ├── basic │ ├── launch.py │ └── nas.py ├── batch_no_limit │ └── launch.py ├── batch_with_job_limit │ └── launch.py ├── dependency_chain │ └── launch.py ├── hybrid_openmp_mpi │ └── launch.py └── job_array │ └── write_pbs_file.py ├── pbs4py ├── __init__.py ├── bsub.py ├── directory_utils.py ├── fake_pbs.py ├── job.py ├── launcher_base.py ├── pbs.py ├── pbs_batch.py ├── scripts │ ├── __init__.py │ ├── job_dir.py │ └── qdel_user_jobs.py └── slurm.py ├── pyproject.toml └── tests ├── job_test └── empty_file ├── pbs_test_files ├── golden0.lsf ├── golden0.pbs └── golden0.slurm ├── test_bsub.py ├── test_bsub_regression.py ├── test_fake_pbs.py ├── test_job.py ├── test_launch_base.py ├── test_output_files └── .empty ├── test_pbs.py ├── test_pbs_batch.py ├── test_pbs_batch_job.py ├── test_pbs_header.py ├── test_pbs_regression.py ├── test_slurm_header.py ├── test_slurm_regression.py └── testing_bashrc /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=pbs4py 3 | omit=pbs4py/directory_utils.py 4 | pbs4py/scripts/*.py 5 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | 4 | extend-ignore = 5 | # black 6 | E203, 7 | # too many leading '#' for block comment 8 | E266, 9 | # expected 2 blank lines, found 1 10 | E302, 11 | # do not use mutable data structures for argument defaults (too many false positives) 12 | B006, 13 | # ===== TODO: to be fixed: 14 | # invalid escape sequence, necessary for sphinx directives in docstrings but should switch to raw string 15 | W605, 16 | # line length, exceeded by some docstrings 17 | E501, 18 | # Function definition does not bind loop variable, happens everywhere in our code 19 | B023, 20 | # pydocstyle 21 | D 22 | 23 | # Only add patterns here that are not included by the defaults of flake8 or other plugins 24 | # extend-select = 25 | 26 | # flake8-docstrings 27 | docstring-convention = numpy 28 | 29 | # flake8-rst-docstrings 30 | rst-roles = 31 | class, 32 | func, 33 | ref, 34 | meth, 35 | 36 | rst-directives = 37 | # Custom directives defined in the sphinx_mdolab_theme 38 | embed-compare, 39 | embed-bibtex, 40 | embed-code, 41 | embed-shell-cmd, 42 | embed-n2, 43 | 44 | # mccabe complexity 45 | # max-complexity = 10 46 | 47 | # ignored files/directories 48 | # we use exclude here and extend-exclude in repo-specific config files 49 | # so that we can pass both to flake8 directly without needing to merge them first 50 | exclude = 51 | # No need to traverse the git directory 52 | .git, 53 | # There's no value in checking cache directories 54 | __pycache__, 55 | # The conf file is mostly autogenerated, ignore it 56 | doc/conf.py, 57 | # No need for init and setup files 58 | __init__.py, 59 | setup.py, 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | *swp 3 | *html 4 | *egg 5 | *egg-info 6 | build 7 | .vscode/.ropeproject/objectdb 8 | .vscode/.ropeproject/config.py 9 | tests/test_output_files 10 | htmlcov 11 | .coverage 12 | .DS_Store 13 | sample*txt 14 | 15 | *pbs 16 | *log 17 | *out 18 | cov.xml 19 | docs/source/_build 20 | *tar.gz 21 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - checkout 3 | - test 4 | - docs 5 | - deploy 6 | 7 | variables: 8 | user_name: fun3d 9 | build_machine: k4 10 | build_directory: /hpnobackup2/fun3d/component-ci 11 | project_repo: git@gitlab.larc.nasa.gov:kejacob1/pbs4py.git 12 | project_name: pbs4py 13 | build_tag: pbs4py-$CI_PIPELINE_ID 14 | project_dir: ${build_directory}/pbs4py-$CI_PIPELINE_ID/${project_name} 15 | 16 | 17 | checkout_branch: 18 | only: 19 | refs: 20 | - main 21 | - merge_requests 22 | stage: checkout 23 | variables: 24 | branch_name: $CI_COMMIT_REF_NAME 25 | checkout_sha: $CI_COMMIT_SHA 26 | tags: 27 | - gitlab_runner 28 | script: 29 | - echo $PWD 30 | - hostname 31 | - whoami 32 | - "ssh -o LogLevel=error ${user_name}@${build_machine} \"cd ${build_directory} && mkdir -p ${build_tag} 33 | && cd ${build_tag} && git clone ${project_repo} ${project_name} \" " 34 | - "ssh -o LogLevel=error ${user_name}@${build_machine} \"cd ${project_dir} 35 | && git checkout ${branch_name} && git checkout ${checkout_sha} \" " 36 | 37 | unit_tests: 38 | stage: test 39 | only: 40 | - merge_requests 41 | tags: 42 | - gitlab_runner 43 | script: 44 | - "ssh -o LogLevel=error ${user_name}@${build_machine} \"cd ${project_dir} 45 | && module load Python_3.7.1 && setenv PYTHONPATH `pwd`:$PYTHONPATH && pytest -vs \" " 46 | 47 | test_doc_build: 48 | stage: docs 49 | only: 50 | - merge_requests 51 | tags: 52 | - gitlab_runner 53 | script: 54 | - "ssh -o LogLevel=error ${user_name}@${build_machine} \"cd ${project_dir} 55 | && module load Python_3.7.1 && setenv PYTHONPATH `pwd`:$PYTHONPATH 56 | && setenv PATH /u/fun3d/.local/bin:$PATH && cd docs 57 | && make html SPHINXOPTS='-W --keep-going' 58 | && cd build && tar zcvf pbs4py_html.tgz html \" " 59 | 60 | build_docs: 61 | stage: docs 62 | only: 63 | refs: 64 | - main 65 | tags: 66 | - gitlab_runner 67 | script: 68 | - "ssh -o LogLevel=error ${user_name}@${build_machine} \"cd ${project_dir} 69 | && module load Python_3.7.1 && setenv PYTHONPATH `pwd`:$PYTHONPATH 70 | && setenv PATH /u/fun3d/.local/bin:$PATH && cd docs && make html 71 | && cd build && tar zcvf pbs4py_html.tgz html \" " 72 | after_script: 73 | - "scp -q ${user_name}@${build_machine}:${project_dir}/docs/build/pbs4py_html.tgz . || true" 74 | artifacts: 75 | paths: 76 | - pbs4py_html.tgz 77 | expire_in: 1 week 78 | 79 | pages: 80 | stage: deploy 81 | only: 82 | refs: 83 | - main 84 | tags: 85 | - gitlab_runner 86 | script: 87 | - rm -rf public 88 | - tar xzvf pbs4py_html.tgz 89 | - mv html public 90 | artifacts: 91 | paths: 92 | - public 93 | expire_in: 1 week 94 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "cwd": "${fileDirname}" 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.defaultFormatter": "ms-python.autopep8", 4 | "editor.formatOnSave": true, 5 | }, 6 | "flake8.args": [ 7 | "--config", 8 | ".flake8" 9 | ], 10 | "autopep8.args": [ 11 | "--max-line-length", 12 | "100", 13 | "--experimental" 14 | ], 15 | "python.testing.pytestArgs": [ 16 | "tests" 17 | ], 18 | "python.testing.unittestEnabled": false, 19 | "python.testing.pytestEnabled": true 20 | } 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | NASA OPEN SOURCE AGREEMENT VERSION 1.3 2 | 3 | THIS OPEN SOURCE AGREEMENT (“AGREEMENT”) DEFINES THE RIGHTS OF USE, REPRODUCTION, DISTRIBUTION, MODIFICATION AND REDISTRIBUTION OF CERTAIN COMPUTER SOFTWARE ORIGINALLY RELEASED BY THE UNITED STATES GOVERNMENT AS REPRESENTED BY THE GOVERNMENT AGENCY LISTED BELOW ("GOVERNMENT AGENCY"). THE UNITED STATES GOVERNMENT, AS REPRESENTED BY GOVERNMENT AGENCY, IS AN INTENDED THIRD-PARTY BENEFICIARY OF ALL SUBSEQUENT DISTRIBUTIONS OR REDISTRIBUTIONS OF THE SUBJECT SOFTWARE. ANYONE WHO USES, REPRODUCES, DISTRIBUTES, MODIFIES OR REDISTRIBUTES THE SUBJECT SOFTWARE, AS DEFINED HEREIN, OR ANY PART THEREOF, IS, BY THAT ACTION, ACCEPTING IN FULL THE RESPONSIBILITIES AND OBLIGATIONS CONTAINED IN THIS AGREEMENT. 4 | 5 | Government Agency: National Aeronautics and Space Administration 6 | Government Agency Original Software Designation: LAR-20193-1 7 | Government Agency Original Software Title: pbs4py - python utilities for submitting jobs on high performance computing systems 8 | Government Agency Point of Contact for Original Software: kevin.e.jacobson@nasa.gov 9 | 10 | 11 | 1. DEFINITIONS 12 | 13 | A. “Contributor” means Government Agency, as the developer of the Original Software, and any entity that makes a Modification. 14 | B. “Covered Patents” mean patent claims licensable by a Contributor that are necessarily infringed by the use or sale of its Modification alone or when combined with the Subject Software. 15 | C. “Display” means the showing of a copy of the Subject Software, either directly or by means of an image, or any other device. 16 | D. “Distribution” means conveyance or transfer of the Subject Software, regardless of means, to another. 17 | E. “Larger Work” means computer software that combines Subject Software, or portions thereof, with software separate from the Subject Software that is not governed by the terms of this Agreement. 18 | F. “Modification” means any alteration of, including addition to or deletion from, the substance or structure of either the Original Software or Subject Software, and includes derivative works, as that term is defined in the Copyright Statute, 17 USC 101. However, the act of including Subject Software as part of a Larger Work does not in and of itself constitute a Modification. 19 | G. “Original Software” means the computer software first released under this Agreement by Government Agency with Government Agency designation LAR-20193-1 and entitled pbs4py - python utilities for submitting jobs on high performance computing systems including source code, object code and accompanying documentation, if any. 20 | H. “Recipient” means anyone who acquires the Subject Software under this Agreement, including all Contributors. 21 | I. “Redistribution” means Distribution of the Subject Software after a Modification has been made. 22 | J. “Reproduction” means the making of a counterpart, image or copy of the Subject Software. 23 | K. “Sale” means the exchange of the Subject Software for money or equivalent value. 24 | L. “Subject Software” means the Original Software, Modifications, or any respective parts thereof. 25 | M. “Use” means the application or employment of the Subject Software for any purpose. 26 | 27 | 2. GRANT OF RIGHTS 28 | 29 | A. Under Non-Patent Rights: Subject to the terms and conditions of this Agreement, each Contributor, with respect to its own contribution to the Subject Software, hereby grants to each Recipient a non-exclusive, world-wide, royalty-free license to engage in the following activities pertaining to the Subject Software: 30 | 31 | 1. Use 32 | 2. Distribution 33 | 3. Reproduction 34 | 4. Modification 35 | 5. Redistribution 36 | 6. Display 37 | 38 | B. Under Patent Rights: Subject to the terms and conditions of this Agreement, each Contributor, with respect to its own contribution to the Subject Software, hereby grants to each Recipient under Covered Patents a non-exclusive, world-wide, royalty-free license to engage in the following activities pertaining to the Subject Software: 39 | 40 | 1. Use 41 | 2. Distribution 42 | 3. Reproduction 43 | 4. Sale 44 | 5. Offer for Sale 45 | 46 | C. The rights granted under Paragraph B. also apply to the combination of a Contributor’s Modification and the Subject Software if, at the time the Modification is added by the Contributor, the addition of such Modification causes the combination to be covered by the Covered Patents. It does not apply to any other combinations that include a Modification. 47 | 48 | D. The rights granted in Paragraphs A. and B. allow the Recipient to sublicense those same rights. Such sublicense must be under the same terms and conditions of this Agreement. 49 | 50 | 3. OBLIGATIONS OF RECIPIENT 51 | 52 | A. Distribution or Redistribution of the Subject Software must be made under this Agreement except for additions covered under paragraph 3H. 53 | 54 | 1. Whenever a Recipient distributes or redistributes the Subject Software, a copy of this Agreement must be included with each copy of the Subject Software; and 55 | 2. If Recipient distributes or redistributes the Subject Software in any form other than source code, Recipient must also make the source code freely available, and must provide with each copy of the Subject Software information on how to 56 | obtain the source code in a reasonable manner on or through a medium customarily used for software exchange. 57 | 58 | B. Each Recipient must ensure that the following copyright notice appears prominently in the Subject Software: 59 | This software may be used, reproduced, and provided to others only as permitted under the terms of the agreement under which it was acquired from the U.S. Government. 60 | Neither title to, nor ownership of, the software is hereby transferred. This notice shall remain on all copies of the software. 61 | 62 | Copyright 2022 United States Government as represented by the Administrator of the National Aeronautics and Space Administration. No copyright is claimed in the United States under Title 17, U.S. Code. All Other Rights Reserved. 63 | 64 | Third Party Software: 65 | 66 | This software calls the following third party software, which is subject to the terms and conditions of its licensor, as applicable at the time of licensing. Third party software is not bundled with this software, but may be available from the licensor. License hyperlinks are provided here for information purposes only: numpy, BSD 3-Clause "New" or "Revised" License, https://github.com/numpy/numpy/blob/main/LICENSE.txt 67 | Third Party Software: 68 | This software derives analyses using Google Earth Engine's (GEE’s) free and publicly accessible data catalog. GEE is not bundled with this software, but users of this software must obtain their own account at code.earthengine.google.com, which is subject to the terms and conditions of its licensor, as applicable at the time of licensing. License hyperlink is provided here for information purposes only: https://earthengine.google.com/terms/. 69 | 70 | C. Each Contributor must characterize its alteration of the Subject Software as a Modification and must identify itself as the originator of its Modification in a manner that reasonably allows subsequent Recipients to identify the originator of the Modification. In fulfillment of these requirements, Contributor must include a file (e.g., a change log file) that describes the alterations made and the date of the alterations, identifies Contributor as originator of the alterations, and consents to characterization of the alterations as a Modification, for example, by including a statement that the Modification is derived, directly or indirectly, from Original Software provided by Government Agency. Once consent is granted, it may not thereafter be revoked. 71 | 72 | D. A Contributor may add its own copyright notice to the Subject Software. Once a copyright notice has been added to the Subject Software, a Recipient may not remove it without the express permission of the Contributor who added the notice. 73 | 74 | E. A Recipient may not make any representation in the Subject Software or in any promotional, advertising or other material that may be construed as an endorsement by Government Agency or by any prior Recipient of any product or service provided by Recipient, or that may seek to obtain commercial advantage by the fact of Government Agency's or a prior Recipient’s participation in this Agreement. 75 | 76 | F. In an effort to track usage and maintain accurate records of the Subject Software, each Recipient, upon receipt of the Subject Software, is requested to provide Government Agency, by e-mail to the Government Agency Point of Contact listed in clause 5.F., the following information: First and Last Name; Email Address; and Affiliation. Recipient’s name and personal information shall be used for statistical purposes only. Once a Recipient makes a Modification available, it is requested that the Recipient 77 | inform Government Agency, by e-mail to the Government Agency Point of Contact listed in clause 5.F., how to access the Modification. 78 | 79 | G. Each Contributor represents that that its Modification is believed to be Contributor’s original creation and does not violate any existing agreements, regulations, statutes or rules, and further that Contributor has sufficient rights to grant the rights conveyed by this Agreement. 80 | 81 | H. A Recipient may choose to offer, and to charge a fee for, warranty, support, indemnity and/or liability obligations to one or more other Recipients of the Subject Software. A Recipient may do so, however, only on its own behalf and not on behalf of Government Agency or any other Recipient. Such a Recipient must make it absolutely clear that any such warranty, support, indemnity and/or liability obligation is offered by that Recipient alone. Further, such Recipient agrees to indemnify Government Agency and every other Recipient for any liability incurred by them as a result of warranty, support, indemnity and/or liability offered by such Recipient. 82 | 83 | I. A Recipient may create a Larger Work by combining Subject Software with separate software not governed by the terms of this agreement and distribute the Larger Work as a single product. In such case, the Recipient must make sure Subject Software, or portions thereof, included in the Larger Work is subject to this Agreement. 84 | 85 | J. Notwithstanding any provisions contained herein, Recipient is hereby put on notice that export of any goods or technical data from the United States may require some form of export license from the U.S. Government. Failure to obtain necessary export licenses may result in criminal liability under U.S. laws. Government Agency neither represents that a license shall not be required nor that, if required, it shall be issued. Nothing granted herein provides any such export license. 86 | 87 | 4. DISCLAIMER OF WARRANTIES AND LIABILITIES; WAIVER AND INDEMNIFICATION 88 | 89 | A. No Warranty: THE SUBJECT SOFTWARE IS PROVIDED “AS IS” WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, AND DISTRIBUTES IT “AS IS.” 90 | B. Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 91 | 92 | 5. GENERAL TERMS 93 | 94 | A. Termination: This Agreement and the rights granted hereunder will terminate automatically if a Recipient fails to comply with these terms and conditions, and fails to cure such noncompliance within thirty (30) days of becoming aware of such noncompliance. Upon termination, a Recipient agrees to immediately cease use and distribution of the Subject Software. All sublicenses to the Subject Software properly granted by the breaching Recipient shall survive any such termination of this Agreement. 95 | 96 | B. Severability: If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement. 97 | 98 | C. Applicable Law: This Agreement shall be subject to United States federal law only for all purposes, including, but not limited to, determining the validity of this Agreement, the meaning of its provisions and the rights, obligations and remedies of the parties. 99 | 100 | D. Entire Understanding: This Agreement constitutes the entire understanding and agreement of the parties relating to release of the Subject Software and may not be superseded, modified or amended except by further written agreement duly executed by the parties. 101 | 102 | E. Binding Authority: By accepting and using the Subject Software under this Agreement, a Recipient affirms its authority to bind the Recipient to all terms and conditions of this Agreement and that that Recipient hereby agrees to all terms and conditions herein. 103 | 104 | F. Point of Contact: Any Recipient contact with Government Agency is to be directed to the designated representative as follows: 105 | 106 | Maxine Saunders 107 | Software Release Authority 108 | MS 020, NASA Langley Research Center Hampton, VA 23681 109 | Phone: 757-864-2025 110 | 111 | Email: larc-sra@mail.nasa.gov 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | pbs4py is a Python module for automating submission of compute jobs on High Performance 4 | Computing clusters, such as those that use the Portable Batch System (PBS). 5 | It includes pre-configured launchers for common NASA HPC systems: the Langley K cluster 6 | and NASA Advanced Supercomputing (NAS) systems. 7 | 8 | Examples uses are uncertainty quantification where many jobs are submitted 9 | simultaneously or optimization where sequences of jobs need to scripted. 10 | 11 | pbs4py also includes scripts for performing tasks associated with PBS jobs 12 | such as a script when given a job number will print the directory from which it was launched 13 | and a script that can delete multiple jobs based on filters. 14 | 15 | 16 | # Documentation 17 | [Documentation is hosted using Github Pages](https://nasa.github.io/pbs4py/) 18 | 19 | The pbs4py documentation is generated from the source code with [Sphinx](https://www.sphinx-doc.org/en/master/). 20 | Once you have installed pbs4py, the documentation is built by running `make html` in the docs directory. 21 | The generated documentation will be in `docs/build/html`. 22 | 23 | # Installation 24 | pbs4py can be installed with 25 | 26 | ``` 27 | pip install pbs4py 28 | ``` 29 | 30 | # Quick Start 31 | 32 | After installation, 33 | 34 | On the K cluster: 35 | ```python 36 | from pbs4py import PBS 37 | pbs = PBS.k4() 38 | pbs.requested_number_of_nodes = 1 39 | pbs.launch(job_name='example_job',job_body=['echo "Hello World"']) 40 | ``` 41 | 42 | On NAS: 43 | ```python 44 | from pbs4py import PBS 45 | group = 'a1111' # your project ID to charge here 46 | pbs = PBS.nas(group, proc_type='san', queue='devel', time=1) 47 | pbs.launch(job_name='example_job',job_body=['echo "Hello World"']) 48 | ``` 49 | 50 | # License Notices and Disclaimers 51 | Notices: 52 | Copyright 2022 United States Government as represented by the Administrator of 53 | the National Aeronautics and Space Administration. No copyright is claimed in 54 | the United States under Title 17, U.S. Code. All Other Rights Reserved. 55 | 56 | Third Party Software: 57 | 58 | This software calls the following third party software, which is subject to the 59 | terms and conditions of its licensor, as applicable at the time of licensing. 60 | Third party software is not bundled with this software, but may be available 61 | from the licensor. License hyperlinks are provided here for information purposes 62 | only: numpy, BSD 3-Clause "New" or "Revised" License, 63 | https://github.com/numpy/numpy/blob/main/LICENSE.txt. 64 | 65 | Disclaimers 66 | No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF 67 | ANY KIND, EITHER EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED 68 | TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM TO SPECIFICATIONS, ANY 69 | IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR 70 | FREEDOM FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR 71 | FREE, OR ANY WARRANTY THAT DOCUMENTATION, IF PROVIDED, WILL CONFORM TO THE 72 | SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN 73 | ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, 74 | RESULTING DESIGNS, HARDWARE, SOFTWARE PRODUCTS OR ANY OTHER APPLICATIONS 75 | RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY 76 | DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF 77 | PRESENT IN THE ORIGINAL SOFTWARE, AND DISTRIBUTES IT "AS IS." 78 | 79 | Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE 80 | UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY 81 | PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE RESULTS IN ANY 82 | LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, 83 | INCLUDING ANY DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE 84 | OF THE SUBJECT SOFTWARE, RECIPIENT SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED 85 | STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR 86 | RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH 87 | MATTER SHALL BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 88 | -------------------------------------------------------------------------------- /coverage_report.sh: -------------------------------------------------------------------------------- 1 | pytest --cov-report xml:cov.xml --cov pbs4py 2 | coverage report -m 3 | coverage html 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/bsub.rst: -------------------------------------------------------------------------------- 1 | .. _bsub_section: 2 | 3 | BSUB Job Launcher 4 | %%%%%%%%%%%%%%%%% 5 | 6 | The BSUB class is a tool to write and launch job scripts on DOE HPC systems. 7 | 8 | 9 | BSUB Class 10 | ========== 11 | 12 | .. automodule:: pbs4py.bsub 13 | 14 | .. autoclass:: BSUB 15 | :members: 16 | :inherited-members: 17 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../../')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pbs4py' 21 | copyright = '2021, NASA' 22 | author = 'NASA' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 31 | 'sphinx_autodoc_typehints', 'sphinxarg.ext'] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # List of patterns, relative to source directory, that match files and 37 | # directories to ignore when looking for source files. 38 | # This pattern also affects html_static_path and html_extra_path. 39 | exclude_patterns = [] 40 | 41 | autoclass_content = 'both' 42 | autodoc_member_order = 'bysource' 43 | autodoc_default_flags = ['members', 'inherited-members'] 44 | 45 | # -- Options for HTML output ------------------------------------------------- 46 | 47 | # The theme to use for HTML and HTML Help pages. See the documentation for 48 | # a list of builtin themes. 49 | # 50 | html_theme = 'sphinxdoc' 51 | 52 | # Add any paths that contain custom static files (such as style sheets) here, 53 | # relative to this directory. They are copied after the builtin static files, 54 | # so a file named "default.css" will overwrite the builtin "default.css". 55 | #html_static_path = ['_static'] 56 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | pbs4py 2 | ====== 3 | 4 | Python scripting for launching and managing PBS jobs. 5 | 6 | 7 | Launching jobs 8 | -------------- 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | pbs.rst 14 | pbs_batch.rst 15 | bsub.rst 16 | 17 | 18 | Managing jobs 19 | ------------- 20 | .. toctree:: 21 | :maxdepth: 2 22 | 23 | job.rst 24 | scripts.rst 25 | 26 | Indices and tables 27 | ================== 28 | 29 | * :ref:`genindex` 30 | * :ref:`modindex` 31 | * :ref:`search` 32 | -------------------------------------------------------------------------------- /docs/source/job.rst: -------------------------------------------------------------------------------- 1 | .. _pbs_job_section: 2 | 3 | PBS Job Class 4 | %%%%%%%%%%%%% 5 | 6 | .. automodule:: pbs4py.job 7 | 8 | .. autoclass:: PBSJob 9 | :members: 10 | -------------------------------------------------------------------------------- /docs/source/pbs.rst: -------------------------------------------------------------------------------- 1 | .. _pbs_section: 2 | 3 | PBS Job Launcher 4 | %%%%%%%%%%%%%%%% 5 | 6 | The PBS class is a tool to define properties of the PBS set up you want to use, 7 | write pbs scripts, and launch jobs. 8 | The ``PBS`` class has several classmethods that serve as alternate constructors which fill in properties of some NASA HPC systems and queue. 9 | Examples of instantiating with this methods is shown below. 10 | For systems or queues not covered by these classmethods, the basic queue attributes are 11 | set in the standard constructor, and less common ones can be adjusted by 12 | changing the attributes of the object. 13 | 14 | 15 | PBS Class 16 | ========= 17 | 18 | .. automodule:: pbs4py.pbs 19 | 20 | .. autoclass:: PBS 21 | :members: 22 | :inherited-members: 23 | 24 | PBS's classmethod constructors 25 | ------------------------------ 26 | 27 | 28 | .. code-block :: python 29 | 30 | from pbs4py import PBS 31 | 32 | k4 = PBS.k4(time=48) 33 | k3 = PBS.k3() 34 | k3a = PBS.k3a() 35 | nas = PBS.nas(group_list='n1337', proc_type='skylake', time = 72) 36 | 37 | FakePBS Class 38 | ============= 39 | Some scripts may be set up with the PBS job handler originally, but you may want to run 40 | the script within an existing PBS job without launching new PBS jobs. 41 | The FakePBS object appears to driving scripts as a standard PBS object, 42 | but directly runs the commands instead of putting them into a PBS job and launching the job. 43 | 44 | .. automodule:: pbs4py.fake_pbs 45 | 46 | .. autoclass:: FakePBS 47 | :members: 48 | -------------------------------------------------------------------------------- /docs/source/pbs_batch.rst: -------------------------------------------------------------------------------- 1 | .. _pbs_batch_section: 2 | 3 | PBS Job Batch Submission 4 | %%%%%%%%%%%%%%%%%%%%%%%% 5 | 6 | The PBSBatch class is a tool launch many jobs simultaneously. 7 | 8 | The basic steps are: 9 | 10 | 1. Instantiating a :class:`~pbs4py.pbs.PBS` that will be used to submit the jobs. 11 | 2. Creating a list of :class:`~pbs4py.pbs_batch.BatchJob` objects that hold the name of the job and a list of the commands to run. 12 | 3. Setting up the job directory with the appropriate input files. 13 | 4. Giving the ``PBS`` object and list of ``BatchJob`` to the :class:`~pbs4py.pbs_batch.PBSBatch` constructor and then calling one of the launch methods. 14 | 15 | Setting up the Job Directories 16 | ============================== 17 | By default jobs are launched in directories with the same name as the job. 18 | This prevents concurrent jobs in the batch from overwriting each other's output files. 19 | 20 | To set up a job, these directories can be created and populated with code like this: 21 | 22 | .. code-block :: python 23 | 24 | batch = PBSBatch(pbs,jobs) 25 | 26 | batch.create_directories() 27 | common_inputs_to_copy = ['fun3d.nml','*.cfg'] 28 | 29 | for job in jobs: 30 | for input in common_inputs_to_copy: 31 | os.system(f'cp {input} {job.name}') 32 | 33 | 34 | Launch Methods 35 | ============== 36 | The batch jobs can be submitted with two different methods of the :class:`~pbs4py.pbs_batch.PBSBatch` class. 37 | 38 | :func:`~pbs4py.pbs_batch.PBSBatch.launch_jobs_with_limit` will launch every job in the list, 39 | but it will only allow a certain number of jobs to be active in the queue system 40 | (queued, running, held) at a time. This would be the preferred launch method if 41 | you have many jobs and don't want to submit 100s of jobs into the queue at a time 42 | as a courtesy to your fellow HPC users. 43 | 44 | :func:`~pbs4py.pbs_batch.PBSBatch.launch_all_jobs` will launch every job in the list. 45 | It has an optional argument to wait for the jobs to finish before returning or 46 | returning immediately after all of the jobs are submitted to the queue. 47 | 48 | Batch Job Class 49 | =============== 50 | .. automodule:: pbs4py.pbs_batch 51 | 52 | .. autoclass:: BatchJob 53 | :members: 54 | 55 | PBSBatch Class 56 | ============== 57 | 58 | .. autoclass:: PBSBatch 59 | :members: 60 | -------------------------------------------------------------------------------- /docs/source/scripts.rst: -------------------------------------------------------------------------------- 1 | .. _pbs_script_section: 2 | 3 | Scripts 4 | %%%%%%% 5 | 6 | Job Directory Script 7 | ==================== 8 | 9 | .. argparse:: 10 | :ref: pbs4py.scripts.job_dir.arg_parser 11 | :prog: job_dir.py 12 | 13 | qdir alias to cd to job's directory 14 | ----------------------------------- 15 | 16 | This script to print the job directory can be used in combination with bash 17 | aliases to create a ``qdir`` alias for moving to the directory a job is running in 18 | 19 | .. code-block:: bash 20 | 21 | qdirfun() { cd `job_dir.py $1`;} 22 | alias qdir=qdirfun 23 | 24 | Then in the shell instance you can do ``qdir {job_id}`` to move the job's run directory. 25 | 26 | Qdel for User Jobs Script 27 | ========================= 28 | 29 | 30 | .. argparse:: 31 | :ref: pbs4py.scripts.qdel_user_jobs.arg_parser 32 | :prog: qdel_user_jobs.py 33 | 34 | 35 | Example 36 | ------- 37 | The following command would delete the current users jobs that meet these conditions: PBS ids between 1000 and 2400, 38 | in the K3-standard queue, and have ``crm`` in the job name. By default the list of jobs will be 39 | printed to the screen asking the user for confirmation. Add ``--no-confirm`` would skip this step. 40 | 41 | .. code-block:: bash 42 | 43 | qdel_user_jobs.py --id_range 1000 2400 --queue K3-standard --name crm 44 | -------------------------------------------------------------------------------- /examples/basic/launch.py: -------------------------------------------------------------------------------- 1 | from pbs4py import PBS 2 | 3 | k4 = PBS.k4(time=48) 4 | k4.mpiexec = 'mpiexec_mpt' 5 | k4.requested_number_of_nodes = 3 6 | 7 | fun3d_command = 'nodet_mpi --gamma 1.14' 8 | fun3d_mpi_command = k4.create_mpi_command(fun3d_command, output_root_name='dog') 9 | 10 | # list of commands that will be run in the pbs script 11 | pbs_commands = ['echo Start', fun3d_mpi_command, 'echo Done'] 12 | 13 | # submit and move on 14 | job_name = 'test_job' 15 | k4.launch(job_name, pbs_commands, blocking=False) 16 | 17 | # submit and wait for job to finish before continuing script 18 | job_name = 'blocking_job' 19 | k4.launch(job_name, pbs_commands) 20 | -------------------------------------------------------------------------------- /examples/basic/nas.py: -------------------------------------------------------------------------------- 1 | from pbs4py import PBS 2 | 3 | group = 'a1234' # replace your charge number here 4 | nas = PBS.nas(group, proc_type='bro', queue_name='devel', time=2) 5 | commands = [nas.create_mpi_command('nodet_mpi', 'debug')] 6 | nas.requested_number_of_nodes = 4 7 | nas.write_job_file('devel.pbs', 'debug', commands) 8 | -------------------------------------------------------------------------------- /examples/batch_no_limit/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pbs4py import PBS, BatchJob, PBSBatch 3 | 4 | 5 | pbs = PBS.k3() 6 | pbs.requested_number_of_nodes = 1 7 | 8 | jobs = [] 9 | for ijob in range(10): 10 | name = f'sleep{ijob}' 11 | commands = [f'sleep {ijob*10}', 12 | f'cat {name}.txt'] 13 | jobs.append(BatchJob(name, commands)) 14 | 15 | batch = PBSBatch(pbs, jobs) 16 | batch.create_directories() 17 | 18 | for job in jobs: 19 | # use job as context manager to enter directory with the name of job.name and write a file 20 | with job: 21 | os.system(f'echo "hello world" > {job.name}.txt') 22 | 23 | batch.launch_all_jobs(wait_for_jobs_to_finish=True) 24 | print('Done.') 25 | -------------------------------------------------------------------------------- /examples/batch_with_job_limit/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pbs4py import PBS, PBSBatch, BatchJob 3 | 4 | 5 | pbs = PBS.k3a() 6 | pbs.requested_number_of_nodes = 1 7 | 8 | jobs = [] 9 | for ijob in range(10): 10 | name = f'sample{ijob}' 11 | commands = [f'sleep {ijob*60}', 12 | f'cat {name}.txt'] 13 | jobs.append(BatchJob(name, commands)) 14 | 15 | batch = PBSBatch(pbs, jobs) 16 | batch.create_directories() 17 | 18 | for job in jobs: 19 | # use job as context manager to enter directory with the name of job.name and write a file 20 | with job: 21 | os.system(f'echo "hello world" > {job.name}.txt') 22 | 23 | batch.launch_jobs_with_limit(max_jobs_at_a_time=3) 24 | -------------------------------------------------------------------------------- /examples/dependency_chain/launch.py: -------------------------------------------------------------------------------- 1 | from pbs4py import PBS 2 | 3 | k3 = PBS.k3(time=1) 4 | k3.mem = '4gb' 5 | k3.requested_number_of_nodes = 1 6 | 7 | # test_job2 will wait until test_job1 is done before running 8 | pbs_commands = ['echo Start', 'sleep 1m', 'echo Done'] 9 | pbs1_id = k3.launch('test_job1', pbs_commands, blocking=False) 10 | 11 | new_commands = ['echo Start 2', 'sleep 2m', 'echo Done 2'] 12 | k3.launch('test_job2', new_commands, blocking=False, dependency=pbs1_id) 13 | -------------------------------------------------------------------------------- /examples/hybrid_openmp_mpi/launch.py: -------------------------------------------------------------------------------- 1 | from pbs4py import PBS 2 | 3 | k4 = PBS.k4(time=48) 4 | k4.mpiexec = 'mpiexec_mpt' 5 | k4.requested_number_of_nodes = 2 6 | 7 | fun3d_command = 'nodet_mpi' 8 | fun3d_mpi_command = k4.create_mpi_command(fun3d_command, 'dog', openmp_threads=20) 9 | 10 | # commands that will be run in the pbs script 11 | pbs_commands = [fun3d_mpi_command] 12 | 13 | # submit and wait for job to finish before continuing script 14 | k4.launch('omp_job', pbs_commands) 15 | -------------------------------------------------------------------------------- /examples/job_array/write_pbs_file.py: -------------------------------------------------------------------------------- 1 | from pbs4py import PBS 2 | 3 | k4 = PBS.k4(time=48) 4 | k4.mpiexec = 'mpiexec_mpt' 5 | k4.requested_number_of_nodes = 1 6 | 7 | k4.array_range = '1-4' 8 | 9 | command_list = [f'echo "Array job index = ${{PBS_ARRAY_INDEX}}"'] 10 | 11 | k4.write_job_file('test_array.pbs', 'test_array', command_list) 12 | -------------------------------------------------------------------------------- /pbs4py/__init__.py: -------------------------------------------------------------------------------- 1 | from .pbs import PBS 2 | from .fake_pbs import FakePBS 3 | from .pbs_batch import BatchJob, PBSBatch 4 | -------------------------------------------------------------------------------- /pbs4py/bsub.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | from pbs4py.launcher_base import Launcher 5 | 6 | 7 | class BSUB(Launcher): 8 | def __init__(self, 9 | project: str, 10 | ngpus_per_node: int = 6, 11 | queue_node_limit: int = 1_000_000, 12 | time: int = 72, 13 | profile_filename: str = '~/.bashrc', 14 | requested_number_of_nodes: int = 1): 15 | """ 16 | A Class for creating and running jobs using the Department of Energy 17 | batch system. 18 | 19 | Parameters 20 | ---------- 21 | project: 22 | The project which to charge for submitted jobs 23 | ngpu_per_node: 24 | The number of GPUs per compute node 25 | time: 26 | The requested wall time for the job(s) in hours 27 | profile_filename: 28 | The file setting the environment to source inside the PBS job 29 | """ 30 | super().__init__(ngpus_per_node, ngpus_per_node, queue_node_limit, 31 | time, profile_filename, requested_number_of_nodes) 32 | 33 | #: The project which to charge for submitted jobs 34 | self.project: str = project 35 | 36 | #: Mail a job report when complete 37 | self.mail_when_complete: bool = True 38 | 39 | self.profile_filename = profile_filename 40 | self.workdir_env_variable = '$LS_SUBCWD' 41 | self.batch_file_extension = 'lsf' 42 | self.mpiexec = 'jsrun' 43 | 44 | def create_mpi_command(self, command: str, 45 | output_root_name: str, 46 | openmp_threads: int = 1, 47 | ranks_per_node: int = None) -> str: 48 | num_mpi_procs = self.requested_number_of_nodes * self.ngpus_per_node 49 | redirect_output = self._redirect_shell_output(f'{output_root_name}.out') 50 | command = f'{self.mpiexec} -n {num_mpi_procs} -a 1 -c {openmp_threads} -g 1 {command} {redirect_output}' 51 | return command 52 | 53 | def _create_list_of_standard_header_options(self, job_name: str) -> List[str]: 54 | header_lines = [self._create_hashbang(), 55 | self._create_project_line_of_header(), 56 | self._create_job_name_line_of_header(job_name), 57 | self._create_number_of_nodes_line_of_header(), 58 | self._create_wall_time_line_of_header()] 59 | return header_lines 60 | 61 | def _create_project_line_of_header(self) -> str: 62 | return f'#BSUB -P {self.project}' 63 | 64 | def _create_job_name_line_of_header(self, job_name: str) -> str: 65 | return f'#BSUB -J {job_name}' 66 | 67 | def _create_number_of_nodes_line_of_header(self) -> str: 68 | return f'#BSUB -nnodes {self.requested_number_of_nodes}' 69 | 70 | def _create_wall_time_line_of_header(self) -> str: 71 | return f'#BSUB -W {self.time}:00' 72 | 73 | def _create_list_of_optional_header_lines(self, dependency: str) -> List[str]: 74 | header_lines = [] 75 | header_lines.extend(self._create_job_dependency_header_line(dependency)) 76 | header_lines.extend(self._create_mail_header_line()) 77 | return header_lines 78 | 79 | def _create_job_dependency_header_line(self, dependency: str) -> List[str]: 80 | if dependency is not None: 81 | return [f'#BSUB -w ended({dependency})'] 82 | else: 83 | return [] 84 | 85 | def _create_mail_header_line(self) -> List[str]: 86 | if self.mail_when_complete: 87 | return ['#BSUB -N'] 88 | else: 89 | return [] 90 | 91 | def _run_job(self, job_filename: str, blocking: bool, print_command_output: bool = True) -> str: 92 | if blocking: 93 | print('Warning: Blocking for bsub not implemented') 94 | 95 | command = f'bsub {job_filename}' 96 | if print_command_output: 97 | print(command) 98 | return os.popen(command).read() 99 | 100 | def _parse_job_id_out_of_bsub_output(self, bsub_output: str) -> int: 101 | return int(bsub_output.split('>')[0].split('<')[-1]) 102 | -------------------------------------------------------------------------------- /pbs4py/directory_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | class cd: 5 | """Context manager for changing the current working directory""" 6 | def __init__(self, newPath): 7 | self.newPath = os.path.expanduser(newPath) 8 | 9 | def __enter__(self): 10 | self.savedPath = os.getcwd() 11 | os.chdir(self.newPath) 12 | 13 | def __exit__(self, etype, value, traceback): 14 | os.chdir(self.savedPath) 15 | -------------------------------------------------------------------------------- /pbs4py/fake_pbs.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import subprocess 3 | from pbs4py import PBS 4 | 5 | 6 | class FakePBS(PBS): 7 | """ 8 | A fake PBS class for directly running commands while still calling as 9 | if it were a standard PBS driver. 10 | This can be used to seemless switch between modes where PBS jobs are 11 | launched for each "job", or using a FakePBS object when you don't want to 12 | launch a new pbs job for each "job", e.g., driving a script 13 | while already within the PBS job. 14 | """ 15 | 16 | def __init__(self, profile_filename='', stop_at_first_failure=False): 17 | super().__init__(profile_filename=profile_filename) 18 | self.stop_at_first_failure = stop_at_first_failure 19 | 20 | def launch(self, job_name: str, job_body: List[str], 21 | blocking: bool = True, dependency: str = None) -> str: 22 | """ 23 | Runs the commands in the job_body and determines if any failed 24 | based on status flags 25 | 26 | Parameters 27 | ---------- 28 | job_name: 29 | [ignored] 30 | job_body: 31 | List of commands to run 32 | blocking: 33 | [ignored] 34 | dependency: 35 | [ignored] 36 | 37 | Returns 38 | ------- 39 | pbs_command_output: str 40 | Empty string but returning something to match true PBS launch output 41 | """ 42 | 43 | number_of_failures = 0 44 | for line in job_body: 45 | print(line) 46 | process = subprocess.Popen(line, shell=True) 47 | process.wait() 48 | 49 | if process.returncode != 0: 50 | number_of_failures += 1 51 | if self.stop_at_first_failure: 52 | break 53 | return f'FakePBS.{number_of_failures}' 54 | -------------------------------------------------------------------------------- /pbs4py/job.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | from typing import List, Union 4 | import subprocess 5 | 6 | 7 | class PBSJob: 8 | def __init__(self, id: str): 9 | """ 10 | A class for querying information and managing a particular submitted 11 | pbs job. For the id number in the constructor, the qstat command will 12 | be used to populate the attributes of the job. 13 | 14 | Parameters 15 | ---------- 16 | id: 17 | The id of the PBS job 18 | """ 19 | 20 | #: The ID of the PBS job 21 | self.id: str = id 22 | 23 | #: The name of the job 24 | self.name: str = "" 25 | 26 | #: The model attribute on the select line from the job submission 27 | self.model: str = "" 28 | 29 | #: The number of resources on the select line 30 | self.requested_number_of_nodes: int = 0 31 | 32 | #: The number of cpus for node 33 | self.ncpus_per_node = 0 34 | 35 | #: The queue which this job was submitted to 36 | self.queue: str = "" 37 | 38 | #: Whether the job is queued, running, or finished 39 | self.state: str = "" 40 | 41 | #: The value of $PBS_O_WORKDIR 42 | self.workdir: str = "" 43 | 44 | #: The exit status of the pbs job 45 | self.exit_status: int = None 46 | 47 | self.read_properties_from_qstat() 48 | 49 | def read_properties_from_qstat(self): 50 | """ 51 | Use qstat to get the current attributes of this job 52 | """ 53 | if "FakePBS" in self.id: 54 | self._read_properties_from_fake_pbs_launcher_job() 55 | else: 56 | self._read_properties_of_real_pbs_job() 57 | 58 | def _read_properties_of_real_pbs_job(self): 59 | qstat_output = self._run_qstat_to_get_full_job_attributes() 60 | if self._is_a_known_job(qstat_output): 61 | self._parse_attributes_from_qstat_output(qstat_output) 62 | else: 63 | self._set_empty_attributes() 64 | 65 | def _read_properties_from_fake_pbs_launcher_job(self): 66 | self.exit_status = int(self.id.split(".")[-1]) 67 | 68 | def qdel(self, echo_command: bool = True) -> str: 69 | """ 70 | Call qdel to delete this job 71 | 72 | Parameters 73 | ---------- 74 | echo_command: 75 | Whether to print the command before running it 76 | 77 | Returns 78 | ------- 79 | command_output: str 80 | The output of the shell command 81 | """ 82 | command = f"qdel {self.id}" 83 | if echo_command: 84 | print(command) 85 | return os.popen(command).read() 86 | 87 | def tail_file_until_job_is_finished(self, file_to_tail: str): 88 | if self._this_job_was_launched_from_fake_pbs(): 89 | # cat the file 90 | with open(file_to_tail, "r") as file: 91 | for line in file: 92 | print(line) 93 | else: 94 | # touch the file first 95 | if not os.path.exists(file_to_tail): 96 | open(file_to_tail, "w").close() 97 | 98 | with open(file_to_tail, "r") as file: 99 | for line in file: 100 | print(line) 101 | while True: 102 | line = file.readline() 103 | if line: 104 | print(line) 105 | else: 106 | # Sleep for a bit to avoid wasting resources 107 | time.sleep(0.1) 108 | if self._job_is_still_running_or_queued(): 109 | continue 110 | else: 111 | for line in file: 112 | print(line) 113 | break 114 | 115 | def update_job_state(self) -> str: 116 | """ 117 | Get the job's status after it has been submitted. 118 | Returns the entry of job_state in the qstat information, e.g., 119 | 'Q', 'R', 'F', 'H', etc. 120 | 121 | """ 122 | self.read_properties_from_qstat() 123 | 124 | def get_exit_status(self) -> int: 125 | qstat_output = self._run_qstat_to_get_full_job_attributes() 126 | qstat_dict = self._convert_qstat_output_to_a_dictionary(qstat_output) 127 | return qstat_dict.get("Exit_status") 128 | 129 | def _this_job_was_launched_from_fake_pbs(self): 130 | return "FakePBS" in self.id 131 | 132 | def _job_is_still_running_or_queued(self): 133 | self.update_job_state() 134 | if self.state == "Q" or self.state == "R": 135 | return True 136 | else: 137 | return False 138 | 139 | def _run_qstat_to_get_full_job_attributes(self) -> Union[List[str], str]: 140 | result = subprocess.run( 141 | ["qstat", "-xf", str(self.id)], 142 | stdout=subprocess.PIPE, 143 | stderr=subprocess.PIPE, 144 | text=False, # Disable automatic decoding 145 | ) 146 | return result.stdout.decode("utf-8", errors="replace").split("\n") 147 | 148 | def _is_a_known_job(self, qstat_output): 149 | return "Unknown Job Id" not in qstat_output 150 | 151 | def _parse_attributes_from_qstat_output(self, qstat_output: List[str]): 152 | qstat_dict = self._convert_qstat_output_to_a_dictionary(qstat_output) 153 | 154 | self.name: str = qstat_dict["Job_Name"] 155 | self.queue: str = qstat_dict["queue"] 156 | self.state: str = qstat_dict["job_state"] 157 | self.workdir = self._parse_workdir(qstat_dict) 158 | 159 | if "model" in qstat_dict["Resource_List.select"]: 160 | self.model = qstat_dict["Resource_List.select"].split("model=")[-1] 161 | else: 162 | self.model = "" 163 | self.requested_number_of_nodes = int(qstat_dict["Resource_List.select"].split(":")[0]) 164 | self.ncpus_per_node = int( 165 | qstat_dict["Resource_List.select"].split("ncpus=")[-1].split(":")[0]) 166 | 167 | self.exit_status: int = qstat_dict.get("Exit_status") 168 | if self.exit_status is not None: 169 | self.exit_status = int(self.exit_status) 170 | 171 | self.walltime_requested = self._convert_walltime_to_seconds( 172 | qstat_dict["Resource_List.walltime"]) 173 | if self.state != "Q": 174 | self.hostname = qstat_dict["exec_host"].split("/")[0] 175 | self.walltime_used = qstat_dict.get("resources_used.walltime") 176 | if self.walltime_used is not None: 177 | self.walltime_used = self._convert_walltime_to_seconds(self.walltime_used) 178 | self.walltime_remaining = self.walltime_requested - self.walltime_used 179 | else: 180 | self.walltime_remaining = None 181 | 182 | def _convert_walltime_to_seconds(self, walltime: str): 183 | walltime_split = walltime.split(":") 184 | return 3600 * int(walltime_split[0]) + 60 * int(walltime_split[1]) + int(walltime_split[2]) 185 | 186 | def _set_empty_attributes(self): 187 | self.name = "" 188 | self.model = "" 189 | self.queue = "" 190 | self.state = "" 191 | self.workdir = "" 192 | self.requested_number_of_nodes = 0 193 | self.ncpus_per_node = 0 194 | self.exit_status = None 195 | 196 | def _parse_workdir(self, qstat_dict: dict) -> str: 197 | return qstat_dict["Variable_List"].split("PBS_O_WORKDIR=")[-1].split(",")[0] 198 | 199 | def _convert_qstat_output_to_a_dictionary(self, qstat_output: List[str]) -> dict: 200 | qstat_dict = {} 201 | current_key = None 202 | current_value = [] 203 | 204 | for line in qstat_output[1:]: 205 | if len(line) == 0: 206 | continue 207 | 208 | if not self._is_a_continued_qstat_line(line): 209 | split_line = line.split("=", 1) 210 | current_key = split_line[0].strip() 211 | current_value = split_line[1].strip() 212 | qstat_dict[current_key] = current_value 213 | else: 214 | qstat_dict[current_key] += line[1:].strip() 215 | 216 | return qstat_dict 217 | 218 | def _is_a_continued_qstat_line(self, line): 219 | return line[0] == "\t" 220 | -------------------------------------------------------------------------------- /pbs4py/launcher_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from typing import List 4 | import numpy as np 5 | 6 | 7 | class Launcher: 8 | def __init__(self, ncpus_per_node: int, ngpus_per_node: int, queue_node_limit: int, time: int, 9 | profile_filename: str, requested_number_of_nodes: int): 10 | 11 | #: The hashbang line which sets the shell for the PBS script. 12 | #: If unset, the default is ``#!/usr/bin/env {self.shell}``. 13 | self.hashbang: str = None 14 | 15 | #: The shell flavor to use in the PBS job 16 | self.shell = 'bash' 17 | 18 | #: The maximum number nodes allowed by the queue 19 | self.queue_node_limit: int = queue_node_limit 20 | 21 | #: The number of compute nodes requested 22 | self.requested_number_of_nodes: int = requested_number_of_nodes 23 | 24 | #: The requested wall time for the pbs job(s) in hours 25 | self.time: int = time 26 | 27 | #: The number of CPU cores per node. 28 | self.ncpus_per_node: int = ncpus_per_node 29 | 30 | #: The number of GPUs per node. 31 | self.ngpus_per_node: int = ngpus_per_node 32 | 33 | #: The mpi execution command name: mpiexec, mpirun, mpiexec_mpt, etc. 34 | self.mpiexec: str = "mpiexec" 35 | self.mpiprocs_per_node = None 36 | 37 | #: Command line option for mpiexec to specify the number of MPI ranks for host/node. 38 | #: Default is to set it based on the mpiexec version. 39 | self.ranks_per_node_flag: str = None 40 | 41 | # these are properties that users typically don't need to set, but are 42 | # specific to each queueing software 43 | self.workdir_env_variable: str = '' 44 | self.profile_filename: str = '' 45 | self.batch_file_extension: str = '' 46 | 47 | #: If true, redirection of the output of mpi commands changed to tee 48 | self.tee_output: bool = False 49 | 50 | self.profile_filename = profile_filename 51 | 52 | @property 53 | def requested_number_of_nodes(self): 54 | """ 55 | The number of nodes to request. That is, the 'select' number in the 56 | ``#PBS -l select={requested_number_of_nodes}:ncpus=40:mpiprocs=40``. 57 | 58 | :type: int 59 | """ 60 | return self._requested_number_of_nodes 61 | 62 | @requested_number_of_nodes.setter 63 | def requested_number_of_nodes(self, number_of_nodes): 64 | self._requested_number_of_nodes = np.min((number_of_nodes, self.queue_node_limit)) 65 | 66 | @property 67 | def mpiprocs_per_node(self): 68 | """ 69 | The number of requested mpiprocs per node. If not set, the launcher will default 70 | to the number of cpus per node. 71 | ``#PBS -l select=1:ncpus=40:mpiprocs={mpiprocs_per_node}``. 72 | 73 | :type: int 74 | """ 75 | if self._mpiprocs_per_node is None: 76 | return self.ncpus_per_node 77 | else: 78 | return self._mpiprocs_per_node 79 | 80 | @mpiprocs_per_node.setter 81 | def mpiprocs_per_node(self, mpiprocs): 82 | self._mpiprocs_per_node = mpiprocs 83 | 84 | @property 85 | def profile_filename(self): 86 | """ 87 | The file to source at the start of the pbs script to set the environment. 88 | Typical names include '~/.profile', '~/.bashrc', and '~/.cshrc'. 89 | If you do not wish to source a file, set to ''. 90 | 91 | :type: str 92 | """ 93 | return self._profile_filename 94 | 95 | @profile_filename.setter 96 | def profile_filename(self, profile_filename): 97 | if (profile_filename == '' or 98 | os.path.isfile(os.path.expanduser(profile_filename))): 99 | self._profile_filename = profile_filename 100 | else: 101 | raise FileNotFoundError('Unable to set profile file.') 102 | 103 | def create_mpi_command( 104 | self, command: str, output_root_name: str = None, openmp_threads: int = None, 105 | ranks_per_node: int = None) -> str: 106 | """ 107 | Wrap a command with mpiexec and route its standard and error output to a file 108 | 109 | Parameters 110 | ---------- 111 | command: 112 | The command thats needs to run in parallel 113 | output_root_name: 114 | The root name of the output file, {output_root_name}.out. 115 | openmp_threads: 116 | The number of openmp threads per mpi process. 117 | ranks_per_node: 118 | The number of MPI ranks per compute node. 119 | 120 | Returns 121 | ------- 122 | full_command: str 123 | The full command string. 124 | """ 125 | omp_env_vars = self._determine_omp_settings(openmp_threads) 126 | ranks_per_node_info = self._set_ranks_per_node_info(openmp_threads, ranks_per_node) 127 | openmp_info = self._set_openmp_info(openmp_threads) 128 | 129 | full_command = [omp_env_vars, self.mpiexec, ranks_per_node_info, openmp_info, command] 130 | if output_root_name is not None: 131 | redirect_output = self._redirect_shell_output(f"{output_root_name}.out") 132 | full_command.append(redirect_output) 133 | return self._filter_empty_strings_from_list_and_combine(full_command) 134 | 135 | def launch(self, job_name: str, job_body: List[str], 136 | blocking: bool = True, dependency: str = None) -> str: 137 | """ 138 | Create a job script and launch the job 139 | 140 | Parameters 141 | ---------- 142 | job_name: 143 | The name of the job. 144 | job_body: 145 | List of commands to run in the body of the job. 146 | blocking: 147 | If true, this function will wait for the job to complete before returning. 148 | If false, this function will launch the job but not wait for it to finish. 149 | dependency: 150 | Jobs that this one depends one. For PBS, these are colon separated in the string 151 | 152 | Returns 153 | ------- 154 | command_output: str 155 | The stdout of the launch command. If the job is successfully launch, 156 | this will be the job id. 157 | """ 158 | filename = f'{job_name}.{self.batch_file_extension}' 159 | self.write_job_file(filename, job_name, job_body, dependency) 160 | return self._run_job(filename, blocking) 161 | 162 | def write_job_file(self, job_filename: str, job_name: str, 163 | job_body: List[str], dependency: str = None): 164 | """ 165 | Create a launch script file in the current directory for the commands defined in ``job_body``. 166 | 167 | Parameters 168 | ---------- 169 | job_filename: 170 | name of file to write to 171 | job_name: 172 | The name of the job. 173 | job_body: 174 | List of commands to run in the body of the job. 175 | dependency: 176 | Jobs that this one depends one. For PBS, these are colon separated in the string 177 | """ 178 | with open(job_filename, mode='w') as fh: 179 | header = self._create_header(job_name, dependency) 180 | for line in header: 181 | fh.write(line + '\n') 182 | 183 | for _ in range(2): 184 | fh.write('\n') 185 | 186 | fh.write(f'cd {self.workdir_env_variable}\n') 187 | if len(self.profile_filename) > 0: 188 | fh.write(f'source {self.profile_filename}\n') 189 | 190 | for _ in range(1): 191 | fh.write('\n') 192 | 193 | for line in job_body: 194 | fh.write(line + '\n') 195 | 196 | def _create_header(self, job_name: str, dependency: str = None) -> List[str]: 197 | header = self._create_list_of_standard_header_options(job_name) 198 | header.extend(self._create_list_of_optional_header_lines(dependency)) 199 | return header 200 | 201 | def _create_hashbang(self): 202 | if self.hashbang is not None: 203 | return self.hashbang 204 | else: 205 | return f'#!/usr/bin/env {self.shell}' 206 | 207 | def _create_list_of_standard_header_options(self, job_name: str) -> List[str]: 208 | return [''] 209 | 210 | def _create_list_of_optional_header_lines(self, dependency: str) -> List[str]: 211 | return [''] 212 | 213 | def _run_job(self, job_filename: str, blocking: bool, print_command_output: bool = True) -> str: 214 | raise NotImplementedError('Launcher must implement a _run_job method') 215 | 216 | def _redirect_shell_output(self, output_filename): 217 | if self.tee_output: 218 | return f'2>&1 | tee {output_filename}' 219 | 220 | if self.shell == 'tcsh': 221 | return f'>& {output_filename}' 222 | else: 223 | return f'&> {output_filename}' 224 | 225 | def _use_omplace_command(self) -> bool: 226 | return self._using_mpt() 227 | 228 | def _use_openmp(self, openmp_threads: int = None): 229 | if openmp_threads is not None: 230 | if openmp_threads > 1: 231 | return True 232 | return False 233 | 234 | def _using_mpt(self) -> bool: 235 | if self.mpiexec == "mpiexec_mpt": 236 | return True 237 | 238 | try: 239 | output = subprocess.run( 240 | [self.mpiexec, "--version"], 241 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) 242 | return "MPT" in output.stderr or "MPT" in output.stdout 243 | except FileNotFoundError: 244 | print(f"Executable '{self.mpiexec}' not found") 245 | return False 246 | 247 | def _get_ranks_per_node_flag(self): 248 | if self.ranks_per_node_flag is not None: 249 | return self.ranks_per_node_flag 250 | else: 251 | if self._using_mpt(): 252 | return "-perhost" 253 | else: 254 | return "--npernode" 255 | 256 | def _determine_omp_settings(self, openmp_threads: int) -> str: 257 | if openmp_threads is None: 258 | return "" 259 | 260 | omp_env_vars = [f"OMP_NUM_THREADS={openmp_threads}"] 261 | if not self._use_omplace_command(): 262 | omp_env_vars.extend(["OMP_PLACES=cores", "OMP_PROC_BIND=close"]) 263 | return self._filter_empty_strings_from_list_and_combine(omp_env_vars) 264 | 265 | def _filter_empty_strings_from_list_and_combine(self, lis: List[str]) -> str: 266 | filtered_for_empty_strings = filter(None, lis) 267 | return " ".join(filtered_for_empty_strings) 268 | 269 | def _set_ranks_per_node_info(self, openmp_threads: int, ranks_per_node: int) -> str: 270 | if ranks_per_node is None and openmp_threads is None: 271 | return "" 272 | elif ranks_per_node is not None: 273 | mpi_procs_per_node = ranks_per_node 274 | else: # openmp_threads is not None: 275 | mpi_procs_per_node = self.ncpus_per_node // openmp_threads 276 | 277 | ranks_per_node_flag = self._get_ranks_per_node_flag() 278 | ranks_per_proc_info = f"{ranks_per_node_flag} {mpi_procs_per_node}" 279 | return ranks_per_proc_info 280 | 281 | def _set_openmp_info(self, openmp_threads: int) -> str: 282 | if not self._use_openmp(openmp_threads): 283 | return "" 284 | 285 | openmp_info = "" 286 | if self._use_omplace_command(): 287 | proc_num_list = ",".join([str(i) for i in range(self.ncpus_per_node)]) 288 | openmp_info = f'omplace -c "{proc_num_list}" -nt {openmp_threads} -vv' 289 | return openmp_info 290 | -------------------------------------------------------------------------------- /pbs4py/pbs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import subprocess 4 | from typing import List, Union 5 | import numpy as np 6 | 7 | from pbs4py.launcher_base import Launcher 8 | 9 | 10 | class PBS(Launcher): 11 | def __init__( 12 | self, 13 | queue_name: str = "K4-route", 14 | ncpus_per_node: int = 40, 15 | ngpus_per_node: int = 0, 16 | queue_node_limit: int = 10, 17 | time: int = 72, 18 | mem: str = None, 19 | profile_filename: str = "~/.bashrc", 20 | requested_number_of_nodes: int = 1, 21 | ): 22 | """ 23 | | A class for creating and running pbs jobs. Default queue properties are for K4. 24 | | Defaults not set during instantiation can be adjusted by directly modifying attributes. 25 | 26 | Parameters 27 | ---------- 28 | queue_name: 29 | Queue name which goes on the "#PBS -N {name}" line of the pbs header 30 | ncpus_per_node: 31 | Number of CPU cores per node 32 | ngpus_per_node: 33 | Number of GPUs per node 34 | queue_node_limit: 35 | Maximum number of nodes allowed in this queue 36 | time: 37 | The requested job walltime in hours 38 | mem: 39 | The requested memory size. String to allow specifying in G, MB, etc. 40 | profile_file: 41 | The file setting the environment to source inside the PBS job. Set to 42 | '' if you do not wish to source a file. 43 | requested_number_of_nodes: 44 | The number of compute nodes to request 45 | """ 46 | super().__init__(ncpus_per_node, ngpus_per_node, queue_node_limit, 47 | time, profile_filename, requested_number_of_nodes) 48 | 49 | #: The name of the queue which goes on the ``#PBS -N {queue_name}`` 50 | #: line of the pbs header 51 | self.queue_name: str = queue_name 52 | 53 | #: The processor model if it needs to be specified. 54 | #: The associated PBS header line is ``#PBS -l select=#:ncpus=#:mpiprocs=#:model={model}`` 55 | #: If left as `None`, the ``:model={mode}`` will not be added to the header line 56 | self.model: Union[str, None] = None 57 | 58 | #: The group for the group_list entry of the pbs header if necessary. 59 | #: The associated PBS header line is ``#PBS -W group_list={group_list}`` 60 | self.group_list: Union[str, None] = None 61 | 62 | #: Requested memory size on the select line. Need to include units in the str. 63 | #: The associated PBS header line is ``#PBS -l select=#:mem={mem}`` 64 | self.mem: Union[str, None] = mem 65 | 66 | #: Index range for PBS array of jobs 67 | #: The associated PBS header line is ``#PBS -J {array_range}`` 68 | self.array_range: Union[str, None] = None 69 | 70 | #: ``pbs -m`` mail options. 'e' at exit, 'b' at beginning, 'a' at abort 71 | self.mail_options: str = None 72 | 73 | #: ``pbs -M`` mail list. Who to email when mail_options are triggered 74 | self.mail_list: Union[str, None] = None 75 | 76 | #: Type of dependency if dependency active. 77 | #: Default is 'afterok' which only launches the new job if the previous one was successful. 78 | self.dependency_type: str = "afterok" 79 | 80 | self.mpiexec: str = "mpiexec" 81 | self.ranks_per_node_flag = None 82 | 83 | self.workdir_env_variable = "$PBS_O_WORKDIR" 84 | self.batch_file_extension = "pbs" 85 | self.requested_number_of_nodes = requested_number_of_nodes 86 | 87 | def _create_list_of_standard_header_options(self, job_name: str) -> List[str]: 88 | header_lines = [ 89 | self._create_hashbang(), 90 | self._create_job_line_of_header(job_name), 91 | self._create_queue_line_of_header(), 92 | self._create_select_line_of_header(), 93 | self._create_walltime_line_of_header(), 94 | self._create_log_name_line_of_header(job_name), 95 | self._create_header_line_to_join_standard_and_error_output(), 96 | self._create_header_line_to_set_that_job_is_not_rerunnable(), 97 | ] 98 | return header_lines 99 | 100 | def _create_job_line_of_header(self, job_name: str) -> str: 101 | return f"#PBS -N {job_name}" 102 | 103 | def _create_queue_line_of_header(self) -> str: 104 | return f"#PBS -q {self.queue_name}" 105 | 106 | def _create_select_line_of_header(self) -> str: 107 | select = f"select={self.requested_number_of_nodes}" 108 | ncpus = f"ncpus={self.ncpus_per_node}" 109 | mpiprocs = f"mpiprocs={self.mpiprocs_per_node}" 110 | 111 | select_line = f"#PBS -l {select}:{ncpus}" 112 | if self.ngpus_per_node > 0: 113 | select_line += f":ngpus={self.ngpus_per_node}" 114 | select_line += f":{mpiprocs}" 115 | if self.mem is not None: 116 | select_line += f":mem={self.mem}" 117 | if self.model is not None: 118 | select_line += f":model={self.model}" 119 | return select_line 120 | 121 | def _create_walltime_line_of_header(self) -> str: 122 | return f"#PBS -l walltime={self.time}:00:00" 123 | 124 | def _create_log_name_line_of_header(self, job_name: str) -> str: 125 | return f"#PBS -o {job_name}_pbs.log" 126 | 127 | def _create_header_line_to_join_standard_and_error_output(self): 128 | return "#PBS -j oe" 129 | 130 | def _create_header_line_to_set_that_job_is_not_rerunnable(self) -> str: 131 | return "#PBS -r n" 132 | 133 | def _create_list_of_optional_header_lines(self, dependency): 134 | header_lines = [] 135 | header_lines.extend(self._create_group_list_header_line()) 136 | header_lines.extend(self._create_array_range_header_line()) 137 | header_lines.extend(self._create_mail_options_header_lines()) 138 | header_lines.extend(self._create_job_dependencies_header_line(dependency)) 139 | return header_lines 140 | 141 | def _create_group_list_header_line(self) -> List[str]: 142 | if self.group_list is not None: 143 | return [f"#PBS -W group_list={self.group_list}"] 144 | else: 145 | return [] 146 | 147 | def _create_array_range_header_line(self) -> List[str]: 148 | if self.array_range is not None: 149 | return [f"#PBS -J {self.array_range}"] 150 | else: 151 | return [] 152 | 153 | def _create_mail_options_header_lines(self) -> List[str]: 154 | header_lines = [] 155 | if self.mail_options is not None: 156 | header_lines.append(f"#PBS -m {self.mail_options}") 157 | if self.mail_list is not None: 158 | header_lines.append(f"#PBS -M {self.mail_list}") 159 | return header_lines 160 | 161 | def _create_job_dependencies_header_line(self, dependency) -> List[str]: 162 | if dependency is not None: 163 | return [f"#PBS -W depend={self.dependency_type}:{dependency}"] 164 | else: 165 | return [] 166 | 167 | def _run_job(self, job_filename: str, blocking: bool, print_command_output=True) -> str: 168 | options = "" 169 | if blocking: 170 | options += "-W block=true" 171 | command_output = os.popen(f"qsub {options} {job_filename}").read().strip() 172 | if print_command_output: 173 | print(command_output) 174 | return command_output 175 | 176 | # Alternate constructors for NASA HPC queues 177 | @classmethod 178 | def k4(cls, time: int = 72, profile_filename: str = "~/.bashrc", requested_number_of_nodes: int = 1): 179 | """ 180 | Constructor for the K4 queues on LaRC's K cluster including K4-standard-512. 181 | 182 | Parameters 183 | ---------- 184 | time: 185 | The requested job walltime in hours 186 | profile_file: 187 | The file setting the environment to source inside the PBS job 188 | requested_number_of_nodes: 189 | The number of compute nodes to request 190 | """ 191 | return cls( 192 | queue_name="K4-route", 193 | ncpus_per_node=40, 194 | queue_node_limit=16, 195 | time=time, 196 | profile_filename=profile_filename, 197 | requested_number_of_nodes=requested_number_of_nodes, 198 | ) 199 | 200 | @classmethod 201 | def k3c(cls, time: int = 72, profile_filename: str = "~/.bashrc", requested_number_of_nodes: int = 1): 202 | """ 203 | Constructor for the K3b queues on LaRC's K cluster. 204 | 205 | Parameters 206 | ---------- 207 | time: 208 | The requested job walltime in hours 209 | profile_file: 210 | The file setting the environment to source inside the PBS job 211 | requested_number_of_nodes: 212 | The number of compute nodes to request 213 | """ 214 | return cls( 215 | queue_name="K3c-route", 216 | ncpus_per_node=28, 217 | queue_node_limit=74, 218 | time=time, 219 | profile_filename=profile_filename, 220 | requested_number_of_nodes=requested_number_of_nodes, 221 | ) 222 | 223 | @classmethod 224 | def k3b(cls, time: int = 72, profile_filename: str = "~/.bashrc", requested_number_of_nodes: int = 1): 225 | """ 226 | Constructor for the K3b queues on LaRC's K cluster. 227 | 228 | Parameters 229 | ---------- 230 | time: 231 | The requested job walltime in hours 232 | profile_file: 233 | The file setting the environment to source inside the PBS job 234 | requested_number_of_nodes: 235 | The number of compute nodes to request 236 | """ 237 | return cls( 238 | queue_name="K3b-route", 239 | ncpus_per_node=28, 240 | queue_node_limit=74, 241 | time=time, 242 | profile_filename=profile_filename, 243 | requested_number_of_nodes=requested_number_of_nodes, 244 | ) 245 | 246 | @classmethod 247 | def k3a(cls, time: int = 72, profile_filename: str = "~/.bashrc", requested_number_of_nodes: int = 1): 248 | """ 249 | Constructor for the K3a queue on LaRC's K cluster. 250 | 251 | Parameters 252 | ---------- 253 | time: 254 | The requested job walltime in hours 255 | profile_file: 256 | The file setting the environment to source inside the PBS job 257 | requested_number_of_nodes: 258 | The number of compute nodes to request 259 | """ 260 | return cls( 261 | queue_name="K3a-route", 262 | ncpus_per_node=16, 263 | queue_node_limit=25, 264 | time=time, 265 | profile_filename=profile_filename, 266 | requested_number_of_nodes=requested_number_of_nodes, 267 | ) 268 | 269 | @classmethod 270 | def k4_v100( 271 | cls, 272 | time: int = 72, 273 | ncpus_per_node=0, 274 | ngpus_per_node=4, 275 | mem="200G", 276 | profile_filename: str = "~/.bashrc", 277 | requested_number_of_nodes: int = 1, 278 | ): 279 | if ncpus_per_node == 0: 280 | ncpus_per_node = ngpus_per_node 281 | return cls( 282 | queue_name="K4-V100", 283 | ncpus_per_node=ncpus_per_node, 284 | ngpus_per_node=ngpus_per_node, 285 | queue_node_limit=4, 286 | time=time, 287 | mem=mem, 288 | profile_filename=profile_filename, 289 | requested_number_of_nodes=requested_number_of_nodes, 290 | ) 291 | 292 | @classmethod 293 | def k5_a100_80( 294 | cls, 295 | time: int = 72, 296 | ncpus_per_node=0, 297 | ngpus_per_node=8, 298 | mem="700G", 299 | profile_filename: str = "~/.bashrc", 300 | requested_number_of_nodes: int = 1, 301 | ): 302 | if ncpus_per_node == 0: 303 | ncpus_per_node = ngpus_per_node 304 | return cls( 305 | queue_name="K5-A100-80", 306 | ncpus_per_node=ncpus_per_node, 307 | ngpus_per_node=ngpus_per_node, 308 | queue_node_limit=2, 309 | time=time, 310 | mem=mem, 311 | profile_filename=profile_filename, 312 | requested_number_of_nodes=requested_number_of_nodes, 313 | ) 314 | 315 | @classmethod 316 | def k5_a100_40( 317 | cls, 318 | time: int = 72, 319 | ncpus_per_node=0, 320 | ngpus_per_node=8, 321 | mem="700G", 322 | profile_filename: str = "~/.bashrc", 323 | requested_number_of_nodes: int = 1, 324 | ): 325 | if ncpus_per_node == 0: 326 | ncpus_per_node = ngpus_per_node 327 | return cls( 328 | queue_name="K5-A100-40", 329 | ncpus_per_node=ncpus_per_node, 330 | ngpus_per_node=ngpus_per_node, 331 | queue_node_limit=2, 332 | time=time, 333 | mem=mem, 334 | profile_filename=profile_filename, 335 | requested_number_of_nodes=requested_number_of_nodes, 336 | ) 337 | 338 | @classmethod 339 | def nas( 340 | cls, 341 | group_list: str, 342 | proc_type: str = "broadwell", 343 | queue_name: str = "long", 344 | time: int = 72, 345 | mem: str = None, 346 | profile_filename: str = "~/.bashrc", 347 | requested_number_of_nodes: int = 1, 348 | ): 349 | """ 350 | Constructor for the queues at NAS. Must specify the group_list 351 | 352 | Parameters 353 | ---------- 354 | group_list: 355 | The charge number or group for the group_list entry of the pbs header. 356 | The associated PBS header line is "#PBS -W group_list={group_list}". 357 | proc_type: 358 | The type of processor to submit to. Can write out or just the first 3 letters: 359 | 'cas', 'sky', 'bro', 'has', 'ivy', 'san'. 360 | queue_name: 361 | Which queue to submit to: devel, debug, normal, long, etc. 362 | time: 363 | The requested job walltime in hours 364 | profile_file: 365 | The file setting the environment to source inside the PBS job 366 | """ 367 | if "sky_gpu" in proc_type.lower(): 368 | ncpus_per_node = 36 369 | ngpus_per_node = 4 370 | model = "sky_gpu" 371 | mem = "200G" 372 | elif "cas_gpu" in proc_type.lower(): 373 | ncpus_per_node = 48 374 | ngpus_per_node = 4 375 | model = "cas_gpu" 376 | mem = "200G" 377 | elif "rom_gpu" in proc_type.lower(): 378 | ncpus_per_node = 128 379 | ngpus_per_node = 8 380 | model = "rom_gpu" 381 | mem = "700G" 382 | elif "mil_a100" in proc_type.lower(): 383 | ncpus_per_node = 64 384 | ngpus_per_node = 4 385 | model = "mil_a100" 386 | mem = "500G" 387 | elif "cas" in proc_type.lower(): 388 | ncpus_per_node = 40 389 | ngpus_per_node = 0 390 | model = "cas_ait" 391 | elif "sky" in proc_type.lower(): 392 | ncpus_per_node = 40 393 | ngpus_per_node = 0 394 | model = "sky_ele" 395 | elif "bro" in proc_type.lower(): 396 | ncpus_per_node = 28 397 | ngpus_per_node = 0 398 | model = "bro" 399 | elif "has" in proc_type.lower(): 400 | ncpus_per_node = 24 401 | ngpus_per_node = 0 402 | model = "has" 403 | elif "ivy" in proc_type.lower(): 404 | ncpus_per_node = 20 405 | ngpus_per_node = 0 406 | model = "ivy" 407 | elif "san" in proc_type.lower(): 408 | ncpus_per_node = 16 409 | ngpus_per_node = 0 410 | model = "san" 411 | elif "rom" in proc_type.lower(): 412 | ncpus_per_node = 128 413 | ngpus_per_node = 0 414 | model = "rom_ait" 415 | elif "mil" in proc_type.lower(): 416 | ncpus_per_node = 128 417 | ngpus_per_node = 0 418 | model = "mil_ait" 419 | else: 420 | raise ValueError("Unknown NAS processor selection") 421 | 422 | pbs = cls( 423 | queue_name=queue_name, 424 | ncpus_per_node=ncpus_per_node, 425 | ngpus_per_node=ngpus_per_node, 426 | queue_node_limit=int(1e6), 427 | time=time, 428 | mem=mem, 429 | profile_filename=profile_filename, 430 | requested_number_of_nodes=requested_number_of_nodes, 431 | ) 432 | 433 | pbs.group_list = group_list 434 | pbs.model = model 435 | return pbs 436 | 437 | @classmethod 438 | def cf1( 439 | cls, 440 | account: str, 441 | queue_name: str = "normal", 442 | queue_node_limit: int = 30, 443 | time: int = 24, 444 | ncpus_per_node=64, 445 | profile_filename: str = "~/.bashrc", 446 | requested_number_of_nodes: int = 2, 447 | ): 448 | pbs = cls( 449 | queue_name=queue_name, 450 | queue_node_limit=queue_node_limit, 451 | ncpus_per_node=ncpus_per_node, 452 | time=time, 453 | profile_filename=profile_filename, 454 | requested_number_of_nodes=requested_number_of_nodes, 455 | ) 456 | pbs.group_list = account 457 | pbs.workdir_env_variable = "$SLURM_SUBMIT_DIR" 458 | return pbs 459 | -------------------------------------------------------------------------------- /pbs4py/pbs_batch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import time 4 | from datetime import datetime 5 | from typing import List 6 | from .pbs import PBS 7 | from .directory_utils import cd 8 | 9 | 10 | class BatchJob: 11 | def __init__(self, name: str, body: List[str]): 12 | """ 13 | Class for individual PBS jobs within a batch of jobs 14 | 15 | Can be used as a context manager to enter/exit a directory 16 | with the job's name 17 | """ 18 | 19 | #: str: Name of the job. 20 | self.name = name 21 | 22 | #: List[str]: list of commands to run in PBS job 23 | self.body = body 24 | 25 | #: str: pbs job identifier returned by qsub 26 | self.id = None 27 | 28 | def get_pbs_job_state(self) -> str: 29 | """ 30 | Get the job's status after it has been submitted. 31 | Returns the entry of job_state in the qstat information, e.g., 32 | 'Q', 'R', 'F', 'H', etc. 33 | 34 | """ 35 | if self.id is not None: 36 | return os.popen(f'qstat -xf {self.id} | grep job_state').read().split()[-1] 37 | else: 38 | return '' 39 | 40 | def __enter__(self): 41 | self.savedPath = os.getcwd() 42 | os.chdir(self.name) 43 | 44 | def __exit__(self, etype, value, traceback): 45 | os.chdir(self.savedPath) 46 | 47 | 48 | class PBSBatch: 49 | def __init__(self, pbs: PBS, jobs: List[BatchJob], use_separate_directories: bool = True): 50 | """ 51 | Batch of PBS jobs. Assumes all jobs required the same 52 | job request size. By default, separate directories with the job's name 53 | will be used to separate output files. 54 | 55 | Parameters 56 | ---------- 57 | pbs: 58 | PBS handler that will be used to submit the jobs 59 | jobs: 60 | List of Job objects that will be run 61 | use_separate_directories: 62 | whether to run each job in a separate directory with the job's name 63 | """ 64 | self.pbs = pbs 65 | self.jobs = jobs 66 | self.use_separate_directories = use_separate_directories 67 | 68 | def create_directories(self): 69 | """ 70 | Create the set of directories with the jobs' names 71 | """ 72 | for job in self.jobs: 73 | if not os.path.exists(job.name): 74 | os.mkdir(job.name) 75 | 76 | def launch_all_jobs(self, wait_for_jobs_to_finish: bool = False, 77 | check_frequency_in_secs: float = 30): 78 | """ 79 | Launch of the all of the jobs in the list. Stores the pbs 80 | job id in the job objects 81 | 82 | Parameters 83 | ---------- 84 | wait_for_jobs_to_finish: 85 | If True, the jobs will be submitted, and this function will not return 86 | until all of the jobs are finished. 87 | 88 | check_frequency_in_secs: 89 | Time interval to wait before checking if all jobs are done. Only relevant 90 | if ``wait_for_jobs_to_finish`` is True. 91 | """ 92 | self._launch_jobs_in_a_list(self.jobs) 93 | if wait_for_jobs_to_finish: 94 | self.wait_for_all_jobs_to_finish(check_frequency_in_secs=check_frequency_in_secs) 95 | 96 | def launch_jobs_with_limit(self, max_jobs_at_a_time: int = 20, 97 | check_frequency_in_secs: float = 30): 98 | """ 99 | The "courteous" version of launch_all_jobs(wait_for_jobs_to_finish=True) and where a limit 100 | is set for the maximum number of jobs running or in the queue at a time since some people 101 | may not like if you submit 1000 jobs at once. 102 | 103 | Parameters 104 | ---------- 105 | max_jobs_at_a_time: 106 | Limit for number of jobs to have queued, running, or held at a time 107 | 108 | check_frequency_in_secs: 109 | Time interval to wait before checking if jobs' statuses. 110 | """ 111 | total_num_of_jobs = len(self.jobs) 112 | 113 | next_job_to_submit = 0 114 | while True: 115 | states = self._get_job_states(self.jobs[:next_job_to_submit]) 116 | num_active_jobs = self._count_number_of_jobs_running_queued_or_held(states) 117 | if num_active_jobs < max_jobs_at_a_time: 118 | end_index = np.min( 119 | (total_num_of_jobs, next_job_to_submit + max_jobs_at_a_time - num_active_jobs)) 120 | self._launch_jobs_in_a_list(self.jobs[next_job_to_submit:end_index]) 121 | next_job_to_submit = end_index 122 | 123 | states = self._get_job_states(self.jobs[:next_job_to_submit]) 124 | self._print_summary_of_job_states(states) 125 | if self._all_jobs_submitted(next_job_to_submit): 126 | if not self._any_jobs_are_still_running_queued_or_held(states): 127 | break 128 | time.sleep(check_frequency_in_secs) 129 | 130 | def wait_for_all_jobs_to_finish(self, check_frequency_in_secs: float = 30): 131 | """ 132 | A blocking check for all the jobs in the batch to finish. Can be paired with 133 | ``launch_all_jobs``. 134 | 135 | Parameters 136 | ---------- 137 | check_frequency_in_secs: 138 | How often to check and print the jobs' states 139 | """ 140 | while True: 141 | states = self._get_job_states(self.jobs) 142 | self._print_summary_of_job_states(states) 143 | if self._any_jobs_are_still_running_queued_or_held(states): 144 | time.sleep(check_frequency_in_secs) 145 | else: 146 | break 147 | 148 | def _launch_jobs_in_a_list(self, jobs: List[BatchJob]): 149 | for job in jobs: 150 | dirname = job.name if self.use_separate_directories else '.' 151 | with cd(dirname): 152 | job.id = self.pbs.launch(job.name, job.body, blocking=False) 153 | 154 | def _all_jobs_submitted(self, next_job_to_submit): 155 | return next_job_to_submit == len(self.jobs) 156 | 157 | def _get_job_states(self, jobs: List[BatchJob]) -> List[str]: 158 | states = [] 159 | for job in jobs: 160 | states.append(job.get_pbs_job_state()) 161 | return states 162 | 163 | def _count_number_of_jobs_running_queued_or_held(self, pbs_states: List[str]): 164 | return pbs_states.count('R') + pbs_states.count('Q') + pbs_states.count('H') 165 | 166 | def _any_jobs_are_still_running_queued_or_held(self, pbs_states): 167 | return self._count_number_of_jobs_running_queued_or_held(pbs_states) > 0 168 | 169 | def _print_summary_of_job_states(self, states: List[str]): 170 | running = states.count('R') 171 | queued = states.count('Q') 172 | finished = states.count('F') 173 | other = len(states) - running - queued - finished 174 | print(f'Job states at {datetime.now().isoformat()}:') 175 | print(f' Queued: {queued}') 176 | print(f' Running: {running}') 177 | print(f' Finished: {finished}') 178 | 179 | num_of_jobs_not_yet_submitted = len(self.jobs)-len(states) 180 | if num_of_jobs_not_yet_submitted > 0: 181 | print(f' Yet to submit: {num_of_jobs_not_yet_submitted}') 182 | print(f' Other: {other}') 183 | -------------------------------------------------------------------------------- /pbs4py/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/pbs4py/19a130db07d21358fd02954ef79ca38c61b8c811/pbs4py/scripts/__init__.py -------------------------------------------------------------------------------- /pbs4py/scripts/job_dir.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | A script to print the directory that a job is running in. 4 | """ 5 | import argparse 6 | from pbs4py.job import PBSJob 7 | 8 | 9 | def arg_parser() -> argparse.ArgumentParser: 10 | parser = argparse.ArgumentParser( 11 | description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 12 | 13 | parser.add_argument('job_id', help='The ID number of the job') 14 | return parser 15 | 16 | 17 | def main(): 18 | parser = arg_parser() 19 | 20 | args = parser.parse_args() 21 | job = PBSJob(args.job_id) 22 | print(job.workdir) 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /pbs4py/scripts/qdel_user_jobs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | A script to delete active PBS jobs of the current user. 4 | The list of jobs to be deleted can be filtered by id range, job name substring, and queue. 5 | For safety, the default behavior is to show the user which jobs will be deleted and ask for confirmation 6 | before any jobs are deleted. 7 | """ 8 | import os 9 | import argparse 10 | import re 11 | from typing import List 12 | 13 | from pbs4py.job import PBSJob 14 | 15 | 16 | def arg_parser() -> argparse.ArgumentParser: 17 | parser = argparse.ArgumentParser( 18 | description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 19 | 20 | parser.add_argument('--id_range', 21 | nargs=2, 22 | default=(-1, -1), 23 | help='Delete jobs in a range of id numbers, [min id, max id]') 24 | parser.add_argument('--queue', 25 | default='', 26 | help='Delete jobs in a specific queue') 27 | parser.add_argument('--name', 28 | default='', 29 | help='Delete jobs in a specific string in the name') 30 | parser.add_argument('--confirm', 31 | action='store_true', 32 | dest='confirm', 33 | help='Whether to prompt the user for confirmation of before deleting') 34 | parser.add_argument('--no-confirm', dest='confirm', action='store_false') 35 | parser.set_defaults(confirm=True) 36 | return parser 37 | 38 | 39 | def get_active_jobs_for_user(): 40 | user_name = os.environ.get('USER') 41 | qstat_output = os.popen(f'qstat -u {user_name}').read().split('\n') 42 | 43 | # remove header from qstat command 44 | qstat_output = qstat_output[3:] 45 | 46 | jobs = [] 47 | for line in qstat_output: 48 | if line: 49 | id = int(re.match('\s*[0-9]+', line)[0]) 50 | jobs.append(PBSJob(str(id))) 51 | return jobs 52 | 53 | 54 | def filter_jobs_to_delete_by_id_range(user_jobs: List[PBSJob], min_id: int, max_id: int): 55 | return [job for job in user_jobs if (job.id >= min_id and job.id <= max_id)] 56 | 57 | 58 | def filter_jobs_to_delete_by_queue(user_jobs: List[PBSJob], queue: str): 59 | return [job for job in user_jobs if job.queue == queue] 60 | 61 | 62 | def filter_jobs_to_delete_by_name_substring(user_jobs: List[PBSJob], name_substring: str): 63 | return [job for job in user_jobs if name_substring in job.name] 64 | 65 | 66 | def delete_jobs(jobs: List[PBSJob]): 67 | for job in jobs: 68 | job.qdel(echo_command=True) 69 | 70 | 71 | def print_jobs_that_will_be_deleted(jobs_to_delete: List[PBSJob]): 72 | print('Found the following jobs:') 73 | print('------------------------') 74 | for job in jobs_to_delete: 75 | print(f'Job: id = {job.id}, name = {job.name}, queue: {job.queue}') 76 | 77 | 78 | def user_confirms(): 79 | prompt = 'Delete these jobs? [y/n]' 80 | valid = {"yes": True, "y": True, "no": False, "n": False} 81 | 82 | while True: 83 | print(prompt) 84 | choice = input().lower() 85 | if choice in valid: 86 | return valid[choice] 87 | else: 88 | print("Please respond with 'yes' or 'no' ", "(or 'y' or 'n').\n") 89 | 90 | 91 | def main(): 92 | parser = arg_parser() 93 | 94 | args = parser.parse_args() 95 | confirm = args.confirm 96 | min_id = int(args.id_range[0]) 97 | max_id = int(args.id_range[1]) 98 | queue = args.queue 99 | name_substring = args.name 100 | 101 | jobs_to_delete = get_active_jobs_for_user() 102 | 103 | if min_id > 0 and max_id > 0: 104 | print('Filtering by id range') 105 | jobs_to_delete = filter_jobs_to_delete_by_id_range(jobs_to_delete, min_id, max_id) 106 | if queue: 107 | print('Filtering by queue') 108 | jobs_to_delete = filter_jobs_to_delete_by_queue(jobs_to_delete, queue) 109 | if name_substring: 110 | print('Filtering by name') 111 | jobs_to_delete = filter_jobs_to_delete_by_name_substring(jobs_to_delete, name_substring) 112 | 113 | if len(jobs_to_delete) == 0: 114 | print(f'No active jobs found for user with specified filters') 115 | exit() 116 | 117 | if confirm: 118 | print_jobs_that_will_be_deleted(jobs_to_delete) 119 | if user_confirms(): 120 | delete_jobs(jobs_to_delete) 121 | else: 122 | print('Skipping') 123 | 124 | else: 125 | delete_jobs(jobs_to_delete) 126 | 127 | 128 | if __name__ == '__main__': 129 | main() 130 | -------------------------------------------------------------------------------- /pbs4py/slurm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | This is a SLURM class to be used with pbs4py and was modified from pbs.py 5 | 6 | Written/Copied by Matt Opgenorth 7 | ''' 8 | import os 9 | from typing import List, Union 10 | 11 | from pbs4py.launcher_base import Launcher 12 | 13 | 14 | class SLURM(Launcher): 15 | def __init__( 16 | self, 17 | queue_name: str = "normal", 18 | ncpus_per_node: int = 64, 19 | ngpus_per_node: int = 0, 20 | queue_node_limit: int = 30, 21 | time: int = 24, 22 | mem: str = None, 23 | profile_filename: str = "~/.bashrc", 24 | requested_number_of_nodes: int = 1, 25 | ): 26 | """ 27 | | A class for creating and running slurm jobs. 28 | | Defaults not set during instantiation can be adjusted by directly modifying attributes. 29 | 30 | Parameters 31 | ---------- 32 | queue_name: 33 | Queue name which goes on the "#SBATCH --partition {queue_name}" line of the slurm header 34 | ncpus_per_node: 35 | Number of CPU cores per node 36 | ngpus_per_node: 37 | Number of GPUs per node 38 | queue_node_limit: 39 | Maximum number of nodes allowed in this queue 40 | time: 41 | The requested job walltime in hours 42 | mem: 43 | The requested memory size. String to allow specifying in G, MB, etc. 44 | profile_file: 45 | The file setting the environment to source inside the SLURM job. Set to 46 | '' if you do not wish to source a file. 47 | requested_number_of_nodes: 48 | The number of compute nodes to request 49 | """ 50 | super().__init__(ncpus_per_node, ngpus_per_node, queue_node_limit, 51 | time, profile_filename, requested_number_of_nodes) 52 | 53 | #: The name of the queue which goes on the ``#SBATCH --partition {queue_name}`` 54 | #: line of the slurm header 55 | self.queue_name: str = queue_name 56 | 57 | #: The account for the account entry of the slurm header if necessary. 58 | #: The associated SLURM header line is ``#SBATCH --account={account}`` 59 | self.account: str = None 60 | 61 | #: Requested memory size on the select line. Need to include units in the str. 62 | #: The associated SLURM header line is ``#SBATCH --mem={mem}`` 63 | self.mem: Union[str, None] = mem 64 | 65 | #: Index range for SLURM array of jobs 66 | #: The associated SLURM header line is ``#SBATCH --array={array_range}`` 67 | self.array_range: Union[str, None] = None 68 | 69 | #: ``sbatch --mail-type`` mail options. BEGIN, END, FAIL 70 | self.mail_options: str = None 71 | 72 | #: ``sbatch --mail-user`` mail list. Who to email when mail_options are triggered 73 | self.mail_list: Union[str, None] = None 74 | 75 | #: Type of dependency if dependency active. 76 | #: Default is 'afterok' which only launches the new job if the previous one was successful. 77 | self.dependency_type: str = "afterok" 78 | 79 | self.mpiexec: str = "mpiexec" 80 | self.ranks_per_node_flag: str = None 81 | 82 | self.workdir_env_variable = "$SLURM_SUBMIT_DIR" 83 | self.batch_file_extension = "slurm" 84 | self.mpiprocs_per_node = None 85 | self.requested_number_of_nodes = requested_number_of_nodes 86 | 87 | #: nodelist 88 | self.nodelist: str = None 89 | 90 | def _create_list_of_standard_header_options(self, job_name: str) -> List[str]: 91 | header_lines = [ 92 | self._create_hashbang(), 93 | self._create_job_line_of_header(job_name), 94 | self._create_queue_line_of_header(), 95 | self._create_nodes_line_of_header(), 96 | self._create_tasks_per_node_line_of_header(), 97 | self._create_walltime_line_of_header(), 98 | self._create_log_name_line_of_header(job_name), 99 | self._create_header_line_to_error_output(job_name), 100 | self._create_header_line_to_set_that_job_is_not_rerunnable(), 101 | ] 102 | return header_lines 103 | 104 | def _create_job_line_of_header(self, job_name: str) -> str: 105 | return f"#SBATCH --job-name={job_name}" 106 | 107 | def _create_queue_line_of_header(self) -> str: 108 | return f"#SBATCH --partition={self.queue_name}" 109 | 110 | def _create_nodes_line_of_header(self) -> str: 111 | return f'#SBATCH --nodes={self.requested_number_of_nodes}' 112 | 113 | def _create_tasks_per_node_line_of_header(self) -> str: 114 | return f'#SBATCH --ntasks-per-node={self.ncpus_per_node}' 115 | 116 | def _create_walltime_line_of_header(self) -> str: 117 | return f"#SBATCH --time={self.time}:00:00" 118 | 119 | def _create_log_name_line_of_header(self, job_name: str) -> str: 120 | return f"#SBATCH --output=qlog_{job_name}" 121 | 122 | def _create_header_line_to_error_output(self, job_name: str): 123 | return f"#SBATCH --error=err_{job_name}" 124 | 125 | def _create_header_line_to_set_that_job_is_not_rerunnable(self) -> str: 126 | return "#SBATCH --no-requeue" 127 | 128 | def _create_list_of_optional_header_lines(self, dependency): 129 | header_lines = [] 130 | header_lines.extend(self._create_account_header_line()) 131 | header_lines.extend(self._create_array_range_header_line()) 132 | header_lines.extend(self._create_mail_options_header_lines()) 133 | header_lines.extend(self._create_job_dependencies_header_line(dependency)) 134 | header_lines.extend(self._create_nodelist_header_line()) 135 | return header_lines 136 | 137 | def _create_account_header_line(self) -> List[str]: 138 | if self.account is not None: 139 | return [f"#SBATCH --account={self.account}"] 140 | else: 141 | return [] 142 | 143 | def _create_array_range_header_line(self) -> List[str]: 144 | if self.array_range is not None: 145 | return [f"#SBATCH --array={self.array_range}"] 146 | else: 147 | return [] 148 | 149 | def _create_mail_options_header_lines(self) -> List[str]: 150 | header_lines = [] 151 | if self.mail_options is not None: 152 | header_lines.append(f"#SBATCH --mail-type={self.mail_options}") 153 | if self.mail_list is not None: 154 | header_lines.append(f"#SBATCH --mail-user={self.mail_list}") 155 | return header_lines 156 | 157 | def _create_job_dependencies_header_line(self, dependency) -> List[str]: 158 | if dependency is not None: 159 | return [f"#SBATCH --dependency={self.dependency_type}:{dependency}"] 160 | else: 161 | return [] 162 | 163 | def _create_nodelist_header_line(self) -> List[str]: 164 | if self.nodelist is not None: 165 | return [f"#SBATCH --nodelist={self.nodelist}"] 166 | else: 167 | return [] 168 | 169 | def _run_job(self, job_filename: str, blocking: bool, print_command_output=True) -> str: 170 | options = "" 171 | if blocking: 172 | options += "-W" 173 | command_output = os.popen(f"sbatch {options} {job_filename}").read().strip() 174 | if print_command_output: 175 | print(command_output) 176 | return command_output 177 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "pbs4py" 7 | version = "1.1.0" 8 | description = "PBS scripting utilities" 9 | readme = "README.md" 10 | requires-python = ">=3.6" 11 | authors = [{ name = "Kevin Jacobson", email = "kevin.e.jacobson@nasa.gov" }] 12 | dependencies = ["numpy"] 13 | license = { file = "LICENSE"} 14 | 15 | [project.scripts] 16 | "qdel_user_jobs.py" = "pbs4py.scripts.qdel_user_jobs:main" 17 | "job_dir.py" = "pbs4py.scripts.job_dir:main" 18 | -------------------------------------------------------------------------------- /tests/job_test/empty_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/pbs4py/19a130db07d21358fd02954ef79ca38c61b8c811/tests/job_test/empty_file -------------------------------------------------------------------------------- /tests/pbs_test_files/golden0.lsf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env tcsh 2 | #BSUB -P ard149 3 | #BSUB -J test_job 4 | #BSUB -nnodes 2 5 | #BSUB -W 24:00 6 | #BSUB -N 7 | 8 | 9 | cd $LS_SUBCWD 10 | source_line 11 | 12 | command1 13 | command2 14 | -------------------------------------------------------------------------------- /tests/pbs_test_files/golden0.pbs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | #PBS -N test_job 3 | #PBS -q queue 4 | #PBS -l select=1:ncpus=5:mpiprocs=5 5 | #PBS -l walltime=24:00:00 6 | #PBS -o test_job_pbs.log 7 | #PBS -j oe 8 | #PBS -r n 9 | 10 | 11 | cd $PBS_O_WORKDIR 12 | source_line 13 | 14 | command1 15 | command2 16 | -------------------------------------------------------------------------------- /tests/pbs_test_files/golden0.slurm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | #SBATCH --job-name=test_job 3 | #SBATCH --partition=queue 4 | #SBATCH --nodes=4 5 | #SBATCH --ntasks-per-node=20 6 | #SBATCH --time=13:00:00 7 | #SBATCH --output=qlog_test_job 8 | #SBATCH --error=err_test_job 9 | #SBATCH --no-requeue 10 | 11 | 12 | cd $SLURM_SUBMIT_DIR 13 | source_line 14 | 15 | command1 16 | command2 17 | -------------------------------------------------------------------------------- /tests/test_bsub.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from typing import List 4 | from pbs4py.bsub import BSUB 5 | 6 | test_directory = os.path.dirname(os.path.abspath(__file__)) 7 | test_profile = f'{test_directory}/testing_bashrc' 8 | 9 | 10 | def check_list_of_strings(actual: List[str], expected: List[str]): 11 | assert len(actual) == len(expected) 12 | for a, e in zip(actual, expected): 13 | assert a == e 14 | 15 | 16 | @pytest.fixture 17 | def bsub_header_test(): 18 | project = 'ard149' 19 | ngpu = 5 20 | time = 24 21 | hashbang = '#!/usr/bin/tcsh' 22 | bsub_header_test = BSUB(project, ngpus_per_node=ngpu, time=time, 23 | profile_filename=test_profile) 24 | bsub_header_test.hashbang = hashbang 25 | bsub_header_test.requested_number_of_nodes = 2 26 | return bsub_header_test 27 | 28 | 29 | def test_walltime_line(bsub_header_test: BSUB): 30 | bsub_header_test.time = 5 31 | line = bsub_header_test._create_wall_time_line_of_header() 32 | assert line == '#BSUB -W 5:00' 33 | 34 | 35 | def test_number_of_nodes_line(bsub_header_test: BSUB): 36 | line = bsub_header_test._create_number_of_nodes_line_of_header() 37 | assert line == '#BSUB -nnodes 2' 38 | 39 | 40 | def test_job_line(bsub_header_test: BSUB): 41 | job_name = 'test' 42 | line = bsub_header_test._create_job_name_line_of_header(job_name) 43 | assert line == '#BSUB -J test' 44 | 45 | 46 | def test_project_line(bsub_header_test: BSUB): 47 | bsub_header_test.project = 'ard149' 48 | line = bsub_header_test._create_project_line_of_header() 49 | assert line == '#BSUB -P ard149' 50 | 51 | 52 | def test_mail_header(bsub_header_test: BSUB): 53 | bsub_header_test.mail_when_complete = False 54 | header = bsub_header_test._create_mail_header_line() 55 | check_list_of_strings(header, []) 56 | 57 | bsub_header_test.mail_when_complete = True 58 | header = bsub_header_test._create_mail_header_line() 59 | check_list_of_strings(header, ['#BSUB -N']) 60 | 61 | 62 | def test_job_dependency_header(bsub_header_test: BSUB): 63 | header = bsub_header_test._create_job_dependency_header_line(None) 64 | check_list_of_strings(header, []) 65 | 66 | header = bsub_header_test._create_job_dependency_header_line('1234') 67 | check_list_of_strings(header, ['#BSUB -w ended(1234)']) 68 | 69 | 70 | def test_parse_job_id_from_bsub_output(bsub_header_test: BSUB): 71 | output = 'Job <1983914> is submitted to default queue .' 72 | id = bsub_header_test._parse_job_id_out_of_bsub_output(output) 73 | assert id == 1983914 74 | 75 | 76 | def test_create_command(bsub_header_test: BSUB): 77 | bsub_header_test.ngpus_per_node = 3 78 | bsub_header_test.requested_number_of_nodes = 2 79 | command = bsub_header_test.create_mpi_command('a.out', 'dog', openmp_threads=2) 80 | assert command == 'jsrun -n 6 -a 1 -c 2 -g 1 a.out &> dog.out' 81 | -------------------------------------------------------------------------------- /tests/test_bsub_regression.py: -------------------------------------------------------------------------------- 1 | import os 2 | import filecmp 3 | from pbs4py.bsub import BSUB 4 | 5 | test_directory = os.path.dirname(os.path.abspath(__file__)) 6 | test_profile = f'{test_directory}/testing_bashrc' 7 | 8 | 9 | def modify_golden_file_to_have_right_path_for_profile(golden_file: str, profile_filename: str): 10 | with open(golden_file, 'r') as fh: 11 | golden_file_contents = fh.readlines() 12 | golden_file_with_profile = [] 13 | for line in golden_file_contents: 14 | if line == 'source_line\n': 15 | golden_file_with_profile.append(f'source {profile_filename}\n') 16 | else: 17 | golden_file_with_profile.append(line) 18 | golden_mod = f'{test_directory}/test_output_files/golden_mod.lsf' 19 | with open(golden_mod, 'w') as fh: 20 | fh.writelines(golden_file_with_profile) 21 | return golden_mod 22 | 23 | 24 | def test_write_job_file_regression_check(): 25 | golden_file = f'{test_directory}/pbs_test_files/golden0.lsf' 26 | project = 'ard149' 27 | time = 24 28 | shell = 'tcsh' 29 | bsub = BSUB(project, time=time, profile_filename=test_profile) 30 | bsub.shell = shell 31 | bsub.requested_number_of_nodes = 2 32 | 33 | job_name = 'test_job' 34 | job_body = ['command1', 'command2'] 35 | bsub_file = f'{test_directory}/test_output_files/test.lsf' 36 | bsub.write_job_file(bsub_file, job_name, job_body) 37 | 38 | golden_mod = modify_golden_file_to_have_right_path_for_profile( 39 | golden_file, bsub.profile_filename) 40 | 41 | assert filecmp.cmp(bsub_file, golden_mod) 42 | 43 | 44 | if __name__ == '__main__': 45 | test_write_job_file_regression_check() 46 | -------------------------------------------------------------------------------- /tests/test_fake_pbs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pbs4py import FakePBS 3 | 4 | test_directory = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | 7 | def test_fakePBS(): 8 | pbs = FakePBS(profile_filename=f'{test_directory}/testing_bashrc') 9 | job_name = 'test' 10 | 11 | file = 'fake_file.txt' 12 | assert not os.path.isfile(f'{test_directory}/{file}') 13 | job_body = [f'touch {test_directory}/{file}'] 14 | job_id = pbs.launch(job_name, job_body) 15 | assert os.path.isfile(f'{test_directory}/{file}') 16 | assert job_id == 'FakePBS.0' 17 | os.system(f'rm {test_directory}/{file}') 18 | -------------------------------------------------------------------------------- /tests/test_job.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pbs4py.job import PBSJob 3 | 4 | class FakeKJob(PBSJob): 5 | def _run_qstat_to_get_full_job_attributes(self): 6 | stand_in_output = [ 7 | "Job Id: 4259576.pbssrv1", 8 | " Job_Name = oat_steady_l6", 9 | " Job_Owner = kejacob1@k4-li1-ib0", 10 | " job_state = Q", 11 | " queue = K4-standard", 12 | " server = pbssrv1", 13 | " Checkpoint = u", 14 | " ctime = Thu Oct 24 12:26:31 2024", 15 | " Error_Path = k4-li1-ib0:/lustre3/hpnobackup2/kejacob1/projects/rca/buffet/c", 16 | " ases/oat15a/ncfv_rans_pointwise_revised_grid_cc/grid_l6/aoa3.6/steady/o", 17 | " at_steady_l6.e4259576", 18 | " Hold_Types = n", 19 | " Join_Path = oe", 20 | " Keep_Files = n", 21 | " Mail_Points = a", 22 | " mtime = Thu Oct 24 12:26:32 2024", 23 | " Output_Path = k4-li1-ib0.ccf-beowulf.ndc.nasa.gov:/lustre3/hpnobackup2/keja", 24 | " cob1/projects/rca/buffet/cases/oat15a/ncfv_rans_pointwise_revised_grid_", 25 | " cc/grid_l6/aoa3.6/steady/oat_steady_l6_pbs.log", 26 | " Priority = 0", 27 | " qtime = Thu Oct 24 12:26:31 2024", 28 | " Rerunable = False", 29 | " Resource_List.mem = 96000mb", 30 | " Resource_List.mpiprocs = 400", 31 | " Resource_List.ncpus = 400", 32 | " Resource_List.nodect = 10", 33 | " Resource_List.nodegroup = K4-open", 34 | " Resource_List.place = scatter:excl", 35 | " Resource_List.select = 10:ncpus=40:mpiprocs=40", 36 | " Resource_List.walltime = 72:00:00", 37 | " substate = 10", 38 | " Variable_List = PBS_O_HOME=/u/kejacob1,PBS_O_LANG=C,PBS_O_LOGNAME=kejacob1,", 39 | " PBS_O_PATH=/hpnobackup2/kejacob1/projects/cbse/cbse_clean/cbse/build_o", 40 | " pt/bin:/u/kejacob1/.local/bin:/u/kejacob1/bin:/usr/local/pkgs-viz/cuda_", 41 | " 12.2.2/bin:/usr/local/pkgs-viz/cuda_12.2.2/nvvm/bin:/usr/local/pkgs-viz", 42 | " /cuda_12.2.2/nsight-systems-2023.2.3/bin:/u/shared/fun3d/fun3d_users/mo", 43 | " dules/ParMETIS/4.0.3-mpt-2.25-intel_2019.5.281/bin:/opt/hpe/hpc/mpt/mpt", 44 | " -2.25/bin:/usr/local/pkgs-modules/intel_2019/inspector/bin64:/usr/local", 45 | " /pkgs-modules/intel_2019/advisor/bin64:/usr/local/pkgs-modules/intel_20", 46 | " 19/compilers_and_libraries_2019.5.281/linux/bin/intel64:/usr/local/pkgs", 47 | " -modules/intel_2019/vtune_amplifier/bin64:/hpnobackup2/shared/kejacob1/", 48 | " modules/gdb/python_3.9.5/bin:/usr/local/pkgs-modules/Python_3.9.5/bin:/", 49 | " hpnobackup2/shared/kejacob1/modules/clang-format/16.0.6/bin:/usr/local/", 50 | " pkgs-modules/gcc_8.2.0/bin:/usr/local/pkgs-modules/autoconf_2.72/bin:/u", 51 | " sr/local/pkgs/modules_4.2.4/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:", 52 | " /sbin,PBS_O_MAIL=/var/spool/mail/kejacob1,PBS_O_SHELL=/bin/bash,", 53 | " PBS_O_WORKDIR=/lustre3/hpnobackup2/kejacob1/projects/rca/buffet/cases/", 54 | " oat15a/ncfv_rans_pointwise_revised_grid_cc/grid_l6/aoa3.6/steady,", 55 | " PBS_O_SYSTEM=Linux,PBS_O_QUEUE=K4-route,PBS_O_HOST=k4-li1-ib0", 56 | " comment = Not Running: Queue K4-standard per-user limit reached on resource", 57 | " ncpus", 58 | " etime = Thu Oct 24 12:26:31 2024", 59 | " eligible_time = 00:00:01", 60 | " Submit_arguments = oat_steady_l6.pbs", 61 | " project = _pbs_project_default", 62 | " Submit_Host = k4-li1-ib0"] 63 | return stand_in_output 64 | 65 | class FakeKJobOld(PBSJob): 66 | def _run_qstat_to_get_full_job_attributes(self): 67 | stand_in_output = [ 68 | " Job: 2493765.pbssrv2", "Job_Name = sample0", "Job_Owner = kejacob1@k4-li1-ib0", 69 | "resources_used.cpupercent = 100", "resources_used.cput = 00:00:02", 70 | "resources_used.mem = 1528kb", "resources_used.ncpus = 16", 71 | "resources_used.vmem = 15936kb", "resources_used.walltime = 00:00:02", "job_state = F", 72 | "queue = K3a-standard", "server = pbssrv2", "Checkpoint = u", 73 | "ctime = 1649348639 (Thu Apr 07 12:23:59 EDT 2022)", 74 | "Error_Path = k4-li1-ib0:/lustre3/hpnobackup2/kejacob1/projects/cad_to_solution/pbs4py/examples/batch_with_job_limit/sample0/sample0.e2493765", 75 | "exec_host = k3ar5n1/0*16", "exec_vnode = (k3ar5n1:ncpus=16)", "Hold_Types = n", 76 | "Join_Path = oe", "Keep_Files = n", "Mail_Points = a", 77 | "mtime = 1649348653 (Thu Apr 07 12:24:13 EDT 2022)", 78 | "Output_Path = k4-li1-ib0.ccf-beowulf.larc.nasa.gov:/lustre3/hpnobackup2/kejacob1/projects/cad_to_solution/pbs4py/examples/batch_with_job_limit/sample0/sample0_pbs.log", 79 | "Priority = 0", "qtime = 1649348639 (Thu Apr 07 12:23:59 EDT 2022)", 80 | "Rerunable = False", "Resource_List.mem = 31gb", "Resource_List.mpiprocs = 16", 81 | "Resource_List.ncpus = 16", "Resource_List.nodect = 1", 82 | "Resource_List.nodegroup = K3a-open", "Resource_List.place = scatter:excl", 83 | "Resource_List.select = 1:ncpus=16:mpiprocs=16", "Resource_List.walltime = 72:00:00", 84 | "stime = 1649348640 (Thu Apr 07 12:24:00 EDT 2022)", "session_id = 22053", 85 | "jobdir = /u/kejacob1", "substate = 92", 86 | "Variable_List = PBS_O_SYSTEM=Linux,PBS_O_SHELL=/bin/bash,PBS_O_HOME=/u/kejacob1,PBS_O_HOST=k4-li1-ib0,PBS_O_LOGNAME=kejacob1,PBS_O_WORKDIR=/lustre3/hpnobackup2/kejacob1/projects/cad_to_solution/pbs4py/examples/batch_with_job_limit/sample0,PBS_O_LANG=C,PBS_O_PATH=/usr/local/pkgs-viz/cuda_11.0.167/bin:/usr/local/pkgs-viz/cuda_11.0.167/nvvm/bin:/u/kejacob1/bin/gdb/bin:/u/kejacob1/.local/bin:/u/kejacob1/bin:/usr/local/pkgs-modules/cmake_3.6.3/bin:/usr/local/pkgs-modules/intel_2018.0.033/inspector/bin64:/usr/local/pkgs-modules/intel_2018.0.033/advisor/bin64:/usr/local/pkgs-modules/intel_2018.0.033/compilers_and_libraries_2018.3.222/linux/bin/intel64:/usr/local/pkgs-modules/intel_2018.0.033/vtune_amplifier/bin64:/usr/local/pkgs-modules/intel_2018.0.033/compilers_and_libraries_2018.3.222/debugger_2018/gdb/intel64/bin:/usr/local/pkgs-modules/openmpi_3.0.1_intel_2018/bin:/usr/local/pkgs-modules/gcc_6.2.0/bin:/usr/local/pkgs-modules/tecplot360ex-2018R1/bin:/usr/local/pkgs-modules/tecplot360ex-2018R2/360ex_2018r2/bin:/lustre3/hpnobackup2/kejacob1/projects/post2/post2_env/bin:/usr/local/pkgs-modules/Python_3.7.1/bin:/usr/local/pkgs/modules_4.2.4/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin,PBS_O_QUEUE=K3a-route,PBS_O_MAIL=/var/spool/mail/kejacob1", 87 | "comment = Job run at Thu Apr 07 at 12:24 on (k3ar5n1:ncpus=16) and finished", 88 | "etime = 1649348639 (Thu Apr 07 12:23:59 EDT 2022)", "run_count = 1", 89 | "eligible_time = 00:00:00", "Stageout_status = 1", "Exit_status = 0", 90 | "Submit_arguments = sample0.pbs", 91 | "history_timestamp = 1649348653", "project = _pbs_project_default"] 92 | return stand_in_output 93 | 94 | 95 | class FakeNASJob(PBSJob): 96 | def _run_qstat_to_get_full_job_attributes(self): 97 | stand_in_output = [ 98 | "Job: 13198744.pbspl1.nas.nasa.gov", 99 | " Job_Name = C006ste", 100 | " Job_Owner = kejacob1@pfe23.nas.nasa.gov", 101 | " job_state = Q", 102 | " queue = devel", 103 | " server = pbspl1.nas.nasa.gov", 104 | " Checkpoint = u", 105 | " ctime = 1649355753 (Thu Apr 07 11:22:33 PDT 2022)", 106 | " Error_Path = pfe23.nas.nasa.gov:/nobackup/kejacob1/projects/sfe/support/C006ste.e13198744", 107 | " group_list = c1454", 108 | " Hold_Types = n", 109 | " Join_Path = oe", 110 | " Keep_Files = n", 111 | " Mail_Points = a", 112 | " mtime = 1649355753 (Thu Apr 07 11:22:33 PDT 2022)", 113 | " Output_Path = pfe23.nas.nasa.gov:/nobackup/kejacob1/projects/sfe/support/C006ste.o13198744", 114 | " Priority = 0", 115 | " qtime = 1649355753 (Thu Apr 07 11:22:33 PDT 2022)", 116 | " Rerunable = False", 117 | " Resource_List.mpiprocs = 640", 118 | " Resource_List.ncpus = 640", 119 | " Resource_List.nobackupp2 = 1", 120 | " Resource_List.nodect = 16", 121 | " Resource_List.place = scatter:excl", 122 | " Resource_List.select = 16:ncpus=40:mpiprocs=40:model=sky_ele", 123 | " Resource_List.walltime = 02:00:00", 124 | " schedselect = 16:ncpus=40:mpiprocs=40:model=sky_ele:aoe=toss3:bigmem=False:reboot=free", 125 | " substate = 10", 126 | " Variable_List = PBS_O_MAIL=/var/mail/kejacob1,PBS_O_PATH=/home1/kejacob1/.local/bin:/home1/kejacob1/bin:/nasa/pkgsrc/toss3/2021Q2/views/python/3.9.5/bin:/usr/local/bin:/usr/local/sbin:/usr/bin:/bin:/usr/X11R6/bin:/PBS/bin:/usr/sbin:/sbin:/opt/c3/bin:/opt/sgi/sbin:/opt/sgi/bin,PBS_O_HOME=/home1/kejacob1,PBS_O_SHELL=/bin/bash,PBS_O_TZ=PST8PDT,PBS_O_SYSTEM=Linux,PBS_O_LOGNAME=kejacob1,PBS_O_LANG=C,PBS_O_WORKDIR=/nobackup/kejacob1/projects/sfe/support,PBS_O_QUEUE=devel,PBS_O_HOST=pfe23.nas.nasa.gov", 127 | " euser = kejacob1", 128 | " egroup = c1454", 129 | " queue_type = E", 130 | " etime = 1649355753 (Thu Apr 07 11:22:33 PDT 2022)", 131 | " eligible_time = 00:00:00", 132 | " Submit_arguments = pfe.pbs", 133 | " project = _pbs_project_default", 134 | " Submit_Host = pfe23.nas.nasa.gov"] 135 | return stand_in_output 136 | 137 | 138 | class FakeUnknownJob(PBSJob): 139 | def _run_qstat_to_get_full_job_attributes(self) -> str: 140 | return 'qstat: Unknown Job Id 123456.pbssrv2' 141 | 142 | 143 | def test_read_K_properties_from_qstat(): 144 | job = FakeKJob('2493761') 145 | 146 | assert job.id == '2493761' 147 | assert job.name == 'oat_steady_l6' 148 | assert job.queue == 'K4-standard' 149 | assert job.state == 'Q' 150 | assert job.workdir == '/lustre3/hpnobackup2/kejacob1/projects/rca/buffet/cases/oat15a/ncfv_rans_pointwise_revised_grid_cc/grid_l6/aoa3.6/steady' 151 | assert job.model == '' 152 | assert job.ncpus_per_node == 40 153 | assert job.requested_number_of_nodes == 10 154 | 155 | def test_read_K_old_properties_from_qstat(): 156 | job = FakeKJobOld('2493765') 157 | 158 | assert job.id == '2493765' 159 | assert job.name == 'sample0' 160 | assert job.queue == 'K3a-standard' 161 | assert job.state == 'F' 162 | assert job.workdir == '/lustre3/hpnobackup2/kejacob1/projects/cad_to_solution/pbs4py/examples/batch_with_job_limit/sample0' 163 | assert job.model == '' 164 | assert job.ncpus_per_node == 16 165 | assert job.requested_number_of_nodes == 1 166 | 167 | 168 | def test_read_NAS_properties_from_qstat(): 169 | job = FakeNASJob('13198744') 170 | 171 | assert job.id == '13198744' 172 | assert job.name == 'C006ste' 173 | assert job.queue == 'devel' 174 | assert job.state == 'Q' 175 | assert job.workdir == '/nobackup/kejacob1/projects/sfe/support' 176 | assert job.model == 'sky_ele' 177 | assert job.ncpus_per_node == 40 178 | assert job.requested_number_of_nodes == 16 179 | 180 | 181 | def test_unknown_job(): 182 | job = FakeUnknownJob('123456') 183 | assert job.id == '123456' 184 | assert job.name == '' 185 | assert job.queue == '' 186 | assert job.state == '' 187 | assert job.workdir == '' 188 | assert job.model == '' 189 | assert job.ncpus_per_node == 0 190 | assert job.requested_number_of_nodes == 0 191 | -------------------------------------------------------------------------------- /tests/test_launch_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from pbs4py.launcher_base import Launcher 4 | 5 | test_directory = os.path.dirname(os.path.abspath(__file__)) 6 | test_profile = f'{test_directory}/testing_bashrc' 7 | 8 | 9 | @pytest.fixture 10 | def launcher(): 11 | return Launcher(1, 1, 1, 1, test_profile, 1) 12 | 13 | 14 | def test_profile_file_checking(): 15 | real_file = 'pbs4py_unit_test_dummy.txt' 16 | os.system(f'touch {real_file}') 17 | 18 | launcher = Launcher(1, 1, 1, 1, real_file, 1) 19 | assert launcher.profile_filename == real_file 20 | os.system(f'rm {real_file}') 21 | 22 | nonexistant_file = "i_am_not_a_file.xyz" 23 | with pytest.raises(FileNotFoundError): 24 | launcher.profile_filename = nonexistant_file 25 | 26 | 27 | def test_output_redirection(launcher: Launcher): 28 | launcher.shell = 'tcsh' 29 | assert launcher._redirect_shell_output('dog.out') == '>& dog.out' 30 | 31 | launcher.shell = 'bash' 32 | assert launcher._redirect_shell_output('dog.out') == '&> dog.out' 33 | 34 | launcher.tee_output = True 35 | assert launcher._redirect_shell_output('dog.out') == '2>&1 | tee dog.out' 36 | 37 | 38 | def test_create_mpi_command_openmpi(launcher: Launcher): 39 | launcher.ncpus_per_node = 30 40 | launcher.mpiexec = 'mpirun' 41 | dummy_command = 'foo' 42 | output_root_name = 'dog' 43 | 44 | if not launcher._using_mpt(): 45 | mpi_command = launcher.create_mpi_command(dummy_command, output_root_name) 46 | expected_command = 'mpirun foo &> dog.out' 47 | assert mpi_command == expected_command 48 | 49 | mpi_command = launcher.create_mpi_command(dummy_command, output_root_name, openmp_threads=5) 50 | expected_command = 'OMP_NUM_THREADS=5 OMP_PLACES=cores OMP_PROC_BIND=close mpirun --npernode 6 foo &> dog.out' 51 | assert mpi_command == expected_command 52 | 53 | mpi_command = launcher.create_mpi_command(dummy_command, output_root_name, ranks_per_node=3) 54 | expected_command = 'mpirun --npernode 3 foo &> dog.out' 55 | assert mpi_command == expected_command 56 | 57 | 58 | def test_mpiprocs(launcher: Launcher): 59 | launcher.ncpus_per_node = 20 60 | assert launcher.mpiprocs_per_node == 20 61 | 62 | launcher.ncpus_per_node = 40 63 | assert launcher.mpiprocs_per_node == 40 64 | 65 | launcher.mpiprocs_per_node = 4 66 | assert launcher.mpiprocs_per_node == 4 67 | assert launcher.ncpus_per_node == 40 68 | -------------------------------------------------------------------------------- /tests/test_output_files/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/pbs4py/19a130db07d21358fd02954ef79ca38c61b8c811/tests/test_output_files/.empty -------------------------------------------------------------------------------- /tests/test_pbs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from pbs4py import PBS 4 | 5 | test_directory = os.path.dirname(os.path.abspath(__file__)) 6 | test_profile = f'{test_directory}/testing_bashrc' 7 | 8 | 9 | def test_k3a_class_method(): 10 | k3a = PBS.k3a(profile_filename=test_profile) 11 | assert k3a.queue_name == 'K3a-route' 12 | assert k3a.ncpus_per_node == 16 13 | assert k3a.queue_node_limit == 25 14 | 15 | 16 | def test_k3b_class_method(): 17 | k3 = PBS.k3b(profile_filename=test_profile) 18 | assert k3.queue_name == 'K3b-route' 19 | assert k3.ncpus_per_node == 28 20 | assert k3.queue_node_limit == 74 21 | 22 | 23 | def test_k3c_class_method(): 24 | k3 = PBS.k3c(profile_filename=test_profile) 25 | assert k3.queue_name == 'K3c-route' 26 | assert k3.ncpus_per_node == 28 27 | assert k3.queue_node_limit == 74 28 | 29 | 30 | def test_k4_class_method(): 31 | k4 = PBS.k4(profile_filename=test_profile) 32 | assert k4.queue_name == 'K4-route' 33 | assert k4.ncpus_per_node == 40 34 | assert k4.queue_node_limit == 16 35 | 36 | 37 | def test_k4_v100_class_method(): 38 | k4v100 = PBS.k4_v100(profile_filename=test_profile) 39 | assert k4v100.queue_name == 'K4-V100' 40 | assert k4v100.ncpus_per_node == 4 41 | assert k4v100.queue_node_limit == 4 42 | 43 | 44 | def test_k5_a100_40_class_method(): 45 | k5 = PBS.k5_a100_40(profile_filename=test_profile) 46 | assert k5.queue_name == 'K5-A100-40' 47 | assert k5.ncpus_per_node == 8 48 | assert k5.queue_node_limit == 2 49 | 50 | 51 | def test_k5_a100_80_class_method(): 52 | k5 = PBS.k5_a100_80(profile_filename=test_profile) 53 | assert k5.queue_name == 'K5-A100-80' 54 | assert k5.ncpus_per_node == 8 55 | assert k5.queue_node_limit == 2 56 | 57 | 58 | def test_nas_cascadelake_class_method(): 59 | nas = PBS.nas('n1337', 'cas', profile_filename=test_profile) 60 | assert nas.group_list == 'n1337' 61 | assert nas.ncpus_per_node == 40 62 | assert nas.model == 'cas_ait' 63 | 64 | 65 | def test_nas_skylake_class_method(): 66 | nas = PBS.nas('n1337', 'skylake', profile_filename=test_profile) 67 | assert nas.group_list == 'n1337' 68 | assert nas.ncpus_per_node == 40 69 | assert nas.model == 'sky_ele' 70 | 71 | 72 | def test_nas_broadwell_class_method(): 73 | nas = PBS.nas('n1337', 'bro', profile_filename=test_profile) 74 | assert nas.group_list == 'n1337' 75 | assert nas.ncpus_per_node == 28 76 | assert nas.model == 'bro' 77 | 78 | 79 | def test_nas_haswell_class_method(): 80 | nas = PBS.nas('n1337', 'has', profile_filename=test_profile) 81 | assert nas.group_list == 'n1337' 82 | assert nas.ncpus_per_node == 24 83 | assert nas.model == 'has' 84 | 85 | 86 | def test_nas_ivybridge_class_method(): 87 | nas = PBS.nas('n1337', 'ivy', profile_filename=test_profile) 88 | assert nas.group_list == 'n1337' 89 | assert nas.ncpus_per_node == 20 90 | assert nas.model == 'ivy' 91 | 92 | 93 | def test_nas_sandybridge_class_method(): 94 | nas = PBS.nas('n1337', 'san', profile_filename=test_profile) 95 | assert nas.group_list == 'n1337' 96 | assert nas.ncpus_per_node == 16 97 | assert nas.model == 'san' 98 | 99 | 100 | def test_nas_mil_class_method(): 101 | nas = PBS.nas('n1337', 'mil', profile_filename=test_profile) 102 | assert nas.group_list == 'n1337' 103 | assert nas.ncpus_per_node == 128 104 | assert nas.model == 'mil_ait' 105 | 106 | 107 | def test_nas_rom_class_method(): 108 | nas = PBS.nas('n1337', 'rom', profile_filename=test_profile) 109 | assert nas.group_list == 'n1337' 110 | assert nas.ncpus_per_node == 128 111 | assert nas.model == 'rom_ait' 112 | 113 | 114 | def test_nas_mil_a100_class_method(): 115 | nas = PBS.nas('n1337', 'mil_a100', profile_filename=test_profile) 116 | assert nas.group_list == 'n1337' 117 | assert nas.ncpus_per_node == 64 118 | assert nas.ngpus_per_node == 4 119 | assert nas.mem == '500G' 120 | assert nas.model == 'mil_a100' 121 | 122 | 123 | def test_nas_sky_gpu_class_method(): 124 | nas = PBS.nas('n1337', 'sky_gpu', profile_filename=test_profile) 125 | assert nas.group_list == 'n1337' 126 | assert nas.ncpus_per_node == 36 127 | assert nas.ngpus_per_node == 4 128 | assert nas.mem == '200G' 129 | assert nas.model == 'sky_gpu' 130 | 131 | 132 | def test_nas_cas_gpu_class_method(): 133 | nas = PBS.nas('n1337', 'cas_gpu', profile_filename=test_profile) 134 | assert nas.group_list == 'n1337' 135 | assert nas.ncpus_per_node == 48 136 | assert nas.ngpus_per_node == 4 137 | assert nas.mem == '200G' 138 | assert nas.model == 'cas_gpu' 139 | 140 | 141 | def test_nas_rom_gpu_class_method(): 142 | nas = PBS.nas('n1337', 'rom_gpu', profile_filename=test_profile) 143 | assert nas.group_list == 'n1337' 144 | assert nas.ncpus_per_node == 128 145 | assert nas.ngpus_per_node == 8 146 | assert nas.mem == '700G' 147 | assert nas.model == 'rom_gpu' 148 | 149 | 150 | def test_nas_class_method_with_bad_queue_name(): 151 | with pytest.raises(ValueError): 152 | PBS.nas('n1337', 'not_a_queue', profile_filename=test_profile) 153 | 154 | 155 | def test_cf1_class_method(): 156 | cf1 = PBS.cf1('acct', profile_filename=test_profile) 157 | assert cf1.queue_name == "normal" 158 | assert cf1.group_list == 'acct' 159 | assert cf1.workdir_env_variable == "$SLURM_SUBMIT_DIR" 160 | assert cf1.queue_node_limit == 30 161 | assert cf1.ncpus_per_node == 64 162 | -------------------------------------------------------------------------------- /tests/test_pbs_batch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from typing import List 4 | 5 | from pbs4py import PBSBatch, BatchJob 6 | from pbs4py.directory_utils import cd 7 | 8 | test_directory = os.path.dirname(os.path.abspath(__file__)) 9 | 10 | 11 | class MockJob(BatchJob): 12 | def __init__(self, name: str, body: List[str]): 13 | super().__init__(name, body) 14 | self.state_check_count = 0 15 | 16 | def get_pbs_job_state(self) -> str: 17 | self.state_check_count += 1 18 | if self.state_check_count == 3: 19 | return 'F' 20 | elif self.state_check_count == 2: 21 | return 'R' 22 | elif self.state_check_count == 1: 23 | return 'Q' 24 | 25 | 26 | class MockPBS: 27 | def __init__(self): 28 | self.id_counter = -1 29 | 30 | def launch(self, job_name, job_body, blocking=True): 31 | self.id_counter += 1 32 | return str(self.id_counter) 33 | 34 | 35 | @pytest.fixture 36 | def batch(): 37 | jobs = [MockJob('job0', ['ls']), 38 | MockJob('job1', ['echo Hello World!']), 39 | MockJob('job2', ['pwd'])] 40 | return PBSBatch(MockPBS(), jobs, use_separate_directories=False) 41 | 42 | 43 | def test_create_directories(batch: PBSBatch): 44 | expected_dirs = ['job0', 'job1', 'job2'] 45 | 46 | with cd(test_directory): 47 | for d in expected_dirs: 48 | assert not os.path.exists(d) 49 | 50 | batch.create_directories() 51 | 52 | for d in expected_dirs: 53 | assert os.path.exists(d) 54 | os.system(f'rm -r {d}') 55 | 56 | 57 | def test_launch(batch: PBSBatch): 58 | batch.launch_all_jobs() 59 | for i, job in enumerate(batch.jobs): 60 | assert str(i) == job.id 61 | 62 | 63 | def test_wait_for_all_jobs_to_finish(batch: PBSBatch): 64 | batch.wait_for_all_jobs_to_finish(check_frequency_in_secs=0.1) 65 | 66 | for job in batch.jobs: 67 | assert job.state_check_count == 3 68 | 69 | 70 | def test_all_jobs_submitted(batch: PBSBatch): 71 | assert batch._all_jobs_submitted(3) 72 | assert not batch._all_jobs_submitted(2) 73 | -------------------------------------------------------------------------------- /tests/test_pbs_batch_job.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from pbs4py import BatchJob 5 | 6 | test_directory = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | 9 | @pytest.fixture 10 | def job(): 11 | return BatchJob('job_test', ['ls']) 12 | 13 | 14 | def test_job_directory_context_manager(job: BatchJob): 15 | cwd = os.getcwd() 16 | os.chdir(test_directory) 17 | 18 | test_file = 'empty_file' 19 | assert not os.path.exists(test_file) 20 | with job: 21 | assert os.path.exists(test_file) 22 | 23 | assert not os.path.exists(test_file) 24 | 25 | os.chdir(cwd) 26 | 27 | 28 | def test_job_state_before_launch(job: BatchJob): 29 | assert job.get_pbs_job_state() == '' 30 | -------------------------------------------------------------------------------- /tests/test_pbs_header.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from typing import List 4 | from pbs4py import PBS 5 | 6 | test_directory = os.path.dirname(os.path.abspath(__file__)) 7 | test_profile = f'{test_directory}/testing_bashrc' 8 | 9 | 10 | def check_list_of_strings(actual: List[str], expected: List[str]): 11 | assert len(actual) == len(expected) 12 | for a, e in zip(actual, expected): 13 | assert a == e 14 | 15 | 16 | @pytest.fixture 17 | def pbs_header_test(): 18 | queue_name = 'queue' 19 | ncpus_per_node = 5 20 | queue_node_limit = 10 21 | time = 24 22 | hashbang = '#!/usr/bin/tcsh' 23 | pbs_header_test = PBS(queue_name=queue_name, ncpus_per_node=ncpus_per_node, 24 | queue_node_limit=queue_node_limit, time=time, 25 | profile_filename=test_profile) 26 | pbs_header_test.hashbang = hashbang 27 | pbs_header_test.requested_number_of_nodes = 2 28 | return pbs_header_test 29 | 30 | 31 | def test_walltime_line(pbs_header_test: PBS): 32 | pbs_header_test.time = 5 33 | line = pbs_header_test._create_walltime_line_of_header() 34 | assert line == '#PBS -l walltime=5:00:00' 35 | 36 | 37 | def test_log_line(pbs_header_test: PBS): 38 | job_name = 'test' 39 | line = pbs_header_test._create_log_name_line_of_header(job_name) 40 | assert line == '#PBS -o test_pbs.log' 41 | 42 | 43 | def test_join_output_line(pbs_header_test: PBS): 44 | line = pbs_header_test._create_header_line_to_join_standard_and_error_output() 45 | assert line == '#PBS -j oe' 46 | 47 | 48 | def test_rerunnable_line(pbs_header_test: PBS): 49 | line = pbs_header_test._create_header_line_to_set_that_job_is_not_rerunnable() 50 | assert line == '#PBS -r n' 51 | 52 | 53 | def test_select_line_with_no_model_or_mem_defined(pbs_header_test: PBS): 54 | header = pbs_header_test._create_select_line_of_header() 55 | expected = "#PBS -l select=2:ncpus=5:mpiprocs=5" 56 | assert header == expected 57 | 58 | 59 | def test_select_line_with_model_defined(pbs_header_test: PBS): 60 | pbs_header_test.model = 'bro' 61 | header = pbs_header_test._create_select_line_of_header() 62 | expected = "#PBS -l select=2:ncpus=5:mpiprocs=5:model=bro" 63 | assert header == expected 64 | 65 | 66 | def test_select_line_with_gpus(pbs_header_test: PBS): 67 | pbs_header_test.ngpus_per_node = 2 68 | header = pbs_header_test._create_select_line_of_header() 69 | expected = "#PBS -l select=2:ncpus=5:ngpus=2:mpiprocs=5" 70 | assert header == expected 71 | 72 | 73 | def test_select_line_with_mpiprocs_defined(pbs_header_test: PBS): 74 | pbs_header_test.mpiprocs_per_node = 3 75 | header = pbs_header_test._create_select_line_of_header() 76 | expected = "#PBS -l select=2:ncpus=5:mpiprocs=3" 77 | assert header == expected 78 | 79 | 80 | def test_select_line_with_mem_defined(pbs_header_test: PBS): 81 | pbs_header_test.mem = '245gb' 82 | header = pbs_header_test._create_select_line_of_header() 83 | expected = "#PBS -l select=2:ncpus=5:mpiprocs=5:mem=245gb" 84 | assert header == expected 85 | 86 | 87 | def test_pbs_header_with_group_name_not_defined(pbs_header_test: PBS): 88 | header = pbs_header_test._create_group_list_header_line() 89 | expected = [] 90 | check_list_of_strings(header, expected) 91 | 92 | 93 | def test_pbs_header_with_group_name_defined(pbs_header_test: PBS): 94 | pbs_header_test.group_list = 'n1337' 95 | header = pbs_header_test._create_group_list_header_line() 96 | expected = ["#PBS -W group_list=n1337"] 97 | check_list_of_strings(header, expected) 98 | 99 | 100 | def test_pbs_header_email_option(pbs_header_test: PBS): 101 | pbs_header_test.mail_options = 'be' 102 | pbs_header_test.mail_list = 'kevin@nasa.gov' 103 | header = pbs_header_test._create_mail_options_header_lines() 104 | expected = ['#PBS -m be', '#PBS -M kevin@nasa.gov'] 105 | check_list_of_strings(header, expected) 106 | 107 | 108 | def test_job_line_of_header(pbs_header_test: PBS): 109 | job_name = 'test_job' 110 | assert '#PBS -N test_job' == pbs_header_test._create_job_line_of_header(job_name) 111 | 112 | 113 | def test_queue_line_of_header(pbs_header_test: PBS): 114 | pbs_header_test.queue_name = 'K4-standard' 115 | assert '#PBS -q K4-standard' == pbs_header_test._create_queue_line_of_header() 116 | 117 | 118 | def test_array_range_line_of_header_default_is_off(pbs_header_test: PBS): 119 | assert [] == pbs_header_test._create_array_range_header_line() 120 | 121 | 122 | def test_array_range_line_of_header(pbs_header_test: PBS): 123 | pbs_header_test.array_range = '1-24' 124 | assert ['#PBS -J 1-24'] == pbs_header_test._create_array_range_header_line() 125 | pbs_header_test.array_range = None 126 | assert [] == pbs_header_test._create_array_range_header_line() 127 | 128 | 129 | def test_job_dependency_line_of_header(pbs_header_test: PBS): 130 | assert [] == pbs_header_test._create_job_dependencies_header_line(dependency=None) 131 | assert [ 132 | '#PBS -W depend=afterok:a.1234'] == pbs_header_test._create_job_dependencies_header_line(dependency='a.1234') 133 | 134 | pbs_header_test.dependency_type = 'before' 135 | assert [ 136 | '#PBS -W depend=before:b.4321'] == pbs_header_test._create_job_dependencies_header_line(dependency='b.4321') 137 | -------------------------------------------------------------------------------- /tests/test_pbs_regression.py: -------------------------------------------------------------------------------- 1 | import os 2 | import filecmp 3 | from pbs4py import PBS 4 | 5 | test_directory = os.path.dirname(os.path.abspath(__file__)) 6 | test_profile = f'{test_directory}/testing_bashrc' 7 | 8 | 9 | def modify_golden_file_to_have_right_path_for_profile(golden_file: str, profile_filename: str): 10 | with open(golden_file, 'r') as fh: 11 | golden_file_contents = fh.readlines() 12 | golden_file_with_profile = [] 13 | for line in golden_file_contents: 14 | if line == 'source_line\n': 15 | golden_file_with_profile.append(f'source {profile_filename}\n') 16 | else: 17 | golden_file_with_profile.append(line) 18 | golden_mod = f'{test_directory}/test_output_files/golden_mod.pbs' 19 | with open(golden_mod, 'w') as fh: 20 | fh.writelines(golden_file_with_profile) 21 | return golden_mod 22 | 23 | 24 | def test_write_job_file_regression_check(): 25 | golden_file = f'{test_directory}/pbs_test_files/golden0.pbs' 26 | queue_name = 'queue' 27 | ncpus_per_node = 5 28 | queue_node_limit = 10 29 | time = 24 30 | hashbang = '#!/usr/bin/bash' 31 | pbs = PBS(queue_name=queue_name, ncpus_per_node=ncpus_per_node, 32 | queue_node_limit=queue_node_limit, time=time, 33 | profile_filename=test_profile) 34 | pbs.hashbang = hashbang 35 | 36 | job_name = 'test_job' 37 | job_body = ['command1', 'command2'] 38 | pbs_file = f'{test_directory}/test_output_files/test.pbs' 39 | pbs.write_job_file(pbs_file, job_name, job_body) 40 | 41 | golden_mod = modify_golden_file_to_have_right_path_for_profile( 42 | golden_file, pbs.profile_filename) 43 | 44 | assert filecmp.cmp(pbs_file, golden_mod) 45 | 46 | 47 | if __name__ == '__main__': 48 | test_write_job_file_regression_check() 49 | -------------------------------------------------------------------------------- /tests/test_slurm_header.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from pbs4py.slurm import SLURM 4 | 5 | test_directory = os.path.dirname(os.path.abspath(__file__)) 6 | test_profile = f'{test_directory}/testing_bashrc' 7 | 8 | 9 | @pytest.fixture 10 | def slurm_header_test(): 11 | queue_name = 'queue' 12 | ncpus_per_node = 5 13 | queue_node_limit = 20 14 | time = 12 15 | hashbang = '#!/usr/bin/tcsh' 16 | pbs_header_test = SLURM(queue_name=queue_name, ncpus_per_node=ncpus_per_node, 17 | queue_node_limit=queue_node_limit, time=time, 18 | profile_filename=test_profile) 19 | pbs_header_test.hashbang = hashbang 20 | pbs_header_test.requested_number_of_nodes = 4 21 | return pbs_header_test 22 | 23 | 24 | def test_job_line(slurm_header_test: SLURM): 25 | line = slurm_header_test._create_job_line_of_header("dog") 26 | assert line == "#SBATCH --job-name=dog" 27 | 28 | 29 | def test_queue_line(slurm_header_test: SLURM): 30 | line = slurm_header_test._create_queue_line_of_header() 31 | assert line == "#SBATCH --partition=queue" 32 | 33 | 34 | def test_nodes_line(slurm_header_test: SLURM): 35 | line = slurm_header_test._create_nodes_line_of_header() 36 | assert line == "#SBATCH --nodes=4" 37 | 38 | 39 | def test_tasks_per_node_line(slurm_header_test: SLURM): 40 | line = slurm_header_test._create_tasks_per_node_line_of_header() 41 | assert line == "#SBATCH --ntasks-per-node=5" 42 | 43 | 44 | def test_walltime_line(slurm_header_test: SLURM): 45 | slurm_header_test.time = 16 46 | line = slurm_header_test._create_walltime_line_of_header() 47 | assert line == "#SBATCH --time=16:00:00" 48 | 49 | 50 | def test_log_line(slurm_header_test: SLURM): 51 | line = slurm_header_test._create_log_name_line_of_header("dog") 52 | assert line == "#SBATCH --output=qlog_dog" 53 | 54 | 55 | def test_error_log_line(slurm_header_test: SLURM): 56 | line = slurm_header_test._create_header_line_to_error_output("dog") 57 | assert line == "#SBATCH --error=err_dog" 58 | 59 | 60 | def test_not_rerunnable_line(slurm_header_test: SLURM): 61 | line = slurm_header_test._create_header_line_to_set_that_job_is_not_rerunnable() 62 | assert line == "#SBATCH --no-requeue" 63 | 64 | 65 | def test_account_line(slurm_header_test: SLURM): 66 | lines = slurm_header_test._create_account_header_line() 67 | assert len(lines) == 0 68 | 69 | slurm_header_test.account = "a123" 70 | lines = slurm_header_test._create_account_header_line() 71 | assert len(lines) == 1 72 | assert lines[0] == "#SBATCH --account=a123" 73 | 74 | 75 | def test_array_range_header_line(slurm_header_test: SLURM): 76 | lines = slurm_header_test._create_array_range_header_line() 77 | assert len(lines) == 0 78 | 79 | slurm_header_test.array_range = '1-2' 80 | lines = slurm_header_test._create_array_range_header_line() 81 | assert len(lines) == 1 82 | assert lines[0] == "#SBATCH --array=1-2" 83 | 84 | 85 | def test_mail_options_lines(slurm_header_test: SLURM): 86 | lines = slurm_header_test._create_mail_options_header_lines() 87 | assert len(lines) == 0 88 | 89 | slurm_header_test.mail_options = "BEGIN" 90 | slurm_header_test.mail_list = "test@nasa.gov" 91 | lines = slurm_header_test._create_mail_options_header_lines() 92 | assert len(lines) == 2 93 | assert lines[0] == "#SBATCH --mail-type=BEGIN" 94 | assert lines[1] == "#SBATCH --mail-user=test@nasa.gov" 95 | 96 | 97 | def test_dependency_lines(slurm_header_test: SLURM): 98 | lines = slurm_header_test._create_job_dependencies_header_line(None) 99 | assert len(lines) == 0 100 | 101 | lines = slurm_header_test._create_job_dependencies_header_line("a123") 102 | assert len(lines) == 1 103 | assert lines[0] == "#SBATCH --dependency=afterok:a123" 104 | 105 | 106 | def test_nodelist_line(slurm_header_test: SLURM): 107 | lines = slurm_header_test._create_nodelist_header_line() 108 | assert len(lines) == 0 109 | 110 | slurm_header_test.nodelist = '1,2,3,4' 111 | lines = slurm_header_test._create_nodelist_header_line() 112 | assert len(lines) == 1 113 | assert lines[0] == "#SBATCH --nodelist=1,2,3,4" 114 | -------------------------------------------------------------------------------- /tests/test_slurm_regression.py: -------------------------------------------------------------------------------- 1 | import os 2 | import filecmp 3 | from pbs4py.slurm import SLURM 4 | 5 | test_directory = os.path.dirname(os.path.abspath(__file__)) 6 | test_profile = f'{test_directory}/testing_bashrc' 7 | 8 | 9 | def modify_golden_file_to_have_right_path_for_profile(golden_file: str, profile_filename: str): 10 | with open(golden_file, 'r') as fh: 11 | golden_file_contents = fh.readlines() 12 | golden_file_with_profile = [] 13 | for line in golden_file_contents: 14 | if line == 'source_line\n': 15 | golden_file_with_profile.append(f'source {profile_filename}\n') 16 | else: 17 | golden_file_with_profile.append(line) 18 | golden_mod = f'{test_directory}/test_output_files/golden_mod.slurm' 19 | with open(golden_mod, 'w') as fh: 20 | fh.writelines(golden_file_with_profile) 21 | return golden_mod 22 | 23 | 24 | def test_write_job_file_regression_check(): 25 | golden_file = f'{test_directory}/pbs_test_files/golden0.slurm' 26 | queue_name = 'queue' 27 | ncpus_per_node = 20 28 | queue_node_limit = 10 29 | time = 13 30 | hashbang = '#!/usr/bin/bash' 31 | requested_number_of_nodes = 4 32 | pbs = SLURM(queue_name=queue_name, ncpus_per_node=ncpus_per_node, 33 | queue_node_limit=queue_node_limit, time=time, 34 | profile_filename=test_profile, 35 | requested_number_of_nodes=requested_number_of_nodes) 36 | pbs.hashbang = hashbang 37 | 38 | job_name = 'test_job' 39 | job_body = ['command1', 'command2'] 40 | pbs_file = f'{test_directory}/test_output_files/test.slurm' 41 | pbs.write_job_file(pbs_file, job_name, job_body) 42 | 43 | golden_mod = modify_golden_file_to_have_right_path_for_profile( 44 | golden_file, pbs.profile_filename) 45 | 46 | assert filecmp.cmp(pbs_file, golden_mod) 47 | 48 | 49 | if __name__ == '__main__': 50 | test_write_job_file_regression_check() 51 | -------------------------------------------------------------------------------- /tests/testing_bashrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/pbs4py/19a130db07d21358fd02954ef79ca38c61b8c811/tests/testing_bashrc --------------------------------------------------------------------------------