├── .flake8 ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.rst ├── docs ├── Makefile ├── conf.py ├── devguide │ ├── #dev_docs.rst# │ ├── .#dev_docs.rst │ ├── changelog.rst │ ├── design.rst │ ├── dev_docs.rst │ └── packaging.rst ├── index.rst ├── libsubmit_art │ ├── README.txt │ ├── multi_node.png │ ├── multi_node.svg │ ├── multi_worker.png │ ├── multi_worker.svg │ ├── single_worker.png │ └── single_worker.svg ├── quick │ └── quickstart.rst ├── reference.rst ├── stubs │ ├── libsubmit.providers.aws.aws.EC2Provider.rst │ ├── libsubmit.providers.cobalt.cobalt.Cobalt.rst │ ├── libsubmit.providers.condor.condor.Condor.rst │ ├── libsubmit.providers.googlecloud.googlecloud.GoogleCloud.rst │ ├── libsubmit.providers.gridEngine.gridEngine.GridEngine.rst │ ├── libsubmit.providers.jetstream.jetstream.Jetstream.rst │ ├── libsubmit.providers.local.local.Local.rst │ ├── libsubmit.providers.provider_base.ExecutionProvider.rst │ ├── libsubmit.providers.slurm.slurm.Slurm.rst │ └── libsubmit.providers.torque.torque.Torque.rst └── userguide │ ├── configuring.rst │ ├── index.rst │ └── overview.rst ├── libsubmit ├── __init__.py ├── channels │ ├── __init__.py │ ├── channel_base.py │ ├── errors.py │ ├── local │ │ ├── __init__.py │ │ └── local.py │ ├── ssh │ │ ├── __init__.py │ │ └── ssh.py │ └── ssh_il │ │ ├── __init__.py │ │ └── ssh_il.py ├── error.py ├── launchers │ ├── __init__.py │ └── launchers.py ├── providers │ ├── __init__.py │ ├── aws │ │ ├── __init__.py │ │ ├── aws.py │ │ └── template.py │ ├── azure │ │ ├── __init__.py │ │ ├── azure.py │ │ ├── azureconf.json │ │ └── deployer.py │ ├── cluster_provider.py │ ├── cobalt │ │ ├── __init__.py │ │ ├── cobalt.py │ │ └── template.py │ ├── condor │ │ ├── __init__.py │ │ ├── condor.py │ │ └── template.py │ ├── googlecloud │ │ ├── __init__.py │ │ └── googlecloud.py │ ├── grid_engine │ │ ├── __init__.py │ │ ├── grid_engine.py │ │ └── template.py │ ├── jetstream │ │ ├── __init__.py │ │ ├── jetstream.py │ │ └── setup_first_time.sh │ ├── kubernetes │ │ ├── __init__.py │ │ ├── kube.py │ │ └── template.py │ ├── local │ │ ├── __init__.py │ │ └── local.py │ ├── provider_base.py │ ├── slurm │ │ ├── __init__.py │ │ ├── slurm.py │ │ └── template.py │ └── torque │ │ ├── __init__.py │ │ ├── template.py │ │ └── torque.py ├── tests │ ├── setup_path.sh │ ├── test_channels │ │ ├── remote_run.sh │ │ ├── test_channels.py │ │ ├── test_local_channel.py │ │ ├── test_scp_1.py │ │ ├── test_ssh_1.py │ │ ├── test_ssh_errors.py │ │ ├── test_ssh_file_transport.py │ │ └── test_ssh_interactive.py │ ├── test_integration │ │ └── test_ssh │ │ │ ├── test_ssh_beagle.py │ │ │ ├── test_ssh_condor_earth.py │ │ │ ├── test_ssh_cori.py │ │ │ └── test_ssh_swan.py │ └── test_providers │ │ └── ec2 │ │ └── test_ec2.py ├── utils.py └── version.py ├── requirements.txt ├── setup.py └── test-requirements.txt /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # D203: 1 blank line required before class docstring 3 | # E124: closing bracket does not match visual indentation 4 | # E126: continuation line over-indented for hanging indent 5 | # F403: ‘from module import *’ used; unable to detect undefined names 6 | # F405: name may be undefined, or defined from star imports: module 7 | # Ignoring the next one for valid tests 8 | # F811: redefinition of unused name from line N 9 | # This one is bad. Sometimes ordering matters, conditional imports 10 | # setting env vars necessary etc. 11 | # E402: module level import not at top of file 12 | ignore = D203, E124, E126, F403, F405, F811, E402, W605 13 | max-line-length = 160 14 | exclude = parsl/executors/serialize/, parsl/libsubmit/ 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Emacs temp files 2 | *~ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # emacs buffers 107 | \#* 108 | 109 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.4" 4 | - "3.5" 5 | - "3.6" 6 | 7 | # command to install dependencies 8 | install: 9 | - pip install -r requirements.txt 10 | - pip install flake8 11 | - python setup.py install 12 | 13 | # command to run tests 14 | script: 15 | - pip install -r test-requirements.txt 16 | - flake8 libsubmit/ 17 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Libsubmit - Scheduler abstraction 2 | ================================= 3 | |licence| |build-status| |docs| 4 | 5 | **Libsubmit** provides a uniform interface to submit arbitrary bash scripts to a 6 | variety of execution systems such as clouds, grids, cluster and supercomputers. 7 | This library is designed to simplify submission of pilot systems such as ipython-parallel 8 | to a variety of compute resources. 9 | 10 | #The latest version available on PyPi is v0.4.1 . 11 | 12 | .. |licence| image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg 13 | :target: https://github.com/Parsl/libsubmit/blob/master/LICENSE 14 | :alt: Apache Licence V2.0 15 | .. |build-status| image:: https://travis-ci.org/Parsl/libsubmit.svg?branch=master 16 | :target: https://travis-ci.org/Parsl/libsubmit 17 | :alt: Build status 18 | .. |docs| image:: https://readthedocs.org/projects/libsubmit/badge/?version=latest 19 | :target: http://libsubmit.readthedocs.io/en/latest/?badge=latest 20 | :alt: Documentation Status 21 | 22 | 23 | Note 24 | ^^^^ 25 | 26 | As of December 20th 2018, (Parsl v0.7.0) the libsubmit repository has been merged into Parsl 27 | to reduce overheads on maintenance with respect to documentation, testing, and release 28 | synchronization. The components offered by libsubmit are now available in Parsl as: 29 | `parsl.channels`, `parsl.launchers` and `parsl.providers`. 30 | 31 | 32 | Documentation 33 | ============= 34 | 35 | Developer documentation for libsubmit is available `here `_. 36 | Since libsubmit is designed primarily to be used by `Parsl `_ as its resource provider most of the user documentation is blended into Parsl documentation `here `_ 37 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python3 -m sphinx 7 | SPHINXPROJ = libsubmit 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # libsubmit documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Oct 2 13:39:42 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'nbsphinx', 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.autosummary', 38 | 'sphinx.ext.intersphinx', 39 | 'sphinx.ext.linkcode', 40 | 'sphinx.ext.napoleon' 41 | ] 42 | 43 | 44 | def linkcode_resolve(domain, info): 45 | if domain != 'py': 46 | return None 47 | if not info['module']: 48 | return None 49 | filename = info['module'].replace('.', '/') 50 | return "http://github.com/Parsl/libsubmit/blob/master/{}.py".format(filename) 51 | 52 | intersphinx_mapping = { 53 | 'python': ('https://docs.python.org/3', None), 54 | } 55 | 56 | # Add any paths that contain templates here, relative to this directory. 57 | templates_path = ['_templates'] 58 | 59 | # The suffix(es) of source filenames. 60 | # You can specify multiple suffix as a list of string: 61 | # 62 | # source_suffix = ['.rst', '.md'] 63 | source_suffix = '.rst' 64 | 65 | # The master toctree document. 66 | master_doc = 'index' 67 | 68 | # General information about the project. 69 | project = u'libsubmit' 70 | copyright = u'2017, Yadu Nand Babuji' 71 | author = u'Yadu Nand Babuji' 72 | 73 | # The version info for the project you're documenting, acts as replacement for 74 | # |version| and |release|, also used in various other places throughout the 75 | # built documents. 76 | # 77 | # The short X.Y version. 78 | version = u'0.1.0' 79 | # The full version, including alpha/beta/rc tags. 80 | release = u'0.1.0' 81 | 82 | # The language for content autogenerated by Sphinx. Refer to documentation 83 | # for a list of supported languages. 84 | # 85 | # This is also used if you do content translation via gettext catalogs. 86 | # Usually you set "language" from the command line for these cases. 87 | language = None 88 | 89 | # List of patterns, relative to source directory, that match files and 90 | # directories to ignore when looking for source files. 91 | # This patterns also effect to html_static_path and html_extra_path 92 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = 'sphinx' 96 | 97 | # If true, `todo` and `todoList` produce output, else they produce nothing. 98 | todo_include_todos = False 99 | 100 | 101 | # -- Options for HTML output ---------------------------------------------- 102 | 103 | # The theme to use for HTML and HTML Help pages. See the documentation for 104 | # a list of builtin themes. 105 | # 106 | #html_theme = 'alabaster' 107 | html_theme = 'sphinx_rtd_theme' 108 | 109 | # Theme options are theme-specific and customize the look and feel of a theme 110 | # further. For a list of options available for each theme, see the 111 | # documentation. 112 | # 113 | # html_theme_options = {} 114 | 115 | # Add any paths that contain custom static files (such as style sheets) here, 116 | # relative to this directory. They are copied after the builtin static files, 117 | # so a file named "default.css" will overwrite the builtin "default.css". 118 | html_static_path = ['_static'] 119 | 120 | 121 | # -- Options for HTMLHelp output ------------------------------------------ 122 | 123 | # Output file base name for HTML help builder. 124 | htmlhelp_basename = 'libsubmitdoc' 125 | 126 | 127 | # -- Options for LaTeX output --------------------------------------------- 128 | 129 | latex_elements = { 130 | # The paper size ('letterpaper' or 'a4paper'). 131 | # 132 | # 'papersize': 'letterpaper', 133 | 134 | # The font size ('10pt', '11pt' or '12pt'). 135 | # 136 | # 'pointsize': '10pt', 137 | 138 | # Additional stuff for the LaTeX preamble. 139 | # 140 | # 'preamble': '', 141 | 142 | # Latex figure (float) alignment 143 | # 144 | # 'figure_align': 'htbp', 145 | } 146 | 147 | # Grouping the document tree into LaTeX files. List of tuples 148 | # (source start file, target name, title, 149 | # author, documentclass [howto, manual, or own class]). 150 | latex_documents = [ 151 | (master_doc, 'libsubmit.tex', u'libsubmit Documentation', 152 | u'Yadu Nand Babuji', 'manual'), 153 | ] 154 | 155 | 156 | # -- Options for manual page output --------------------------------------- 157 | 158 | # One entry per manual page. List of tuples 159 | # (source start file, name, description, authors, manual section). 160 | man_pages = [ 161 | (master_doc, 'libsubmit', u'libsubmit Documentation', 162 | [author], 1) 163 | ] 164 | 165 | 166 | # -- Options for Texinfo output ------------------------------------------- 167 | 168 | # Grouping the document tree into Texinfo files. List of tuples 169 | # (source start file, target name, title, author, 170 | # dir menu entry, description, category) 171 | texinfo_documents = [ 172 | (master_doc, 'libsubmit', u'libsubmit Documentation', 173 | author, 'libsubmit', 'One line description of project.', 174 | 'Miscellaneous'), 175 | ] 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /docs/devguide/#dev_docs.rst#: -------------------------------------------------------------------------------- 1 | Developer Documentation 2 | *********************** 3 | 4 | .. automodule:: libsubmit 5 | :no-undoc-members: 6 | 7 | .. autofunction:: set_stream_logger 8 | 9 | .. autofunction:: set_file_logger 10 | 11 | ExecutionProviders 12 | ------------------ 13 | 14 | An execution provider is basically an adapter to various types of execution resources. The providers abstract 15 | away the interfaces provided by various systems to request, monitor, and cancel computate resources. 16 | 17 | .. autoclass:: libsubmit.execution_provider_base.ExecutionProvider 18 | :members: __init__, submit, status, cancel, scaling_enabled, channels_required 19 | 20 | 21 | Slurm 22 | ^^^^^ 23 | 24 | .. autoclass:: libsubmit.providers.slurm.slurm.Slurm 25 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 26 | 27 | Cobalt 28 | ^^^^^^ 29 | 30 | .. autoclass:: libsubmit.providers.cobalt.cobalt.Cobalt 31 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 32 | 33 | Condor 34 | ^^^^^^ 35 | 36 | .. autoclass:: libsubmit.providers.condor.condor.Condor 37 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 38 | 39 | Torque 40 | ^^^^^^ 41 | 42 | .. autoclass:: libsubmit.providers.torque.torque.Torque 43 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 44 | 45 | Local 46 | ^^^^^ 47 | 48 | .. autoclass:: libsubmit.providers.local.local.Local 49 | :members: __init__, submit, status, cancel, scaling_enabled, current_capacity, channels_required 50 | 51 | AWS 52 | ^^^ 53 | 54 | .. autoclass:: libsubmit.providers.aws.aws.EC2Provider 55 | :members: __init__, submit, status, cancel, scaling_enabled, current_capacity, channels_required, create_vpc, read_state_file, write_state_file, create_session, security_group 56 | 57 | 58 | 59 | Channels 60 | -------- 61 | 62 | For certain resources such as campus clusters or supercomputers at research laboratories, resource requirements 63 | may require authentication. For instance some resources may allow access to their job schedulers from only 64 | their login-nodes which require you to authenticate on through SSH, GSI-SSH and sometimes even require 65 | two factor authentication. Channels are simple abstractions that enable the ExecutionProvider component to talk 66 | to the resource managers of compute facilities. The simplest Channel, *LocalChannel* simply executes commands 67 | locally on a shell, while the *SshChannel* authenticates you to remote systems. 68 | 69 | .. autoclass:: libsubmit.channels.channel_base.Channel 70 | :members: execute_wait, script_dir, execute_no_wait, push_file, close 71 | 72 | LocalChannel 73 | ^^^^^^^^^^^^ 74 | .. autoclass:: libsubmit.channels.local.local.LocalChannel 75 | :members: __init__, execute_wait, execute_no_wait, push_file, script_dir, close 76 | 77 | SshChannel 78 | ^^^^^^^^^^^^ 79 | .. autoclass:: libsubmit.channels.ssh.ssh.SshChannel 80 | :members: __init__, execute_wait, execute_no_wait, push_file, pull_file, script_dir, close 81 | 82 | SshILChannel 83 | ^^^^^^^^^^^^ 84 | .. autoclass:: libsubmit.channels.ssh_il.ssh_il.SshILChannel 85 | :members: __init__, execute_wait, execute_no_wait, push_file, pull_file, script_dir, close 86 | 87 | 88 | 89 | Launchers 90 | --------- 91 | 92 | Launchers are basically wrappers for user submitted scripts as they are submitted to 93 | a specific execution resource. 94 | 95 | .. autofunction:: libsubmit.launchers.singleNodeLauncher 96 | 97 | -------------------------------------------------------------------------------- /docs/devguide/.#dev_docs.rst: -------------------------------------------------------------------------------- 1 | ben@benbox.7349 -------------------------------------------------------------------------------- /docs/devguide/changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Libsubmit 0.4.1 5 | --------------- 6 | 7 | Released. June 18th, 2018. 8 | This release folds in massive contributions from @annawoodard. 9 | 10 | New functionality 11 | ^^^^^^^^^^^^^^^^^ 12 | 13 | * Several code cleanups, doc improvements, and consistent naming 14 | 15 | * All providers have the initialization and actual start of resources decoupled. 16 | 17 | 18 | 19 | Libsubmit 0.4.0 20 | --------------- 21 | 22 | Released. May 15th, 2018. 23 | This release folds in contributions from @ahayschi, @annawoodard, @yadudoc 24 | 25 | New functionality 26 | ^^^^^^^^^^^^^^^^^ 27 | 28 | * Several enhancements and fixes to the AWS cloud provider (#44, #45, #50) 29 | 30 | * Added support for python3.4 31 | 32 | 33 | Bug Fixes 34 | ^^^^^^^^^ 35 | 36 | * Condor jobs left in queue with X state at end of completion `issue#26 `_ 37 | 38 | * Worker launches on Cori seem to fail from broken ENV `issue#27 `_ 39 | 40 | * EC2 provider throwing an exception at initial run `issue#46 `_ 41 | 42 | -------------------------------------------------------------------------------- /docs/devguide/design.rst: -------------------------------------------------------------------------------- 1 | Design 2 | ====== 3 | 4 | Under construction. 5 | 6 | -------------------------------------------------------------------------------- /docs/devguide/dev_docs.rst: -------------------------------------------------------------------------------- 1 | Developer documentation 2 | *********************** 3 | 4 | .. automodule:: libsubmit 5 | :no-undoc-members: 6 | 7 | .. autofunction:: set_stream_logger 8 | 9 | .. autofunction:: set_file_logger 10 | 11 | ExecutionProviders 12 | ------------------ 13 | 14 | An execution provider is basically an adapter to various types of execution resources. The providers abstract 15 | away the interfaces provided by various systems to request, monitor, and cancel computate resources. 16 | 17 | .. autoclass:: libsubmit.execution_provider_base.ExecutionProvider 18 | :members: __init__, submit, status, cancel, scaling_enabled, channels_required 19 | 20 | 21 | Slurm 22 | ^^^^^ 23 | 24 | .. autoclass:: libsubmit.providers.slurm.slurm.Slurm 25 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 26 | 27 | Cobalt 28 | ^^^^^^ 29 | 30 | .. autoclass:: libsubmit.providers.cobalt.cobalt.Cobalt 31 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 32 | 33 | Condor 34 | ^^^^^^ 35 | 36 | .. autoclass:: libsubmit.providers.condor.condor.Condor 37 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 38 | 39 | Torque 40 | ^^^^^^ 41 | 42 | .. autoclass:: libsubmit.providers.torque.torque.Torque 43 | :members: __init__, submit, status, cancel, _status, scaling_enabled, _write_submit_script, current_capacity, channels_required 44 | 45 | Local 46 | ^^^^^ 47 | 48 | .. autoclass:: libsubmit.providers.local.local.Local 49 | :members: __init__, submit, status, cancel, scaling_enabled, current_capacity, channels_required 50 | 51 | AWS 52 | ^^^ 53 | 54 | .. autoclass:: libsubmit.providers.aws.aws.EC2Provider 55 | :members: __init__, submit, status, cancel, scaling_enabled, current_capacity, channels_required, create_vpc, read_state_file, write_state_file, create_session, security_group 56 | 57 | 58 | 59 | Channels 60 | -------- 61 | 62 | For certain resources such as campus clusters or supercomputers at research laboratories, resource requirements 63 | may require authentication. For instance some resources may allow access to their job schedulers from only 64 | their login-nodes which require you to authenticate on through SSH, GSI-SSH and sometimes even require 65 | two factor authentication. Channels are simple abstractions that enable the ExecutionProvider component to talk 66 | to the resource managers of compute facilities. The simplest Channel, *LocalChannel* simply executes commands 67 | locally on a shell, while the *SshChannel* authenticates you to remote systems. 68 | 69 | .. autoclass:: libsubmit.channels.channel_base.Channel 70 | :members: execute_wait, script_dir, execute_no_wait, push_file, close 71 | 72 | LocalChannel 73 | ^^^^^^^^^^^^ 74 | .. autoclass:: libsubmit.channels.local.local.LocalChannel 75 | :members: __init__, execute_wait, execute_no_wait, push_file, script_dir, close 76 | 77 | SshChannel 78 | ^^^^^^^^^^^^ 79 | .. autoclass:: libsubmit.channels.ssh.ssh.SshChannel 80 | :members: __init__, execute_wait, execute_no_wait, push_file, pull_file, script_dir, close 81 | 82 | SshILChannel 83 | ^^^^^^^^^^^^ 84 | .. autoclass:: libsubmit.channels.ssh_il.ssh_il.SshILChannel 85 | :members: __init__, execute_wait, execute_no_wait, push_file, pull_file, script_dir, close 86 | 87 | 88 | 89 | Launchers 90 | --------- 91 | 92 | Launchers are basically wrappers for user submitted scripts as they are submitted to 93 | a specific execution resource. 94 | 95 | .. autofunction:: libsubmit.launchers.singleNodeLauncher 96 | 97 | -------------------------------------------------------------------------------- /docs/devguide/packaging.rst: -------------------------------------------------------------------------------- 1 | Packaging 2 | --------- 3 | 4 | Currently packaging is managed by Yadu. 5 | 6 | Here are the steps: 7 | 8 | .. code:: bash 9 | 10 | # Depending on permission all of the following might have to be run as root. 11 | sudo su 12 | 13 | # Make sure to have twine installed 14 | pip3 install twine 15 | 16 | # Create a source distribution 17 | python3 setup.py sdist 18 | 19 | # Create a wheel package, which is a prebuilt package 20 | python3 setup.py bdist_wheel 21 | 22 | # Upload the package with twine 23 | # This step will ask for username and password for the PyPi account. 24 | twine upload dist/* 25 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. libsubmit documentation master file, created by 2 | sphinx-quickstart on Mon Oct 2 13:39:42 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to libsubmit's documentation! 7 | ===================================== 8 | 9 | Libsubmit is responsible for managing execution resources with a Local Resource 10 | Manager (LRM). For instance, campus clusters and supercomputers generally have 11 | schedulers such as Slurm, PBS, Condor and. Clouds on the other hand have API 12 | interfaces that allow much more fine grain composition of an execution environment. 13 | An execution provider abstracts these resources and provides a single uniform 14 | interface to them. 15 | 16 | This module provides the following functionality: 17 | 18 | 1. A standard interface to schedulers 19 | 2. Support for submitting, monitoring and cancelling jobs 20 | 3. A modular design, making it simple to add support for new resources. 21 | 4. Support for pushing files from client side to resources. 22 | 23 | 24 | .. toctree:: 25 | 26 | quick/quickstart 27 | userguide/index 28 | reference 29 | devguide/changelog 30 | devguide/dev_docs 31 | devguide/packaging 32 | 33 | 34 | Indices and tables 35 | ================== 36 | 37 | * :ref:`genindex` 38 | * :ref:`modindex` 39 | * :ref:`search` 40 | -------------------------------------------------------------------------------- /docs/libsubmit_art/README.txt: -------------------------------------------------------------------------------- 1 | Edittable diagrams embedded in this google doc: 2 | https://docs.google.com/document/d/193LBq7H-dtxrYUER7oZqs0ZlOcLa2fGFPGLOHRS1c5U/edit?usp=sharing 3 | 4 | -------------------------------------------------------------------------------- /docs/libsubmit_art/multi_node.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/docs/libsubmit_art/multi_node.png -------------------------------------------------------------------------------- /docs/libsubmit_art/multi_worker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/docs/libsubmit_art/multi_worker.png -------------------------------------------------------------------------------- /docs/libsubmit_art/single_worker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/docs/libsubmit_art/single_worker.png -------------------------------------------------------------------------------- /docs/libsubmit_art/single_worker.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs/quick/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Libsubmit is an adapter to a variety of computational resources such as Clouds, Campus Clusters and Supercomputers. This python-module is designed to simplify and expose 5 | a uniform interface to seemingly diverse class of resource schedulers. This library 6 | originated from Parsl: Parallel scripting library and is designed to bring dynamic 7 | resource management capabilities to it. 8 | 9 | 10 | Installing 11 | ---------- 12 | 13 | Libsubmit is now available on PyPI, but first make sure you have Python3.5+ 14 | 15 | >>> python3 --version 16 | 17 | 18 | Installing on Linux 19 | ^^^^^^^^^^^^^^^^^^^ 20 | 21 | 1. Install Libsubmit:: 22 | 23 | $ python3 -m pip install libsumit 24 | 25 | 26 | 2. Libsubmit supports a variety of computation resource via specific libraries. You might only need a subset of these, which can be installed by specifying the resources names:: 27 | 28 | $ python3 -m pip install libsumit[,,] 29 | 30 | 31 | Installing on Mac OS 32 | ^^^^^^^^^^^^^^^^^^^^ 33 | 34 | 1. Install Conda and setup python3.6 following instructions `here `_:: 35 | 36 | $ conda create --name libsubmit_py36 python=3.6 37 | $ source activate libsubmit_py36 38 | 39 | 2. Install Libsubnmit:: 40 | 41 | $ python3 -m pip install libsubmit[] 42 | 43 | 44 | For Developers 45 | -------------- 46 | 47 | 1. Download Libsubmit:: 48 | 49 | $ git clone https://github.com/Parsl/libsubmit 50 | 51 | 2. Install:: 52 | 53 | $ cd libsubmit 54 | $ python3 setup.py install 55 | 56 | 3. Use Libsubmit! 57 | 58 | Requirements 59 | ============ 60 | 61 | Libsubmit requires the following : 62 | 63 | * Python 3.5+ 64 | * paramiko 65 | * ipyparallel 66 | * boto3 - for AWS 67 | * azure, haikunator - for Azure 68 | * python-novaclient - for jetstream 69 | 70 | For testing: 71 | 72 | * nose 73 | * coverage 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /docs/reference.rst: -------------------------------------------------------------------------------- 1 | Reference guide 2 | *************** 3 | 4 | .. autosummary:: 5 | :toctree: stubs 6 | :nosignatures: 7 | 8 | libsubmit.channels.local.local.LocalChannel 9 | libsubmit.channels.ssh.ssh.SshChannel 10 | libsubmit.providers.aws.aws.EC2Provider 11 | libsubmit.providers.azureProvider.azureProvider.AzureProvider 12 | libsubmit.providers.cobalt.cobalt.Cobalt 13 | libsubmit.providers.condor.condor.Condor 14 | libsubmit.providers.googlecloud.googlecloud.GoogleCloud 15 | libsubmit.providers.gridEngine.gridEngine.GridEngine 16 | libsubmit.providers.jetstream.jetstream.Jetstream 17 | libsubmit.providers.local.local.Local 18 | libsubmit.providers.sge.sge.GridEngine 19 | libsubmit.providers.slurm.slurm.Slurm 20 | libsubmit.providers.torque.torque.Torque 21 | libsubmit.providers.provider_base.ExecutionProvider 22 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.aws.aws.EC2Provider.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.aws.aws.EC2Provider 2 | ======================================= 3 | 4 | .. currentmodule:: libsubmit.providers.aws.aws 5 | 6 | .. autoclass:: EC2Provider 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~EC2Provider.__init__ 17 | ~EC2Provider.cancel 18 | ~EC2Provider.config_route_table 19 | ~EC2Provider.create_session 20 | ~EC2Provider.create_vpc 21 | ~EC2Provider.get_instance_state 22 | ~EC2Provider.goodbye 23 | ~EC2Provider.initialize_boto_client 24 | ~EC2Provider.ipyparallel_configuration 25 | ~EC2Provider.read_state_file 26 | ~EC2Provider.security_group 27 | ~EC2Provider.show_summary 28 | ~EC2Provider.shut_down_instance 29 | ~EC2Provider.spin_up_instance 30 | ~EC2Provider.status 31 | ~EC2Provider.submit 32 | ~EC2Provider.teardown 33 | ~EC2Provider.write_state_file 34 | ~EC2Provider.xstr 35 | 36 | 37 | 38 | 39 | 40 | .. rubric:: Attributes 41 | 42 | .. autosummary:: 43 | 44 | ~EC2Provider.channels_required 45 | ~EC2Provider.current_capacity 46 | ~EC2Provider.scaling_enabled 47 | 48 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.cobalt.cobalt.Cobalt.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.cobalt.cobalt.Cobalt 2 | ======================================== 3 | 4 | .. currentmodule:: libsubmit.providers.cobalt.cobalt 5 | 6 | .. autoclass:: Cobalt 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~Cobalt.__init__ 17 | ~Cobalt.cancel 18 | ~Cobalt.status 19 | ~Cobalt.submit 20 | 21 | 22 | 23 | 24 | 25 | .. rubric:: Attributes 26 | 27 | .. autosummary:: 28 | 29 | ~Cobalt.channels_required 30 | ~Cobalt.current_capacity 31 | ~Cobalt.scaling_enabled 32 | 33 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.condor.condor.Condor.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.condor.condor.Condor 2 | ======================================== 3 | 4 | .. currentmodule:: libsubmit.providers.condor.condor 5 | 6 | .. autoclass:: Condor 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~Condor.__init__ 17 | ~Condor.cancel 18 | ~Condor.status 19 | ~Condor.submit 20 | 21 | 22 | 23 | 24 | 25 | .. rubric:: Attributes 26 | 27 | .. autosummary:: 28 | 29 | ~Condor.channels_required 30 | ~Condor.current_capacity 31 | ~Condor.scaling_enabled 32 | 33 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.googlecloud.googlecloud.GoogleCloud.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.googlecloud.googlecloud.GoogleCloud 2 | ======================================================= 3 | 4 | .. currentmodule:: libsubmit.providers.googlecloud.googlecloud 5 | 6 | .. autoclass:: GoogleCloud 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~GoogleCloud.__init__ 17 | ~GoogleCloud.bye 18 | ~GoogleCloud.cancel 19 | ~GoogleCloud.create_instance 20 | ~GoogleCloud.delete_instance 21 | ~GoogleCloud.get_correct_zone 22 | ~GoogleCloud.status 23 | ~GoogleCloud.submit 24 | 25 | 26 | 27 | 28 | 29 | .. rubric:: Attributes 30 | 31 | .. autosummary:: 32 | 33 | ~GoogleCloud.channels_required 34 | ~GoogleCloud.current_capacity 35 | ~GoogleCloud.scaling_enabled 36 | 37 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.gridEngine.gridEngine.GridEngine.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.gridEngine.gridEngine.GridEngine 2 | ==================================================== 3 | 4 | .. currentmodule:: libsubmit.providers.gridEngine.gridEngine 5 | 6 | .. autoclass:: GridEngine 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~GridEngine.__init__ 17 | ~GridEngine.cancel 18 | ~GridEngine.execute_wait 19 | ~GridEngine.get_configs 20 | ~GridEngine.status 21 | ~GridEngine.submit 22 | 23 | 24 | 25 | 26 | 27 | .. rubric:: Attributes 28 | 29 | .. autosummary:: 30 | 31 | ~GridEngine.channels_required 32 | ~GridEngine.current_capacity 33 | ~GridEngine.scaling_enabled 34 | 35 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.jetstream.jetstream.Jetstream.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.jetstream.jetstream.Jetstream 2 | ================================================= 3 | 4 | .. currentmodule:: libsubmit.providers.jetstream.jetstream 5 | 6 | .. autoclass:: Jetstream 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~Jetstream.__init__ 17 | ~Jetstream.scale_in 18 | ~Jetstream.scale_out 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.local.local.Local.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.local.local.Local 2 | ===================================== 3 | 4 | .. currentmodule:: libsubmit.providers.local.local 5 | 6 | .. autoclass:: Local 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~Local.__init__ 17 | ~Local.cancel 18 | ~Local.status 19 | ~Local.submit 20 | 21 | 22 | 23 | 24 | 25 | .. rubric:: Attributes 26 | 27 | .. autosummary:: 28 | 29 | ~Local.channels_required 30 | ~Local.current_capacity 31 | ~Local.scaling_enabled 32 | 33 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.provider_base.ExecutionProvider.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.provider\_base.ExecutionProvider 2 | ==================================================== 3 | 4 | .. currentmodule:: libsubmit.providers.provider_base 5 | 6 | .. autoclass:: ExecutionProvider 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~ExecutionProvider.cancel 17 | ~ExecutionProvider.status 18 | ~ExecutionProvider.submit 19 | 20 | 21 | 22 | 23 | 24 | .. rubric:: Attributes 25 | 26 | .. autosummary:: 27 | 28 | ~ExecutionProvider.channels_required 29 | ~ExecutionProvider.scaling_enabled 30 | 31 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.slurm.slurm.Slurm.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.slurm.slurm.Slurm 2 | ===================================== 3 | 4 | .. currentmodule:: libsubmit.providers.slurm.slurm 5 | 6 | .. autoclass:: Slurm 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~Slurm.__init__ 17 | ~Slurm.cancel 18 | ~Slurm.execute_wait 19 | ~Slurm.get_configs 20 | ~Slurm.status 21 | ~Slurm.submit 22 | 23 | 24 | 25 | 26 | 27 | .. rubric:: Attributes 28 | 29 | .. autosummary:: 30 | 31 | ~Slurm.channels_required 32 | ~Slurm.current_capacity 33 | ~Slurm.scaling_enabled 34 | 35 | -------------------------------------------------------------------------------- /docs/stubs/libsubmit.providers.torque.torque.Torque.rst: -------------------------------------------------------------------------------- 1 | libsubmit.providers.torque.torque.Torque 2 | ======================================== 3 | 4 | .. currentmodule:: libsubmit.providers.torque.torque 5 | 6 | .. autoclass:: Torque 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~Torque.__init__ 17 | ~Torque.cancel 18 | ~Torque.status 19 | ~Torque.submit 20 | 21 | 22 | 23 | 24 | 25 | .. rubric:: Attributes 26 | 27 | .. autosummary:: 28 | 29 | ~Torque.channels_required 30 | ~Torque.current_capacity 31 | ~Torque.scaling_enabled 32 | 33 | -------------------------------------------------------------------------------- /docs/userguide/configuring.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ============= 3 | 4 | The primary mode by which you interact with libsubmit is by instantiating an ExecutionProvider 5 | with a configuration data structure and optional Channel objects if the ExecutionProvider requires it. 6 | 7 | The configuration datastructure expected by an ExecutionProvider as well as options specifics are 8 | described below. 9 | 10 | The config structure looks like this: 11 | 12 | .. code-block:: python 13 | 14 | config = { "poolName" : , 15 | "provider" : , 16 | "scriptDir" : , 17 | "minBlocks" : , 18 | "maxBlocks" : , 19 | "initBlocks" : , 20 | "block" : { # Specify the shape of the block 21 | "nodes" : , 22 | "taskBlocks" : , 23 | "walltime" : 24 | "options" : { # These are provider specific options 25 | "partition" : , 26 | "account" : , 27 | "overrides" : 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /docs/userguide/index.rst: -------------------------------------------------------------------------------- 1 | User guide 2 | ========== 3 | 4 | .. toctree:: 5 | :maxdepth: 5 6 | 7 | overview 8 | configuring 9 | -------------------------------------------------------------------------------- /docs/userguide/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | Under construction. Please refer to the developer documentation as this section 5 | is being built. 6 | 7 | -------------------------------------------------------------------------------- /libsubmit/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Libsubmit 3 | ========= 4 | 5 | Uniform interface to diverse and multi-lingual set of computational resources. 6 | 7 | ''' 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | 11 | from libsubmit.version import VERSION 12 | from libsubmit.providers import LocalProvider 13 | 14 | from libsubmit.providers import CobaltProvider 15 | from libsubmit.providers import CondorProvider 16 | from libsubmit.providers import GridEngineProvider 17 | from libsubmit.providers import SlurmProvider 18 | from libsubmit.providers import TorqueProvider 19 | 20 | from libsubmit.providers import AWSProvider 21 | from libsubmit.providers import AzureProvider 22 | from libsubmit.providers import GoogleCloudProvider 23 | from libsubmit.providers import JetstreamProvider 24 | 25 | from libsubmit.providers import KubernetesProvider 26 | 27 | from libsubmit.channels import SSHChannel 28 | from libsubmit.channels import SSHInteractiveLoginChannel 29 | from libsubmit.channels import LocalChannel 30 | 31 | from libsubmit.launchers import SimpleLauncher, SingleNodeLauncher, SrunLauncher, \ 32 | AprunLauncher, SrunMPILauncher, AprunLauncher 33 | 34 | 35 | __author__ = 'Yadu Nand Babuji' 36 | __version__ = VERSION 37 | 38 | __all__ = ['LocalProvider', 39 | 'CobaltProvider', 40 | 'CondorProvider', 41 | 'GridEngineProvider', 42 | 'SlurmProvider', 43 | 'TorqueProvider', 44 | 'AWSProvider', 45 | 'AzureProvider', 46 | 'GoogleCloudProvider', 47 | 'JetstreamProvider', 48 | 'KubernetesProvider', 49 | 'LocalChannel', 50 | 'SSHChannel', 51 | 'SSHInteractiveLoginChannel', 52 | 'SimpleLauncher', 53 | 'SingleNodeLauncher', 54 | 'SrunLauncher', 55 | 'AprunLauncher', 56 | 'SrunMPILauncher', 57 | 'AprunLauncher'] 58 | 59 | 60 | def set_stream_logger(name='libsubmit', level=logging.DEBUG, format_string=None): 61 | ''' 62 | Add a stream log handler 63 | 64 | Args: 65 | - name (string) : Set the logger name. 66 | - level (logging.LEVEL) : Set to logging.DEBUG by default. 67 | - format_string (sting) : Set to None by default. 68 | 69 | Returns: 70 | - None 71 | ''' 72 | 73 | if format_string is None: 74 | format_string = "%(asctime)s %(name)s [%(levelname)s] %(message)s" 75 | 76 | logger = logging.getLogger(name) 77 | logger.setLevel(level) 78 | handler = logging.StreamHandler() 79 | handler.setLevel(level) 80 | formatter = logging.Formatter(format_string) 81 | handler.setFormatter(formatter) 82 | logger.addHandler(handler) 83 | 84 | 85 | def set_file_logger(filename, name='libsubmit', level=logging.DEBUG, format_string=None): 86 | ''' Add a stream log handler 87 | 88 | Args: 89 | - filename (string): Name of the file to write logs to 90 | - name (string): Logger name 91 | - level (logging.LEVEL): Set the logging level. 92 | - format_string (string): Set the format string 93 | 94 | Returns: 95 | - None 96 | ''' 97 | 98 | if format_string is None: 99 | format_string = "%(asctime)s %(name)s [%(levelname)s] %(message)s" 100 | 101 | logger = logging.getLogger(name) 102 | logger.setLevel(level) 103 | handler = logging.FileHandler(filename) 104 | handler.setLevel(level) 105 | formatter = logging.Formatter(format_string) 106 | handler.setFormatter(formatter) 107 | logger.addHandler(handler) 108 | 109 | 110 | class NullHandler(logging.Handler): 111 | ''' Setup default logging to /dev/null since this is library. 112 | 113 | ''' 114 | 115 | def emit(self, record): 116 | pass 117 | 118 | 119 | logging.getLogger('libsubmit').addHandler(NullHandler()) 120 | -------------------------------------------------------------------------------- /libsubmit/channels/__init__.py: -------------------------------------------------------------------------------- 1 | from libsubmit.channels.ssh.ssh import SSHChannel 2 | from libsubmit.channels.local.local import LocalChannel 3 | from libsubmit.channels.ssh_il.ssh_il import SSHInteractiveLoginChannel 4 | 5 | __all__ = ['SSHChannel', 'LocalChannel', 'SSHInteractiveLoginChannel'] 6 | -------------------------------------------------------------------------------- /libsubmit/channels/channel_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod, abstractproperty 2 | 3 | 4 | class Channel(metaclass=ABCMeta): 5 | """ Define the interface to all channels. Channels are usually called via the execute_wait function. 6 | For channels that execute remotely, a push_file function allows you to copy over files. 7 | 8 | .. code:: python 9 | 10 | +------------------ 11 | | 12 | cmd, wtime ------->| execute_wait 13 | (ec, stdout, stderr)<-|---+ 14 | | 15 | cmd, wtime ------->| execute_no_wait 16 | (ec, stdout, stderr)<-|---+ 17 | | 18 | src, dst_dir ------->| push_file 19 | dst_path <--------|----+ 20 | | 21 | dst_script_dir <------| script_dir 22 | | 23 | +------------------- 24 | 25 | """ 26 | 27 | @abstractmethod 28 | def execute_wait(self, cmd, walltime, envs={}, *args, **kwargs): 29 | ''' Executes the cmd, with a defined walltime. 30 | 31 | Args: 32 | - cmd (string): Command string to execute over the channel 33 | - walltime (int) : Timeout in seconds 34 | 35 | KWargs: 36 | - envs (dict) : Environment variables to push to the remote side 37 | 38 | Returns: 39 | - (exit_code, stdout, stderr) (int, string, string) 40 | ''' 41 | pass 42 | 43 | @abstractproperty 44 | def script_dir(self): 45 | ''' This is a property. Returns the directory assigned for storing all internal scripts such as 46 | scheduler submit scripts. This is usually where error logs from the scheduler would reside on the 47 | channel destination side. 48 | 49 | Args: 50 | - None 51 | 52 | Returns: 53 | - Channel script dir 54 | ''' 55 | pass 56 | 57 | @abstractmethod 58 | def execute_no_wait(self, cmd, walltime, envs={}, *args, **kwargs): 59 | ''' Optional. THis is infrequently used. 60 | 61 | Args: 62 | - cmd (string): Command string to execute over the channel 63 | - walltime (int) : Timeout in seconds 64 | 65 | KWargs: 66 | - envs (dict) : Environment variables to push to the remote side 67 | 68 | Returns: 69 | - (exit_code(None), stdout, stderr) (int, io_thing, io_thing) 70 | ''' 71 | pass 72 | 73 | @abstractmethod 74 | def push_file(self, source, dest_dir): 75 | ''' Channel will take care of moving the file from source to the destination 76 | directory 77 | 78 | Args: 79 | source (string) : Full filepath of the file to be moved 80 | dest_dir (string) : Absolute path of the directory to move to 81 | 82 | Returns: 83 | destination_path (string) 84 | ''' 85 | pass 86 | 87 | @abstractmethod 88 | def close(self): 89 | ''' Closes the channel. Clean out any auth credentials. 90 | 91 | Args: 92 | None 93 | 94 | Returns: 95 | Bool 96 | 97 | ''' 98 | pass 99 | -------------------------------------------------------------------------------- /libsubmit/channels/errors.py: -------------------------------------------------------------------------------- 1 | ''' Exceptions raise by Apps. 2 | ''' 3 | 4 | 5 | class ChannelError(Exception): 6 | """ Base class for all exceptions 7 | 8 | Only to be invoked when only a more specific error is not available. 9 | """ 10 | def __repr__(self): 11 | return "Hostname:{0}, Reason:{1}".format(self.hostname, self.reason) 12 | 13 | def __str__(self): 14 | return self.__repr__() 15 | 16 | 17 | class BadHostKeyException(ChannelError): 18 | ''' SSH channel could not be created since server's host keys could not 19 | be verified 20 | 21 | Contains: 22 | reason(string) 23 | e (paramiko exception object) 24 | hostname (string) 25 | ''' 26 | 27 | def __init__(self, e, hostname): 28 | super().__init__() 29 | self.reason = "SSH channel could not be created since server's host keys could not be verified" 30 | self.hostname = hostname 31 | self.e = e 32 | 33 | 34 | class BadScriptPath(ChannelError): 35 | ''' An error raised during execution of an app. 36 | What this exception contains depends entirely on context 37 | Contains: 38 | reason(string) 39 | e (paramiko exception object) 40 | hostname (string) 41 | ''' 42 | 43 | def __init__(self, e, hostname): 44 | super().__init__() 45 | self.reason = "Inaccessible remote script dir. Specify script_dir" 46 | self.hostname = hostname 47 | self.e = e 48 | 49 | 50 | class BadPermsScriptPath(ChannelError): 51 | ''' User does not have permissions to access the script_dir on the remote site 52 | 53 | Contains: 54 | reason(string) 55 | e (paramiko exception object) 56 | hostname (string) 57 | ''' 58 | 59 | def __init__(self, e, hostname): 60 | super().__init__() 61 | self.reason = "User does not have permissions to access the script_dir" 62 | self.hostname = hostname 63 | self.e = e 64 | 65 | 66 | class FileExists(ChannelError): 67 | ''' Push or pull of file over channel fails since a file of the name already 68 | exists on the destination. 69 | 70 | Contains: 71 | reason(string) 72 | e (paramiko exception object) 73 | hostname (string) 74 | ''' 75 | 76 | def __init__(self, e, hostname, filename=None): 77 | super().__init__() 78 | self.reason = "File name collision in channel transport phase:" + filename 79 | self.hostname = hostname 80 | self.e = e 81 | 82 | 83 | class AuthException(ChannelError): 84 | ''' An error raised during execution of an app. 85 | What this exception contains depends entirely on context 86 | Contains: 87 | reason(string) 88 | e (paramiko exception object) 89 | hostname (string) 90 | ''' 91 | 92 | def __init__(self, e, hostname): 93 | super().__init__() 94 | self.reason = "Authentication to remote server failed" 95 | self.hostname = hostname 96 | self.e = e 97 | 98 | 99 | class SSHException(ChannelError): 100 | ''' if there was any other error connecting or establishing an SSH session 101 | 102 | Contains: 103 | reason(string) 104 | e (paramiko exception object) 105 | hostname (string) 106 | ''' 107 | 108 | def __init__(self, e, hostname): 109 | super().__init__() 110 | self.reason = "Error connecting or establishing an SSH session" 111 | self.hostname = hostname 112 | self.e = e 113 | 114 | 115 | class FileCopyException(ChannelError): 116 | ''' File copy operation failed 117 | 118 | Contains: 119 | reason(string) 120 | e (paramiko exception object) 121 | hostname (string) 122 | ''' 123 | 124 | def __init__(self, e, hostname): 125 | super().__init__() 126 | self.reason = "File copy failed due to {0}".format(e) 127 | self.hostname = hostname 128 | self.e = e 129 | -------------------------------------------------------------------------------- /libsubmit/channels/local/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/channels/local/__init__.py -------------------------------------------------------------------------------- /libsubmit/channels/local/local.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import errno 3 | import logging 4 | import os 5 | import shutil 6 | import subprocess 7 | 8 | from libsubmit.channels.channel_base import Channel 9 | from libsubmit.channels.errors import * 10 | from libsubmit.utils import RepresentationMixin 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class LocalChannel(Channel, RepresentationMixin): 16 | ''' This is not even really a channel, since opening a local shell is not heavy 17 | and done so infrequently that they do not need a persistent channel 18 | ''' 19 | 20 | def __init__(self, userhome=".", envs={}, script_dir="./.scripts", **kwargs): 21 | ''' Initialize the local channel. script_dir is required by set to a default. 22 | 23 | KwArgs: 24 | - userhome (string): (default='.') This is provided as a way to override and set a specific userhome 25 | - envs (dict) : A dictionary of env variables to be set when launching the shell 26 | - script_dir (string): (default="./.scripts") Directory to place scripts 27 | ''' 28 | self.userhome = os.path.abspath(userhome) 29 | self.hostname = "localhost" 30 | self.envs = envs 31 | local_env = os.environ.copy() 32 | self._envs = copy.deepcopy(local_env) 33 | self._envs.update(envs) 34 | self._script_dir = os.path.abspath(script_dir) 35 | try: 36 | os.makedirs(self._script_dir) 37 | except OSError as e: 38 | if e.errno != errno.EEXIST: 39 | logger.error("Failed to create script_dir : {0}".format(script_dir)) 40 | raise BadScriptPath(e, self.hostname) 41 | 42 | @property 43 | def script_dir(self): 44 | return self._script_dir 45 | 46 | def execute_wait(self, cmd, walltime, envs={}): 47 | ''' Synchronously execute a commandline string on the shell. 48 | 49 | Args: 50 | - cmd (string) : Commandline string to execute 51 | - walltime (int) : walltime in seconds, this is not really used now. 52 | 53 | Kwargs: 54 | - envs (dict) : Dictionary of env variables. This will be used 55 | to override the envs set at channel initialization. 56 | 57 | Returns: 58 | - retcode : Return code from the execution, -1 on fail 59 | - stdout : stdout string 60 | - stderr : stderr string 61 | 62 | Raises: 63 | None. 64 | ''' 65 | retcode = -1 66 | stdout = None 67 | stderr = None 68 | 69 | current_env = copy.deepcopy(self._envs) 70 | current_env.update(envs) 71 | 72 | try: 73 | proc = subprocess.Popen( 74 | cmd, 75 | stdout=subprocess.PIPE, 76 | stderr=subprocess.PIPE, 77 | cwd=self.userhome, 78 | env=current_env, 79 | shell=True 80 | ) 81 | proc.wait(timeout=walltime) 82 | stdout = proc.stdout.read() 83 | stderr = proc.stderr.read() 84 | retcode = proc.returncode 85 | 86 | except Exception as e: 87 | print("Caught exception : {0}".format(e)) 88 | logger.warn("Execution of command [%s] failed due to \n %s ", cmd, e) 89 | # Set retcode to non-zero so that this can be handled in the provider. 90 | if retcode == 0: 91 | retcode = -1 92 | return (retcode, None, None) 93 | 94 | return (retcode, stdout.decode("utf-8"), stderr.decode("utf-8")) 95 | 96 | def execute_no_wait(self, cmd, walltime, envs={}): 97 | ''' Synchronously execute a commandline string on the shell. 98 | 99 | Args: 100 | - cmd (string) : Commandline string to execute 101 | - walltime (int) : walltime in seconds, this is not really used now. 102 | 103 | Returns: 104 | 105 | - retcode : Return code from the execution, -1 on fail 106 | - stdout : stdout string 107 | - stderr : stderr string 108 | 109 | Raises: 110 | None. 111 | ''' 112 | current_env = copy.deepcopy(self._envs) 113 | current_env.update(envs) 114 | 115 | try: 116 | proc = subprocess.Popen( 117 | cmd, 118 | stdout=subprocess.PIPE, 119 | stderr=subprocess.PIPE, 120 | cwd=self.userhome, 121 | env=current_env, 122 | shell=True, 123 | preexec_fn=os.setpgrp 124 | ) 125 | pid = proc.pid 126 | 127 | except Exception as e: 128 | print("Caught exception : {0}".format(e)) 129 | logger.warn("Execution of command [%s] failed due to \n %s ", (cmd, e)) 130 | 131 | return pid, proc 132 | 133 | def push_file(self, source, dest_dir): 134 | ''' If the source files dirpath is the same as dest_dir, a copy 135 | is not necessary, and nothing is done. Else a copy is made. 136 | 137 | Args: 138 | - source (string) : Path to the source file 139 | - dest_dir (string) : Path to the directory to which the files is to be copied 140 | 141 | Returns: 142 | - destination_path (String) : Absolute path of the destination file 143 | 144 | Raises: 145 | - FileCopyException : If file copy failed. 146 | ''' 147 | 148 | local_dest = dest_dir + '/' + os.path.basename(source) 149 | 150 | # Only attempt to copy if the target dir and source dir are different 151 | if os.path.dirname(source) != dest_dir: 152 | try: 153 | shutil.copyfile(source, local_dest) 154 | os.chmod(local_dest, 0o777) 155 | 156 | except OSError as e: 157 | raise FileCopyException(e, self.hostname) 158 | 159 | return local_dest 160 | 161 | def close(self): 162 | ''' There's nothing to close here, and this really doesn't do anything 163 | 164 | Returns: 165 | - False, because it really did not "close" this channel. 166 | ''' 167 | return False 168 | -------------------------------------------------------------------------------- /libsubmit/channels/ssh/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/channels/ssh/__init__.py -------------------------------------------------------------------------------- /libsubmit/channels/ssh/ssh.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import getpass 3 | import logging 4 | import os 5 | 6 | import paramiko 7 | from libsubmit.channels.errors import * 8 | from libsubmit.utils import RepresentationMixin 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class SSHChannel(RepresentationMixin): 14 | ''' SSH persistent channel. This enables remote execution on sites 15 | accessible via ssh. It is assumed that the user has setup host keys 16 | so as to ssh to the remote host. Which goes to say that the following 17 | test on the commandline should work : 18 | 19 | >>> ssh @ 20 | 21 | ''' 22 | 23 | def __init__(self, hostname, username=None, password=None, script_dir=None, envs=None, **kwargs): 24 | ''' Initialize a persistent connection to the remote system. 25 | We should know at this point whether ssh connectivity is possible 26 | 27 | Args: 28 | - hostname (String) : Hostname 29 | 30 | KWargs: 31 | - username (string) : Username on remote system 32 | - password (string) : Password for remote system 33 | - script_dir (string) : Full path to a script dir where 34 | generated scripts could be sent to. 35 | - envs (dict) : A dictionary of environment variables to be set when executing commands 36 | 37 | Raises: 38 | ''' 39 | 40 | self.hostname = hostname 41 | self.username = username 42 | self.password = password 43 | self.kwargs = kwargs 44 | 45 | self.ssh_client = paramiko.SSHClient() 46 | self.ssh_client.load_system_host_keys() 47 | self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 48 | 49 | if script_dir: 50 | self._script_dir = script_dir 51 | else: 52 | self._script_dir = "/tmp/{0}/scripts/".format(getpass.getuser()) 53 | 54 | self.envs = {} 55 | if envs is not None: 56 | self.envs = envs 57 | 58 | try: 59 | self.ssh_client.connect( 60 | hostname, 61 | username=username, 62 | password=password, 63 | allow_agent=True 64 | ) 65 | t = self.ssh_client.get_transport() 66 | self.sftp_client = paramiko.SFTPClient.from_transport(t) 67 | 68 | except paramiko.BadHostKeyException as e: 69 | raise BadHostKeyException(e, self.hostname) 70 | 71 | except paramiko.AuthenticationException as e: 72 | raise AuthException(e, self.hostname) 73 | 74 | except paramiko.SSHException as e: 75 | raise SSHException(e, self.hostname) 76 | 77 | except Exception as e: 78 | raise SSHException(e, self.hostname) 79 | 80 | @property 81 | def script_dir(self): 82 | return self._script_dir 83 | 84 | def prepend_envs(self, cmd, env={}): 85 | env.update(self.envs) 86 | 87 | if len(env.keys()) > 0: 88 | env_vars = ' '.join(['{}={}'.format(key, value) for key, value in env.items()]) 89 | return 'env {0} {1}'.format(env_vars, cmd) 90 | return cmd 91 | 92 | def execute_wait(self, cmd, walltime=2, envs={}): 93 | ''' Synchronously execute a commandline string on the shell. 94 | 95 | Args: 96 | - cmd (string) : Commandline string to execute 97 | - walltime (int) : walltime in seconds, this is not really used now. 98 | 99 | Kwargs: 100 | - envs (dict) : Dictionary of env variables 101 | 102 | Returns: 103 | - retcode : Return code from the execution, -1 on fail 104 | - stdout : stdout string 105 | - stderr : stderr string 106 | 107 | Raises: 108 | None. 109 | ''' 110 | 111 | # Execute the command 112 | stdin, stdout, stderr = self.ssh_client.exec_command( 113 | self.prepend_envs(cmd, envs), bufsize=-1, timeout=walltime 114 | ) 115 | # Block on exit status from the command 116 | exit_status = stdout.channel.recv_exit_status() 117 | return exit_status, stdout.read().decode("utf-8"), stderr.read().decode("utf-8") 118 | 119 | def execute_no_wait(self, cmd, walltime=2, envs={}): 120 | ''' Execute asynchronousely without waiting for exitcode 121 | 122 | Args: 123 | - cmd (string): Commandline string to be executed on the remote side 124 | - walltime (int): timeout to exec_command 125 | 126 | KWargs: 127 | - envs (dict): A dictionary of env variables 128 | 129 | Returns: 130 | - None, stdout (readable stream), stderr (readable stream) 131 | 132 | Raises: 133 | - ChannelExecFailed (reason) 134 | ''' 135 | 136 | # Execute the command 137 | stdin, stdout, stderr = self.ssh_client.exec_command( 138 | self.prepend_envs(cmd, envs), bufsize=-1, timeout=walltime 139 | ) 140 | # Block on exit status from the command 141 | return None, stdout, stderr 142 | 143 | def push_file(self, local_source, remote_dir): 144 | ''' Transport a local file to a directory on a remote machine 145 | 146 | Args: 147 | - local_source (string): Path 148 | - remote_dir (string): Remote path 149 | 150 | Returns: 151 | - str: Path to copied file on remote machine 152 | 153 | Raises: 154 | - BadScriptPath : if script path on the remote side is bad 155 | - BadPermsScriptPath : You do not have perms to make the channel script dir 156 | - FileCopyException : FileCopy failed. 157 | 158 | ''' 159 | remote_dest = remote_dir + '/' + os.path.basename(local_source) 160 | 161 | try: 162 | self.sftp_client.mkdir(remote_dir) 163 | except IOError as e: 164 | if e.errno is None: 165 | logger.info( 166 | "Copying {0} into existing directory {1}".format(local_source, remote_dir) 167 | ) 168 | else: 169 | logger.exception("Pushing {0} to {1} failed".format(local_source, remote_dir)) 170 | if e.errno == 2: 171 | raise BadScriptPath(e, self.hostname) 172 | elif e.errno == 13: 173 | raise BadPermsScriptPath(e, self.hostname) 174 | else: 175 | logger.exception("File push failed due to SFTP client failure") 176 | raise FileCopyException(e, self.hostname) 177 | 178 | try: 179 | self.sftp_client.put(local_source, remote_dest, confirm=True) 180 | # Set perm because some systems require the script to be executable 181 | self.sftp_client.chmod(remote_dest, 0o777) 182 | except Exception as e: 183 | logger.exception("File push from local source {} to remote destination {} failed".format( 184 | local_source, remote_dest)) 185 | raise FileCopyException(e, self.hostname) 186 | 187 | return remote_dest 188 | 189 | def pull_file(self, remote_source, local_dir): 190 | ''' Transport file on the remote side to a local directory 191 | 192 | Args: 193 | - remote_source (string): remote_source 194 | - local_dir (string): Local directory to copy to 195 | 196 | 197 | Returns: 198 | - str: Local path to file 199 | 200 | Raises: 201 | - FileExists : Name collision at local directory. 202 | - FileCopyException : FileCopy failed. 203 | ''' 204 | 205 | local_dest = local_dir + '/' + os.path.basename(remote_source) 206 | 207 | try: 208 | os.makedirs(local_dir) 209 | except OSError as e: 210 | if e.errno != errno.EEXIST: 211 | logger.exception("Failed to create script_dir: {0}".format(script_dir)) 212 | raise BadScriptPath(e, self.hostname) 213 | 214 | # Easier to check this than to waste time trying to pull file and 215 | # realize there's a problem. 216 | if os.path.exists(local_dest): 217 | logger.exception("Remote file copy will overwrite a local file:{0}".format(local_dest)) 218 | raise FileExists(None, self.hostname, filename=local_dest) 219 | 220 | try: 221 | self.sftp_client.get(remote_source, local_dest) 222 | except Exception as e: 223 | logger.exception("File pull failed") 224 | raise FileCopyException(e, self.hostname) 225 | 226 | return local_dest 227 | 228 | def close(self): 229 | return self.ssh_client.close() 230 | -------------------------------------------------------------------------------- /libsubmit/channels/ssh_il/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/channels/ssh_il/__init__.py -------------------------------------------------------------------------------- /libsubmit/channels/ssh_il/ssh_il.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import logging 3 | 4 | import paramiko 5 | from libsubmit.channels.ssh.ssh import SSHChannel 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class SSHInteractiveLoginChannel(SSHChannel): 11 | """SSH persistent channel. This enables remote execution on sites 12 | accessible via ssh. This channel supports interactive login and is appropriate when 13 | keys are not set up. 14 | """ 15 | 16 | def __init__(self, hostname, username=None, password=None, script_dir=None, envs=None, **kwargs): 17 | ''' Initialize a persistent connection to the remote system. 18 | We should know at this point whether ssh connectivity is possible 19 | 20 | Args: 21 | - hostname (String) : Hostname 22 | 23 | KWargs: 24 | - username (string) : Username on remote system 25 | - password (string) : Password for remote system 26 | - script_dir (string) : Full path to a script dir where 27 | generated scripts could be sent to. 28 | - envs (dict) : A dictionary of env variables to be set when executing commands 29 | 30 | Raises: 31 | ''' 32 | self.hostname = hostname 33 | self.username = username 34 | self.password = password 35 | self.kwargs = kwargs 36 | 37 | self.ssh_client = paramiko.SSHClient() 38 | self.ssh_client.load_system_host_keys() 39 | self.ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 40 | 41 | if script_dir: 42 | self._script_dir = script_dir 43 | else: 44 | self._script_dir = "/tmp/{0}/scripts/".format(getpass.getuser()) 45 | 46 | self.envs = {} 47 | if envs is not None: 48 | self.envs = envs 49 | 50 | try: 51 | self.ssh_client.connect( 52 | hostname, username=username, password=password, allow_agent=True 53 | ) 54 | 55 | except Exception: 56 | logger.debug("Caught the SSHException in SSHInteractive") 57 | pass 58 | ''' 59 | except paramiko.BadHostKeyException as e: 60 | raise BadHostKeyException(e, self.hostname) 61 | 62 | except paramiko.AuthenticationException as e: 63 | raise AuthException(e, self.hostname) 64 | 65 | except paramiko.SSHException as e: 66 | logger.debug("Caught the SSHException in SSHInteractive") 67 | pass 68 | 69 | except Exception as e: 70 | raise SSHException(e, self.hostname) 71 | ''' 72 | 73 | transport = self.ssh_client.get_transport() 74 | 75 | il_password = getpass.getpass('Enter {0} Logon password :'.format(hostname)) 76 | transport.auth_password(username, il_password) 77 | 78 | self.sftp_client = paramiko.SFTPClient.from_transport(transport) 79 | -------------------------------------------------------------------------------- /libsubmit/error.py: -------------------------------------------------------------------------------- 1 | class ConfigurationError(Exception): 2 | """Error raised when a class constructor has not been initialized correctly.""" 3 | pass 4 | 5 | 6 | class ExecutionProviderException(Exception): 7 | """ Base class for all exceptions 8 | Only to be invoked when only a more specific error is not available. 9 | 10 | """ 11 | pass 12 | 13 | 14 | class SchedulerMissingArgs(ExecutionProviderException): 15 | ''' Error raised when the template used to compose the submit script to the local resource manager is missing required arguments 16 | ''' 17 | 18 | def __init__(self, missing_keywords, sitename): 19 | self.missing_keywords = missing_keywords 20 | self.sitename = sitename 21 | 22 | def __repr__(self): 23 | return "SchedulerMissingArgs: Pool:{0} Arg:{1}".format(self.sitename, self.missing_keywords) 24 | 25 | 26 | class ScriptPathError(ExecutionProviderException): 27 | ''' Error raised when the template used to compose the submit script to the local resource manager is missing required arguments 28 | ''' 29 | 30 | def __init__(self, script_path, reason): 31 | self.script_path = script_path 32 | self.reason = reason 33 | 34 | def __repr__(self): 35 | return "Unable to write submit script:{0} Reason:{1}".format(self.script_path, self.reason) 36 | 37 | 38 | class BadLauncher(ExecutionProviderException): 39 | ''' Error raised when a non callable object is provider as Launcher 40 | ''' 41 | 42 | def __init__(self, launcher, reason): 43 | self.launcher = launcher 44 | self.reason = reason 45 | 46 | def __repr__(self): 47 | return "Bad Launcher provided:{0} Reason:{1}".format(self.launcher, self.reason) 48 | 49 | 50 | class OptionalModuleMissing(ExecutionProviderException): 51 | ''' Error raised a required module is missing for a optional/extra provider 52 | ''' 53 | 54 | def __init__(self, module_names, reason): 55 | self.module_names = module_names 56 | self.reason = reason 57 | 58 | def __repr__(self): 59 | return "Unable to Initialize provider.Missing:{0}, Reason:{1}".format( 60 | self.module_names, self.reason 61 | ) 62 | 63 | 64 | class ChannelRequired(ExecutionProviderException): 65 | ''' Execution provider requires a channel. 66 | ''' 67 | 68 | def __init__(self, provider, reason): 69 | self.provider = provider 70 | self.reason = reason 71 | 72 | def __repr__(self): 73 | return "Unable to Initialize provider.Provider:{0}, Reason:{1}".format( 74 | self.provider, self.reason 75 | ) 76 | 77 | 78 | class ScaleOutFailed(ExecutionProviderException): 79 | ''' Generic catch. Scale out failed in the submit phase on the provider side 80 | ''' 81 | 82 | def __init__(self, provider, reason): 83 | self.provider = provider 84 | self.reason = reason 85 | 86 | def __repr__(self): 87 | return "Unable to Initialize provider.Provider:{0}, Reason:{1}".format( 88 | self.provider, self.reason 89 | ) 90 | -------------------------------------------------------------------------------- /libsubmit/launchers/__init__.py: -------------------------------------------------------------------------------- 1 | from libsubmit.launchers.launchers import SimpleLauncher, SingleNodeLauncher, \ 2 | SrunLauncher, AprunLauncher, SrunMPILauncher, AprunLauncher, \ 3 | GnuParallelLauncher, MpiExecLauncher 4 | 5 | __all__ = ['SimpleLauncher', 6 | 'SingleNodeLauncher', 7 | 'SrunLauncher', 8 | 'AprunLauncher', 9 | 'SrunMPILauncher', 10 | 'AprunLauncher', 11 | 'GnuParallelLauncher', 12 | 'MpiExecLauncher'] 13 | -------------------------------------------------------------------------------- /libsubmit/launchers/launchers.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | from libsubmit.utils import RepresentationMixin 4 | 5 | 6 | class Launcher(RepresentationMixin, metaclass=ABCMeta): 7 | """ Launcher base class to enforce launcher interface 8 | """ 9 | @abstractmethod 10 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 11 | """ Wraps the command with the Launcher calls. 12 | *MUST* be implemented by the concrete child classes 13 | """ 14 | pass 15 | 16 | 17 | class SimpleLauncher(Launcher): 18 | """ Does no wrapping. Just returns the command as-is 19 | """ 20 | 21 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 22 | """ 23 | Args: 24 | - command (string): The command string to be launched 25 | - task_block (string) : bash evaluated string. 26 | 27 | KWargs: 28 | - walltime (int) : This is not used by this launcher. 29 | """ 30 | return command 31 | 32 | 33 | class SingleNodeLauncher(Launcher): 34 | """ Worker launcher that wraps the user's command with the framework to 35 | launch multiple command invocations in parallel. This wrapper sets the 36 | bash env variable CORES to the number of cores on the machine. By setting 37 | task_blocks to an integer or to a bash expression the number of invocations 38 | of the command to be launched can be controlled. 39 | """ 40 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 41 | """ 42 | Args: 43 | - command (string): The command string to be launched 44 | - task_block (string) : bash evaluated string. 45 | 46 | KWargs: 47 | - walltime (int) : This is not used by this launcher. 48 | """ 49 | task_blocks = tasks_per_node * nodes_per_block 50 | 51 | x = '''export CORES=$(getconf _NPROCESSORS_ONLN) 52 | echo "Found cores : $CORES" 53 | WORKERCOUNT={1} 54 | 55 | CMD ( ) {{ 56 | {0} 57 | }} 58 | for COUNT in $(seq 1 1 $WORKERCOUNT) 59 | do 60 | echo "Launching worker: $COUNT" 61 | CMD & 62 | done 63 | wait 64 | echo "All workers done" 65 | '''.format(command, task_blocks) 66 | return x 67 | 68 | 69 | class GnuParallelLauncher(Launcher): 70 | """ Worker launcher that wraps the user's command with the framework to 71 | launch multiple command invocations via GNU parallel sshlogin. 72 | 73 | This wrapper sets the bash env variable CORES to the number of cores on the 74 | machine. 75 | 76 | This launcher makes the following assumptions: 77 | - GNU parallel is installed and can be located in $PATH 78 | - Paswordless SSH login is configured between the controller node and the 79 | target nodes. 80 | - The provider makes available the $PBS_NODEFILE environment variable 81 | """ 82 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 83 | """ 84 | Args: 85 | - command (string): The command string to be launched 86 | - task_block (string) : bash evaluated string. 87 | 88 | KWargs: 89 | - walltime (int) : This is not used by this launcher. 90 | """ 91 | task_blocks = tasks_per_node * nodes_per_block 92 | 93 | x = '''export CORES=$(getconf _NPROCESSORS_ONLN) 94 | echo "Found cores : $CORES" 95 | WORKERCOUNT={3} 96 | 97 | # Deduplicate the nodefile 98 | SSHLOGINFILE="$JOBNAME.nodes" 99 | if [ -z "$PBS_NODEFILE" ]; then 100 | echo "localhost" > $SSHLOGINFILE 101 | else 102 | sort -u $PBS_NODEFILE > $SSHLOGINFILE 103 | fi 104 | 105 | cat << PARALLEL_CMD_EOF > cmd_$JOBNAME.sh 106 | {0} 107 | PARALLEL_CMD_EOF 108 | chmod u+x cmd_$JOBNAME.sh 109 | 110 | #file to contain the commands to parallel 111 | PFILE=cmd_${{JOBNAME}}.sh.parallel 112 | 113 | # Truncate the file 114 | cp /dev/null $PFILE 115 | 116 | for COUNT in $(seq 1 1 $WORKERCOUNT) 117 | do 118 | echo "sh cmd_$JOBNAME.sh" >> $PFILE 119 | done 120 | 121 | parallel --env _ --joblog "$JOBNAME.sh.parallel.log" \ 122 | --sshloginfile $SSHLOGINFILE --jobs {1} < $PFILE 123 | 124 | echo "All workers done" 125 | '''.format(command, tasks_per_node, nodes_per_block, task_blocks) 126 | return x 127 | 128 | 129 | class MpiExecLauncher(Launcher): 130 | """ Worker launcher that wraps the user's command with the framework to 131 | launch multiple command invocations via mpiexec. 132 | 133 | This wrapper sets the bash env variable CORES to the number of cores on the 134 | machine. 135 | 136 | This launcher makes the following assumptions: 137 | - mpiexec is installed and can be located in $PATH 138 | - The provider makes available the $PBS_NODEFILE environment variable 139 | """ 140 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 141 | """ 142 | Args: 143 | - command (string): The command string to be launched 144 | - task_block (string) : bash evaluated string. 145 | 146 | KWargs: 147 | - walltime (int) : This is not used by this launcher. 148 | """ 149 | task_blocks = tasks_per_node * nodes_per_block 150 | 151 | x = '''export CORES=$(getconf _NPROCESSORS_ONLN) 152 | echo "Found cores : $CORES" 153 | WORKERCOUNT={3} 154 | 155 | # Deduplicate the nodefile 156 | HOSTFILE="$JOBNAME.nodes" 157 | if [ -z "$PBS_NODEFILE" ]; then 158 | echo "localhost" > $HOSTFILE 159 | else 160 | sort -u $PBS_NODEFILE > $HOSTFILE 161 | fi 162 | 163 | cat << MPIEXEC_EOF > cmd_$JOBNAME.sh 164 | {0} 165 | MPIEXEC_EOF 166 | chmod u+x cmd_$JOBNAME.sh 167 | 168 | mpiexec --bind-to none -n $WORKERCOUNT --hostfile $HOSTFILE /usr/bin/sh cmd_$JOBNAME.sh 169 | 170 | echo "All workers done" 171 | '''.format(command, tasks_per_node, nodes_per_block, task_blocks) 172 | return x 173 | 174 | 175 | class SrunLauncher(Launcher): 176 | """ Worker launcher that wraps the user's command with the SRUN launch framework 177 | to launch multiple cmd invocations in parallel on a single job allocation. 178 | """ 179 | 180 | def __init__(self): 181 | pass 182 | 183 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 184 | """ 185 | Args: 186 | - command (string): The command string to be launched 187 | - task_block (string) : bash evaluated string. 188 | 189 | KWargs: 190 | - walltime (int) : This is not used by this launcher. 191 | """ 192 | task_blocks = tasks_per_node * nodes_per_block 193 | x = '''export CORES=$SLURM_CPUS_ON_NODE 194 | export NODES=$SLURM_JOB_NUM_NODES 195 | 196 | echo "Found cores : $CORES" 197 | echo "Found nodes : $NODES" 198 | WORKERCOUNT={1} 199 | 200 | cat << SLURM_EOF > cmd_$SLURM_JOB_NAME.sh 201 | {0} 202 | SLURM_EOF 203 | chmod a+x cmd_$SLURM_JOB_NAME.sh 204 | 205 | TASKBLOCKS={1} 206 | 207 | srun --ntasks $TASKBLOCKS -l bash cmd_$SLURM_JOB_NAME.sh 208 | 209 | echo "Done" 210 | '''.format(command, task_blocks) 211 | return x 212 | 213 | 214 | class SrunMPILauncher(Launcher): 215 | """Worker launcher that wraps the user's command with the SRUN launch framework 216 | to launch multiple cmd invocations in parallel on a single job allocation. 217 | 218 | """ 219 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 220 | """ 221 | Args: 222 | - command (string): The command string to be launched 223 | - task_block (string) : bash evaluated string. 224 | 225 | KWargs: 226 | - walltime (int) : This is not used by this launcher. 227 | """ 228 | task_blocks = tasks_per_node * nodes_per_block 229 | x = '''export CORES=$SLURM_CPUS_ON_NODE 230 | export NODES=$SLURM_JOB_NUM_NODES 231 | 232 | echo "Found cores : $CORES" 233 | echo "Found nodes : $NODES" 234 | WORKERCOUNT={1} 235 | 236 | cat << SLURM_EOF > cmd_$SLURM_JOB_NAME.sh 237 | {0} 238 | SLURM_EOF 239 | chmod a+x cmd_$SLURM_JOB_NAME.sh 240 | 241 | TASKBLOCKS={1} 242 | 243 | # If there are more taskblocks to be launched than nodes use 244 | if (( "$TASKBLOCKS" > "$NODES" )) 245 | then 246 | echo "TaskBlocks:$TASKBLOCKS > Nodes:$NODES" 247 | CORES_PER_BLOCK=$(($NODES * $CORES / $TASKBLOCKS)) 248 | for blk in $(seq 1 1 $TASKBLOCKS): 249 | do 250 | srun --ntasks $CORES_PER_BLOCK -l bash cmd_$SLURM_JOB_NAME.sh & 251 | done 252 | wait 253 | else 254 | # A Task block could be integer multiples of Nodes 255 | echo "TaskBlocks:$TASKBLOCKS <= Nodes:$NODES" 256 | NODES_PER_BLOCK=$(( $NODES / $TASKBLOCKS )) 257 | for blk in $(seq 1 1 $TASKBLOCKS): 258 | do 259 | srun --exclusive --nodes $NODES_PER_BLOCK -l bash cmd_$SLURM_JOB_NAME.sh & 260 | done 261 | wait 262 | 263 | fi 264 | 265 | 266 | echo "Done" 267 | '''.format(command, task_blocks) 268 | return x 269 | 270 | 271 | class AprunLauncher(Launcher): 272 | """ Worker launcher that wraps the user's command with the Aprun launch framework 273 | to launch multiple cmd invocations in parallel on a single job allocation 274 | 275 | """ 276 | def __init__(self, overrides=''): 277 | self.overrides = overrides 278 | 279 | def __call__(self, command, tasks_per_node, nodes_per_block, walltime=None): 280 | """ 281 | Args: 282 | - command (string): The command string to be launched 283 | - tasks_per_node (int) : Workers to launch per node 284 | - nodes_per_block (int) : Number of nodes in a block 285 | 286 | KWargs: 287 | - walltime (int) : This is not used by this launcher. 288 | """ 289 | 290 | tasks_per_block = tasks_per_node * nodes_per_block 291 | x = ''' 292 | WORKERCOUNT={1} 293 | 294 | cat << APRUN_EOF > cmd_$JOBNAME.sh 295 | {0} 296 | APRUN_EOF 297 | chmod a+x cmd_$JOBNAME.sh 298 | 299 | aprun -n {tasks_per_block} -N {tasks_per_node} {overrides} /bin/bash cmd_$JOBNAME.sh & 300 | wait 301 | 302 | echo "Done" 303 | '''.format(command, tasks_per_block, 304 | tasks_per_block=tasks_per_block, 305 | tasks_per_node=tasks_per_node, 306 | overrides=self.overrides) 307 | return x 308 | 309 | 310 | if __name__ == '__main__': 311 | 312 | s = SingleNodeLauncher() 313 | wrapped = s("hello", 1, 1) 314 | print(wrapped) 315 | -------------------------------------------------------------------------------- /libsubmit/providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Workstation Provider 2 | from libsubmit.providers.local.local import LocalProvider 3 | 4 | # Cluster Providers 5 | 6 | from libsubmit.providers.cobalt.cobalt import CobaltProvider 7 | from libsubmit.providers.condor.condor import CondorProvider 8 | from libsubmit.providers.grid_engine.grid_engine import GridEngineProvider 9 | from libsubmit.providers.slurm.slurm import SlurmProvider 10 | from libsubmit.providers.torque.torque import TorqueProvider 11 | 12 | # Cloud Providers 13 | from libsubmit.providers.aws.aws import AWSProvider 14 | from libsubmit.providers.googlecloud.googlecloud import GoogleCloudProvider 15 | from libsubmit.providers.azure.azure import AzureProvider 16 | from libsubmit.providers.jetstream.jetstream import JetstreamProvider 17 | 18 | # Kubernetes 19 | from libsubmit.providers.kubernetes.kube import KubernetesProvider 20 | 21 | __all__ = ['LocalProvider', 22 | 'CobaltProvider', 23 | 'CondorProvider', 24 | 'GridEngineProvider', 25 | 'SlurmProvider', 26 | 'TorqueProvider', 27 | 'AWSProvider', 28 | 'GoogleCloudProvider', 29 | 'AzureProvider', 30 | 'JetstreamProvider', 31 | 'KubernetesProvider'] 32 | -------------------------------------------------------------------------------- /libsubmit/providers/aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/aws/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/aws/template.py: -------------------------------------------------------------------------------- 1 | template_string = """#!/bin/bash 2 | #sed -i 's/us-east-2\.ec2\.//g' /etc/apt/sources.list 3 | cd ~ 4 | apt-get update -y 5 | apt-get install -y python3 python3-pip libffi-dev g++ libssl-dev 6 | pip3 install numpy scipy parsl 7 | $overrides 8 | 9 | $user_script 10 | 11 | # Shutdown the instance as soon as the worker scripts exits 12 | # or times out to avoid EC2 costs. 13 | if ! $linger 14 | then 15 | halt 16 | fi 17 | """ 18 | -------------------------------------------------------------------------------- /libsubmit/providers/azure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/azure/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/azure/azure.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from libsubmit.error import * 5 | from libsubmit.providers.provider_base import ExecutionProvider 6 | from libsubmit.utils import RepresentationMixin 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | try: 11 | from azure.common.credentials import UserPassCredentials 12 | from libsubmit.azure.azure_deployer import Deployer 13 | 14 | except ImportError: 15 | _azure_enabled = False 16 | else: 17 | _azure_enabled = True 18 | 19 | translate_table = { 20 | 'PD': 'PENDING', 21 | 'R': 'RUNNING', 22 | 'CA': 'CANCELLED', 23 | 'CF': 'PENDING', # (configuring), 24 | 'CG': 'RUNNING', # (completing), 25 | 'CD': 'COMPLETED', 26 | 'F': 'FAILED', # (failed), 27 | 'TO': 'TIMEOUT', # (timeout), 28 | 'NF': 'FAILED', # (node failure), 29 | 'RV': 'FAILED', # (revoked) and 30 | 'SE': 'FAILED' 31 | } # (special exit state 32 | 33 | template_string = """ 34 | cd ~ 35 | sudo apt-get update -y 36 | sudo apt-get install -y python3 python3-pip ipython 37 | sudo pip3 install ipyparallel parsl 38 | """ 39 | 40 | 41 | class AzureProvider(ExecutionProvider, RepresentationMixin): 42 | """A provider for using Azure resources. 43 | 44 | Parameters 45 | ---------- 46 | profile : str 47 | Profile to be used if different from the standard Azure config file ~/.azure/config. 48 | template_file : str 49 | Location of template file for Azure instance. Default is 'templates/template.json'. 50 | walltime : str 51 | Walltime requested per block in HH:MM:SS. 52 | azure_template_file : str 53 | Path to the template file for the Azure instance. 54 | init_blocks : int 55 | Number of blocks to provision at the start of the run. Default is 1. 56 | min_blocks : int 57 | Minimum number of blocks to maintain. Default is 0. 58 | max_blocks : int 59 | Maximum number of blocks to maintain. Default is 10. 60 | nodes_per_block : int 61 | Nodes to provision per block. Default is 1. 62 | """ 63 | 64 | def __init__(self, 65 | subscription_id, 66 | username, 67 | password, 68 | label='azure', 69 | template_file='template.json', 70 | init_blocks=1, 71 | min_blocks=0, 72 | max_blocks=1, 73 | nodes_per_block=1, 74 | state_file=None): 75 | self.configure_logger() 76 | 77 | if not _azure_enabled: 78 | raise OptionalModuleMissing(['azure'], "Azure Provider requires the azure module.") 79 | 80 | credentials = UserPassCredentials(username, password) 81 | 82 | self.resource_client = ResourceManagementClient(credentials, subscription_id) 83 | self.storage_client = StorageManagementClient(credentials, subscription_id) 84 | 85 | self.resource_group_name = 'my_resource_group' 86 | self.deployer = Deployer(subscription_id, self.resource_group_name, self.read_configs(config)) 87 | 88 | self.channel = channel 89 | self.config = config 90 | self.provisioned_blocks = 0 91 | self.resources = {} 92 | self.instances = [] 93 | 94 | self.max_nodes = max_blocks * nodes_per_block 95 | 96 | try: 97 | self.initialize_boto_client() 98 | except Exception as e: 99 | logger.error("Azure '{}' failed to initialize.".format(self.label)) 100 | raise e 101 | 102 | try: 103 | if state_file is None: 104 | state_file = '.azure_{}.json'.format(self.label) 105 | self.read_state_file(state_file) 106 | 107 | except Exception: 108 | self.create_vpc().id 109 | logger.info("No State File. Cannot load previous options. Creating new infrastructure.") 110 | self.write_state_file() 111 | 112 | def submit(self, command='sleep 1', blocksize=1, job_name="parsl.auto"): 113 | """Submit command to an Azure instance. 114 | 115 | Submit returns an ID that corresponds to the task that was just submitted. 116 | 117 | Parameters 118 | ---------- 119 | command : str 120 | Command to be invoked on the remote side. 121 | blocksize : int 122 | Number of blocks requested. 123 | job_name : str 124 | Prefix for job name. 125 | 126 | Returns 127 | ------- 128 | None or str 129 | If at capacity (no more can be provisioned), None is returned. Otherwise, 130 | an identifier for the job is returned. 131 | """ 132 | 133 | job_name = "parsl.auto.{0}".format(time.time()) 134 | [instance, *rest] = self.deployer.deploy(command=command, job_name=job_name, blocksize=1) 135 | 136 | if not instance: 137 | logger.error("Failed to submit request to Azure") 138 | return None 139 | 140 | logger.debug("Started instance_id: {0}".format(instance.instance_id)) 141 | 142 | state = translate_table.get(instance.state['Name'], "PENDING") 143 | 144 | self.resources[instance.instance_id] = {"job_id": instance.instance_id, "instance": instance, "status": state} 145 | 146 | return instance.instance_id 147 | 148 | def status(self, job_ids): 149 | """Get the status of a list of jobs identified by their ids. 150 | 151 | Parameters 152 | ---------- 153 | job_ids : list of str 154 | Identifiers for the jobs. 155 | 156 | Returns 157 | ------- 158 | list of int 159 | Status codes for each requested job. 160 | """ 161 | states = [] 162 | statuses = self.deployer.get_vm_status([self.resources.get(job_id) for job_id in job_ids]) 163 | for status in statuses: 164 | states.append(translate_table.get(status.state['Name'], "PENDING")) 165 | return states 166 | 167 | def cancel(self, job_ids): 168 | """Cancel jobs specified by a list of job ids. 169 | 170 | Parameters 171 | ---------- 172 | list of str 173 | List of identifiers of jobs which should be canceled. 174 | 175 | Returns 176 | ------- 177 | list of bool 178 | For each entry, True if the cancel operation is successful, otherwise False. 179 | """ 180 | for job_id in job_ids: 181 | try: 182 | self.deployer.destroy(self.resources.get(job_id)) 183 | return True 184 | except e: 185 | logger.error("Failed to cancel {}".format(repr(job_id))) 186 | logger.error(e) 187 | return False 188 | 189 | @property 190 | def scaling_enabled(): 191 | return True 192 | 193 | @property 194 | def current_capacity(self): 195 | """Returns the current blocksize.""" 196 | return len(self.instances) 197 | 198 | 199 | if __name__ == '__main__': 200 | config = open("azureconf.json") 201 | -------------------------------------------------------------------------------- /libsubmit/providers/azure/azureconf.json: -------------------------------------------------------------------------------- 1 | { 2 | "site": "azure", 3 | "nodeGranularity": 1, 4 | "maxNodes": 5, 5 | "AMIID": "ami-ae90b6cb", 6 | "logFile": "azureprovider.log", 7 | "username": "", 8 | "pass": "", 9 | "subscriptionId": "", 10 | "location": "eastus", 11 | "AZURE_CLIENT_ID": "0", 12 | "AZURE_CLIENT_SECRET": "", 13 | "AZURE_TENANT_ID": "0", 14 | "azureTemplateFile": "template.json" 15 | } 16 | -------------------------------------------------------------------------------- /libsubmit/providers/azure/deployer.py: -------------------------------------------------------------------------------- 1 | """A deployer class to deploy a template on Azure""" 2 | import json 3 | import os.path 4 | 5 | from azure.common.credentials import ServicePrincipalCredentials 6 | from azure.mgmt.resource import ResourceManagementClient 7 | from azure.mgmt.resource.resources.models import DeploymentMode 8 | 9 | 10 | class Deployer(object): 11 | """ Initialize the deployer class with subscription, resource group and public key. 12 | 13 | :raises IOError: If the public key path cannot be read (access or not exists) 14 | :raises KeyError: If AZURE_CLIENT_ID, AZURE_CLIENT_SECRET or AZURE_TENANT_ID env 15 | variables or not defined 16 | """ 17 | config = "" 18 | 19 | def __init__(self, subscription_id, resource_group, config, 20 | pub_ssh_key_path='~/.ssh/id_rsa.pub'): 21 | self.config = config 22 | self.subscription_id = subscription_id 23 | self.resource_group = resource_group 24 | self.dns_label_prefix = self.name_generator.haikunate() 25 | self.location = self.config['location'] 26 | 27 | pub_ssh_key_path = os.path.expanduser(pub_ssh_key_path) 28 | # Will raise if file not exists or not enough permission 29 | with open(pub_ssh_key_path, 'r') as pub_ssh_file_fd: 30 | self.pub_ssh_key = pub_ssh_file_fd.read() 31 | self.credentials = ServicePrincipalCredentials( 32 | client_id=self.config['AZURE_CLIENT_ID'], 33 | secret=self.config['AZURE_CLIENT_SECRET'], 34 | tenant=self.config['AZURE_TENANT_ID'] 35 | ) 36 | self.client = ResourceManagementClient( 37 | self.credentials, self.subscription_id) 38 | 39 | def deploy(self, job_name, command='', blocksize=1): 40 | instances = [] 41 | """Deploy the template to a resource group.""" 42 | self.client.resource_groups.create_or_update( 43 | self.resource_group, 44 | { 45 | 'location': self.location, 46 | 47 | } 48 | ) 49 | 50 | template_path = os.path.join(os.path.dirname( 51 | __file__), 'templates', 'template.json') 52 | with open(template_path, 'r') as template_file_fd: 53 | template = json.load(template_file_fd) 54 | 55 | parameters = { 56 | 'sshKeyData': self.pub_ssh_key, 57 | 'vmName': 'azure-deployment-sample-vm', 58 | 'dnsLabelPrefix': self.dns_label_prefix 59 | } 60 | parameters = {k: {'value': v} for k, v in parameters.items()} 61 | 62 | deployment_properties = { 63 | 'mode': DeploymentMode.incremental, 64 | 'template': template, 65 | 'parameters': parameters 66 | } 67 | for i in range(blocksize): 68 | deployment_async_operation = self.client.deployments.create_or_update( 69 | self.resource_group, 70 | 'azure-sample', 71 | deployment_properties 72 | ) 73 | instances.append(deployment_async_operation.wait()) 74 | return instances 75 | 76 | def destroy(self, job_ids): 77 | """Destroy the given resource group""" 78 | for job_id in job_ids: 79 | self.client.resource_groups.delete(self.resource_group) 80 | 81 | def get_vm(self, resource_group_name, vm_name): 82 | ''' 83 | you need to retry this just in case the credentials token expires, 84 | that's where the decorator comes in 85 | this will return all the data about the virtual machine 86 | ''' 87 | return self.client.virtual_machines.get( 88 | resource_group_name, vm_name, expand='instanceView') 89 | 90 | def get_vm_status(self, vm_name, rgn): 91 | ''' 92 | this will just return the status of the virtual machine 93 | sometime the status may be unknown as shown by the azure portal; 94 | in that case statuses[1] doesn't exist, hence retrying on IndexError 95 | also, it may take on the order of minutes for the status to become 96 | available so the decorator will bang on it forever 97 | ''' 98 | rgn = rgn if rgn else self.resource_group 99 | return self.client.virtual_machines.get( 100 | rgn, vm_name).instance_view.statuses[1].display_status 101 | -------------------------------------------------------------------------------- /libsubmit/providers/cluster_provider.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from string import Template 4 | 5 | import libsubmit.error as ep_error 6 | from libsubmit.providers.provider_base import ExecutionProvider 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class ClusterProvider(ExecutionProvider): 12 | """ This class defines behavior common to all cluster/supercompute-style scheduler systems. 13 | 14 | Parameters 15 | ---------- 16 | label : str 17 | Label for this provider. 18 | channel : Channel 19 | Channel for accessing this provider. Possible channels include 20 | :class:`~libsubmit.channels.LocalChannel` (the default), 21 | :class:`~libsubmit.channels.SSHChannel`, or 22 | :class:`~libsubmit.channels.SSHInteractiveLoginChannel`. 23 | script_dir : str 24 | Relative or absolute path to a directory where intermediate scripts are placed. 25 | walltime : str 26 | Walltime requested per block in HH:MM:SS. 27 | launcher : str 28 | FIXME 29 | cmd_timeout : int 30 | Timeout for commands made to the scheduler in seconds 31 | 32 | .. code:: python 33 | 34 | +------------------ 35 | | 36 | script_string ------->| submit 37 | id <--------|---+ 38 | | 39 | [ ids ] ------->| status 40 | [statuses] <--------|----+ 41 | | 42 | [ ids ] ------->| cancel 43 | [cancel] <--------|----+ 44 | | 45 | [True/False] <--------| scaling_enabled 46 | | 47 | +------------------- 48 | """ 49 | 50 | def __init__(self, 51 | label, 52 | channel, 53 | script_dir, 54 | nodes_per_block, 55 | tasks_per_node, 56 | init_blocks, 57 | min_blocks, 58 | max_blocks, 59 | parallelism, 60 | walltime, 61 | launcher, 62 | cmd_timeout=10): 63 | 64 | self._scaling_enabled = True 65 | self.label = label 66 | self.channel = channel 67 | self.tasks_per_block = nodes_per_block * tasks_per_node 68 | self.nodes_per_block = nodes_per_block 69 | self.tasks_per_node = tasks_per_node 70 | self.init_blocks = init_blocks 71 | self.min_blocks = min_blocks 72 | self.max_blocks = max_blocks 73 | self.parallelism = parallelism 74 | self.provisioned_blocks = 0 75 | self.launcher = launcher 76 | self.walltime = walltime 77 | self.cmd_timeout = cmd_timeout 78 | if not callable(self.launcher): 79 | raise(ep_error.BadLauncher(self.launcher, 80 | "Launcher for executor:{} is of type:{}. Expects a libsubmit.launcher.launcher.Launcher or callable".format( 81 | label, 82 | type(self.launcher)))) 83 | 84 | self.script_dir = script_dir 85 | if not os.path.exists(self.script_dir): 86 | os.makedirs(self.script_dir) 87 | 88 | # Dictionary that keeps track of jobs, keyed on job_id 89 | self.resources = {} 90 | 91 | def execute_wait(self, cmd, timeout=None): 92 | t = self.cmd_timeout 93 | if timeout is not None: 94 | t = timeout 95 | return self.channel.execute_wait(cmd, t) 96 | 97 | def _write_submit_script(self, template, script_filename, job_name, configs): 98 | """Generate submit script and write it to a file. 99 | 100 | Args: 101 | - template (string) : The template string to be used for the writing submit script 102 | - script_filename (string) : Name of the submit script 103 | - job_name (string) : job name 104 | - configs (dict) : configs that get pushed into the template 105 | 106 | Returns: 107 | - True: on success 108 | 109 | Raises: 110 | SchedulerMissingArgs : If template is missing args 111 | ScriptPathError : Unable to write submit script out 112 | """ 113 | 114 | try: 115 | submit_script = Template(template).substitute(jobname=job_name, **configs) 116 | # submit_script = Template(template).safe_substitute(jobname=job_name, **configs) 117 | with open(script_filename, 'w') as f: 118 | f.write(submit_script) 119 | 120 | except KeyError as e: 121 | logger.error("Missing keys for submit script : %s", e) 122 | raise (ep_error.SchedulerMissingArgs(e.args, self.sitename)) 123 | 124 | except IOError as e: 125 | logger.error("Failed writing to submit script: %s", script_filename) 126 | raise (ep_error.ScriptPathError(script_filename, e)) 127 | except Exception as e: 128 | print("Template : ", template) 129 | print("Args : ", job_name) 130 | print("Kwargs : ", configs) 131 | logger.error("Uncategorized error: %s", e) 132 | raise (e) 133 | 134 | return True 135 | 136 | def submit(self, cmd_string, blocksize, job_name="parsl.auto"): 137 | ''' The submit method takes the command string to be executed upon 138 | instantiation of a resource most often to start a pilot (such as IPP engine 139 | or even Swift-T engines). 140 | 141 | Args : 142 | - cmd_string (str) : The bash command string to be executed. 143 | - blocksize (int) : Blocksize to be requested 144 | 145 | KWargs: 146 | - job_name (str) : Human friendly name to be assigned to the job request 147 | 148 | Returns: 149 | - A job identifier, this could be an integer, string etc 150 | 151 | Raises: 152 | - ExecutionProviderExceptions or its subclasses 153 | ''' 154 | raise NotImplementedError 155 | 156 | def _status(self): 157 | raise NotImplementedError 158 | 159 | def status(self, job_ids): 160 | """ Get the status of a list of jobs identified by the job identifiers 161 | returned from the submit request. 162 | 163 | Args: 164 | - job_ids (list) : A list of job identifiers 165 | 166 | Returns: 167 | - A list of status from ['PENDING', 'RUNNING', 'CANCELLED', 'COMPLETED', 168 | 'FAILED', 'TIMEOUT'] corresponding to each job_id in the job_ids list. 169 | 170 | Raises: 171 | - ExecutionProviderException or its subclasses 172 | 173 | """ 174 | if job_ids: 175 | self._status() 176 | return [self.resources[jid]['status'] for jid in job_ids] 177 | 178 | def cancel(self, job_ids): 179 | """ Cancels the resources identified by the job_ids provided by the user. 180 | 181 | Args: 182 | - job_ids (list): A list of job identifiers 183 | 184 | Returns: 185 | - A list of status from cancelling the job which can be True, False 186 | 187 | Raises: 188 | - ExecutionProviderException or its subclasses 189 | """ 190 | 191 | raise NotImplementedError 192 | 193 | @property 194 | def scaling_enabled(self): 195 | """ The callers of ParslExecutors need to differentiate between Executors 196 | and Executors wrapped in a resource provider 197 | 198 | Returns: 199 | - Status (Bool) 200 | """ 201 | return self._scaling_enabled 202 | 203 | @property 204 | def current_capacity(self): 205 | """ Returns the currently provisioned blocks. 206 | This may need to return more information in the futures : 207 | { minsize, maxsize, current_requested } 208 | """ 209 | return self.provisioned_blocks 210 | -------------------------------------------------------------------------------- /libsubmit/providers/cobalt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/cobalt/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/cobalt/cobalt.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | 5 | import libsubmit.error as ep_error 6 | from libsubmit.channels import LocalChannel 7 | from libsubmit.launchers import AprunLauncher 8 | from libsubmit.providers.cobalt.template import template_string 9 | from libsubmit.providers.cluster_provider import ClusterProvider 10 | from libsubmit.utils import RepresentationMixin, wtime_to_minutes 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | translate_table = { 15 | 'QUEUED': 'PENDING', 16 | 'STARTING': 'PENDING', 17 | 'RUNNING': 'RUNNING', 18 | 'EXITING': 'COMPLETED', 19 | 'KILLING': 'COMPLETED' 20 | } 21 | 22 | 23 | class CobaltProvider(ClusterProvider, RepresentationMixin): 24 | """ Cobalt Execution Provider 25 | 26 | This provider uses cobalt to submit (qsub), obtain the status of (qstat), and cancel (qdel) 27 | jobs. Theo script to be used is created from a template file in this 28 | same module. 29 | 30 | Parameters 31 | ---------- 32 | channel : Channel 33 | Channel for accessing this provider. Possible channels include 34 | :class:`~libsubmit.channels.LocalChannel` (the default), 35 | :class:`~libsubmit.channels.SSHChannel`, or 36 | :class:`~libsubmit.channels.SSHInteractiveLoginChannel`. 37 | label : str 38 | Label for this provider. 39 | script_dir : str 40 | Relative or absolute path to a directory where intermediate scripts are placed. 41 | nodes_per_block : int 42 | Nodes to provision per block. 43 | tasks_per_node : int 44 | Tasks to run per node. 45 | min_blocks : int 46 | Minimum number of blocks to maintain. 47 | max_blocks : int 48 | Maximum number of blocks to maintain. 49 | walltime : str 50 | Walltime requested per block in HH:MM:SS. 51 | account : str 52 | Account that the job will be charged against. 53 | queue : str 54 | Torque queue to request blocks from. 55 | overrides : str 56 | String to append to the Torque submit script on the scheduler. 57 | launcher : Launcher 58 | Launcher for this provider. Possible launchers include 59 | :class:`~libsubmit.launchers.AprunLauncher` (the default) or, 60 | :class:`~libsubmit.launchers.SingleNodeLauncher` 61 | """ 62 | def __init__(self, 63 | channel=LocalChannel(), 64 | label='cobalt', 65 | script_dir='parsl_scripts', 66 | nodes_per_block=1, 67 | tasks_per_node=1, 68 | init_blocks=0, 69 | min_blocks=0, 70 | max_blocks=10, 71 | parallelism=1, 72 | walltime="00:10:00", 73 | account=None, 74 | queue=None, 75 | overrides='', 76 | launcher=AprunLauncher(), 77 | cmd_timeout=10): 78 | super().__init__(label, 79 | channel=channel, 80 | script_dir=script_dir, 81 | nodes_per_block=nodes_per_block, 82 | tasks_per_node=tasks_per_node, 83 | init_blocks=init_blocks, 84 | min_blocks=min_blocks, 85 | max_blocks=max_blocks, 86 | parallelism=parallelism, 87 | walltime=walltime, 88 | launcher=launcher, 89 | cmd_timeout=cmd_timeout) 90 | 91 | self.account = account 92 | self.queue = queue 93 | self.overrides = overrides 94 | 95 | def _status(self): 96 | """ Internal: Do not call. Returns the status list for a list of job_ids 97 | 98 | Args: 99 | self 100 | 101 | Returns: 102 | [status...] : Status list of all jobs 103 | """ 104 | 105 | jobs_missing = list(self.resources.keys()) 106 | 107 | retcode, stdout, stderr = super().execute_wait("qstat -u $USER") 108 | 109 | # Execute_wait failed. Do no update. 110 | if retcode != 0: 111 | return 112 | 113 | for line in stdout.split('\n'): 114 | if line.startswith('='): 115 | continue 116 | 117 | parts = line.upper().split() 118 | if parts and parts[0] != 'JOBID': 119 | job_id = parts[0] 120 | 121 | if job_id not in self.resources: 122 | continue 123 | 124 | status = translate_table.get(parts[4], 'UNKNOWN') 125 | 126 | self.resources[job_id]['status'] = status 127 | jobs_missing.remove(job_id) 128 | 129 | # squeue does not report on jobs that are not running. So we are filling in the 130 | # blanks for missing jobs, we might lose some information about why the jobs failed. 131 | for missing_job in jobs_missing: 132 | if self.resources[missing_job]['status'] in ['RUNNING', 'KILLING', 'EXITING']: 133 | self.resources[missing_job]['status'] = translate_table['EXITING'] 134 | 135 | def submit(self, command, blocksize, job_name="parsl.auto"): 136 | """ Submits the command onto an Local Resource Manager job of blocksize parallel elements. 137 | Submit returns an ID that corresponds to the task that was just submitted. 138 | 139 | If tasks_per_node < 1 : ! This is illegal. tasks_per_node should be integer 140 | 141 | If tasks_per_node == 1: 142 | A single node is provisioned 143 | 144 | If tasks_per_node > 1 : 145 | tasks_per_node * blocksize number of nodes are provisioned. 146 | 147 | Args: 148 | - command :(String) Commandline invocation to be made on the remote side. 149 | - blocksize :(float) 150 | 151 | Kwargs: 152 | - job_name (String): Name for job, must be unique 153 | 154 | Returns: 155 | - None: At capacity, cannot provision more 156 | - job_id: (string) Identifier for the job 157 | 158 | """ 159 | 160 | if self.provisioned_blocks >= self.max_blocks: 161 | logger.warn("[%s] at capacity, cannot add more blocks now", self.label) 162 | return None 163 | 164 | # Note: Fix this later to avoid confusing behavior. 165 | # We should always allocate blocks in integer counts of node_granularity 166 | if blocksize < self.nodes_per_block: 167 | blocksize = self.nodes_per_block 168 | 169 | account_opt = '-A {}'.format(self.account) if self.account is not None else '' 170 | 171 | job_name = "parsl.{0}.{1}".format(job_name, time.time()) 172 | 173 | script_path = "{0}/{1}.submit".format(self.script_dir, job_name) 174 | script_path = os.path.abspath(script_path) 175 | 176 | job_config = {} 177 | job_config["overrides"] = self.overrides 178 | 179 | logger.debug("Requesting blocksize:%s nodes_per_block:%s tasks_per_node:%s", 180 | blocksize, self.nodes_per_block, self.tasks_per_node) 181 | 182 | # Wrap the command 183 | job_config["user_script"] = self.launcher(command, self.tasks_per_node, self.nodes_per_block) 184 | 185 | queue_opt = '-q {}'.format(self.queue) if self.queue is not None else '' 186 | 187 | logger.debug("Writing submit script") 188 | self._write_submit_script(template_string, script_path, job_name, job_config) 189 | 190 | channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) 191 | 192 | command = 'qsub -n {0} {1} -t {2} {3} {4}'.format( 193 | self.nodes_per_block, queue_opt, wtime_to_minutes(self.walltime), account_opt, channel_script_path) 194 | logger.debug("Executing {}".format(command)) 195 | 196 | retcode, stdout, stderr = super().execute_wait(command) 197 | 198 | # TODO : FIX this block 199 | if retcode != 0: 200 | logger.error("Failed command: {0}".format(command)) 201 | logger.error("Launch failed stdout:\n{0} \nstderr:{1}\n".format(stdout, stderr)) 202 | 203 | logger.debug("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) 204 | 205 | job_id = None 206 | 207 | if retcode == 0: 208 | # We should be getting only one line back 209 | job_id = stdout.strip() 210 | self.resources[job_id] = {'job_id': job_id, 'status': 'PENDING', 'blocksize': blocksize} 211 | else: 212 | logger.error("Submission of command to scale_out failed: {0}".format(stderr)) 213 | raise (ep_error.ScaleOutFailed(self.__class__, "Request to submit job to local scheduler failed")) 214 | 215 | logger.debug("Returning job id : {0}".format(job_id)) 216 | return job_id 217 | 218 | def cancel(self, job_ids): 219 | """ Cancels the jobs specified by a list of job ids 220 | 221 | Args: 222 | job_ids : [ ...] 223 | 224 | Returns : 225 | [True/False...] : If the cancel operation fails the entire list will be False. 226 | """ 227 | 228 | job_id_list = ' '.join(job_ids) 229 | retcode, stdout, stderr = super().execute_wait("qdel {0}".format(job_id_list)) 230 | rets = None 231 | if retcode == 0: 232 | for jid in job_ids: 233 | self.resources[jid]['status'] = translate_table['KILLING'] # Setting state to cancelled 234 | rets = [True for i in job_ids] 235 | else: 236 | rets = [False for i in job_ids] 237 | 238 | return rets 239 | -------------------------------------------------------------------------------- /libsubmit/providers/cobalt/template.py: -------------------------------------------------------------------------------- 1 | template_string = '''#!/bin/bash -e 2 | $overrides 3 | 4 | echo "Starting Cobalt job script" 5 | 6 | echo "----Cobalt Nodefile: -----" 7 | cat $$COBALT_NODEFILE 8 | echo "--------------------------" 9 | 10 | export JOBNAME="${jobname}" 11 | 12 | $user_script 13 | 14 | echo "End of Cobalt job" 15 | ''' 16 | -------------------------------------------------------------------------------- /libsubmit/providers/condor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/condor/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/condor/template.py: -------------------------------------------------------------------------------- 1 | template_string = ''' 2 | universe = vanilla 3 | should_transfer_files = YES 4 | when_to_transfer_output = ON_EXIT_OR_EVICT 5 | Transfer_Executable = false 6 | transfer_input_files = ${input_files} 7 | machine_count = ${nodes} 8 | output = ${submit_script_dir}/${job_name}.stdout 9 | error = ${submit_script_dir}/${job_name}.stderr 10 | executable = /bin/bash 11 | arguments = ${job_script} 12 | requirements = ${requirements} 13 | +projectname = ${project} 14 | leave_in_queue = TRUE 15 | environment = "${environment}" 16 | 17 | ${overrides} 18 | 19 | queue 20 | 21 | ''' 22 | 23 | # for later, 24 | # if we want to remove on preemption, this might work: 25 | # PERIODIC_REMOVE = (NumJobstarts > 1) 26 | # or if the pilot can trap signals, then we can send a special exit code on 27 | # sigterm/sigkill and remove that way. but then we still need to be careful in 28 | # cases where the worker dies, for example-- no signal is sent 29 | -------------------------------------------------------------------------------- /libsubmit/providers/googlecloud/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/googlecloud/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/googlecloud/googlecloud.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import logging 3 | import os 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | try: 8 | import googleapiclient.discovery 9 | 10 | except ImportError: 11 | _google_enabled = False 12 | else: 13 | _google_enabled = True 14 | 15 | translate_table = { 16 | 'PENDING': 'PENDING', 17 | 'PROVISIONING': 'PENDING', 18 | "STAGING": "PENDING", 19 | 'RUNNING': 'RUNNING', 20 | 'DONE': 'COMPLETED', 21 | 'STOPPING': 'COMPLETED', 22 | 'STOPPED': 'COMPLETED', 23 | 'TERMINATED': 'COMPLETED', 24 | 'SUSPENDING': 'COMPLETED', 25 | 'SUSPENDED': 'COMPLETED', 26 | } 27 | 28 | 29 | class GoogleCloudProvider(): 30 | """A provider for using resources from the Google Compute Engine. 31 | 32 | Parameters 33 | ---------- 34 | project_id : str 35 | Project ID from Google compute engine. 36 | key_file : str 37 | Path to authorization private key json file. This is required for auth. 38 | A new one can be generated here: https://console.cloud.google.com/apis/credentials 39 | region : str 40 | Region in which to start instances 41 | os_project : str 42 | OS project code for Google compute engine. 43 | os_family : str 44 | OS family to request. 45 | label : str 46 | A label for this executor. Default is 'google_cloud'. 47 | google_version : str 48 | Google compute engine version to use. Possibilies include 'v1' (default) or 'beta'. 49 | instance_type: str 50 | 'n1-standard-1', 51 | script_dir : str 52 | Relative or absolute path to a directory where intermediate scripts are placed. 53 | init_blocks : int 54 | Number of blocks to provision immediately. Default is 1. 55 | min_blocks : int 56 | Minimum number of blocks to maintain. Default is 0. 57 | max_blocks : int 58 | Maximum number of blocks to maintain. Default is 10. 59 | parallelism : float 60 | Ratio of provisioned task slots to active tasks. A parallelism value of 1 represents aggressive 61 | scaling where as many resources as possible are used; parallelism close to 0 represents 62 | the opposite situation in which as few resources as possible (i.e., min_blocks) are used. 63 | 64 | .. code:: python 65 | 66 | +------------------ 67 | | 68 | script_string ------->| submit 69 | id <--------|---+ 70 | | 71 | [ ids ] ------->| status 72 | [statuses] <--------|----+ 73 | | 74 | [ ids ] ------->| cancel 75 | [cancel] <--------|----+ 76 | | 77 | [True/False] <--------| scaling_enabled 78 | | 79 | +------------------- 80 | """ 81 | 82 | def __init__(self, 83 | project_id, 84 | key_file, 85 | region, 86 | os_project, 87 | os_family, 88 | label='google_cloud', 89 | google_version='v1', 90 | instance_type='n1-standard-1', 91 | script_dir='parsl_scripts', 92 | init_blocks=1, 93 | min_blocks=0, 94 | max_blocks=10, 95 | parallelism=1): 96 | self.project_id = project_id 97 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file 98 | self.zone = self.get_zone(region) 99 | self.os_project = os_project 100 | self.os_family = os_family 101 | self.label = label 102 | self.client = googleapiclient.discovery.build('compute', google_version) 103 | self.instance_type = instance_type 104 | self.script_dir = script_dir 105 | if not os.path.exists(self.script_dir): 106 | os.makedirs(self.script_dir) 107 | self.init_blocks = init_blocks 108 | self.min_blocks = min_blocks 109 | self.max_blocks = max_blocks 110 | self.parallelism = parallelism 111 | self.num_instances = 0 112 | 113 | # Dictionary that keeps track of jobs, keyed on job_id 114 | self.resources = {} 115 | self.provisioned_blocks = 0 116 | atexit.register(self.bye) 117 | 118 | def submit(self, command="", blocksize=1, job_name="parsl.auto"): 119 | ''' The submit method takes the command string to be executed upon 120 | instantiation of a resource most often to start a pilot. 121 | 122 | Args : 123 | - command (str) : The bash command string to be executed. 124 | - blocksize (int) : Blocksize to be requested 125 | 126 | KWargs: 127 | - job_name (str) : Human friendly name to be assigned to the job request 128 | 129 | Returns: 130 | - A job identifier, this could be an integer, string etc 131 | 132 | Raises: 133 | - ExecutionProviderException or its subclasses 134 | ''' 135 | instance, name = self.create_instance(command=command) 136 | self.provisioned_blocks += 1 137 | self.resources[name] = {"job_id": name, "status": translate_table[instance['status']]} 138 | return name 139 | 140 | def status(self, job_ids): 141 | ''' Get the status of a list of jobs identified by the job identifiers 142 | returned from the submit request. 143 | 144 | Args: 145 | - job_ids (list) : A list of job identifiers 146 | 147 | Returns: 148 | - A list of status from ['PENDING', 'RUNNING', 'CANCELLED', 'COMPLETED', 149 | 'FAILED', 'TIMEOUT'] corresponding to each job_id in the job_ids list. 150 | 151 | Raises: 152 | - ExecutionProviderException or its subclasses 153 | 154 | ''' 155 | statuses = [] 156 | for job_id in job_ids: 157 | instance = self.client.instances().get(instance=job_id, project=self.project_id, zone=self.zone).execute() 158 | self.resources[job_id]['status'] = translate_table[instance['status']] 159 | statuses.append(translate_table[instance['status']]) 160 | return statuses 161 | 162 | def cancel(self, job_ids): 163 | ''' Cancels the resources identified by the job_ids provided by the user. 164 | 165 | Args: 166 | - job_ids (list): A list of job identifiers 167 | 168 | Returns: 169 | - A list of status from cancelling the job which can be True, False 170 | 171 | Raises: 172 | - ExecutionProviderException or its subclasses 173 | ''' 174 | statuses = [] 175 | for job_id in job_ids: 176 | try: 177 | self.delete_instance(job_id) 178 | statuses.append(True) 179 | self.provisioned_blocks -= 1 180 | except Exception: 181 | statuses.append(False) 182 | return statuses 183 | 184 | @property 185 | def scaling_enabled(self): 186 | ''' Scaling is enabled 187 | 188 | Returns: 189 | - Status (Bool) 190 | ''' 191 | return True 192 | 193 | @property 194 | def current_capacity(self): 195 | """Returns the number of currently provisioned blocks.""" 196 | return self.provisioned_blocks 197 | 198 | def bye(self): 199 | self.cancel([i for i in list(self.resources)]) 200 | 201 | def create_instance(self, command=""): 202 | name = "parslauto{}".format(self.num_instances) 203 | self.num_instances += 1 204 | compute = self.client 205 | project = self.project_id 206 | image_response = compute.images().getFromFamily( 207 | project=self.os_project, family=self.os_family).execute() 208 | source_disk_image = image_response['selfLink'] 209 | 210 | # Configure the machine 211 | machine_type = "zones/{}/machineTypes/{}".format(self.zone, self.instance_type) 212 | startup_script = command 213 | 214 | config = { 215 | 'name': name, 216 | 'machineType': machine_type, 217 | 218 | # Specify the boot disk and the image to use as a source. 219 | 'disks': [{ 220 | 'boot': True, 221 | 'autoDelete': True, 222 | 'initializeParams': { 223 | 'sourceImage': source_disk_image, 224 | } 225 | }], 226 | 'networkInterfaces': [{ 227 | 'network': 'global/networks/default', 228 | 'accessConfigs': [{ 229 | 'type': 'ONE_TO_ONE_NAT', 230 | 'name': 'External NAT' 231 | }] 232 | }], 233 | 'serviceAccounts': [{ 234 | 'email': 235 | 'default', 236 | 'scopes': [ 237 | 'https://www.googleapis.com/auth/devstorage.read_write', 238 | 'https://www.googleapis.com/auth/logging.write' 239 | ] 240 | }], 241 | 'metadata': { 242 | 'items': [{ 243 | # Startup script is automatically executed by the 244 | # instance upon startup. 245 | 'key': 'startup-script', 246 | 'value': startup_script 247 | }] 248 | } 249 | } 250 | 251 | return compute.instances().insert(project=project, zone=self.zone, body=config).execute(), name 252 | 253 | def get_zone(self, region): 254 | res = self.client.zones().list(project=self.project_id).execute() 255 | for zone in res['items']: 256 | if region in zone['name'] and zone['status'] == "UP": 257 | return zone["name"] 258 | 259 | def delete_instance(self, name): 260 | 261 | compute = self.client 262 | project = self.project_id 263 | zone = self.zone 264 | 265 | return compute.instances().delete(project=project, zone=zone, instance=name).execute() 266 | -------------------------------------------------------------------------------- /libsubmit/providers/grid_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/grid_engine/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/grid_engine/grid_engine.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | 5 | from libsubmit.channels import LocalChannel 6 | from libsubmit.providers.cluster_provider import ClusterProvider 7 | from libsubmit.providers.grid_engine.template import template_string 8 | from libsubmit.launchers import SingleNodeLauncher 9 | from libsubmit.utils import RepresentationMixin, wtime_to_minutes 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | translate_table = { 14 | 'qw': 'PENDING', 15 | 'hqw': 'PENDING', 16 | 'hrwq': 'PENDING', 17 | 'r': 'RUNNING', 18 | 's': 'FAILED', # obsuspended 19 | 'ts': 'FAILED', 20 | 't': 'FAILED', # Suspended by alarm 21 | 'eqw': 'FAILED', # Error states 22 | 'ehqw': 'FAILED', # .. 23 | 'ehrqw': 'FAILED', # .. 24 | 'd': 'COMPLETED', 25 | 'dr': 'COMPLETED', 26 | 'dt': 'COMPLETED', 27 | 'drt': 'COMPLETED', 28 | 'ds': 'COMPLETED', 29 | 'drs': 'COMPLETES', 30 | } 31 | 32 | 33 | class GridEngineProvider(ClusterProvider, RepresentationMixin): 34 | """A provider for the Grid Engine scheduler. 35 | 36 | Parameters 37 | ---------- 38 | channel : Channel 39 | Channel for accessing this provider. Possible channels include 40 | :class:`~libsubmit.channels.LocalChannel` (the default), 41 | :class:`~libsubmit.channels.SSHChannel`, or 42 | :class:`~libsubmit.channels.SSHInteractiveLoginChannel`. 43 | label : str 44 | Label for this provider. 45 | script_dir : str 46 | Relative or absolute path to a directory where intermediate scripts are placed. 47 | nodes_per_block : int 48 | Nodes to provision per block. 49 | tasks_per_node : int 50 | Tasks to run per node. 51 | min_blocks : int 52 | Minimum number of blocks to maintain. 53 | max_blocks : int 54 | Maximum number of blocks to maintain. 55 | parallelism : float 56 | Ratio of provisioned task slots to active tasks. A parallelism value of 1 represents aggressive 57 | scaling where as many resources as possible are used; parallelism close to 0 represents 58 | the opposite situation in which as few resources as possible (i.e., min_blocks) are used. 59 | walltime : str 60 | Walltime requested per block in HH:MM:SS. 61 | overrides : str 62 | String to prepend to the #SBATCH blocks in the submit script to the scheduler. 63 | launcher : Launcher 64 | Launcher for this provider. Possible launchers include 65 | :class:`~libsubmit.launchers.SingleNodeLauncher` (the default), 66 | """ 67 | 68 | def __init__(self, 69 | channel=LocalChannel(), 70 | label='grid_engine', 71 | script_dir='parsl_scripts', 72 | nodes_per_block=1, 73 | tasks_per_node=1, 74 | init_blocks=1, 75 | min_blocks=0, 76 | max_blocks=10, 77 | parallelism=1, 78 | walltime="00:10:00", 79 | overrides='', 80 | launcher=SingleNodeLauncher()): 81 | super().__init__(label, 82 | channel, 83 | script_dir, 84 | nodes_per_block, 85 | tasks_per_node, 86 | init_blocks, 87 | min_blocks, 88 | max_blocks, 89 | parallelism, 90 | walltime, 91 | launcher) 92 | self.overrides = overrides 93 | 94 | if launcher in ['srun', 'srun_mpi']: 95 | logger.warning("Use of {} launcher is usually appropriate for Slurm providers. " 96 | "Recommended options include 'single_node' or 'aprun'.".format(launcher)) 97 | 98 | def get_configs(self, command): 99 | """Compose a dictionary with information for writing the submit script.""" 100 | 101 | logger.debug("Requesting one block with {} nodes per block and {} tasks per node".format( 102 | self.nodes_per_block, self.tasks_per_node)) 103 | 104 | job_config = {} 105 | job_config["submit_script_dir"] = self.channel.script_dir 106 | job_config["nodes"] = self.nodes_per_block 107 | job_config["walltime"] = wtime_to_minutes(self.walltime) 108 | job_config["overrides"] = self.overrides 109 | job_config["user_script"] = command 110 | 111 | job_config["user_script"] = self.launcher(command, 112 | self.tasks_per_node, 113 | self.nodes_per_block) 114 | return job_config 115 | 116 | def submit(self, command="", blocksize=1, job_name="parsl.auto"): 117 | ''' The submit method takes the command string to be executed upon 118 | instantiation of a resource most often to start a pilot (such as IPP engine 119 | or even Swift-T engines). 120 | 121 | Args : 122 | - command (str) : The bash command string to be executed. 123 | - blocksize (int) : Blocksize to be requested 124 | 125 | KWargs: 126 | - job_name (str) : Human friendly name to be assigned to the job request 127 | 128 | Returns: 129 | - A job identifier, this could be an integer, string etc 130 | 131 | Raises: 132 | - ExecutionProviderException or its subclasses 133 | ''' 134 | 135 | # Note: Fix this later to avoid confusing behavior. 136 | # We should always allocate blocks in integer counts of node_granularity 137 | if blocksize < self.nodes_per_block: 138 | blocksize = self.nodes_per_block 139 | 140 | # Set job name 141 | job_name = "{0}.{1}".format(job_name, time.time()) 142 | 143 | # Set script path 144 | script_path = "{0}/{1}.submit".format(self.script_dir, job_name) 145 | script_path = os.path.abspath(script_path) 146 | 147 | job_config = self.get_configs(command, blocksize) 148 | 149 | logger.debug("Writing submit script") 150 | self._write_submit_script(template_string, script_path, job_name, job_config) 151 | 152 | channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) 153 | cmd = "qsub -terse {0}".format(channel_script_path) 154 | retcode, stdout, stderr = super().execute_wait(cmd, 10) 155 | 156 | if retcode == 0: 157 | for line in stdout.split('\n'): 158 | job_id = line.strip() 159 | if not job_id: 160 | continue 161 | self.resources[job_id] = {'job_id': job_id, 'status': 'PENDING', 'blocksize': blocksize} 162 | return job_id 163 | else: 164 | print("[WARNING!!] Submission of command to scale_out failed") 165 | logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) 166 | 167 | def _status(self): 168 | ''' Get the status of a list of jobs identified by the job identifiers 169 | returned from the submit request. 170 | 171 | Returns: 172 | - A list of status from ['PENDING', 'RUNNING', 'CANCELLED', 'COMPLETED', 173 | 'FAILED', 'TIMEOUT'] corresponding to each job_id in the job_ids list. 174 | 175 | Raises: 176 | - ExecutionProviderException or its subclasses 177 | 178 | ''' 179 | 180 | cmd = "qstat" 181 | 182 | retcode, stdout, stderr = super().execute_wait(cmd) 183 | 184 | # Execute_wait failed. Do no update 185 | if retcode != 0: 186 | return 187 | 188 | jobs_missing = list(self.resources.keys()) 189 | for line in stdout.split('\n'): 190 | parts = line.split() 191 | if parts and parts[0].lower().lower() != 'job-id' \ 192 | and not parts[0].startswith('----'): 193 | job_id = parts[0] 194 | status = translate_table.get(parts[4].lower(), 'UNKNOWN') 195 | if job_id in self.resources: 196 | self.resources[job_id]['status'] = status 197 | jobs_missing.remove(job_id) 198 | 199 | # Filling in missing blanks for jobs that might have gone missing 200 | # we might lose some information about why the jobs failed. 201 | for missing_job in jobs_missing: 202 | if self.resources[missing_job]['status'] in ['PENDING', 'RUNNING']: 203 | self.resources[missing_job]['status'] = 'COMPLETED' 204 | 205 | def cancel(self, job_ids): 206 | ''' Cancels the resources identified by the job_ids provided by the user. 207 | 208 | Args: 209 | - job_ids (list): A list of job identifiers 210 | 211 | Returns: 212 | - A list of status from cancelling the job which can be True, False 213 | 214 | Raises: 215 | - ExecutionProviderException or its subclasses 216 | ''' 217 | 218 | job_id_list = ' '.join(job_ids) 219 | cmd = "qdel {}".format(job_id_list) 220 | retcode, stdout, stderr = super().execute_wait(cmd, 3) 221 | 222 | rets = None 223 | if retcode == 0: 224 | for jid in job_ids: 225 | self.resources[jid]['status'] = "COMPLETED" 226 | rets = [True for i in job_ids] 227 | else: 228 | rets = [False for i in job_ids] 229 | 230 | return rets 231 | -------------------------------------------------------------------------------- /libsubmit/providers/grid_engine/template.py: -------------------------------------------------------------------------------- 1 | template_string = """#!/bin/bash 2 | #$$ -S /bin/bash 3 | #$$ -o ${submit_script_dir}/${jobname}.submit.stdout 4 | #$$ -e ${submit_script_dir}/${jobname}.submit.stderr 5 | #$$ -cwd 6 | #$$ -l h_rt=${walltime} 7 | $overrides 8 | 9 | export JOBNAME="${jobname}" 10 | 11 | $user_script 12 | """ 13 | -------------------------------------------------------------------------------- /libsubmit/providers/jetstream/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/jetstream/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/jetstream/jetstream.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import configparser 3 | import logging 4 | import os 5 | 6 | from libsubmit.error import * 7 | 8 | try: 9 | from novaclient import api_versions 10 | from novaclient import client 11 | 12 | except ImportError: 13 | _nova_enabled = False 14 | else: 15 | _nova_enabled = True 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | setup_script = '''#!/bin/bash 20 | LOG=/root/userdata.logs 21 | echo "Userdata logs " > $LOG 22 | apt-get update &>> $LOG 23 | yes | aptdcon --hide-terminal --install python3-pip &>> $LOG 24 | pip3 install jupyter ipyparallel parsl &>> $LOG 25 | cat < ipcontroller-engine.json 26 | {engine_config} 27 | EOF 28 | ipengine --file=ipcontroller-engine.json &>> $LOG 29 | ''' 30 | 31 | 32 | class JetstreamProvider(object): 33 | def __init__(self, config, poolname): 34 | self.config = config 35 | self.blocks = {} 36 | self.pool = poolname 37 | controller_file = "~/.ipython/profile_default/security/ipcontroller-engine.json" 38 | 39 | if not _nova_enabled: 40 | raise OptionalModuleMissing(['python-novaclient'], 41 | "Jetstream Provider requires the python-novaclient module.") 42 | 43 | self.client = client.Client( 44 | api_versions.APIVersion("2.0"), 45 | config['sites.jetstream']['OS_USERNAME'], 46 | config['sites.jetstream']['OS_PASSWORD'], 47 | project_id=config['sites.jetstream']['OS_PROJECT_ID'], 48 | project_name=config['sites.jetstream']['OS_PROJECT_NAME'], 49 | auth_url=config['sites.jetstream']['OS_AUTH_URL'], 50 | insecure=False, 51 | region_name=config['sites.jetstream']['OS_REGION_NAME'], 52 | user_domain_name=config['sites.jetstream']['OS_USER_DOMAIN_NAME']) 53 | 54 | api_version = api_versions.get_api_version("2.0") 55 | api_version = api_versions.discover_version(self.client, api_version) 56 | client.discover_extensions(api_version) 57 | 58 | logger.debug(self.client.has_neutron()) 59 | self.server_manager = self.client.servers 60 | 61 | try: 62 | with open(os.path.expanduser(controller_file), 'r') as f: 63 | self.engine_config = f.read() 64 | 65 | except FileNotFoundError: 66 | logger.error("No controller_file found at : %s. Cannot proceed", controller_file) 67 | exit(-1) 68 | 69 | except Exception as e: 70 | 71 | logger.error("Caught exception while reading from the ipcontroller_engine.json") 72 | raise e 73 | 74 | try: 75 | # Check if the authentication worked by forcing a call 76 | self.server_manager.list() 77 | 78 | except Exception as e: 79 | logger.error("Caught exception : %s", e) 80 | raise e 81 | 82 | flavors = self.client.flavors.list() 83 | 84 | try: 85 | self.flavor = [f for f in flavors if f.name == config['sites.jetstream.{0}'.format(poolname)]['flavor']][0] 86 | except Exception as e: 87 | logger.error("Caught exception : ", e) 88 | raise e 89 | 90 | self.sec_groups = ast.literal_eval(config['sites.jetstream.{0}'.format(poolname)]['sec_groups']) 91 | self.nics = ast.literal_eval(config['sites.jetstream.{0}'.format(poolname)]['nics']) 92 | 93 | def scale_out(self, blocks=1, block_size=1): 94 | ''' Scale out the existing resources. 95 | ''' 96 | self.config['sites.jetstream.{0}'.format(self.pool)]['flavor'] 97 | count = 0 98 | if blocks == 1: 99 | block_id = len(self.blocks) 100 | self.blocks[block_id] = [] 101 | for instance_id in range(0, block_size): 102 | instances = self.server_manager.create( 103 | 'parsl-{0}-{1}'.format(block_id, instance_id), # Name 104 | self.client.images.get('87e08a17-eae2-4ce4-9051-c561d9a54bde'), # Image_id 105 | self.client.flavors.list()[0], 106 | min_count=1, 107 | max_count=1, 108 | userdata=setup_script.format(engine_config=self.engine_config), 109 | key_name='TG-MCB090174-api-key', 110 | security_groups=['global-ssh'], 111 | nics=[{ 112 | "net-id": '724a50cf-7f11-4b3b-a884-cd7e6850e39e', 113 | "net-name": 'PARSL-priv-net', 114 | "v4-fixed-ip": '' 115 | }]) 116 | self.blocks[block_id].extend([instances]) 117 | count += 1 118 | 119 | return count 120 | 121 | def scale_in(self, blocks=0, machines=0, strategy=None): 122 | ''' Scale in resources 123 | ''' 124 | count = 0 125 | instances = self.client.servers.list() 126 | for instance in instances[0:machines]: 127 | print("Deleting : ", instance) 128 | instance.delete() 129 | count += 1 130 | 131 | return count 132 | 133 | 134 | if __name__ == '__main__': 135 | 136 | Config = configparser.ConfigParser() 137 | Config.read('/home/yadu/.ssh/parsl.config') 138 | 139 | print(Config['sites.jetstream']['OS_AUTH_URL']) 140 | foo = jetstream(Config, 'pool1') 141 | 142 | # foo.scale_out(blocks=1, block_size=2) 143 | foo.scale_in(machines=2) 144 | # foo.scale_out(blocks=1, block_size=2) 145 | -------------------------------------------------------------------------------- /libsubmit/providers/jetstream/setup_first_time.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | check_tools() { 4 | 5 | which nova 6 | if [[ $? != 0 ]] 7 | then 8 | echo "nova is missing. Try apt-get install nova" 9 | fi 10 | 11 | } 12 | 13 | setup_secgroups() { 14 | echo "Setting up sec groups" 15 | nova secgroup-create global-ssh "ssh & icmp enabled" 16 | nova secgroup-add-rule global-ssh tcp 22 22 0.0.0.0/0 17 | nova secgroup-add-rule global-ssh icmp -1 -1 0.0.0.0/0 18 | 19 | } 20 | 21 | setup_keypair() { 22 | ssh-keygen -b 2048 -t rsa -f ${OS_PROJECT_NAME}-api-key -P "" 23 | nova keypair-add --pub-key ${OS_PROJECT_NAME}-api-key.pub ${OS_PROJECT_NAME}-api-key 24 | } 25 | 26 | setup_network() { 27 | 28 | parsl_net="PARSL-priv-net" 29 | neutron net-create $parsl_net 30 | neutron net-list 31 | neutron subnet-create $parsl_net 10.0.0.0/24 --name parsl-api-subnet1 32 | neutron net-list 33 | neutron router-create parsl-api-router 34 | neutron router-interface-add parsl-api-router parsl-api-subnet1 35 | neutron router-gateway-set parsl-api-router public 36 | neutron router-show parsl-api-router 37 | } 38 | 39 | check_tools 40 | #setup_secgroups 41 | #setup_keypair 42 | setup_network 43 | nova boot parsl-executor-001 --flavor m1.small --image 87e08a17-eae2-4ce4-9051-c561d9a54bde --key-name TG-MCB090174-api-key --security-groups global-ssh --nic net-name=PARSL-priv-net 44 | -------------------------------------------------------------------------------- /libsubmit/providers/kubernetes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/kubernetes/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/kubernetes/kube.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from libsubmit.providers.kubernetes.template import template_string 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | from libsubmit.error import * 8 | from libsubmit.providers.provider_base import ExecutionProvider 9 | 10 | try: 11 | from kubernetes import client, config 12 | config.load_kube_config() 13 | _kubernetes_enabled = True 14 | except (ImportError, NameError, FileNotFoundError): 15 | _kubernetes_enabled = False 16 | 17 | 18 | class KubernetesProvider(ExecutionProvider): 19 | """ Kubernetes execution provider: 20 | 21 | TODO: put in a config 22 | """ 23 | 24 | def __repr__(self): 25 | return "".format(self.sitename) 26 | 27 | def __init__(self, config, channel=None): 28 | """ Initialize the Kubernetes execution provider class 29 | 30 | Args: 31 | - Config (dict): Dictionary with all the config options. 32 | 33 | KWargs : 34 | - channel (channel object) : default=None A channel object 35 | """ 36 | 37 | self.channel = channel 38 | 39 | if not _kubernetes_enabled: 40 | raise OptionalModuleMissing(['kubernetes'], 41 | "Kubernetes provider requires kubernetes module and config.") 42 | 43 | self.kube_client = client.ExtensionsV1beta1Api() 44 | 45 | self.config = config 46 | self.sitename = self.config['site'] 47 | self.namespace = self.config['execution']['namespace'] 48 | self.image = self.config['execution']['image'] 49 | 50 | self.init_blocks = self.config["execution"]["block"]["initBlocks"] 51 | self.min_blocks = self.config["execution"]["block"]["minBlocks"] 52 | self.max_blocks = self.config["execution"]["block"]["maxBlocks"] 53 | 54 | self.user_id = None 55 | self.group_id = None 56 | self.run_as_non_root = None 57 | if 'security' in self.config['execution']: 58 | self.user_id = self.config["execution"]['security']["user_id"] 59 | self.group_id = self.config["execution"]['security']["group_id"] 60 | self.run_as_non_root = self.config["execution"]['security']["run_as_non_root"] 61 | 62 | self.secret = None 63 | if 'secret' in self.config['execution']: 64 | self.secret = self.config['execution']['secret'] 65 | 66 | # Dictionary that keeps track of jobs, keyed on job_id 67 | self.resources = {} 68 | 69 | def submit(self, cmd_string, blocksize, job_name="parsl.auto"): 70 | """ Submit a job 71 | 72 | Args: 73 | - cmd_string :(String) - Name of the container to initiate 74 | - blocksize :(float) - Number of replicas 75 | 76 | Kwargs: 77 | - job_name (String): Name for job, must be unique 78 | 79 | Returns: 80 | - None: At capacity, cannot provision more 81 | - job_id: (string) Identifier for the job 82 | 83 | """ 84 | if not self.resources: 85 | job_name = "{0}-{1}".format(job_name, time.time()).split(".")[0] 86 | 87 | self.deployment_name = '{}-{}-deployment'.format(job_name, 88 | str(time.time()).split('.')[0]) 89 | 90 | formatted_cmd = template_string.format(command=cmd_string, 91 | overrides=self.config["execution"]["block"]["options"].get("overrides", '')) 92 | 93 | print("Creating replicas :", self.init_blocks) 94 | self.deployment_obj = self._create_deployment_object(job_name, 95 | self.image, 96 | self.deployment_name, 97 | cmd_string=formatted_cmd, 98 | replicas=self.init_blocks) 99 | logger.debug("Deployment name :{}".format(self.deployment_name)) 100 | self._create_deployment(self.deployment_obj) 101 | self.resources[self.deployment_name] = {'status': 'RUNNING', 102 | 'pods': self.init_blocks} 103 | 104 | return self.deployment_name 105 | 106 | def status(self, job_ids): 107 | """ Get the status of a list of jobs identified by the job identifiers 108 | returned from the submit request. 109 | 110 | Args: 111 | - job_ids (list) : A list of job identifiers 112 | 113 | Returns: 114 | - A list of status from ['PENDING', 'RUNNING', 'CANCELLED', 'COMPLETED', 115 | 'FAILED', 'TIMEOUT'] corresponding to each job_id in the job_ids list. 116 | 117 | Raises: 118 | - ExecutionProviderExceptions or its subclasses 119 | 120 | """ 121 | self._status() 122 | # This is a hack 123 | return ['RUNNING' for jid in job_ids] 124 | 125 | def cancel(self, job_ids): 126 | """ Cancels the jobs specified by a list of job ids 127 | 128 | Args: 129 | job_ids : [ ...] 130 | 131 | Returns : 132 | [True/False...] : If the cancel operation fails the entire list will be False. 133 | """ 134 | for job in job_ids: 135 | logger.debug("Terminating job/proc_id : {0}".format(job)) 136 | # Here we are assuming that for local, the job_ids are the process id's 137 | self._delete_deployment(job) 138 | 139 | self.resources[job]['status'] = 'CANCELLED' 140 | rets = [True for i in job_ids] 141 | 142 | return rets 143 | 144 | def _status(self): 145 | """ Internal: Do not call. Returns the status list for a list of job_ids 146 | 147 | Args: 148 | self 149 | 150 | Returns: 151 | [status...] : Status list of all jobs 152 | """ 153 | 154 | jobs_ids = list(self.resources.keys()) 155 | # TODO: fix this 156 | return jobs_ids 157 | # do something to get the deployment's status 158 | 159 | def _create_deployment_object(self, job_name, job_image, 160 | deployment_name, port=80, 161 | replicas=1, 162 | cmd_string=None, 163 | engine_json_file='~/.ipython/profile_default/security/ipcontroller-engine.json', 164 | engine_dir='.'): 165 | """ Create a kubernetes deployment for the job. 166 | 167 | Args: 168 | - job_name (string) : Name of the job and deployment 169 | - job_image (string) : Docker image to launch 170 | 171 | KWargs: 172 | - port (integer) : Container port 173 | - replicas : Number of replica containers to maintain 174 | 175 | Returns: 176 | - True: The deployment object to launch 177 | """ 178 | 179 | # sorry, quick hack that doesn't pass this stuff through to test it works. 180 | # TODO it also doesn't only add what is set :( 181 | security_context = None 182 | if 'security' in self.config['execution']: 183 | security_context = client.V1SecurityContext(run_as_group=self.group_id, 184 | run_as_user=self.user_id, 185 | run_as_non_root=self.run_as_non_root) 186 | # self.user_id = None 187 | # self.group_id = None 188 | # self.run_as_non_root = None 189 | # Create the enviornment variables and command to initiate IPP 190 | environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA") 191 | 192 | launch_args = ["-c", "{0}; /app/deploy.sh;".format(cmd_string)] 193 | print(launch_args) 194 | 195 | # Configureate Pod template container 196 | container = None 197 | if security_context: 198 | container = client.V1Container( 199 | name=job_name, 200 | image=job_image, 201 | ports=[client.V1ContainerPort(container_port=port)], 202 | command=['/bin/bash'], 203 | args=launch_args, 204 | env=[environment_vars], 205 | security_context=security_context) 206 | else: 207 | container = client.V1Container( 208 | name=job_name, 209 | image=job_image, 210 | ports=[client.V1ContainerPort(container_port=port)], 211 | command=['/bin/bash'], 212 | args=launch_args, 213 | env=[environment_vars]) 214 | # Create a secret to enable pulling images from secure repositories 215 | secret = None 216 | if self.secret: 217 | secret = client.V1LocalObjectReference(name=self.secret) 218 | 219 | # Create and configurate a spec section 220 | template = client.V1PodTemplateSpec( 221 | metadata=client.V1ObjectMeta(labels={"app": job_name}), 222 | spec=client.V1PodSpec(containers=[container], image_pull_secrets=[secret])) 223 | 224 | # Create the specification of deployment 225 | spec = client.ExtensionsV1beta1DeploymentSpec(replicas=replicas, 226 | template=template) 227 | 228 | # Instantiate the deployment object 229 | deployment = client.ExtensionsV1beta1Deployment( 230 | api_version="extensions/v1beta1", 231 | kind="Deployment", 232 | metadata=client.V1ObjectMeta(name=deployment_name), 233 | spec=spec) 234 | 235 | return deployment 236 | 237 | def _create_deployment(self, deployment): 238 | """ Create the kubernetes deployment """ 239 | 240 | api_response = self.kube_client.create_namespaced_deployment( 241 | body=deployment, 242 | namespace=self.namespace) 243 | 244 | logger.debug("Deployment created. status='{0}'".format(str(api_response.status))) 245 | 246 | def _delete_deployment(self, deployment_name): 247 | """ Delete deployment """ 248 | 249 | api_response = self.kube_client.delete_namespaced_deployment( 250 | name=deployment_name, 251 | namespace=self.namespace, 252 | body=client.V1DeleteOptions( 253 | propagation_policy='Foreground', 254 | grace_period_seconds=5)) 255 | 256 | logger.debug("Deployment deleted. status='{0}'".format( 257 | str(api_response.status))) 258 | 259 | @property 260 | def scaling_enabled(self): 261 | return False 262 | 263 | @property 264 | def channels_required(self): 265 | return False 266 | -------------------------------------------------------------------------------- /libsubmit/providers/kubernetes/template.py: -------------------------------------------------------------------------------- 1 | template_string = """{overrides} 2 | 3 | {command} 4 | """ 5 | -------------------------------------------------------------------------------- /libsubmit/providers/local/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/local/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/local/local.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import signal 4 | import time 5 | 6 | import libsubmit.error as ep_error 7 | from libsubmit.channels import LocalChannel 8 | from libsubmit.launchers import SingleNodeLauncher 9 | from libsubmit.providers.provider_base import ExecutionProvider 10 | from libsubmit.utils import RepresentationMixin 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | translate_table = { 15 | 'PD': 'PENDING', 16 | 'R': 'RUNNING', 17 | 'CA': 'CANCELLED', 18 | 'CF': 'PENDING', # (configuring), 19 | 'CG': 'RUNNING', # (completing), 20 | 'CD': 'COMPLETED', 21 | 'F': 'FAILED', 22 | 'TO': 'TIMEOUT', 23 | 'NF': 'FAILED', # (node failure), 24 | 'RV': 'FAILED', # (revoked) and 25 | 'SE': 'FAILED' 26 | } # (special exit state 27 | 28 | 29 | class LocalProvider(ExecutionProvider, RepresentationMixin): 30 | """ Local Execution Provider 31 | 32 | This provider is used to provide execution resources from the localhost. 33 | 34 | Parameters 35 | ---------- 36 | 37 | min_blocks : int 38 | Minimum number of blocks to maintain. 39 | max_blocks : int 40 | Maximum number of blocks to maintain. 41 | parallelism : float 42 | Ratio of provisioned task slots to active tasks. A parallelism value of 1 represents aggressive 43 | scaling where as many resources as possible are used; parallelism close to 0 represents 44 | the opposite situation in which as few resources as possible (i.e., min_blocks) are used. 45 | """ 46 | 47 | def __init__(self, 48 | channel=LocalChannel(), 49 | label='local', 50 | script_dir='parsl_scripts', 51 | tasks_per_node=1, 52 | nodes_per_block=1, 53 | launcher=SingleNodeLauncher(), 54 | init_blocks=4, 55 | min_blocks=0, 56 | max_blocks=10, 57 | walltime="00:15:00", 58 | parallelism=1): 59 | self.channel = channel 60 | self.label = label 61 | if not os.path.exists(script_dir): 62 | os.makedirs(script_dir) 63 | self.script_dir = script_dir 64 | self.provisioned_blocks = 0 65 | self.nodes_per_block = nodes_per_block 66 | self.tasks_per_node = tasks_per_node 67 | self.launcher = launcher 68 | self.init_blocks = init_blocks 69 | self.min_blocks = min_blocks 70 | self.max_blocks = max_blocks 71 | self.parallelism = parallelism 72 | self.walltime = walltime 73 | 74 | # Dictionary that keeps track of jobs, keyed on job_id 75 | self.resources = {} 76 | 77 | def status(self, job_ids): 78 | ''' Get the status of a list of jobs identified by their ids. 79 | 80 | Args: 81 | - job_ids (List of ids) : List of identifiers for the jobs 82 | 83 | Returns: 84 | - List of status codes. 85 | 86 | ''' 87 | 88 | logging.debug("Checking status of : {0}".format(job_ids)) 89 | for job_id in self.resources: 90 | poll_code = self.resources[job_id]['proc'].poll() 91 | if self.resources[job_id]['status'] in ['COMPLETED', 'FAILED']: 92 | continue 93 | 94 | if poll_code is None: 95 | self.resources[job_id]['status'] = 'RUNNING' 96 | elif poll_code == 0 and self.resources[job_id]['status'] != 'RUNNING': 97 | self.resources[job_id]['status'] = 'COMPLETED' 98 | elif poll_code < 0 and self.resources[job_id]['status'] != 'RUNNING': 99 | self.resources[job_id]['status'] = 'FAILED' 100 | 101 | return [self.resources[jid]['status'] for jid in job_ids] 102 | 103 | def _write_submit_script(self, script_string, script_filename): 104 | ''' 105 | Load the template string with config values and write the generated submit script to 106 | a submit script file. 107 | 108 | Args: 109 | - template_string (string) : The template string to be used for the writing submit script 110 | - script_filename (string) : Name of the submit script 111 | 112 | Returns: 113 | - True: on success 114 | 115 | Raises: 116 | SchedulerMissingArgs : If template is missing args 117 | ScriptPathError : Unable to write submit script out 118 | ''' 119 | 120 | try: 121 | with open(script_filename, 'w') as f: 122 | f.write(script_string) 123 | 124 | except KeyError as e: 125 | logger.error("Missing keys for submit script : %s", e) 126 | raise (ep_error.SchedulerMissingArgs(e.args, self.label)) 127 | 128 | except IOError as e: 129 | logger.error("Failed writing to submit script: %s", script_filename) 130 | raise (ep_error.ScriptPathError(script_filename, e)) 131 | 132 | return True 133 | 134 | def submit(self, command, blocksize, job_name="parsl.auto"): 135 | ''' Submits the command onto an Local Resource Manager job of blocksize parallel elements. 136 | Submit returns an ID that corresponds to the task that was just submitted. 137 | 138 | If tasks_per_node < 1: 139 | 1/tasks_per_node is provisioned 140 | 141 | If tasks_per_node == 1: 142 | A single node is provisioned 143 | 144 | If tasks_per_node > 1 : 145 | tasks_per_node * blocksize number of nodes are provisioned. 146 | 147 | Args: 148 | - command :(String) Commandline invocation to be made on the remote side. 149 | - blocksize :(float) - Not really used for local 150 | 151 | Kwargs: 152 | - job_name (String): Name for job, must be unique 153 | 154 | Returns: 155 | - None: At capacity, cannot provision more 156 | - job_id: (string) Identifier for the job 157 | 158 | ''' 159 | 160 | job_name = "{0}.{1}".format(job_name, time.time()) 161 | 162 | # Set script path 163 | script_path = "{0}/{1}.sh".format(self.script_dir, job_name) 164 | script_path = os.path.abspath(script_path) 165 | 166 | wrap_command = self.launcher(command, self.tasks_per_node, self.nodes_per_block) 167 | 168 | self._write_submit_script(wrap_command, script_path) 169 | 170 | job_id, proc = self.channel.execute_no_wait('bash {0}'.format(script_path), 3) 171 | self.resources[job_id] = {'job_id': job_id, 'status': 'RUNNING', 'blocksize': blocksize, 'proc': proc} 172 | 173 | return job_id 174 | 175 | def cancel(self, job_ids): 176 | ''' Cancels the jobs specified by a list of job ids 177 | 178 | Args: 179 | job_ids : [ ...] 180 | 181 | Returns : 182 | [True/False...] : If the cancel operation fails the entire list will be False. 183 | ''' 184 | 185 | for job in job_ids: 186 | logger.debug("Terminating job/proc_id : {0}".format(job)) 187 | # Here we are assuming that for local, the job_ids are the process id's 188 | proc = self.resources[job]['proc'] 189 | os.killpg(os.getpgid(proc.pid), signal.SIGTERM) 190 | self.resources[job]['status'] = 'CANCELLED' 191 | rets = [True for i in job_ids] 192 | 193 | return rets 194 | 195 | @property 196 | def scaling_enabled(self): 197 | return True 198 | 199 | @property 200 | def current_capacity(self): 201 | return len(self.resources) 202 | 203 | 204 | if __name__ == "__main__": 205 | 206 | print("Nothing here") 207 | -------------------------------------------------------------------------------- /libsubmit/providers/provider_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod, abstractproperty 2 | 3 | 4 | class ExecutionProvider(metaclass=ABCMeta): 5 | """ Define the strict interface for all Execution Provider 6 | 7 | .. code:: python 8 | 9 | +------------------ 10 | | 11 | script_string ------->| submit 12 | id <--------|---+ 13 | | 14 | [ ids ] ------->| status 15 | [statuses] <--------|----+ 16 | | 17 | [ ids ] ------->| cancel 18 | [cancel] <--------|----+ 19 | | 20 | [True/False] <--------| scaling_enabled 21 | | 22 | +------------------- 23 | """ 24 | 25 | @abstractmethod 26 | def submit(self, command, blocksize, job_name="parsl.auto"): 27 | ''' The submit method takes the command string to be executed upon 28 | instantiation of a resource most often to start a pilot (such as IPP engine 29 | or even Swift-T engines). 30 | 31 | Args : 32 | - command (str) : The bash command string to be executed. 33 | - blocksize (int) : Blocksize to be requested 34 | 35 | KWargs: 36 | - job_name (str) : Human friendly name to be assigned to the job request 37 | 38 | Returns: 39 | - A job identifier, this could be an integer, string etc 40 | 41 | Raises: 42 | - ExecutionProviderException or its subclasses 43 | ''' 44 | 45 | pass 46 | 47 | @abstractmethod 48 | def status(self, job_ids): 49 | ''' Get the status of a list of jobs identified by the job identifiers 50 | returned from the submit request. 51 | 52 | Args: 53 | - job_ids (list) : A list of job identifiers 54 | 55 | Returns: 56 | - A list of status from ['PENDING', 'RUNNING', 'CANCELLED', 'COMPLETED', 57 | 'FAILED', 'TIMEOUT'] corresponding to each job_id in the job_ids list. 58 | 59 | Raises: 60 | - ExecutionProviderException or its subclasses 61 | 62 | ''' 63 | 64 | pass 65 | 66 | @abstractmethod 67 | def cancel(self, job_ids): 68 | ''' Cancels the resources identified by the job_ids provided by the user. 69 | 70 | Args: 71 | - job_ids (list): A list of job identifiers 72 | 73 | Returns: 74 | - A list of status from cancelling the job which can be True, False 75 | 76 | Raises: 77 | - ExecutionProviderException or its subclasses 78 | ''' 79 | 80 | pass 81 | 82 | @abstractproperty 83 | def scaling_enabled(self): 84 | ''' The callers of ParslExecutors need to differentiate between Executors 85 | and Executors wrapped in a resource provider 86 | 87 | Returns: 88 | - Status (Bool) 89 | ''' 90 | 91 | pass 92 | -------------------------------------------------------------------------------- /libsubmit/providers/slurm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/slurm/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/slurm/slurm.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | 5 | from libsubmit.channels import LocalChannel 6 | from libsubmit.launchers import SingleNodeLauncher 7 | from libsubmit.providers.cluster_provider import ClusterProvider 8 | from libsubmit.providers.slurm.template import template_string 9 | from libsubmit.utils import RepresentationMixin, wtime_to_minutes 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | translate_table = { 14 | 'PD': 'PENDING', 15 | 'R': 'RUNNING', 16 | 'CA': 'CANCELLED', 17 | 'CF': 'PENDING', # (configuring), 18 | 'CG': 'RUNNING', # (completing), 19 | 'CD': 'COMPLETED', 20 | 'F': 'FAILED', # (failed), 21 | 'TO': 'TIMEOUT', # (timeout), 22 | 'NF': 'FAILED', # (node failure), 23 | 'RV': 'FAILED', # (revoked) and 24 | 'SE': 'FAILED' 25 | } # (special exit state 26 | 27 | 28 | class SlurmProvider(ClusterProvider, RepresentationMixin): 29 | """Slurm Execution Provider 30 | 31 | This provider uses sbatch to submit, squeue for status and scancel to cancel 32 | jobs. The sbatch script to be used is created from a template file in this 33 | same module. 34 | 35 | Parameters 36 | ---------- 37 | partition : str 38 | Slurm partition to request blocks from. 39 | label : str 40 | Label for this provider. 41 | channel : Channel 42 | Channel for accessing this provider. Possible channels include 43 | :class:`~libsubmit.channels.LocalChannel` (the default), 44 | :class:`~libsubmit.channels.SSHChannel`, or 45 | :class:`~libsubmit.channels.SSHInteractiveLoginChannel`. 46 | script_dir : str 47 | Relative or absolute path to a directory where intermediate scripts are placed. 48 | nodes_per_block : int 49 | Nodes to provision per block. 50 | tasks_per_node : int 51 | Tasks to run per node. 52 | min_blocks : int 53 | Minimum number of blocks to maintain. 54 | max_blocks : int 55 | Maximum number of blocks to maintain. 56 | parallelism : float 57 | Ratio of provisioned task slots to active tasks. A parallelism value of 1 represents aggressive 58 | scaling where as many resources as possible are used; parallelism close to 0 represents 59 | the opposite situation in which as few resources as possible (i.e., min_blocks) are used. 60 | walltime : str 61 | Walltime requested per block in HH:MM:SS. 62 | overrides : str 63 | String to prepend to the #SBATCH blocks in the submit script to the scheduler. 64 | launcher : Launcher 65 | Launcher for this provider. Possible launchers include 66 | :class:`~libsubmit.launchers.SingleNodeLauncher` (the default), 67 | :class:`~libsubmit.launchers.SrunLauncher`, or 68 | :class:`~libsubmit.launchers.AprunLauncher` 69 | """ 70 | 71 | def __init__(self, 72 | partition, 73 | label='slurm', 74 | channel=LocalChannel(), 75 | script_dir='parsl_scripts', 76 | nodes_per_block=1, 77 | tasks_per_node=1, 78 | init_blocks=1, 79 | min_blocks=0, 80 | max_blocks=10, 81 | parallelism=1, 82 | walltime="00:10:00", 83 | overrides='', 84 | cmd_timeout=10, 85 | launcher=SingleNodeLauncher()): 86 | super().__init__(label, 87 | channel, 88 | script_dir, 89 | nodes_per_block, 90 | tasks_per_node, 91 | init_blocks, 92 | min_blocks, 93 | max_blocks, 94 | parallelism, 95 | walltime, 96 | cmd_timeout=cmd_timeout, 97 | launcher=launcher) 98 | self.partition = partition 99 | self.overrides = overrides 100 | 101 | def _status(self): 102 | ''' Internal: Do not call. Returns the status list for a list of job_ids 103 | 104 | Args: 105 | self 106 | 107 | Returns: 108 | [status...] : Status list of all jobs 109 | ''' 110 | job_id_list = ','.join(self.resources.keys()) 111 | cmd = "squeue --job {0}".format(job_id_list) 112 | 113 | retcode, stdout, stderr = super().execute_wait(cmd) 114 | 115 | # Execute_wait failed. Do no update 116 | if retcode != 0: 117 | return 118 | 119 | jobs_missing = list(self.resources.keys()) 120 | for line in stdout.split('\n'): 121 | parts = line.split() 122 | if parts and parts[0] != 'JOBID': 123 | job_id = parts[0] 124 | status = translate_table.get(parts[4], 'UNKNOWN') 125 | self.resources[job_id]['status'] = status 126 | jobs_missing.remove(job_id) 127 | 128 | # squeue does not report on jobs that are not running. So we are filling in the 129 | # blanks for missing jobs, we might lose some information about why the jobs failed. 130 | for missing_job in jobs_missing: 131 | if self.resources[missing_job]['status'] in ['PENDING', 'RUNNING']: 132 | self.resources[missing_job]['status'] = 'COMPLETED' 133 | 134 | def submit(self, command, blocksize, job_name="parsl.auto"): 135 | """Submit the command as a slurm job of blocksize parallel elements. 136 | 137 | Parameters 138 | ---------- 139 | command : str 140 | Command to be made on the remote side. 141 | blocksize : int 142 | Not implemented. 143 | job_name : str 144 | Name for the job (must be unique). 145 | 146 | Returns 147 | ------- 148 | None or str 149 | If at capacity, returns None; otherwise, a string identifier for the job 150 | """ 151 | 152 | if self.provisioned_blocks >= self.max_blocks: 153 | logger.warn("Slurm provider '{}' is at capacity (no more blocks will be added)".format(self.label)) 154 | return None 155 | 156 | job_name = "{0}.{1}".format(job_name, time.time()) 157 | 158 | script_path = "{0}/{1}.submit".format(self.script_dir, job_name) 159 | script_path = os.path.abspath(script_path) 160 | 161 | logger.debug("Requesting one block with {} nodes".format(self.nodes_per_block)) 162 | 163 | job_config = {} 164 | job_config["submit_script_dir"] = self.channel.script_dir 165 | job_config["nodes"] = self.nodes_per_block 166 | job_config["tasks_per_node"] = self.tasks_per_node 167 | job_config["walltime"] = wtime_to_minutes(self.walltime) 168 | job_config["overrides"] = self.overrides 169 | job_config["partition"] = self.partition 170 | job_config["user_script"] = command 171 | 172 | # Wrap the command 173 | job_config["user_script"] = self.launcher(command, 174 | self.tasks_per_node, 175 | self.nodes_per_block) 176 | 177 | logger.debug("Writing submit script") 178 | self._write_submit_script(template_string, script_path, job_name, job_config) 179 | 180 | channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) 181 | 182 | retcode, stdout, stderr = super().execute_wait("sbatch {0}".format(channel_script_path)) 183 | 184 | job_id = None 185 | if retcode == 0: 186 | for line in stdout.split('\n'): 187 | if line.startswith("Submitted batch job"): 188 | job_id = line.split("Submitted batch job")[1].strip() 189 | self.resources[job_id] = {'job_id': job_id, 'status': 'PENDING', 'blocksize': blocksize} 190 | else: 191 | print("Submission of command to scale_out failed") 192 | logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) 193 | return job_id 194 | 195 | def cancel(self, job_ids): 196 | ''' Cancels the jobs specified by a list of job ids 197 | 198 | Args: 199 | job_ids : [ ...] 200 | 201 | Returns : 202 | [True/False...] : If the cancel operation fails the entire list will be False. 203 | ''' 204 | 205 | job_id_list = ' '.join(job_ids) 206 | retcode, stdout, stderr = super().execute_wait("scancel {0}".format(job_id_list)) 207 | rets = None 208 | if retcode == 0: 209 | for jid in job_ids: 210 | self.resources[jid]['status'] = translate_table['CA'] # Setting state to cancelled 211 | rets = [True for i in job_ids] 212 | else: 213 | rets = [False for i in job_ids] 214 | 215 | return rets 216 | 217 | def _test_add_resource(self, job_id): 218 | self.resources.extend([{'job_id': job_id, 'status': 'PENDING', 'size': 1}]) 219 | return True 220 | 221 | 222 | if __name__ == "__main__": 223 | 224 | print("None") 225 | -------------------------------------------------------------------------------- /libsubmit/providers/slurm/template.py: -------------------------------------------------------------------------------- 1 | template_string = '''#!/bin/bash 2 | 3 | #SBATCH --job-name=${jobname} 4 | #SBATCH --output=${submit_script_dir}/${jobname}.submit.stdout 5 | #SBATCH --error=${submit_script_dir}/${jobname}.submit.stderr 6 | #SBATCH --nodes=${nodes} 7 | #SBATCH --partition=${partition} 8 | #SBATCH --time=${walltime} 9 | #SBATCH --ntasks-per-node=${tasks_per_node} 10 | 11 | $overrides 12 | 13 | export JOBNAME="${jobname}" 14 | 15 | $user_script 16 | ''' 17 | -------------------------------------------------------------------------------- /libsubmit/providers/torque/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsl/libsubmit/5c2cbf0c31365050a83b98a93b77edf6b065adea/libsubmit/providers/torque/__init__.py -------------------------------------------------------------------------------- /libsubmit/providers/torque/template.py: -------------------------------------------------------------------------------- 1 | template_string = '''#!/bin/bash 2 | 3 | #PBS -S /bin/bash 4 | #PBS -N ${jobname} 5 | #PBS -m n 6 | #PBS -k eo 7 | #PBS -l walltime=$walltime 8 | #PBS -l nodes=${nodes_per_block}:ppn=${tasks_per_node} 9 | #PBS -o ${submit_script_dir}/${jobname}.submit.stdout 10 | #PBS -e ${submit_script_dir}/${jobname}.submit.stderr 11 | #PBS -v WORKER_LOGGING_LEVEL 12 | ${overrides} 13 | 14 | export JOBNAME="${jobname}" 15 | 16 | ${user_script} 17 | 18 | ''' 19 | -------------------------------------------------------------------------------- /libsubmit/providers/torque/torque.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | 5 | from libsubmit.channels import LocalChannel 6 | from libsubmit.launchers import AprunLauncher 7 | from libsubmit.providers.torque.template import template_string 8 | from libsubmit.providers.cluster_provider import ClusterProvider 9 | from libsubmit.utils import RepresentationMixin 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | # From the man pages for qstat for PBS/Torque systems 14 | translate_table = { 15 | 'R': 'RUNNING', 16 | 'C': 'COMPLETED', # Completed after having run 17 | 'E': 'COMPLETED', # Exiting after having run 18 | 'H': 'HELD', # Held 19 | 'Q': 'PENDING', # Queued, and eligible to run 20 | 'W': 'PENDING', # Job is waiting for it's execution time (-a option) to be reached 21 | 'S': 'HELD' 22 | } # Suspended 23 | 24 | 25 | class TorqueProvider(ClusterProvider, RepresentationMixin): 26 | """Torque Execution Provider 27 | 28 | This provider uses sbatch to submit, squeue for status, and scancel to cancel 29 | jobs. The sbatch script to be used is created from a template file in this 30 | same module. 31 | 32 | Parameters 33 | ---------- 34 | channel : Channel 35 | Channel for accessing this provider. Possible channels include 36 | :class:`~libsubmit.channels.LocalChannel` (the default), 37 | :class:`~libsubmit.channels.SSHChannel`, or 38 | :class:`~libsubmit.channels.SSHInteractiveLoginChannel`. 39 | account : str 40 | Account the job will be charged against. 41 | queue : str 42 | Torque queue to request blocks from. 43 | label : str 44 | Label for this provider. 45 | script_dir : str 46 | Relative or absolute path to a directory where intermediate scripts are placed. 47 | nodes_per_block : int 48 | Nodes to provision per block. 49 | tasks_per_node : int 50 | Tasks to run per node. 51 | init_blocks : int 52 | Number of blocks to provision at the start of the run. Default is 1. 53 | min_blocks : int 54 | Minimum number of blocks to maintain. Default is 0. 55 | max_blocks : int 56 | Maximum number of blocks to maintain. 57 | parallelism : float 58 | Ratio of provisioned task slots to active tasks. A parallelism value of 1 represents aggressive 59 | scaling where as many resources as possible are used; parallelism close to 0 represents 60 | the opposite situation in which as few resources as possible (i.e., min_blocks) are used. 61 | walltime : str 62 | Walltime requested per block in HH:MM:SS. 63 | overrides : str 64 | String to prepend to the Torque submit script. 65 | launcher : Launcher 66 | Launcher for this provider. Possible launchers include 67 | :class:`~libsubmit.launchers.AprunLauncher` (the default), or 68 | :class:`~libsubmit.launchers.SingleNodeLauncher`, 69 | 70 | """ 71 | def __init__(self, 72 | channel=LocalChannel(), 73 | account=None, 74 | queue=None, 75 | overrides='', 76 | label='torque', 77 | script_dir='parsl_scripts', 78 | nodes_per_block=1, 79 | tasks_per_node=1, 80 | init_blocks=1, 81 | min_blocks=0, 82 | max_blocks=100, 83 | parallelism=1, 84 | launcher=AprunLauncher(), 85 | walltime="00:20:00"): 86 | super().__init__(label, 87 | channel, 88 | script_dir, 89 | nodes_per_block, 90 | tasks_per_node, 91 | init_blocks, 92 | min_blocks, 93 | max_blocks, 94 | parallelism, 95 | walltime, 96 | launcher) 97 | 98 | self.account = account 99 | self.queue = queue 100 | self.overrides = overrides 101 | self.provisioned_blocks = 0 102 | 103 | self.script_dir = script_dir 104 | if not os.path.exists(self.script_dir): 105 | os.makedirs(self.script_dir) 106 | 107 | # Dictionary that keeps track of jobs, keyed on job_id 108 | self.resources = {} 109 | 110 | def _status(self): 111 | ''' Internal: Do not call. Returns the status list for a list of job_ids 112 | 113 | Args: 114 | self 115 | 116 | Returns: 117 | [status...] : Status list of all jobs 118 | ''' 119 | 120 | job_id_list = ' '.join(self.resources.keys()) 121 | 122 | jobs_missing = list(self.resources.keys()) 123 | 124 | retcode, stdout, stderr = self.channel.execute_wait("qstat {0}".format(job_id_list), 3) 125 | for line in stdout.split('\n'): 126 | parts = line.split() 127 | if not parts or parts[0].upper().startswith('JOB') or parts[0].startswith('---'): 128 | continue 129 | job_id = parts[0] 130 | status = translate_table.get(parts[4], 'UNKNOWN') 131 | self.resources[job_id]['status'] = status 132 | jobs_missing.remove(job_id) 133 | 134 | # squeue does not report on jobs that are not running. So we are filling in the 135 | # blanks for missing jobs, we might lose some information about why the jobs failed. 136 | for missing_job in jobs_missing: 137 | if self.resources[missing_job]['status'] in ['PENDING', 'RUNNING']: 138 | self.resources[missing_job]['status'] = translate_table['E'] 139 | 140 | def submit(self, command, blocksize, job_name="parsl.auto"): 141 | ''' Submits the command onto an Local Resource Manager job of blocksize parallel elements. 142 | Submit returns an ID that corresponds to the task that was just submitted. 143 | 144 | If tasks_per_node < 1 : ! This is illegal. tasks_per_node should be integer 145 | 146 | If tasks_per_node == 1: 147 | A single node is provisioned 148 | 149 | If tasks_per_node > 1 : 150 | tasks_per_node * blocksize number of nodes are provisioned. 151 | 152 | Args: 153 | - command :(String) Commandline invocation to be made on the remote side. 154 | - blocksize :(float) 155 | 156 | Kwargs: 157 | - job_name (String): Name for job, must be unique 158 | 159 | Returns: 160 | - None: At capacity, cannot provision more 161 | - job_id: (string) Identifier for the job 162 | 163 | ''' 164 | 165 | if self.provisioned_blocks >= self.max_blocks: 166 | logger.warn("[%s] at capacity, cannot add more blocks now", self.label) 167 | return None 168 | 169 | # Note: Fix this later to avoid confusing behavior. 170 | # We should always allocate blocks in integer counts of node_granularity 171 | if blocksize < self.nodes_per_block: 172 | blocksize = self.nodes_per_block 173 | 174 | # Set job name 175 | job_name = "parsl.{0}.{1}".format(job_name, time.time()) 176 | 177 | # Set script path 178 | script_path = "{0}/{1}.submit".format(self.script_dir, job_name) 179 | script_path = os.path.abspath(script_path) 180 | 181 | logger.debug("Requesting blocksize:%s nodes_per_block:%s tasks_per_node:%s", blocksize, self.nodes_per_block, 182 | self.tasks_per_node) 183 | 184 | job_config = {} 185 | # TODO : script_path might need to change to accommodate script dir set via channels 186 | job_config["submit_script_dir"] = self.channel.script_dir 187 | job_config["nodes"] = self.nodes_per_block 188 | job_config["task_blocks"] = self.nodes_per_block * self.tasks_per_node 189 | job_config["nodes_per_block"] = self.nodes_per_block 190 | job_config["tasks_per_node"] = self.tasks_per_node 191 | job_config["walltime"] = self.walltime 192 | job_config["overrides"] = self.overrides 193 | job_config["user_script"] = command 194 | 195 | # Wrap the command 196 | job_config["user_script"] = self.launcher(command, 197 | self.tasks_per_node, 198 | self.nodes_per_block) 199 | 200 | logger.debug("Writing submit script") 201 | self._write_submit_script(template_string, script_path, job_name, job_config) 202 | 203 | channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) 204 | 205 | submit_options = '' 206 | if self.queue is not None: 207 | submit_options = '{0} -q {1}'.format(submit_options, self.queue) 208 | if self.account is not None: 209 | submit_options = '{0} -A {1}'.format(submit_options, self.account) 210 | 211 | launch_cmd = "qsub {0} {1}".format(submit_options, channel_script_path) 212 | retcode, stdout, stderr = self.channel.execute_wait(launch_cmd, 10) 213 | 214 | job_id = None 215 | if retcode == 0: 216 | for line in stdout.split('\n'): 217 | if line.strip(): 218 | job_id = line.strip() 219 | self.resources[job_id] = {'job_id': job_id, 'status': 'PENDING', 'blocksize': blocksize} 220 | else: 221 | message = "Command '{}' failed with return code {}".format(launch_cmd, retcode) 222 | if (stdout is not None) and (stderr is not None): 223 | message += "\nstderr:{}\nstdout{}".format(stderr.strip(), stdout.strip()) 224 | logger.error(message) 225 | 226 | return job_id 227 | 228 | def cancel(self, job_ids): 229 | ''' Cancels the jobs specified by a list of job ids 230 | 231 | Args: 232 | job_ids : [ ...] 233 | 234 | Returns : 235 | [True/False...] : If the cancel operation fails the entire list will be False. 236 | ''' 237 | 238 | job_id_list = ' '.join(job_ids) 239 | retcode, stdout, stderr = self.channel.execute_wait("qdel {0}".format(job_id_list), 3) 240 | rets = None 241 | if retcode == 0: 242 | for jid in job_ids: 243 | self.resources[jid]['status'] = translate_table['E'] # Setting state to exiting 244 | rets = [True for i in job_ids] 245 | else: 246 | rets = [False for i in job_ids] 247 | 248 | return rets 249 | 250 | 251 | if __name__ == "__main__": 252 | 253 | print("None") 254 | -------------------------------------------------------------------------------- /libsubmit/tests/setup_path.sh: -------------------------------------------------------------------------------- 1 | export PATH=$PWD:$PATH 2 | export PYTHONPATH=$PWD/../../:$PYTHONPATH 3 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/remote_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Hostname: $HOSTNAME" 3 | echo "Cpu info -----" 4 | cat /proc/cpuinfo 5 | echo "Done----------" 6 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_channels.py: -------------------------------------------------------------------------------- 1 | from libsubmit.channels.local.local import LocalChannel 2 | 3 | 4 | def test_local(): 5 | 6 | channel = LocalChannel(None, None) 7 | 8 | ec, out, err = channel.execute_wait('echo "pwd: $PWD"', 2) 9 | 10 | assert ec == 0, "Channel execute failed" 11 | print("Stdout: ", out) 12 | print("Stderr: ", err) 13 | 14 | 15 | if __name__ == "__main__": 16 | 17 | test_local() 18 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_local_channel.py: -------------------------------------------------------------------------------- 1 | from libsubmit.channels.local.local import LocalChannel 2 | 3 | 4 | def test_env(): 5 | ''' Regression testing for issue #27 6 | ''' 7 | 8 | lc = LocalChannel() 9 | rc, stdout, stderr = lc.execute_wait("env", 1) 10 | 11 | stdout = stdout.split('\n') 12 | x = [l for l in stdout if l.startswith("PATH=")] 13 | assert x, "PATH not found" 14 | 15 | x = [l for l in stdout if l.startswith("HOME=")] 16 | assert x, "HOME not found" 17 | 18 | print("RC:{} \nSTDOUT:{} \nSTDERR:{}".format(rc, stdout, stderr)) 19 | 20 | 21 | def test_env_mod(): 22 | ''' Testing for env update at execute time. 23 | ''' 24 | 25 | lc = LocalChannel() 26 | rc, stdout, stderr = lc.execute_wait("env", 1, {'TEST_ENV': 'fooo'}) 27 | 28 | stdout = stdout.split('\n') 29 | x = [l for l in stdout if l.startswith("PATH=")] 30 | assert x, "PATH not found" 31 | 32 | x = [l for l in stdout if l.startswith("HOME=")] 33 | assert x, "HOME not found" 34 | 35 | x = [l for l in stdout if l.startswith("TEST_ENV=fooo")] 36 | assert x, "User set env missing" 37 | 38 | 39 | if __name__ == "__main__": 40 | 41 | test_env() 42 | test_env_mod() 43 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_scp_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from libsubmit.channels.ssh.ssh import SSHChannel as SSH 4 | 5 | 6 | def connect_and_list(hostname, username): 7 | out = '' 8 | conn = SSH(hostname, username=username) 9 | conn.push_file(os.path.abspath('remote_run.sh'), '/home/davidk/') 10 | # ec, out, err = conn.execute_wait("ls /tmp/remote_run.sh; bash /tmp/remote_run.sh") 11 | conn.close() 12 | return out 13 | 14 | 15 | script = '''#!/bin/bash 16 | echo "Hostname: $HOSTNAME" 17 | echo "Cpu info -----" 18 | cat /proc/cpuinfo 19 | echo "Done----------" 20 | ''' 21 | 22 | 23 | def test_connect_1(): 24 | with open('remote_run.sh', 'w') as f: 25 | f.write(script) 26 | 27 | sites = { 28 | 'midway': { 29 | 'url': 'midway.rcc.uchicago.edu', 30 | 'uname': 'yadunand' 31 | }, 32 | 'swift': { 33 | 'url': 'swift.rcc.uchicago.edu', 34 | 'uname': 'yadunand' 35 | }, 36 | 'cori': { 37 | 'url': 'cori.nersc.gov', 38 | 'uname': 'yadunand' 39 | } 40 | } 41 | 42 | for site in sites.values(): 43 | out = connect_and_list(site['url'], site['uname']) 44 | print("Sitename :{0} hostname:{1}".format(site['url'], out)) 45 | 46 | 47 | if __name__ == "__main__": 48 | 49 | test_connect_1() 50 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_ssh_1.py: -------------------------------------------------------------------------------- 1 | from libsubmit.channels.ssh.ssh import SSHChannel as SSH 2 | 3 | 4 | def connect_and_list(hostname, username): 5 | conn = SSH(hostname, username=username) 6 | ec, out, err = conn.execute_wait("echo $HOSTNAME") 7 | conn.close() 8 | return out 9 | 10 | 11 | def test_midway(): 12 | ''' Test ssh channels to midway 13 | ''' 14 | url = 'midway.rcc.uchicago.edu' 15 | uname = 'yadunand' 16 | out = connect_and_list(url, uname) 17 | print("Sitename :{0} hostname:{1}".format(url, out)) 18 | 19 | 20 | def test_beagle(): 21 | ''' Test ssh channels to beagle 22 | ''' 23 | url = 'login04.beagle.ci.uchicago.edu' 24 | uname = 'yadunandb' 25 | out = connect_and_list(url, uname) 26 | print("Sitename :{0} hostname:{1}".format(url, out)) 27 | 28 | 29 | def test_osg(): 30 | ''' Test ssh connectivity to osg 31 | ''' 32 | url = 'login.osgconnect.net' 33 | uname = 'yadunand' 34 | out = connect_and_list(url, uname) 35 | print("Sitename :{0} hostname:{1}".format(url, out)) 36 | 37 | 38 | def test_cori(): 39 | ''' Test ssh connectivity to cori 40 | ''' 41 | url = 'cori.nersc.gov' 42 | uname = 'yadunand' 43 | out = connect_and_list(url, uname) 44 | print("Sitename :{0} hostname:{1}".format(url, out)) 45 | 46 | 47 | if __name__ == "__main__": 48 | 49 | pass 50 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_ssh_errors.py: -------------------------------------------------------------------------------- 1 | from libsubmit.channels.errors import * 2 | from libsubmit.channels.ssh.ssh import SSHChannel as SSH 3 | 4 | 5 | def connect_and_list(hostname, username): 6 | conn = SSH(hostname, username=username) 7 | ec, out, err = conn.execute_wait("echo $HOSTNAME") 8 | conn.close() 9 | return out 10 | 11 | 12 | def test_error_1(): 13 | try: 14 | connect_and_list("bad.url.gov", "ubuntu") 15 | except Exception as e: 16 | assert type(e) == SSHException, "Excpected SSException, got :{0}".format(e) 17 | 18 | 19 | def test_error_2(): 20 | try: 21 | connect_and_list("swift.rcc.uchicago.edu", "mango") 22 | except SSHException: 23 | print("Caught the right exception") 24 | else: 25 | raise Exception("Expected SSException, got :{0}".format(e)) 26 | 27 | 28 | def test_error_3(): 29 | ''' This should work 30 | ''' 31 | try: 32 | connect_and_list("login.mcs.anl.gov", "yadunand") 33 | except AuthException as e: 34 | print("Caught exception : ", e) 35 | else: 36 | assert type(e) == BadHostKeyException, "Expected SSException, got :{0}".format(e) 37 | 38 | 39 | def test_error_3(): 40 | ''' This should work 41 | ''' 42 | try: 43 | connect_and_list("edison.nersc.gov", "yadunand") 44 | except BadHostKeyException as e: 45 | print("Caught exception BadHostKeyException: ", e) 46 | else: 47 | assert False, "Expected SSException, got :{0}".format(e) 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | tests = [test_error_1, test_error_2, test_error_3] 53 | 54 | for test in tests: 55 | print("---------Running : {0}---------------".format(test)) 56 | test() 57 | print("----------------------DONE--------------------------") 58 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_ssh_file_transport.py: -------------------------------------------------------------------------------- 1 | import libsubmit 2 | from libsubmit.channels.ssh.ssh import SSHChannel as SSH 3 | 4 | 5 | def connect_and_list(hostname, username): 6 | conn = SSH(hostname, username=username) 7 | ec, out, err = conn.execute_wait("echo $HOSTNAME") 8 | conn.close() 9 | return out 10 | 11 | 12 | def test_push(conn, fname="test001.txt"): 13 | 14 | with open(fname, 'w') as f: 15 | f.write("Hello from parsl.ssh testing\n") 16 | 17 | conn.push_file(fname, "/tmp") 18 | ec, out, err = conn.execute_wait("ls /tmp/{0}".format(fname)) 19 | print(ec, out, err) 20 | 21 | 22 | def test_pull(conn, fname="test001.txt"): 23 | 24 | local = "foo" 25 | conn.pull_file("/tmp/{0}".format(fname), local) 26 | 27 | with open("{0}/{1}".format(local, fname), 'r') as f: 28 | print(f.readlines()) 29 | 30 | 31 | if __name__ == "__main__": 32 | 33 | libsubmit.set_stream_logger() 34 | 35 | # This is for testing 36 | conn = SSH("midway.rcc.uchicago.edu", username="yadunand") 37 | 38 | test_push(conn) 39 | test_pull(conn) 40 | 41 | conn.close() 42 | -------------------------------------------------------------------------------- /libsubmit/tests/test_channels/test_ssh_interactive.py: -------------------------------------------------------------------------------- 1 | import libsubmit 2 | from libsubmit.channels.ssh_il.ssh_il import SSHInteractiveLoginChannel as SSH 3 | 4 | 5 | def connect_and_list(hostname, username): 6 | conn = SSH(hostname, username=username) 7 | ec, out, err = conn.execute_wait("echo $HOSTNAME") 8 | conn.close() 9 | return out 10 | 11 | 12 | def test_cooley(): 13 | ''' Test ssh channels to midway 14 | ''' 15 | url = 'cooley.alcf.anl.gov' 16 | uname = 'yadunand' 17 | out = connect_and_list(url, uname) 18 | print("Sitename :{0} hostname:{1}".format(url, out)) 19 | return 20 | 21 | 22 | if __name__ == "__main__": 23 | libsubmit.set_stream_logger() 24 | test_cooley() 25 | -------------------------------------------------------------------------------- /libsubmit/tests/test_integration/test_ssh/test_ssh_beagle.py: -------------------------------------------------------------------------------- 1 | import libsubmit 2 | from libsubmit import SSHChannel 3 | from libsubmit import Torque 4 | import time 5 | 6 | 7 | def test_1(): 8 | 9 | torque_config = { 10 | "site": "Beagle.CI", 11 | "execution": { 12 | "executor": "ipp", 13 | "provider": "torque", 14 | "channel": "ssh", 15 | "block": { 16 | "initBlocks": 1, 17 | "maxBlocks": 1, 18 | "minBlocks": 0, 19 | "taskBlocks": 1, 20 | "nodes": 1, 21 | "walltime": "00:25:00", 22 | "options": { 23 | "partition": "debug", 24 | "overrides": '''#SBATCH --constraint=haswell''' 25 | } 26 | } 27 | } 28 | } 29 | 30 | channel = SSHChannel( 31 | "login4.beagle.ci.uchicago.edu", 32 | "yadunandb", 33 | script_dir="/lustre/beagle2/yadunand/parsl_scripts") 34 | ec, out, err = channel.execute_wait("which qsub; echo $HOSTNAME; pwd") 35 | print("Stdout : ", out) 36 | 37 | provider = Torque(config=torque_config, channel=channel) 38 | 39 | x = provider.submit('''echo "sleeping" 40 | sleep 120 41 | echo "Done sleeping" ''', 1) 42 | time.sleep(3) 43 | 44 | y = provider.submit('''echo "sleeping" 45 | sleep 120 46 | echo "Done sleeping" ''', 1) 47 | time.sleep(3) 48 | 49 | stats = provider.status([x, y]) 50 | 51 | x = provider.cancel([x, y]) 52 | print(stats) 53 | print("Cancel stats : ", x) 54 | 55 | 56 | if __name__ == "__main__": 57 | libsubmit.set_stream_logger() 58 | test_1() 59 | -------------------------------------------------------------------------------- /libsubmit/tests/test_integration/test_ssh/test_ssh_condor_earth.py: -------------------------------------------------------------------------------- 1 | import os 2 | import libsubmit 3 | from libsubmit import SSHChannel, Condor 4 | import time 5 | 6 | 7 | def test_1(): 8 | config = { 9 | "site": "T3_US_NotreDame", 10 | "execution": { 11 | "script_dir": ".scripts", 12 | "environment": { 13 | 'CONDOR_CONFIG': '/opt/condor/RedHat6/etc/condor_config', 14 | 'CONDOR_LOCATION': '/opt/condor/RedHat6', 15 | 'PATH': '/opt/condor/RedHat6/bin:${PATH}' 16 | }, 17 | "block": { 18 | "environment": { 19 | 'foo': 'spacey "quoted" value', 20 | 'bar': "this 'works' too", 21 | 'baz': 2 22 | }, 23 | "nodes": 1, 24 | "walltime": "01:00:00", 25 | "options": { 26 | "project": "cms.org.nd", 27 | "condor_overrides": "", 28 | "requirements": "" 29 | } 30 | } 31 | } 32 | } 33 | channel = SSHChannel("earth.crc.nd.edu", os.environ['USER']) 34 | 35 | ec, out, err = channel.execute_wait("printenv", envs=config['execution']['environment']) 36 | print("current env:", out) 37 | 38 | ec, out, err = channel.execute_wait("which condor_submit", envs=config['execution']['environment']) 39 | print('which condor_submit? ', out) 40 | 41 | provider = Condor(config=config, channel=channel) 42 | 43 | ids = provider.submit('''echo "sleeping" 44 | sleep 120 45 | echo "Done sleeping" ''', 1) 46 | time.sleep(3) 47 | 48 | ids += provider.submit('''echo "sleeping" 49 | sleep 120 50 | echo "Done sleeping" ''', 1) 51 | time.sleep(3) 52 | 53 | stats = provider.status(ids) 54 | print(stats) 55 | 56 | provider.cancel(ids) 57 | 58 | 59 | if __name__ == "__main__": 60 | libsubmit.set_stream_logger() 61 | test_1() 62 | -------------------------------------------------------------------------------- /libsubmit/tests/test_integration/test_ssh/test_ssh_cori.py: -------------------------------------------------------------------------------- 1 | import libsubmit 2 | from libsubmit import SSHChannel, Slurm 3 | import time 4 | 5 | 6 | def test_1(): 7 | 8 | slurm_config = { 9 | "site": "Cori/Nersc", 10 | "execution": { 11 | "executor": "ipp", 12 | "provider": "slurm", 13 | "channel": "local", 14 | "options": { 15 | "init_parallelism": 1, 16 | "max_parallelism": 1, 17 | "min_parallelism": 0, 18 | "tasks_per_node": 1, 19 | "node_granularity": 1, 20 | "partition": "debug", 21 | "walltime": "00:25:00", 22 | "submit_script_dir": ".scripts", 23 | "overrides": '''#SBATCH --constraint=haswell''' 24 | } 25 | } 26 | } 27 | 28 | channel = SSHChannel( 29 | "cori.nersc.gov", 30 | "yadunand", 31 | channel_script_dir="/global/homes/y/yadunand/parsl_scripts") 32 | ec, out, err = channel.execute_wait("which sbatch; echo $HOSTNAME; pwd") 33 | print("Stdout: ", out) 34 | 35 | provider = Slurm(config=slurm_config, channel=channel) 36 | 37 | x = provider.submit('''echo "sleeping" 38 | sleep 120 39 | echo "Done sleeping" ''', 1) 40 | time.sleep(3) 41 | 42 | y = provider.submit('''echo "sleeping" 43 | sleep 120 44 | echo "Done sleeping" ''', 1) 45 | time.sleep(3) 46 | 47 | stats = provider.status([x, y]) 48 | 49 | provider.cancel([x, y]) 50 | print(stats) 51 | 52 | 53 | if __name__ == "__main__": 54 | libsubmit.set_stream_logger() 55 | test_1() 56 | -------------------------------------------------------------------------------- /libsubmit/tests/test_integration/test_ssh/test_ssh_swan.py: -------------------------------------------------------------------------------- 1 | import libsubmit 2 | from libsubmit import SSHChannel 3 | from libsubmit import Torque 4 | import time 5 | 6 | 7 | def test_1(): 8 | 9 | torque_config = { 10 | "site": "Swan.CrayPN", 11 | "execution": { 12 | "executor": "ipp", 13 | "provider": "torque", 14 | "channel": "ssh", 15 | "block": { 16 | "initBlocks": 1, 17 | "maxBlocks": 1, 18 | "minBlocks": 0, 19 | "taskBlocks": 1, 20 | "nodes": 1, 21 | "walltime": "00:25:00", 22 | "options": { 23 | "partition": "debug", 24 | "queue": "ivb12", 25 | } 26 | } 27 | } 28 | } 29 | 30 | channel = SSHChannel("swan.cray.com", "p01953", script_dir="parsl_scripts") 31 | ec, out, err = channel.execute_wait("which qsub; echo $HOSTNAME; pwd") 32 | print("Stdout: ", out) 33 | 34 | provider = Torque(config=torque_config, channel=channel) 35 | 36 | x = provider.submit('''echo "sleeping" 37 | sleep 120 38 | echo "Done sleeping" ''', 1) 39 | time.sleep(3) 40 | 41 | y = provider.submit('''echo "sleeping" 42 | sleep 120 43 | echo "Done sleeping" ''', 1) 44 | time.sleep(3) 45 | 46 | stats = provider.status([x, y]) 47 | 48 | print("Trying to cancel : {0} {1}".format(x, y)) 49 | provider.cancel([x, y]) 50 | print(stats) 51 | 52 | 53 | if __name__ == "__main__": 54 | libsubmit.set_stream_logger() 55 | test_1() 56 | -------------------------------------------------------------------------------- /libsubmit/tests/test_providers/ec2/test_ec2.py: -------------------------------------------------------------------------------- 1 | import libsubmit 2 | from libsubmit import EC2Provider as ec2 3 | import time 4 | 5 | 6 | def test_1(): 7 | 8 | config = { 9 | "site": "ec2", 10 | "auth": { 11 | "foo": "foo" 12 | }, 13 | "execution": { 14 | "executor": "ipp", 15 | "provider": "ec2", 16 | "channel": None, 17 | "block": { 18 | "initBlocks": 1, 19 | "maxBlocks": 1, 20 | "minBlocks": 0, 21 | "taskBlocks": 1, 22 | "nodes": 1, 23 | "walltime": "00:25:00", 24 | "options": { 25 | "region": "us-east-2", 26 | "imageId": 'ami-82f4dae7', 27 | "stateFile": "awsproviderstate.json", 28 | "keyName": "parsl.test" 29 | } 30 | } 31 | } 32 | } 33 | 34 | provider = ec2(config=config, channel=None) 35 | 36 | x = provider.submit('''echo "sleeping" 37 | sleep 120 38 | echo "Done sleeping" ''', 1) 39 | time.sleep(3) 40 | 41 | y = provider.submit('''echo "sleeping" 42 | sleep 120 43 | echo "Done sleeping" ''', 1) 44 | time.sleep(3) 45 | 46 | stats = provider.status([x, y]) 47 | 48 | provider.cancel([x, y]) 49 | print(stats) 50 | 51 | 52 | def test_2(): 53 | 54 | config = { 55 | "site": "ec2", 56 | "auth": { 57 | "profile": "default" 58 | }, 59 | "execution": { 60 | "executor": "ipp", 61 | "provider": "ec2", 62 | "channel": None, 63 | "block": { 64 | "initBlocks": 1, 65 | "maxBlocks": 1, 66 | "minBlocks": 0, 67 | "taskBlocks": 1, 68 | "nodes": 1, 69 | "walltime": "00:25:00", 70 | "options": { 71 | "region": "us-east-2", 72 | "imageId": 'ami-82f4dae7', 73 | "stateFile": "awsproviderstate.json", 74 | "keyName": "parsl.test" 75 | } 76 | } 77 | } 78 | } 79 | 80 | provider = ec2(config=config, channel=None) 81 | 82 | x = provider.submit('''echo "sleeping" 83 | sleep 120 84 | echo "Done sleeping" ''', 1) 85 | time.sleep(3) 86 | 87 | y = provider.submit('''echo "sleeping" 88 | sleep 120 89 | echo "Done sleeping" ''', 1) 90 | time.sleep(3) 91 | 92 | print("X : ", x) 93 | print("Y : ", y) 94 | stats = provider.status([x, y]) 95 | print("Status : ", stats) 96 | 97 | provider.cancel([x, y]) 98 | 99 | # provider.teardown() 100 | 101 | 102 | if __name__ == "__main__": 103 | libsubmit.set_stream_logger() 104 | # test_1 () 105 | test_2() 106 | -------------------------------------------------------------------------------- /libsubmit/utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | def wtime_to_minutes(time_string): 5 | ''' wtime_to_minutes 6 | 7 | Convert standard wallclock time string to minutes. 8 | 9 | Args: 10 | - Time_string in HH:MM:SS format 11 | 12 | Returns: 13 | (int) minutes 14 | 15 | ''' 16 | hours, mins, seconds = time_string.split(':') 17 | return int(hours) * 60 + int(mins) + 1 18 | 19 | 20 | class RepresentationMixin(object): 21 | """A mixin class for adding a __repr__ method. 22 | 23 | The __repr__ method will return a string equivalent to the code used to instantiate 24 | the child class, with any defaults included explicitly. The __max_width__ class variable 25 | controls the maximum width of the representation string. If this width is exceeded, 26 | the representation string will be split up, with one argument or keyword argument per line. 27 | 28 | Any arguments or keyword arguments in the constructor must be defined as attributes, or 29 | an AttributeError will be raised. 30 | 31 | Examples 32 | -------- 33 | >>> from libsubmit.utils import RepresentationMixin 34 | >>> class Foo(RepresentationMixin): 35 | def __init__(self, first, second, third='three', fourth='fourth'): 36 | self.first = first 37 | self.second = second 38 | self.third = third 39 | self.fourth = fourth 40 | >>> bar = Foo(1, 'two', fourth='baz') 41 | >>> bar 42 | Foo(1, 'two', third='three', fourth='baz') 43 | """ 44 | __max_width__ = 80 45 | 46 | def __repr__(self): 47 | argspec = inspect.getargspec(self.__init__) 48 | if len(argspec.args) > 1: 49 | defaults = dict(zip(reversed(argspec.args), reversed(argspec.defaults))) 50 | else: 51 | defaults = [] 52 | 53 | for arg in argspec.args[1:]: 54 | if not hasattr(self, arg): 55 | template = 'class {} uses {} in the constructor, but does not define it as an attribute' 56 | raise AttributeError(template.format(self.__class__.__name__, arg)) 57 | 58 | args = [getattr(self, a) for a in argspec.args[1:-len(defaults)]] 59 | kwargs = {key: getattr(self, key) for key in defaults} 60 | 61 | def assemble_multiline(args, kwargs): 62 | def indent(text): 63 | lines = text.splitlines() 64 | if len(lines) <= 1: 65 | return text 66 | return "\n".join(" " + l for l in lines).strip() 67 | args = ["\n {},".format(indent(repr(a))) for a in args] 68 | kwargs = ["\n {}={}".format(k, indent(repr(v))) 69 | for k, v in sorted(kwargs.items())] 70 | 71 | info = "".join(args) + ", ".join(kwargs) 72 | return self.__class__.__name__ + "({}\n)".format(info) 73 | 74 | def assemble_line(args, kwargs): 75 | kwargs = ['{}={}'.format(k, repr(v)) for k, v in sorted(kwargs.items())] 76 | 77 | info = ", ".join([repr(a) for a in args] + kwargs) 78 | return self.__class__.__name__ + "({})".format(info) 79 | 80 | if len(assemble_line(args, kwargs)) <= self.__class__.__max_width__: 81 | return assemble_line(args, kwargs) 82 | else: 83 | return assemble_multiline(args, kwargs) 84 | -------------------------------------------------------------------------------- /libsubmit/version.py: -------------------------------------------------------------------------------- 1 | ''' Set module version 2 | ..[-alpha/beta/..] 3 | ''' 4 | VERSION = '0.5.1-a3' 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | paramiko 2 | boto3 3 | azure-mgmt>=2.0.0 4 | python-novaclient 5 | google-api-python-client 6 | google-auth 7 | nbsphinx 8 | kubernetes>=6.0.0 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('libsubmit/version.py') as f: 4 | exec(f.read()) 5 | 6 | install_requires = [ 7 | 'paramiko' 8 | ] 9 | 10 | tests_require = [ 11 | 'paramiko', 12 | 'mock>=1.0.0', 13 | 'nose', 14 | 'pytest' 15 | ] 16 | 17 | setup( 18 | name='libsubmit', 19 | version=VERSION, 20 | description='Uniform interface to clouds, clusters, grids and supercomputers.', 21 | long_description='Submit, track and cancel arbitrary bash scripts on computate resources', 22 | url='https://github.com/Parsl/libsubmit', 23 | author='Yadu Nand Babuji', 24 | author_email='yadu@uchicago.edu', 25 | license='Apache 2.0', 26 | download_url = 'https://github.com/Parsl/libsubmit/archive/master.zip', 27 | package_data={'': ['LICENSE']}, 28 | packages=find_packages(), 29 | install_requires=install_requires, 30 | extras_require = { 31 | 'aws' : ['boto3'], 32 | 'azure' : ['azure-mgmt>=2.0.0', 'haikunator'], 33 | 'jetstream' : ['python-novaclient'] 34 | }, 35 | classifiers = [ 36 | # Maturity 37 | 'Development Status :: 3 - Alpha', 38 | # Intended audience 39 | 'Intended Audience :: Developers', 40 | # Licence, must match with licence above 41 | 'License :: OSI Approved :: Apache Software License', 42 | # Python versions supported 43 | 'Programming Language :: Python :: 3.5', 44 | 'Programming Language :: Python :: 3.6', 45 | ], 46 | keywords = ['Workflows', 'Scientific computing'], 47 | ) 48 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | nose 3 | pytest 4 | coverage 5 | mock>=1.0.0 6 | --------------------------------------------------------------------------------