├── .gitattributes ├── copyright_header.txt ├── example_computer_setup ├── computers │ ├── ssh_transport.yaml │ ├── localhost.yaml │ └── my-cluster.yaml ├── README.md ├── new_profile.sh └── add_extras.py ├── environment.yml ├── CODE_OF_CONDUCT.md ├── SUPPORT.md ├── aiida_dynamic_workflows ├── _static_version.py ├── __init__.py ├── common │ ├── serialize.py │ ├── __init__.py │ ├── array.py │ └── mapspec.py ├── utils.py ├── step.py ├── samples.py ├── control.py ├── parsers.py ├── schedulers.py ├── _version.py ├── query.py ├── report.py ├── workchains.py ├── data.py ├── engine.py └── workflow.py ├── .github └── workflows │ └── ci-style.yaml ├── .pre-commit-config.yaml ├── setup.py ├── LICENSE ├── .gitignore ├── README.md ├── SECURITY.md ├── setup.cfg └── examples ├── 04-deleting-data.md ├── 01-calculations.md ├── 02-workflows.md └── 03-failures.md /.gitattributes: -------------------------------------------------------------------------------- 1 | aiida_dynamic_workflows/_static_version.py export-subst 2 | -------------------------------------------------------------------------------- /copyright_header.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. 2 | Licensed under the MIT License. 3 | -------------------------------------------------------------------------------- /example_computer_setup/computers/ssh_transport.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # You can tweak these settings 3 | timeout: 120 # SSH connections can live for 2 minutes 4 | safe_interval: 0 # Can make as many SSH connections as we like 5 | compress: true # Compress files for transfer 6 | key_policy: AutoAddPolicy # Automatically fixes missing hosts in known_hosts 7 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: dynamic-workflows-dev 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # General dependencies 6 | - python=3.9 7 | - graphviz 8 | # dev dependencies 9 | - pre-commit 10 | - ipykernel # for running notebooks 11 | # used in example notebooks 12 | - loky 13 | - toolz 14 | - pandas 15 | - numpy 16 | -------------------------------------------------------------------------------- /example_computer_setup/computers/localhost.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | description: "Local machine" 3 | label: "localhost" 4 | hostname: "localhost" 5 | transport: core.local 6 | scheduler: core.direct 7 | work_dir: "/home/{username}/.aiida_run" 8 | mpirun_command: "mpirun -np {tot_num_mpiprocs}" 9 | mpiprocs_per_machine: "1" 10 | shebang: "#!/bin/bash" 11 | prepend_text: " " 12 | append_text: " " 13 | -------------------------------------------------------------------------------- /example_computer_setup/README.md: -------------------------------------------------------------------------------- 1 | # Aiida profiles for use with aiida-dynamic-workflows 2 | 3 | aiida-dynamic-workflows assumes Conda is used to manage the Python environments 4 | on the Computers 5 | 6 | To get started, modify the `hostname` and `work_dir` keys in `computers/cluster.yaml` 7 | to point to a Slurm cluster. 8 | Then run run `./new_profile.sh ` to create a new Aiida profile with that 9 | Computer set up. 10 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | For help and questions about using this project, please file a GitHub Issue. 10 | 11 | ## Microsoft Support Policy 12 | 13 | Support for this project is limited to the resources listed above. 14 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/_static_version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT License. 4 | 5 | # This file will be overwritten by setup.py when a source or binary 6 | # distribution is made. The magic value "__use_git__" is interpreted by 7 | # version.py. 8 | 9 | version = "__use_git__" 10 | 11 | # These values are only set if the distribution was created with 'git archive' 12 | refnames = "HEAD -> main, tag: v0.1.0" 13 | git_hash = "06e78f5" 14 | -------------------------------------------------------------------------------- /example_computer_setup/computers/my-cluster.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | description: "Cluster with Slurm installeed" 3 | label: "my-cluster" 4 | hostname: "headnode.mycluster.whatever" 5 | transport: core.ssh 6 | scheduler: "dynamic_workflows.slurm" 7 | work_dir: "/home/{username}/.aiida_run" 8 | mpirun_command: "mpirun -np {tot_num_mpiprocs}" 9 | mpiprocs_per_machine: "1" 10 | shebang: "#!/bin/bash" 11 | prepend_text: " " 12 | append_text: " " 13 | # Extra properties 14 | extras: 15 | # note: will be autodetected if not specified 16 | conda_dir: "/home/{username}/miniconda3" 17 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from . import ( 6 | calculations, 7 | common, 8 | control, 9 | data, 10 | engine, 11 | parsers, 12 | query, 13 | report, 14 | utils, 15 | workflow, 16 | ) 17 | from ._version import __version__ # noqa: F401 18 | from .samples import input_samples 19 | from .step import step 20 | 21 | __all__ = [ 22 | "calculations", 23 | "common", 24 | "control", 25 | "data", 26 | "engine", 27 | "input_samples", 28 | "parsers", 29 | "report", 30 | "query", 31 | "step", 32 | "utils", 33 | "workflow", 34 | "__version__", 35 | ] 36 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/common/serialize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import cloudpickle 6 | 7 | 8 | def read(name, opener=open): 9 | """Load file contents as a bytestring.""" 10 | with opener(name, "rb") as f: 11 | return f.read() 12 | 13 | 14 | loads = cloudpickle.loads 15 | dumps = cloudpickle.dumps 16 | 17 | 18 | def load(name, opener=open): 19 | """Load a cloudpickled object from the named file.""" 20 | with opener(name, "rb") as f: 21 | return cloudpickle.load(f) 22 | 23 | 24 | def dump(obj, name, opener=open): 25 | """Dump an object to the named file using cloudpickle.""" 26 | with opener(name, "wb") as f: 27 | cloudpickle.dump(obj, f) 28 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | # Common code used both by the plugin and by the runtime that wraps usercode. 6 | 7 | import importlib.resources 8 | 9 | from .array import FileBasedObjectArray 10 | from .mapspec import MapSpec 11 | from .serialize import dump, load 12 | 13 | __all__ = ["dump", "load", "FileBasedObjectArray", "MapSpec", "package_module_contents"] 14 | 15 | 16 | def package_module_contents(): 17 | """Yield (filename, contents) pairs for each module in this subpackage.""" 18 | for filename in importlib.resources.contents(__package__): 19 | if filename.endswith(".py"): 20 | yield filename, importlib.resources.read_text(__package__, filename) 21 | -------------------------------------------------------------------------------- /.github/workflows/ci-style.yaml: -------------------------------------------------------------------------------- 1 | name: continuous-integration-style 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | 9 | pre-commit: 10 | 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 30 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Set up Python 3.9 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.9' 21 | 22 | # remove after aiida 2.0 is released 23 | - name: Install AiiDA development version 24 | run: pip install git+https://github.com/aiidateam/aiida-core.git@1890bab724956220c306bd9794457a5657739174 25 | 26 | - name: Install python dependencies 27 | run: | 28 | pip install pre-commit 29 | pip install -e . 30 | pip freeze 31 | 32 | - name: Run pre-commit 33 | run: 34 | pre-commit run --all-files || ( git status --short ; git diff ; exit 1 ) 35 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: mixed-line-ending 8 | args: ['--fix=lf'] 9 | - repo: https://gitlab.com/pycqa/flake8 10 | rev: 4.0.1 11 | hooks: 12 | - id: flake8 13 | - repo: https://github.com/Lucas-C/pre-commit-hooks 14 | rev: v1.1.13 15 | hooks: 16 | - id: insert-license 17 | files: \.py$ 18 | args: 19 | - --license-filepath 20 | - copyright_header.txt 21 | - repo: https://github.com/ambv/black 22 | rev: 22.3.0 23 | hooks: 24 | - id: black 25 | language_version: python3.9 26 | - repo: https://github.com/timothycrosley/isort 27 | rev: 5.10.1 28 | hooks: 29 | - id: isort 30 | - repo: https://github.com/pycqa/pydocstyle 31 | rev: 6.1.1 32 | hooks: 33 | - id: pydocstyle 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from setuptools import setup 6 | 7 | 8 | def get_version_and_cmdclass(package_path): 9 | """Load version.py module without importing the whole package. 10 | 11 | Template code from miniver 12 | """ 13 | from importlib.util import module_from_spec, spec_from_file_location 14 | import os 15 | 16 | spec = spec_from_file_location("version", os.path.join(package_path, "_version.py")) 17 | module = module_from_spec(spec) 18 | spec.loader.exec_module(module) 19 | return module.__version__, module.cmdclass 20 | 21 | 22 | version, cmdclass = get_version_and_cmdclass("aiida_dynamic_workflows") 23 | 24 | # All other options are specified in 'setup.cfg'; the version has to be 25 | # determined dynamically from git tags (using 'miniver'), so it needs 26 | # to be done here. 27 | setup( 28 | version=version, 29 | cmdclass=cmdclass, 30 | ) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /example_computer_setup/new_profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | profile=$1 5 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 6 | 7 | if ! command -v verdi &> /dev/null 8 | then 9 | echo "'verdi' command not found: did you activate the Conda environment where Aiida is installed?" 10 | exit 1 11 | fi 12 | 13 | if [ -z "$profile" ] 14 | then 15 | echo "Usage: new_profile.sh " 16 | exit 1 17 | fi 18 | 19 | # Ensure profile is lowercase only 20 | 21 | function lowered () { 22 | echo $1 | tr '[:upper:]' '[:lower:]' 23 | } 24 | 25 | if [ "$profile" != "$(lowered $profile)" ] 26 | then 27 | echo "Profile name '$profile' is not lowercase" 28 | exit 1 29 | fi 30 | 31 | verdi quicksetup --profile $profile 32 | 33 | for config_file in "$SCRIPT_DIR"/computers/*.yaml; do 34 | computer=$(basename $config_file .yaml) 35 | 36 | # -n to use default values that are not included 37 | # in the config file (this includes "username"). 38 | verdi --profile $profile computer setup -n --config $config_file 39 | 40 | if [ $computer = localhost ]; then 41 | verdi --profile $profile computer configure core.local $computer -n --safe-interval 0 42 | else 43 | verdi --profile $profile computer configure core.ssh $computer -n --config "$SCRIPT_DIR/computers/ssh_transport.yaml" 44 | fi 45 | 46 | "$SCRIPT_DIR/add_extras.py" --profile $profile --config $config_file 47 | 48 | done 49 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from __future__ import annotations 5 | 6 | import asyncio 7 | from concurrent.futures import ThreadPoolExecutor 8 | from functools import partial 9 | from pathlib import Path 10 | import shutil 11 | from typing import Iterable 12 | 13 | from IPython.display import Image 14 | import aiida 15 | import graphviz 16 | from tqdm import tqdm 17 | 18 | 19 | def block_until_done(chain: aiida.orm.WorkChainNode, interval=1) -> int: 20 | """Block a running chain until an exit code is set. 21 | 22 | Parameters 23 | ---------- 24 | chain : aiida.orm.WorkChainNode 25 | interval : int, optional 26 | Checking interval, by default 1 27 | 28 | Returns 29 | ------- 30 | int 31 | Exit code. 32 | """ 33 | loop = asyncio.get_event_loop() 34 | 35 | async def wait_until_done(chain: aiida.orm.WorkChainNode) -> None: 36 | while chain.exit_status is None: 37 | await asyncio.sleep(interval) 38 | 39 | coro = wait_until_done(chain) 40 | loop.run_until_complete(coro) 41 | return chain.exit_status 42 | 43 | 44 | def render_png(g: graphviz.Digraph) -> Image: 45 | """Render 'graphviz.Digraph' as png.""" 46 | return Image(g.render(format="png")) 47 | 48 | 49 | def parallel_rmtree(dirs: Iterable[str | Path], with_tqdm: bool = True): 50 | """Apply 'shutil.rmtree' to 'dirs' in parallel using a thread pool.""" 51 | # Threadpool executor, as this task is IO bound. 52 | rmtree = partial(shutil.rmtree, ignore_errors=True) 53 | with ThreadPoolExecutor() as tp: 54 | it = tp.map(rmtree, dirs) 55 | if with_tqdm: 56 | it = tqdm(it, total=len(dirs)) 57 | # Bare 'for' loop to force the map to complete. 58 | for _ in it: 59 | pass 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /example_computer_setup/add_extras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) Microsoft Corporation. 4 | # Licensed under the MIT License. 5 | 6 | 7 | import aiida 8 | from aiida.cmdline.utils import echo 9 | import aiida.orm 10 | import click 11 | import yaml 12 | 13 | 14 | @click.command() 15 | @click.option("--profile", help="Aiida profile") 16 | @click.option("--config", required=True, help="Config file for computer") 17 | def main(profile, config): 18 | """Add extra properties to the computer defined in 'config'.""" 19 | aiida.load_profile(profile) 20 | 21 | with open(config) as f: 22 | config = yaml.safe_load(f) 23 | 24 | label = config["label"] 25 | 26 | echo.echo_info(f"Adding extra properties to computer {label}") 27 | 28 | extras = config.get("extras", dict()) 29 | 30 | computer = aiida.orm.load_computer(label) 31 | for k, v in extras.items(): 32 | computer.set_property(k, str(v)) 33 | computer.store() 34 | 35 | echo.echo_success(f"Added the following properties to {label}: {extras}") 36 | 37 | if "conda_dir" not in extras: 38 | echo.echo_info(f"Setting the conda directory for computer {label}") 39 | conda_dir = get_conda_dir(computer) 40 | computer.set_property("conda_dir", conda_dir) 41 | computer.store() 42 | 43 | echo.echo_success(f"Set the Conda directory on {label} to '{conda_dir}'") 44 | else: 45 | conda_dir = extras["conda_dir"] 46 | with computer.get_transport() as t: 47 | if not t.isdir(conda_dir): 48 | echo.echo_warning(f"'{conda_dir}' is not a directory on {label}") 49 | 50 | 51 | def get_conda_dir(computer): 52 | """Return the Conda directory for the given computer. 53 | 54 | First we try to determine the Conda directory automatically by 55 | activating the "base" environment and getting $CONDA_PREFIX. 56 | 57 | If that fails we simply prompt the user. 58 | """ 59 | label = computer.label 60 | with computer.get_transport() as t: 61 | rv, stdout, stderr = t.exec_command_wait( 62 | "set -e; conda activate base; echo $CONDA_PREFIX" 63 | ) 64 | conda_dir = stdout.strip() or None 65 | if not conda_dir: 66 | echo.echo_warning( 67 | "Failed to automatically determine Conda directory " 68 | f"for {label} (the computer said: '{stderr}')" 69 | ) 70 | 71 | while not conda_dir: 72 | x = click.prompt(f"Enter your conda directory for {label}") 73 | if t.isdir(x): 74 | conda_dir = x 75 | else: 76 | echo.echo_warning(f"'{x}' is not a directory on {label}") 77 | return conda_dir 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aiida-dynamic-workflows 2 | An AiiDA plugin for dynamically composing workflows from Python functions that run as CalcJobs. 3 | 4 | **This is experimental, pre-alpha software**. 5 | 6 | 7 | ## Prerequisites 8 | An environment where the _development_ version of AiiDA is installed. 9 | This plugin makes use of a bugfix on the development branch, which will 10 | not be included in an AiiDA release until v2.0. 11 | 12 | 13 | ## Installing 14 | As pre-alpha software, this package is **not** released on PyPI. 15 | Currently the only way to install the plugin is to clone the 16 | repository and use `pip`: 17 | ```bash 18 | pip install -e . 19 | ``` 20 | 21 | 22 | ## Initialization 23 | This plugin uses Conda for managing Python environments on remote computers. 24 | Any Computers that you use with this plugin must have a `conda_dir` property 25 | that contains an absolute path to the Conda directory on the machine 26 | (typically something like `/home/{username}/miniconda3`. 27 | The `add_extras.py` script in `example_cluster_setup/` can help you with this 28 | 29 | 30 | ## Examples 31 | The [`examples/`](./examples) directory contains Jupyter notebooks that illustrate the main 32 | features of `aiida-dynamic-workflows`. The notebooks are in Markdown format, and so require 33 | the Jupyter plugin [jupytext](https://jupytext.readthedocs.io/en/latest/) in order to run them. 34 | 35 | 36 | ## Contributing 37 | 38 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 39 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 40 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 41 | 42 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 43 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 44 | provided by the bot. You will only need to do this once across all repos using our CLA. 45 | 46 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 47 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 48 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 49 | 50 | ## Trademarks 51 | 52 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 53 | trademarks or logos is subject to and must follow 54 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 55 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 56 | Any use of third-party trademarks or logos are subject to those third-party's policies. 57 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/step.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import copy 6 | from typing import Any, Callable, Dict, Optional, Tuple, Union 7 | 8 | import toolz 9 | 10 | from .data import PyFunction 11 | 12 | __all__ = ["step"] 13 | 14 | 15 | @toolz.curry 16 | def step( 17 | f: Callable, 18 | *, 19 | returns: Union[str, Tuple[str]] = "_return_value", 20 | resources: Optional[Dict[str, Any]] = None, 21 | ) -> PyFunction: 22 | """Construct a PyFunction from a Python function. 23 | 24 | This function is commonly used as a decorator. 25 | 26 | Parameters 27 | ---------- 28 | f 29 | The function to transform into a PyFunction 30 | returns 31 | The name of the output of this function. 32 | If multiple names are provided, then 'f' is assumed to return 33 | as many values (as a tuple) as there are names. 34 | resources 35 | Optional specification of computational resources that this 36 | function needs. Possible resources are: "memory", "cores". 37 | "memory" must be a string containing an integer value followed 38 | by one of the following suffixes: "kB", "MB", "GB". 39 | "cores" must be a positive integer. 40 | 41 | Examples 42 | -------- 43 | >>> f = step(lambda x, y: x + y, returns="sum") 44 | >>> 45 | >>> @step(returns="other_sum", resources={"memory": "10GB", cores=2}) 46 | ... def g(x: int, y: int) -> int: 47 | ... return x + y 48 | ... 49 | >>> @step(returns=("a", "b")) 50 | ... def h(x): 51 | ... return (x + 1, x + 2) 52 | ... 53 | >>> 54 | """ 55 | # TODO: First query the Aiida DB to see if this function already exists. 56 | # This will require having a good hash for Python functions. 57 | # This is a hard problem. 58 | if resources: 59 | _validate_resources(resources) 60 | 61 | node = PyFunction(func=f, returns=returns, resources=resources) 62 | node.store() 63 | return node 64 | 65 | 66 | def _validate_resources(resources) -> Dict: 67 | resources = copy.deepcopy(resources) 68 | if "memory" in resources: 69 | _validate_memory(resources.pop("memory")) 70 | if "cores" in resources: 71 | _validate_cores(resources.pop("cores")) 72 | if resources: 73 | raise ValueError(f"Unexpected resource specifications: {list(resources)}") 74 | 75 | 76 | def _validate_memory(memory: str): 77 | mem, unit = memory[:-2], memory[-2:] 78 | if not mem.isnumeric(): 79 | raise ValueError(f"Expected an integer amount of memory, got: '{mem}'") 80 | elif int(mem) == 0: 81 | raise ValueError("Cannot specify zero memory") 82 | valid_units = ("kB", "MB", "GB") 83 | if unit not in valid_units: 84 | raise ValueError( 85 | f"Invalid memory unit: '{unit}' (expected one of {valid_units})." 86 | ) 87 | 88 | 89 | def _validate_cores(cores: int): 90 | if int(cores) != cores: 91 | raise ValueError(f"Expected an integer number of cores, got: {cores}") 92 | elif cores <= 0: 93 | raise ValueError(f"Expected a positive number of cores, got: {cores}") 94 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = aiida_dynamic_workflows 3 | description = AiiDA plugins for dynamically defining workflows that execute Python functions 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | url = http://github.com/microsoft/aiida-dynamic-workflows 7 | author = Microsoft Quantum 8 | license = MIT 9 | license_file = LICENSE 10 | classifiers = 11 | Development Status :: 2 - Pre-Alpha 12 | License :: OSI Approved :: MIT License 13 | Framework :: AiiDA 14 | Intended Audience :: Science/Research 15 | Intended Audience :: Developers 16 | Programming Language :: Python :: 3 :: Only 17 | Topic :: Software Development :: Libraries :: Python Modules 18 | Operating System :: POSIXLinux 19 | keywords = aiida 20 | 21 | [options] 22 | packages = find: 23 | python_requires = >=3.8,<4 24 | install_requires = 25 | # TODO: Update dependency to Aiida 2.0, when it is released 26 | aiida-core==2.0.0b1 27 | toolz 28 | cloudpickle 29 | numpy 30 | graphviz 31 | tqdm 32 | setup_requires = 33 | reentry 34 | include_package_data = True 35 | reentry_register = true 36 | 37 | [options.entry_points] 38 | aiida.calculations = 39 | dynamic_workflows.PyCalcJob = aiida_dynamic_workflows.calculations:PyCalcJob 40 | dynamic_workflows.PyMapJob = aiida_dynamic_workflows.calculations:PyMapJob 41 | dynamic_workflows.merge_remote_arrays = aiida_dynamic_workflows.calculations:merge_remote_arrays 42 | aiida.parsers = 43 | dynamic_workflows.PyCalcParser = aiida_dynamic_workflows.parsers:PyCalcParser 44 | dynamic_workflows.PyMapParser = aiida_dynamic_workflows.parsers:PyMapParser 45 | aiida.data = 46 | dynamic_workflows.PyData = aiida_dynamic_workflows.data:PyData 47 | dynamic_workflows.PyArray= aiida_dynamic_workflows.data:PyArray 48 | dynamic_workflows.PyRemoteData = aiida_dynamic_workflows.data:PyRemoteData 49 | dynamic_workflows.PyRemoteArray = aiida_dynamic_workflows.data:PyRemoteArray 50 | dynamic_workflows.PyOutline = aiida_dynamic_workflows.data:PyOutline 51 | dynamic_workflows.PyFunction = aiida_dynamic_workflows.data:PyFunction 52 | dynamic_workflows.Nil = aiida_dynamic_workflows.data:Nil 53 | dynamic_workflows.PyException = aiida_dynamic_workflows.data:PyException 54 | aiida.node = 55 | process.workflow.dynamic_workflows.WorkChainNode = aiida_dynamic_workflows.workchains:WorkChainNode 56 | aiida.schedulers = 57 | dynamic_workflows.slurm = aiida_dynamic_workflows.schedulers:SlurmSchedulerWithJobArray 58 | aiida.workflows = 59 | dynamic_workflows.PyWorkChain = aiida_dynamic_workflows.workflow:PyWorkChain 60 | dynamic_workflows.RestartedPyMapJob = aiida_dynamic_workflows.workchains:RestartedPyMapJob 61 | dynamic_workflows.RestartedPyCalcJob = aiida_dynamic_workflows.workchains:RestartedPyCalcJob 62 | 63 | [pydocstyle] 64 | inherit = False 65 | convention = numpy 66 | add-ignore = D100,D104,D105 67 | match = (?!test).*\.py 68 | 69 | [isort] 70 | force_sort_within_sections=True 71 | profile=black 72 | 73 | [flake8] 74 | ignore = E203, E266, W503 75 | max-line-length = 88 76 | max-complexity = 18 77 | select = B,C,E,F,W,T4,B9 78 | 79 | [coverage:run] 80 | omit = 81 | */tests/* 82 | */ipynb_filter.py 83 | */_static_version.py 84 | */_version.py 85 | */setup.py 86 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/samples.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import itertools 6 | from typing import Dict, Iterable, Optional, Tuple 7 | 8 | import aiida.orm 9 | import toolz 10 | 11 | from .calculations import PyCalcJob, PyMapJob 12 | from .common import MapSpec 13 | from .data import PyRemoteArray, from_aiida_type 14 | 15 | 16 | def input_samples(result: PyRemoteArray) -> Iterable[Dict]: 17 | """Return an iterable of samples, given a result from a PyMapJob. 18 | 19 | Parameters 20 | ---------- 21 | result 22 | The array resulting from the execution of a PyMapJob. 23 | 24 | Returns 25 | ------- 26 | An iterable of dictionaries, ordered as 'result' (flattened, if 27 | 'result' is a >1D array). Each dictionary has the same keys (the 28 | names of the parameters that produced 'result'). 29 | 30 | Examples 31 | -------- 32 | >>> import pandas as pd 33 | >>> # In the following we assume 'charge' is a PyRemoteArray output from a PyMapJob. 34 | >>> df = pd.DataFrame(input_samples(charge)) 35 | >>> # Add a 'charge' column showing the result associated with each sample. 36 | >>> df.assign(charge=charge.reshape(-1)) 37 | """ 38 | if result.creator is None: 39 | raise ValueError( 40 | "Cannot generate sample plan from data that was not produced from a CalcJob" 41 | ) 42 | job = result.creator 43 | if not issubclass(job.process_class, PyMapJob): 44 | raise TypeError("Expected data that was produced from a MapJob") 45 | output_axes = MapSpec.from_string(job.attributes["mapspec"]).output.axes 46 | sp = _parameter_spec(result) 47 | 48 | consts = {k: from_aiida_type(v) for k, (v, axes) in sp.items() if axes is None} 49 | mapped = { 50 | k: (from_aiida_type(v), axes) for k, (v, axes) in sp.items() if axes is not None 51 | } 52 | 53 | # This could be done more efficiently if we return instead a dictionary of arrays. 54 | 55 | for el in itertools.product(*map(range, result.shape)): 56 | el = dict(zip(output_axes, el)) 57 | d = {k: v[tuple(el[ax] for ax in axes)] for k, (v, axes) in mapped.items()} 58 | yield toolz.merge(consts, d) 59 | 60 | 61 | def _parameter_spec(result: aiida.orm.Data, axes: Optional[Tuple[str]] = None) -> Dict: 62 | """Return a dictionary specifying the parameters that produced a given 'result'. 63 | 64 | Parameters 65 | ---------- 66 | result 67 | Data produced from a PyCalcJob or PyMapJob. 68 | axes 69 | Labels for each axis of 'result', used to rename input axis labels. 70 | 71 | Returns 72 | ------- 73 | Dictionary mapping parameter names (strings) to pairs: (Aiida node, axis names). 74 | """ 75 | job = result.creator 76 | job_type = job.process_class 77 | 78 | if not issubclass(job_type, PyCalcJob): 79 | raise TypeError(f"Don't know what to do with {job_type}") 80 | 81 | if issubclass(job_type, PyMapJob): 82 | mapspec = MapSpec.from_string(job.attributes["mapspec"]) 83 | if axes: 84 | assert len(axes) == len(mapspec.output.axes) 85 | translation = dict(zip(mapspec.output.axes, axes)) 86 | else: 87 | translation = dict() 88 | input_axes = { 89 | spec.name: [translation.get(ax, ax) for ax in spec.axes] 90 | for spec in mapspec.inputs 91 | } 92 | else: 93 | input_axes = dict() 94 | assert axes is None 95 | 96 | kwargs = job.inputs.kwargs if hasattr(job.inputs, "kwargs") else {} 97 | # Inputs that were _not_ created by another CalcJob are the parameters we seek. 98 | parameters = {k: (v, input_axes.get(k)) for k, v in kwargs.items() if not v.creator} 99 | # Inputs that _were_ created by another Calcjob need to have 100 | # _their_ inputs inspected, in turn. 101 | other_inputs = [(v, input_axes.get(k)) for k, v in kwargs.items() if v.creator] 102 | upstream_params = [_parameter_spec(v, ax) for v, ax in other_inputs] 103 | 104 | return toolz.merge(parameters, *upstream_params) 105 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/control.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import subprocess 6 | import time 7 | from typing import Optional, Union 8 | 9 | from aiida import get_config_option 10 | from aiida.cmdline.commands.cmd_process import process_kill, process_pause, process_play 11 | from aiida.cmdline.utils import common, daemon, echo 12 | from aiida.engine.daemon.client import get_daemon_client 13 | from aiida.orm import ProcessNode, load_node 14 | 15 | 16 | def kill(process: Union[ProcessNode, int, str], timeout: int = 5) -> bool: 17 | """Kill the specified process. 18 | 19 | Params 20 | ------ 21 | process 22 | The process to kill. 23 | timeout 24 | Timeout (in seconds) to wait for confirmation that the process was killed. 25 | 26 | Returns 27 | ------- 28 | True only if the process is now terminated. 29 | """ 30 | process = _ensure_process_node(process) 31 | process_kill.callback([process], timeout=timeout, wait=True) 32 | return process.is_terminated 33 | 34 | 35 | def pause(process: Union[ProcessNode, int, str], timeout: int = 5) -> bool: 36 | """Pause the specified process. 37 | 38 | Paused processes will not continue execution, and can be unpaused later. 39 | 40 | Params 41 | ------ 42 | process 43 | The process to kill. 44 | timeout 45 | Timeout (in seconds) to wait for confirmation that the process was killed. 46 | 47 | Returns 48 | ------- 49 | True only if the process is now paused. 50 | """ 51 | process = _ensure_process_node(process) 52 | if process.is_terminated: 53 | raise RuntimeError("Cannot pause terminated process {process.pk}.") 54 | process_pause.callback([process], all_entries=False, timeout=timeout, wait=True) 55 | return process.paused 56 | 57 | 58 | def unpause(process: Union[ProcessNode, int, str], timeout: int = 5) -> bool: 59 | """Unpause the specified process. 60 | 61 | Params 62 | ------ 63 | process 64 | The process to kill. 65 | timeout 66 | Timeout (in seconds) to wait for confirmation that the process was killed. 67 | 68 | Returns 69 | ------- 70 | True only if the process is now unpaused. 71 | """ 72 | process = _ensure_process_node(process) 73 | if process.is_terminated: 74 | raise RuntimeError("Cannot unpause terminated process {process.pk}.") 75 | process_play.callback([process], all_entries=False, timeout=timeout, wait=True) 76 | return not process.paused 77 | 78 | 79 | def ensure_daemon_restarted(n_workers: Optional[int] = None): 80 | """Restart the daemon (if it is running), or start it (if it is stopped). 81 | 82 | Parameters 83 | ---------- 84 | n_workers 85 | The number of daemon workers to start. If not provided, the default 86 | number of workers for this profile is used. 87 | 88 | Notes 89 | ----- 90 | If the daemon is running this is equivalent to running 91 | 'verdi daemon restart --reset', i.e. we fully restart the daemon, including 92 | the circus controller. This ensures that any changes in the environment are 93 | properly picked up by the daemon. 94 | """ 95 | client = get_daemon_client() 96 | n_workers = n_workers or get_config_option("daemon.default_workers") 97 | 98 | if client.is_daemon_running: 99 | echo.echo("Stopping the daemon...", nl=False) 100 | response = client.stop_daemon(wait=True) 101 | retcode = daemon.print_client_response_status(response) 102 | if retcode: 103 | raise RuntimeError(f"Problem restarting Aiida daemon: {response['status']}") 104 | 105 | echo.echo("Starting the daemon...", nl=False) 106 | 107 | # We have to run this in a subprocess because it daemonizes, and we do not 108 | # want to daemonize _this_ process. 109 | command = [ 110 | "verdi", 111 | "-p", 112 | client.profile.name, 113 | "daemon", 114 | "start-circus", 115 | str(n_workers), 116 | ] 117 | try: 118 | currenv = common.get_env_with_venv_bin() 119 | subprocess.check_output(command, env=currenv, stderr=subprocess.STDOUT) 120 | except subprocess.CalledProcessError as exception: 121 | echo.echo("FAILED", fg="red", bold=True) 122 | raise RuntimeError("Failed to start the daemon") from exception 123 | 124 | time.sleep(1) 125 | response = client.get_status() 126 | 127 | retcode = daemon.print_client_response_status(response) 128 | if retcode: 129 | raise RuntimeError(f"Problem starting Aiida daemon: {response['status']}") 130 | 131 | 132 | def _ensure_process_node(node_or_id: Union[ProcessNode, int, str]) -> ProcessNode: 133 | if isinstance(node_or_id, ProcessNode): 134 | return node_or_id 135 | else: 136 | return load_node(node_or_id) 137 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/parsers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | """Aiida Parsers for interpreting the output of arbitrary Python functions.""" 5 | 6 | import os.path 7 | 8 | import aiida.engine 9 | import aiida.parsers 10 | 11 | from . import common 12 | from .common import MapSpec 13 | from .data import PyRemoteArray, PyRemoteData, array_shape 14 | 15 | # TODO: unify 'PyCalcParser' and 'PyMapParser': they are identical except 16 | # for the type of the outputs (PyRemoteData vs. PyRemoteArray). 17 | 18 | 19 | class PyCalcParser(aiida.parsers.Parser): 20 | """Parser for a PyCalcJob.""" 21 | 22 | def parse(self, **kwargs): # noqa: D102 23 | 24 | calc = self.node 25 | 26 | def retrieve(value_file): 27 | # No actual retrieval occurs; we just store a reference 28 | # to the remote value. 29 | return PyRemoteData.from_remote_data( 30 | calc.outputs.remote_folder, 31 | value_file, 32 | ) 33 | 34 | exception_file = "__exception__.pickle" 35 | remote_folder = calc.outputs["remote_folder"] 36 | remote_files = remote_folder.listdir() 37 | has_exception = exception_file in remote_files 38 | 39 | exit_code = None 40 | 41 | # If any data was produced we create the appropriate outputs. 42 | # If something went wrong the exit code will still be non-zero. 43 | output_folder = remote_folder.listdir("__return_values__") 44 | for r in calc.inputs.func.returns: 45 | filename = f"{r}.pickle" 46 | path = os.path.join("__return_values__", filename) 47 | if filename in output_folder: 48 | self.out(f"return_values.{r}", retrieve(path)) 49 | else: 50 | exit_code = self.exit_codes.MISSING_OUTPUT 51 | 52 | try: 53 | job_infos = calc.computer.get_scheduler().parse_detailed_job_info( 54 | calc.get_detailed_job_info() 55 | ) 56 | except AttributeError: 57 | pass 58 | else: 59 | (job_info,) = job_infos 60 | if job_info["State"] == "FAILED": 61 | exit_code = self.exit_codes.NONZERO_EXIT_CODE 62 | 63 | if has_exception: 64 | self.out("exception", retrieve(exception_file)) 65 | exit_code = self.exit_codes.USER_CODE_RAISED 66 | 67 | if exit_code is not None: 68 | calc.set_exit_status(exit_code.status) 69 | calc.set_exit_message(exit_code.message) 70 | return exit_code 71 | 72 | 73 | class PyMapParser(aiida.parsers.Parser): 74 | """Parser for a PyMapJob.""" 75 | 76 | def parse(self, **kwargs): # noqa: D102 77 | 78 | calc = self.node 79 | 80 | mapspec = MapSpec.from_string(calc.get_option("mapspec")) 81 | mapped_parameter_shapes = { 82 | k: array_shape(v) 83 | for k, v in calc.inputs.kwargs.items() 84 | if k in mapspec.parameters 85 | } 86 | expected_shape = mapspec.shape(mapped_parameter_shapes) 87 | remote_folder = calc.outputs["remote_folder"] 88 | has_exceptions = bool(remote_folder.listdir("__exceptions__")) 89 | 90 | def retrieve(return_value_name): 91 | return PyRemoteArray( 92 | computer=calc.computer, 93 | remote_path=os.path.join( 94 | calc.outputs.remote_folder.get_remote_path(), 95 | return_value_name, 96 | ), 97 | shape=expected_shape, 98 | filename_template=common.array.filename_template, 99 | ) 100 | 101 | exit_code = None 102 | 103 | # If any data was produced we create the appropriate outputs. 104 | # Users can still tell something went wrong from the exit code. 105 | for r in calc.inputs.func.returns: 106 | path = os.path.join("__return_values__", r) 107 | has_data = remote_folder.listdir(path) 108 | if has_data: 109 | self.out(f"return_values.{r}", retrieve(path)) 110 | else: 111 | exit_code = self.exit_codes.MISSING_OUTPUT 112 | 113 | try: 114 | job_infos = calc.computer.get_scheduler().parse_detailed_job_info( 115 | calc.get_detailed_job_info() 116 | ) 117 | except AttributeError: 118 | pass 119 | else: 120 | if any(j["State"] == "FAILED" for j in job_infos): 121 | exit_code = self.exit_codes.NONZERO_EXIT_CODE 122 | 123 | if has_exceptions: 124 | self.out("exception", retrieve("__exceptions__")) 125 | exit_code = self.exit_codes.USER_CODE_RAISED 126 | 127 | if exit_code is not None: 128 | calc.set_exit_status(exit_code.status) 129 | calc.set_exit_message(exit_code.message) 130 | return exit_code 131 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/common/array.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import concurrent.futures 6 | import functools 7 | import itertools 8 | import operator 9 | import pathlib 10 | from typing import Any, List, Sequence, Tuple 11 | 12 | import numpy as np 13 | 14 | from . import serialize 15 | 16 | filename_template = "__{:d}__.pickle" 17 | 18 | 19 | class FileBasedObjectArray: 20 | """Array interface to a folder of files on disk. 21 | 22 | __getitem__ returns "np.ma.masked" for non-existant files. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | folder, 28 | shape, 29 | strides=None, 30 | filename_template=filename_template, 31 | ): 32 | self.folder = pathlib.Path(folder).absolute() 33 | self.shape = tuple(shape) 34 | self.strides = _make_strides(self.shape) if strides is None else tuple(strides) 35 | self.filename_template = str(filename_template) 36 | 37 | @property 38 | def size(self) -> int: 39 | """Return number of elements in the array.""" 40 | return functools.reduce(operator.mul, self.shape, 1) 41 | 42 | @property 43 | def rank(self) -> int: 44 | """Return the rank of the array.""" 45 | return len(self.shape) 46 | 47 | def _normalize_key(self, key: Tuple[int, ...]) -> Tuple[int, ...]: 48 | if not isinstance(key, tuple): 49 | key = (key,) 50 | if len(key) != self.rank: 51 | raise IndexError( 52 | f"too many indices for array: array is {self.rank}-dimensional, " 53 | "but {len(key)} were indexed" 54 | ) 55 | 56 | if any(isinstance(k, slice) for k in key): 57 | raise NotImplementedError("Cannot yet slice subarrays") 58 | 59 | normalized_key = [] 60 | for axis, k in enumerate(key): 61 | axis_size = self.shape[axis] 62 | normalized_k = k if k >= 0 else (axis_size - k) 63 | if not (0 <= normalized_k < axis_size): 64 | raise IndexError( 65 | "index {k} is out of bounds for axis {axis} with size {axis_size}" 66 | ) 67 | normalized_key.append(k) 68 | 69 | return tuple(normalized_key) 70 | 71 | def _index_to_file(self, index: int) -> pathlib.Path: 72 | """Return the filename associated with the given index.""" 73 | return self.folder / self.filename_template.format(index) 74 | 75 | def _key_to_file(self, key: Tuple[int, ...]) -> pathlib.Path: 76 | """Return the filename associated with the given key.""" 77 | index = sum(k * s for k, s in zip(key, self.strides)) 78 | return self._index_to_file(index) 79 | 80 | def _files(self): 81 | """Yield all the filenames that constitute the data in this array.""" 82 | return map(self._key_to_file, itertools.product(*map(range, self.shape))) 83 | 84 | def __getitem__(self, key): 85 | key = self._normalize_key(key) 86 | if any(isinstance(x, slice) for x in key): 87 | # XXX: need to figure out strides in order to implement this. 88 | raise NotImplementedError("Cannot yet slice subarrays") 89 | 90 | f = self._key_to_file(key) 91 | if not f.is_file(): 92 | return np.ma.core.masked 93 | return serialize.load(f) 94 | 95 | def to_array(self) -> np.ma.core.MaskedArray: 96 | """Return a masked numpy array containing all the data. 97 | 98 | The returned numpy array has dtype "object" and a mask for 99 | masking out missing data. 100 | """ 101 | items = _load_all(map(self._index_to_file, range(self.size))) 102 | mask = [not self._index_to_file(i).is_file() for i in range(self.size)] 103 | return np.ma.array(items, mask=mask, dtype=object).reshape(self.shape) 104 | 105 | def dump(self, key, value): 106 | """Dump 'value' into the file associated with 'key'. 107 | 108 | Examples 109 | -------- 110 | >>> arr = FileBasedObjectArray(...) 111 | >>> arr.dump((2, 1, 5), dict(a=1, b=2)) 112 | """ 113 | key = self._normalize_key(key) 114 | if not any(isinstance(x, slice) for x in key): 115 | return serialize.dump(value, self._key_to_file(key)) 116 | 117 | raise NotImplementedError("Cannot yet dump subarrays") 118 | 119 | 120 | def _tails(seq): 121 | while seq: 122 | seq = seq[1:] 123 | yield seq 124 | 125 | 126 | def _make_strides(shape): 127 | return tuple(functools.reduce(operator.mul, s, 1) for s in _tails(shape)) 128 | 129 | 130 | def _load_all(filenames: Sequence[str]) -> List[Any]: 131 | def maybe_read(f): 132 | return serialize.read(f) if f.is_file() else None 133 | 134 | def maybe_load(x): 135 | return serialize.loads(x) if x is not None else None 136 | 137 | # Delegate file reading to the threadpool but deserialize sequentially, 138 | # as this is pure Python and CPU bound 139 | with concurrent.futures.ThreadPoolExecutor() as tex: 140 | return [maybe_load(x) for x in tex.map(maybe_read, filenames)] 141 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/schedulers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from collections.abc import Mapping 6 | import datetime 7 | from typing import List, Optional, T 8 | 9 | from aiida.common.lang import type_check 10 | from aiida.schedulers import JobInfo, JobState 11 | from aiida.schedulers.plugins.slurm import SlurmScheduler 12 | import toolz 13 | 14 | __all__ = ["SlurmSchedulerWithJobArray"] 15 | 16 | 17 | class SlurmSchedulerWithJobArray(SlurmScheduler): 18 | """A Slurm scheduler that reports only a single JobInfo for job arrays.""" 19 | 20 | def _parse_joblist_output(self, retval, stdout, stderr): 21 | # Aiida assumes that there is a single job associated with each call 22 | # to 'sbatch', but this is not true in the case of job arrays. 23 | # In order to meet this requirement we merge the JobInfos for each job 24 | # in the array. 25 | return merge_job_arrays(super()._parse_joblist_output(retval, stdout, stderr)) 26 | 27 | # Return only the necessary fields for 'parse_output' to do its job. 28 | # Our fat array jobs mean the response from 'sacct' can be pretty huge. 29 | _detailed_job_info_fields = [ 30 | "JobID", 31 | "ExitCode", 32 | "State", 33 | "Reason", 34 | "CPUTime", 35 | ] 36 | 37 | def _get_detailed_job_info_command(self, job_id): 38 | fields = ",".join(self._detailed_job_info_fields) 39 | # --parsable2 separates fields with pipes, with no trailing pipe 40 | return f"sacct --format={fields} --parsable2 --jobs={job_id}" 41 | 42 | @classmethod 43 | def parse_detailed_job_info(cls, detailed_job_info): 44 | """Parse output from 'sacct', issued after the completion of the job.""" 45 | type_check(detailed_job_info, dict) 46 | 47 | retval = detailed_job_info["retval"] 48 | if retval != 0: 49 | stderr = detailed_job_info["stderr"] 50 | raise ValueError(f"Error code {retval} returned by 'sacct': {stderr}") 51 | 52 | try: 53 | detailed_stdout = detailed_job_info["stdout"] 54 | except KeyError: 55 | raise ValueError( 56 | "the `detailed_job_info` does not contain the required key `stdout`." 57 | ) 58 | 59 | type_check(detailed_stdout, str) 60 | 61 | lines = detailed_stdout.splitlines() 62 | 63 | try: 64 | fields, *job_infos = lines 65 | except IndexError: 66 | raise ValueError("`detailed_job_info.stdout` does not contain enough lines") 67 | fields = fields.split("|") 68 | 69 | if fields != cls._detailed_job_info_fields: 70 | raise ValueError( 71 | "Fields returned by 'sacct' do not match fields specified." 72 | ) 73 | 74 | # Parse the individual job outputs 75 | job_infos = [dict(zip(fields, info.split("|"))) for info in job_infos] 76 | # Each job has a 'batch' entry also, which we ignore 77 | job_infos = [j for j in job_infos if not j["JobID"].endswith(".batch")] 78 | 79 | return job_infos 80 | 81 | def parse_output(self, detailed_job_info, stdout, stderr): 82 | """Parse output from 'sacct', issued after the completion of the job.""" 83 | from aiida.engine import CalcJob 84 | 85 | job_infos = self.parse_detailed_job_info(detailed_job_info) 86 | 87 | # TODO: figure out how to return richer information to the calcjob, so 88 | # that a workchain could in principle reschedule with only the 89 | # failed jobs. 90 | if any(j["State"] == "OUT_OF_MEMORY" for j in job_infos): 91 | return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY 92 | if any(j["State"] == "TIMEOUT" for j in job_infos): 93 | return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME 94 | 95 | 96 | def merge_job_arrays(jobs: List[JobInfo]) -> List[JobInfo]: 97 | """Merge JobInfos from jobs in the same Slurm Array into a single JobInfo.""" 98 | mergers = { 99 | "job_id": toolz.compose(job_array_id, toolz.first), 100 | "dispatch_time": min, 101 | "finish_time": toolz.compose( 102 | max, 103 | toolz.curried.map(with_default(datetime.datetime.min)), 104 | ), 105 | "job_state": total_job_state, 106 | "raw_data": toolz.identity, 107 | } 108 | 109 | job_array_id_from_info = toolz.compose( 110 | job_array_id, toolz.functoolz.attrgetter("job_id") 111 | ) 112 | 113 | return [ 114 | merge_with_functions(*jobs, mergers=mergers, factory=JobInfo) 115 | for jobs in toolz.groupby(job_array_id_from_info, jobs).values() 116 | ] 117 | 118 | 119 | def total_job_state(states: List[JobState]) -> JobState: 120 | # Order is important here 121 | possible_states = [ 122 | JobState.UNDETERMINED, 123 | JobState.RUNNING, 124 | JobState.SUSPENDED, 125 | JobState.QUEUED_HELD, 126 | JobState.QUEUED, 127 | ] 128 | for ps in possible_states: 129 | if any(state == ps for state in states): 130 | return ps 131 | 132 | if all(state == JobState.DONE for state in states): 133 | return JobState.DONE 134 | else: 135 | raise RuntimeError("Invalid state encountered") 136 | 137 | 138 | def job_array_id(job_id: str) -> str: 139 | """Return the ID of the associated array job. 140 | 141 | If the provided job is not part of a job array then 142 | the job ID is returned. 143 | """ 144 | return toolz.first(job_id.split("_")) 145 | 146 | 147 | @toolz.curry 148 | def with_default(default: T, v: Optional[T]) -> T: 149 | """Return 'v' if it is not 'None', otherwise return 'default'.""" 150 | return default if v is None else v 151 | 152 | 153 | def merge_with_functions(*dicts, mergers, factory=dict): 154 | """Merge 'dicts', using 'mergers'. 155 | 156 | Parameters 157 | ---------- 158 | *dicts 159 | The dictionaries / mappings to merge 160 | mergers 161 | Mapping from keys in 'dicts' to functions. Each function 162 | accepts a list of values and returns a single value. 163 | factory 164 | Function that returns a new instance of the mapping 165 | type that we would like returned 166 | 167 | Examples 168 | -------- 169 | >>> merge_with_functions( 170 | ... {"a": 1, "b": 10, "c": "hello"}, 171 | ... {"a": 5, "b": 20, "c": "goodbye"}, 172 | ... mergers={"a": min, "b": max}, 173 | ... ) 174 | {"a": 1, "b": 20, "c": "goodbye"} 175 | """ 176 | if len(dicts) == 1 and not isinstance(dicts[0], Mapping): 177 | dicts = dicts[0] 178 | 179 | result = factory() 180 | for d in dicts: 181 | for k, v in d.items(): 182 | if k not in result: 183 | result[k] = [v] 184 | else: 185 | result[k].append(v) 186 | return toolz.itemmap( 187 | lambda kv: (kv[0], mergers.get(kv[0], toolz.last)(kv[1])), result, factory 188 | ) 189 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/_version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT License. 4 | 5 | 6 | from collections import namedtuple 7 | import os 8 | import subprocess 9 | 10 | from setuptools.command.build_py import build_py as build_py_orig 11 | from setuptools.command.sdist import sdist as sdist_orig 12 | 13 | Version = namedtuple("Version", ("release", "dev", "labels")) 14 | 15 | # No public API 16 | __all__ = [] 17 | 18 | package_root = os.path.dirname(os.path.realpath(__file__)) 19 | package_name = os.path.basename(package_root) 20 | distr_root = os.path.dirname(package_root) 21 | # If the package is inside a "src" directory the 22 | # distribution root is 1 level up. 23 | if os.path.split(distr_root)[1] == "src": 24 | _package_root_inside_src = True 25 | distr_root = os.path.dirname(distr_root) 26 | else: 27 | _package_root_inside_src = False 28 | 29 | STATIC_VERSION_FILE = "_static_version.py" 30 | 31 | 32 | def get_version(version_file=STATIC_VERSION_FILE): 33 | version_info = get_static_version_info(version_file) 34 | version = version_info["version"] 35 | if version == "__use_git__": 36 | version = get_version_from_git() 37 | if not version: 38 | version = get_version_from_git_archive(version_info) 39 | if not version: 40 | version = Version("unknown", None, None) 41 | return pep440_format(version) 42 | else: 43 | return version 44 | 45 | 46 | def get_static_version_info(version_file=STATIC_VERSION_FILE): 47 | version_info = {} 48 | with open(os.path.join(package_root, version_file), "rb") as f: 49 | exec(f.read(), {}, version_info) 50 | return version_info 51 | 52 | 53 | def version_is_from_git(version_file=STATIC_VERSION_FILE): 54 | return get_static_version_info(version_file)["version"] == "__use_git__" 55 | 56 | 57 | def pep440_format(version_info): 58 | release, dev, labels = version_info 59 | 60 | version_parts = [release] 61 | if dev: 62 | if release.endswith("-dev") or release.endswith(".dev"): 63 | version_parts.append(dev) 64 | else: # prefer PEP440 over strict adhesion to semver 65 | version_parts.append(".dev{}".format(dev)) 66 | 67 | if labels: 68 | version_parts.append("+") 69 | version_parts.append(".".join(labels)) 70 | 71 | return "".join(version_parts) 72 | 73 | 74 | def get_version_from_git(): 75 | try: 76 | p = subprocess.Popen( 77 | ["git", "rev-parse", "--show-toplevel"], 78 | cwd=distr_root, 79 | stdout=subprocess.PIPE, 80 | stderr=subprocess.PIPE, 81 | ) 82 | except OSError: 83 | return 84 | if p.wait() != 0: 85 | return 86 | if not os.path.samefile(p.communicate()[0].decode().rstrip("\n"), distr_root): 87 | # The top-level directory of the current Git repository is not the same 88 | # as the root directory of the distribution: do not extract the 89 | # version from Git. 90 | return 91 | 92 | # git describe --first-parent does not take into account tags from branches 93 | # that were merged-in. The '--long' flag gets us the 'dev' version and 94 | # git hash, '--always' returns the git hash even if there are no tags. 95 | for opts in [["--first-parent"], []]: 96 | try: 97 | p = subprocess.Popen( 98 | ["git", "describe", "--long", "--always"] + opts, 99 | cwd=distr_root, 100 | stdout=subprocess.PIPE, 101 | stderr=subprocess.PIPE, 102 | ) 103 | except OSError: 104 | return 105 | if p.wait() == 0: 106 | break 107 | else: 108 | return 109 | 110 | description = ( 111 | p.communicate()[0] 112 | .decode() 113 | .strip("v") # Tags can have a leading 'v', but the version should not 114 | .rstrip("\n") 115 | .rsplit("-", 2) # Split the latest tag, commits since tag, and hash 116 | ) 117 | 118 | try: 119 | release, dev, git = description 120 | except ValueError: # No tags, only the git hash 121 | # prepend 'g' to match with format returned by 'git describe' 122 | git = "g{}".format(*description) 123 | release = "unknown" 124 | dev = None 125 | 126 | labels = [] 127 | if dev == "0": 128 | dev = None 129 | else: 130 | labels.append(git) 131 | 132 | try: 133 | p = subprocess.Popen(["git", "diff", "--quiet"], cwd=distr_root) 134 | except OSError: 135 | labels.append("confused") # This should never happen. 136 | else: 137 | if p.wait() == 1: 138 | labels.append("dirty") 139 | 140 | return Version(release, dev, labels) 141 | 142 | 143 | # TODO: change this logic when there is a git pretty-format 144 | # that gives the same output as 'git describe'. 145 | # Currently we can only tell the tag the current commit is 146 | # pointing to, or its hash (with no version info) 147 | # if it is not tagged. 148 | def get_version_from_git_archive(version_info): 149 | try: 150 | refnames = version_info["refnames"] 151 | git_hash = version_info["git_hash"] 152 | except KeyError: 153 | # These fields are not present if we are running from an sdist. 154 | # Execution should never reach here, though 155 | return None 156 | 157 | if git_hash.startswith("$Format") or refnames.startswith("$Format"): 158 | # variables not expanded during 'git archive' 159 | return None 160 | 161 | VTAG = "tag: v" 162 | refs = set(r.strip() for r in refnames.split(",")) 163 | version_tags = set(r[len(VTAG) :] for r in refs if r.startswith(VTAG)) 164 | if version_tags: 165 | release, *_ = sorted(version_tags) # prefer e.g. "2.0" over "2.0rc1" 166 | return Version(release, dev=None, labels=None) 167 | else: 168 | return Version("unknown", dev=None, labels=["g{}".format(git_hash)]) 169 | 170 | 171 | __version__ = get_version() 172 | 173 | 174 | # The following section defines a module global 'cmdclass', 175 | # which can be used from setup.py. The 'package_name' and 176 | # '__version__' module globals are used (but not modified). 177 | 178 | 179 | def _write_version(fname): 180 | # This could be a hard link, so try to delete it first. Is there any way 181 | # to do this atomically together with opening? 182 | try: 183 | os.remove(fname) 184 | except OSError: 185 | pass 186 | with open(fname, "w") as f: 187 | f.write( 188 | "# This file has been created by setup.py.\n" 189 | "version = '{}'\n".format(__version__) 190 | ) 191 | 192 | 193 | class _build_py(build_py_orig): 194 | def run(self): 195 | super().run() 196 | _write_version(os.path.join(self.build_lib, package_name, STATIC_VERSION_FILE)) 197 | 198 | 199 | class _sdist(sdist_orig): 200 | def make_release_tree(self, base_dir, files): 201 | super().make_release_tree(base_dir, files) 202 | if _package_root_inside_src: 203 | p = os.path.join("src", package_name) 204 | else: 205 | p = package_name 206 | _write_version(os.path.join(base_dir, p, STATIC_VERSION_FILE)) 207 | 208 | 209 | cmdclass = dict(sdist=_sdist, build_py=_build_py) 210 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/query.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from __future__ import annotations 5 | 6 | import datetime 7 | import itertools 8 | import multiprocessing 9 | from pathlib import Path 10 | 11 | import aiida.common 12 | import aiida.engine 13 | import aiida.manage.configuration 14 | import aiida.orm 15 | 16 | from .data import PyRemoteArray, PyRemoteData 17 | from .workflow import PyWorkChain 18 | 19 | 20 | def workflows() -> aiida.orm.QueryBuilder: 21 | """Return an Aiida database query that will return all workflows.""" 22 | q = aiida.orm.QueryBuilder() 23 | q.append(cls=PyWorkChain, tag="flow") 24 | q.order_by({"flow": [{"ctime": {"order": "desc"}}]}) 25 | return q 26 | 27 | 28 | def running_workflows() -> aiida.orm.QueryBuilder: 29 | """Return an Aiida database query that will return all running workflows.""" 30 | r = workflows() 31 | r.add_filter( 32 | "flow", 33 | { 34 | "attributes.process_state": { 35 | "in": [ 36 | aiida.engine.ProcessState.RUNNING.value, 37 | aiida.engine.ProcessState.WAITING.value, 38 | ], 39 | } 40 | }, 41 | ) 42 | return r 43 | 44 | 45 | def recent_workflows( 46 | days: int = 0, hours: int = 0, minutes: int = 0 47 | ) -> aiida.orm.QueryBuilder: 48 | """Return an Aiida database query for all recently started workflows. 49 | 50 | Parameters 51 | ---------- 52 | days, hours, minutes 53 | Any workflows started more recently than this many days/minutes/hours 54 | will be included in the result of the query. 55 | """ 56 | delta = aiida.common.timezone.now() - datetime.timedelta( 57 | days=days, hours=hours, minutes=minutes 58 | ) 59 | r = workflows() 60 | r.add_filter("flow", {"ctime": {">": delta}}) 61 | return r 62 | 63 | 64 | def remote_files( 65 | profile: str | None = None, 66 | root: str | Path | None = None, 67 | ) -> set[Path]: 68 | """Return the paths of all RemoteData for the given profile. 69 | 70 | Parameters 71 | ---------- 72 | profile 73 | The profile name for which to return the UUIDs. 74 | If not provided, runs on the currently loaded profile. 75 | root 76 | If provided, return only sub-paths of this root path. 77 | 78 | Notes 79 | ----- 80 | As Paths are returned without any information about what computer 81 | the path refers to, this function is only useful in environments 82 | where the Paths are globally unique. 83 | """ 84 | if profile: 85 | aiida.load_profile(profile) 86 | 87 | # PyRemoteData and PyRemoteArray are not in the 'data.core.remote' 88 | # plugin path, so 'query.append' does not include them when querying 89 | # for 'aiida.orm.RemoteData', despite the fact that they do subclass it. 90 | remote_data = [aiida.orm.RemoteData, PyRemoteArray, PyRemoteData] 91 | 92 | query = aiida.orm.QueryBuilder() 93 | query.append(cls=remote_data, project="attributes.remote_path", tag="files") 94 | if root: 95 | root = Path(root).absolute() 96 | query.add_filter("files", {"attributes.remote_path": {"like": f"{root}%"}}) 97 | 98 | return {Path(p) for p, in query.iterall()} 99 | 100 | 101 | # Needs to be importable to be used with multiprocessing in 'referenced_remote_files' 102 | def _run_on_q(f, q, *args): 103 | try: 104 | r = f(*args) 105 | except Exception as e: 106 | q.put(("error", e)) 107 | else: 108 | q.put(("ok", r)) 109 | 110 | 111 | def referenced_remote_files(root: str | Path | None = None) -> set[Path]: 112 | """Return the paths of all RemoteData for all profiles. 113 | 114 | Parameters 115 | ---------- 116 | root 117 | If provided, return only sub-paths of this root path. 118 | 119 | Notes 120 | ----- 121 | As Paths are returned without any information about what computer 122 | the path refers to, this function is only useful in environments 123 | where the Paths are globally unique. 124 | """ 125 | # Loading different AiiDA profiles requires starting a fresh Python interpreter. 126 | # For this reason we cannot use concurrent.futures, and must use bare 127 | # multiprocessing. 128 | # TODO: revisit whether this is necessary when AiiDA 2.0 is released 129 | ctx = multiprocessing.get_context("spawn") 130 | q = ctx.Queue() 131 | profiles = aiida.manage.configuration.get_config().profile_names 132 | procs = [ 133 | ctx.Process(target=_run_on_q, args=(remote_files, q, p, root)) for p in profiles 134 | ] 135 | for proc in procs: 136 | proc.start() 137 | for proc in procs: 138 | proc.join() 139 | 140 | results = [q.get() for _ in range(q.qsize())] 141 | if errors := [e for status, e in results if status != "ok"]: 142 | raise ValueError(f"One or more processes errored: {errors}") 143 | 144 | return set(itertools.chain.from_iterable(r for _, r in results)) 145 | 146 | 147 | def referenced_work_directories(root: str | Path) -> set[Path]: 148 | """Return all calcjob working directories referenced in the AiiDA database. 149 | 150 | Notes 151 | ----- 152 | As Paths are returned without any information about what computer 153 | the path refers to, this function is only useful in environments 154 | where the Paths are globally unique. 155 | """ 156 | root = Path(root).absolute() 157 | # aiiDA shards working directory paths like '/path/to/.aiida_run/ab/cd/1234-...' 158 | # so we add 3 subdirectories onto the root to get to the working directories. 159 | n = len(root.parts) + 3 160 | return {Path(*p.parts[:n]) for p in referenced_remote_files(root)} 161 | 162 | 163 | def existing_work_directories(root: str | Path) -> set[Path]: 164 | """Return all calcjob working directories under 'root' that exist on disk. 165 | 166 | Notes 167 | ----- 168 | As Paths are returned without any information about what computer 169 | the path refers to, this function is only useful in environments 170 | where the Paths are globally unique. 171 | 172 | Examples 173 | -------- 174 | >>> work_directories("/path/to/my-user/.aiida_run") 175 | {PosixPath('/path/to/my-user/.aiida_run/00/24/ab.c2-899c-4106-8c8e-74638dbdd71c')} 176 | """ 177 | root = Path(root).absolute() 178 | # aiiDA shards working directory paths like '/path/to/.aiida_run/ab/cd/1234-...' 179 | # so we add glob 3 subdirectories onto the root to get to the working directories. 180 | return {Path(p) for p in root.glob("*/*/*")} 181 | 182 | 183 | def unreferenced_work_directories(root: str | Path) -> set[Path]: 184 | """Return all unreferenced calcjob working directories under 'root'. 185 | 186 | i.e. return all calcjob working directories that exist on disk, but are 187 | not referenced in the AiiDA database. 188 | 189 | Notes 190 | ----- 191 | As Paths are returned without any information about what computer 192 | the path refers to, this function is only useful in environments 193 | where the Paths are globally unique. 194 | 195 | Examples 196 | -------- 197 | >>> unreferenced_work_directories("/path/to/my-user/.aiida_run") 198 | {PosixPath('/path/to/my-user/.aiida_run/00/24/abc2-899c-4106-8c8e-74638dbdd71c')} 199 | """ 200 | root = Path(root).absolute() 201 | 202 | return existing_work_directories(root) - referenced_work_directories(root) 203 | 204 | 205 | def computer_work_directory(computer: str | aiida.orm.Computer) -> Path: 206 | """Return the work directory for 'computer'. 207 | 208 | Like 'computer.get_workdir()', except that '{username}' template 209 | parameters are replaced with actual usernames. 210 | 211 | Parameters 212 | ---------- 213 | computer 214 | A Computer instance, or a computer label. 215 | """ 216 | if not isinstance(computer, aiida.orm.Computer): 217 | computer = aiida.orm.load_computer(computer) 218 | 219 | with computer.get_transport() as t: 220 | return Path(computer.get_workdir().format(username=t.whoami())) 221 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/common/mapspec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from __future__ import annotations 6 | 7 | from dataclasses import dataclass 8 | import functools 9 | import re 10 | from typing import Dict, List, Optional, Tuple, Union 11 | 12 | from .array import _make_strides 13 | 14 | 15 | @dataclass(frozen=True) 16 | class ArraySpec: 17 | """Specification for a named array, with some axes indexed by named indices.""" 18 | 19 | name: str 20 | axes: Tuple[Optional[str]] 21 | 22 | def __post_init__(self): 23 | if not self.name.isidentifier(): 24 | raise ValueError( 25 | f"Array name '{self.name}' is not a valid Python identifier" 26 | ) 27 | for i in self.axes: 28 | if not (i is None or i.isidentifier()): 29 | raise ValueError(f"Index name '{i}' is not a valid Python identifier") 30 | 31 | def __str__(self) -> str: 32 | indices = [":" if x is None else x for x in self.axes] 33 | return f"{self.name}[{', '.join(indices)}]" 34 | 35 | @property 36 | def indices(self) -> Tuple[str]: 37 | """Return the names of the indices for this array spec.""" 38 | return tuple(x for x in self.axes if x is not None) 39 | 40 | @property 41 | def rank(self) -> int: 42 | """Return the rank of this array spec.""" 43 | return len(self.axes) 44 | 45 | def validate(self, shape: Tuple[int, ...]): 46 | """Raise an exception if 'shape' is not compatible with this array spec.""" 47 | if len(shape) != self.rank: 48 | raise ValueError( 49 | f"Expecting array of rank {self.rank}, but got array of shape {shape}" 50 | ) 51 | 52 | 53 | @dataclass(frozen=True) 54 | class MapSpec: 55 | """Specification for how to map input axes to output axes. 56 | 57 | Examples 58 | -------- 59 | >>> mapped = MapSpec.from_string("a[i, j], b[i, j], c[k] -> q[i, j, k]") 60 | >>> partial_reduction = MapSpec.from_string("a[i, :], b[:, k] -> q[i, k]") 61 | """ 62 | 63 | inputs: Tuple[ArraySpec] 64 | output: ArraySpec 65 | 66 | def __post_init__(self): 67 | if any(x is None for x in self.output.axes): 68 | raise ValueError("Output array must have all axes indexed (no ':').") 69 | 70 | output_indices = set(self.output.indices) 71 | input_indices = functools.reduce( 72 | set.union, (x.indices for x in self.inputs), set() 73 | ) 74 | 75 | if extra_indices := output_indices - input_indices: 76 | raise ValueError( 77 | "Output array has indices that do not appear " 78 | f"in the input: {extra_indices}" 79 | ) 80 | if unused_indices := input_indices - output_indices: 81 | raise ValueError( 82 | "Input array have indices that do not appear " 83 | f"in the output: {unused_indices}" 84 | ) 85 | 86 | @property 87 | def parameters(self) -> Tuple[str, ...]: 88 | """Return the parameter names of this mapspec.""" 89 | return tuple(x.name for x in self.inputs) 90 | 91 | @property 92 | def indices(self) -> Tuple[str, ...]: 93 | """Return the index names for this MapSpec.""" 94 | return self.output.indices 95 | 96 | def shape(self, shapes: Dict[str, Tuple[int, ...]]) -> Tuple[int, ...]: 97 | """Return the shape of the output of this MapSpec. 98 | 99 | Parameters 100 | ---------- 101 | shapes 102 | Shapes of the inputs, keyed by name. 103 | """ 104 | input_names = set(x.name for x in self.inputs) 105 | 106 | if extra_names := set(shapes.keys()) - input_names: 107 | raise ValueError( 108 | f"Got extra array {extra_names} that are not accepted by this map." 109 | ) 110 | if missing_names := input_names - set(shapes.keys()): 111 | raise ValueError( 112 | f"Inputs expected by this map were not provided: {missing_names}" 113 | ) 114 | 115 | # Each individual array is of the appropriate rank 116 | for x in self.inputs: 117 | x.validate(shapes[x.name]) 118 | 119 | # Shapes match between array sharing a named index 120 | 121 | def get_dim(array, index): 122 | axis = array.axes.index(index) 123 | return shapes[array.name][axis] 124 | 125 | shape = [] 126 | for index in self.output.indices: 127 | relevant_arrays = [x for x in self.inputs if index in x.indices] 128 | dim, *rest = [get_dim(x, index) for x in relevant_arrays] 129 | if any(dim != x for x in rest): 130 | raise ValueError( 131 | f"Dimension mismatch for arrays {relevant_arrays} " 132 | f"along {index} axis." 133 | ) 134 | shape.append(dim) 135 | 136 | return tuple(shape) 137 | 138 | def output_key(self, shape: Tuple[int, ...], linear_index: int) -> Tuple[int, ...]: 139 | """Return a key used for indexing the output of this map. 140 | 141 | Parameters 142 | ---------- 143 | shape 144 | The shape of the map output. 145 | linear_index 146 | The index of the element for which to return the key. 147 | 148 | Examples 149 | -------- 150 | >>> spec = MapSpec.from_string("x[i, j], y[j, :, k] -> z[i, j, k]") 151 | >>> spec.output_key((5, 2, 3), 23) 152 | (3, 1, 2) 153 | """ 154 | if len(shape) != len(self.indices): 155 | raise ValueError( 156 | f"Expected a shape of length {len(self.indices)}, got {shape}" 157 | ) 158 | return tuple( 159 | (linear_index // stride) % dim 160 | for stride, dim in zip(_make_strides(shape), shape) 161 | ) 162 | 163 | def input_keys( 164 | self, 165 | shape: Tuple[int, ...], 166 | linear_index: int, 167 | ) -> Dict[str, Tuple[Union[slice, int]]]: 168 | """Return keys for indexing inputs of this map. 169 | 170 | Parameters 171 | ---------- 172 | shape 173 | The shape of the map output. 174 | linear_index 175 | The index of the element for which to return the keys. 176 | 177 | Examples 178 | -------- 179 | >>> spec = MapSpec("x[i, j], y[j, :, k] -> z[i, j, k]") 180 | >>> spec.input_keys((5, 2, 3), 23) 181 | {'x': (3, 1), 'y': (1, slice(None, None, None), 2)} 182 | """ 183 | output_key = self.output_key(shape, linear_index) 184 | if len(output_key) != len(self.indices): 185 | raise ValueError( 186 | f"Expected a key of shape {len(self.indices)}, got {output_key}" 187 | ) 188 | ids = dict(zip(self.indices, output_key)) 189 | return { 190 | x.name: tuple(slice(None) if ax is None else ids[ax] for ax in x.axes) 191 | for x in self.inputs 192 | } 193 | 194 | def __str__(self) -> str: 195 | return f"{', '.join(map(str, self.inputs))} -> {self.output}" 196 | 197 | @classmethod 198 | def from_string(cls, expr): 199 | """Construct an MapSpec from a string.""" 200 | try: 201 | in_, out_ = expr.split("->") 202 | except ValueError: 203 | raise ValueError(f"Expected expression of form 'a -> b', but got '{expr}''") 204 | 205 | inputs = _parse_indexed_arrays(in_) 206 | outputs = _parse_indexed_arrays(out_) 207 | if len(outputs) != 1: 208 | raise ValueError(f"Expected a single output, but got {len(outputs)}") 209 | (output,) = outputs 210 | 211 | return cls(inputs, output) 212 | 213 | def to_string(self) -> str: 214 | """Return a faithful representation of a MapSpec as a string.""" 215 | return str(self) 216 | 217 | 218 | def _parse_index_string(index_string) -> List[Optional[str]]: 219 | indices = [idx.strip() for idx in index_string.split(",")] 220 | return [i if i != ":" else None for i in indices] 221 | 222 | 223 | def _parse_indexed_arrays(expr) -> List[ArraySpec]: 224 | array_pattern = r"(\w+?)\[(.+?)\]" 225 | return [ 226 | ArraySpec(name, _parse_index_string(indices)) 227 | for name, indices in re.findall(array_pattern, expr) 228 | ] 229 | -------------------------------------------------------------------------------- /examples/04-deleting-data.md: -------------------------------------------------------------------------------- 1 | # Deleting data 2 | 3 | 4 | This notebook provides guidance on how to delete data that you no longer need. 5 | 6 | 7 | As usual we first import AiiDA and aiida_dynamic_workflows: 8 | 9 | ```python 10 | import aiida 11 | aiida.load_profile() 12 | 13 | aiida.__version__ 14 | ``` 15 | 16 | ```python 17 | import aiida_dynamic_workflows 18 | import aiida_dynamic_workflows.workflow 19 | import aiida_dynamic_workflows.report 20 | 21 | aiida_dynamic_workflows.control.ensure_daemon_restarted() 22 | aiida_dynamic_workflows.__version__ 23 | ``` 24 | 25 | Next we define a utility function for watching processes as they evolve: 26 | 27 | ```python 28 | import datetime 29 | import time 30 | 31 | import ipywidgets as widgets 32 | 33 | def wait(p, timeout=2): 34 | out = widgets.Output() 35 | while not p.is_terminated: 36 | out.clear_output(wait=True) 37 | print(f"last updated @ {datetime.datetime.now()}") 38 | print(aiida_dynamic_workflows.report.progress(p)) 39 | time.sleep(timeout) 40 | out.clear_output(wait=True) 41 | print(f"Finished @ {p.mtime}") 42 | print(aiida_dynamic_workflows.report.progress(p)) 43 | ``` 44 | 45 | Now we create a small workflow, for illustrative purposes: 46 | 47 | ```python 48 | @aiida_dynamic_workflows.step(returns="c") 49 | def add(a, b): 50 | return a + b 51 | 52 | @aiida_dynamic_workflows.step(returns="z") 53 | def mul(c, y): 54 | return c * y 55 | 56 | 57 | workflow = ( 58 | aiida_dynamic_workflows.workflow 59 | .new_workflow("test") 60 | .then(add) 61 | .then(mul) 62 | .returning("c", "z") 63 | ) 64 | 65 | local = aiida_dynamic_workflows.engine.execution_environment("py39", "localhost") 66 | ``` 67 | 68 | ```python 69 | from functools import partial 70 | import random 71 | 72 | rand = partial(random.randint, 0, 1000) 73 | 74 | flow = aiida_dynamic_workflows.workflow.build(workflow.on(local), a=rand(), b=rand(), y=rand()) 75 | ``` 76 | 77 | And we run it: 78 | 79 | ```python 80 | run = aiida.engine.submit(flow) 81 | wait(run) 82 | ``` 83 | 84 | ## Deleting nodes from the AiiDA database 85 | 86 | 87 | Let's say that that you wish to delete the two runs from the database. 88 | 89 | AiiDA provides the following functionality of deleting the nodes from the database: 90 | 91 | ```python 92 | marked_pks, are_deleted = aiida.tools.delete_nodes([run.id]) 93 | ``` 94 | 95 | This function returns two things: 96 | 1. The first is a set containing the IDs of the nodes that were deleted (or not) 97 | 2. The second is a boolean value that is True if the nodes were actually deleted 98 | 99 | 100 | The first thing to notice is that `marked_pks` contains many more nodes than the ones we explicitly marked for deletion: 101 | 102 | ```python 103 | len(marked_pks) 104 | ``` 105 | 106 | This is because AiiDA tries to maintain the integrity of the provenance graph. 107 | 108 | If we delete the Workflow nodes then the calculation nodes that were created by the workflow, as well as all the produced data nodes, must also be deleted. 109 | 110 | 111 | We see that the above invocation did not actually delete anything: 112 | 113 | ```python 114 | are_deleted 115 | ``` 116 | 117 | This is a safety feature; to have `delete_nodes` actually delete, we must pass `dry_run=False`: 118 | 119 | ```python 120 | marked_pks, are_deleted = aiida.tools.delete_nodes([run.id], dry_run=False) 121 | ``` 122 | 123 | ```python 124 | are_deleted 125 | ``` 126 | 127 | ## Deleting the remote data 128 | 129 | 130 | Deleting the nodes from the AiiDA database is a good first step, however a typical workflow has all the intermediate data stored as `PyRemoteData` and `PyRemoteArray`. This means that the actual data is stored in a file on some remote filesystem (cluster NFS); only a _reference_ to the file is stored in the AiiDA database. 131 | 132 | Once we have deleted the nodes from the database we also need to ensure we remove the data from the remote filsystem, to avoid filling up our disk with unwanted data. 133 | 134 | Pyiida provides the following tools for achieving this. 135 | 136 | 137 | #### `aiida_dynamic_workflows.query.unreferenced_work_directories` 138 | 139 | 140 | This function returns any CalcJob working directories that are unreference by any RemoteData in _any profile_ in the AiiDA database. 141 | 142 | It expects a path that will be used as a root directory for the search (i.e. only paths under this root will be returned). 143 | 144 | To help with this there is `computer_work_directory`, which returns the CalcJob working directory root for the named computer: 145 | 146 | ```python 147 | from aiida_dynamic_workflows.query import unreferenced_work_directories, computer_work_directory 148 | ``` 149 | 150 | ```python 151 | unreferenced_paths = unreferenced_work_directories(computer_work_directory("localhost")) 152 | ``` 153 | 154 | We see that there are a few paths that are unreferenced by the AiiDA database: 155 | 156 | ```python 157 | unreferenced_paths 158 | ``` 159 | 160 | As these paths are not referenced by any RemoteData in the AiiDA database, they may safely be removed without invalidating the AiiDA provenance graph. 161 | 162 | As these are just plain old paths, they may be removed by any method you wish (e.g. export to a file `to-remove.txt` and run `cat to-remove.txt | parallel rm -r {}`) 163 | However, aiida_dynamic_workflows has a useful tool for just this: 164 | 165 | ```python 166 | aiida_dynamic_workflows.utils.parallel_rmtree(unreferenced_paths) 167 | ``` 168 | 169 | After having removed these paths, we should see that there are no more unreferenced work directories: 170 | 171 | ```python 172 | unreferenced_work_directories(computer_work_directory("localhost")) 173 | ``` 174 | 175 | ## Preserving cached data 176 | 177 | 178 | Let's run the calculation twice, again: 179 | 180 | ```python 181 | original_run = aiida.engine.submit(flow) 182 | wait(original_run) 183 | ``` 184 | 185 | ```python 186 | cached_run = aiida.engine.submit(flow) 187 | wait(cached_run) 188 | ``` 189 | 190 | We see that the calculations in the second run are created from the calculations in the first run: 191 | 192 | ```python 193 | for c in original_run.called: 194 | print(c.inputs.func.name, c.uuid) 195 | ``` 196 | 197 | Indeed, we see that the data nodes for the two runs point to the same location on the remote storage: 198 | 199 | ```python 200 | original_data_paths = {k: v.get_remote_path() for k, v in original_run.outputs.return_values.items()} 201 | print(original_data_paths) 202 | ``` 203 | 204 | ```python 205 | cached_data_paths = {k: v.get_remote_path() for k, v in cached_run.outputs.return_values.items()} 206 | print(cached_data_paths) 207 | ``` 208 | 209 | ```python 210 | assert original_data_paths == cached_data_paths 211 | ``` 212 | 213 | If we delete the original run only we therefore need to keep the remote data around, as it is still referenced by the cached run. 214 | 215 | **Let's verify that this is what happens.** 216 | 217 | 218 | First let's check that there is not any unreferenced data already: 219 | 220 | ```python 221 | assert not unreferenced_work_directories(computer_work_directory("localhost")) 222 | ``` 223 | 224 | and let's check that removing the original run is not going to remove any nodes associated with the cached run: 225 | 226 | ```python 227 | marked_pks, are_deleted = aiida.tools.delete_nodes([original_run.id]) 228 | for n in marked_pks: 229 | print(repr(aiida.orm.load_node(n))) 230 | ``` 231 | 232 | ```python 233 | aiida_dynamic_workflows.report.graph(cached_run, as_png=True) 234 | ``` 235 | 236 | We indeed see that there is no overlap; only the nodes from `original_run` are going to be deleted. 237 | 238 | 239 | Let's actually delete them: 240 | 241 | ```python 242 | marked_pks, are_deleted = aiida.tools.delete_nodes([original_run.id], dry_run=False) 243 | assert are_deleted 244 | ``` 245 | 246 | Let's now check that, indeed, the data is still referenced: 247 | 248 | ```python 249 | assert not unreferenced_work_directories(computer_work_directory("localhost")) 250 | ``` 251 | 252 | Success! 253 | 254 | 255 | If we now delete the cached run: 256 | 257 | ```python 258 | _, are_deleted = aiida.tools.delete_nodes([cached_run.id], dry_run=False) 259 | assert are_deleted 260 | ``` 261 | 262 | We should see that the data is now unreferenced: 263 | 264 | ```python 265 | unrefd = unreferenced_work_directories(computer_work_directory("localhost")) 266 | print(unrefd) 267 | assert unrefd 268 | assert {str(x) for x in unrefd} == set(cached_data_paths.values()) 269 | ``` 270 | 271 | And so we can safely delete them: 272 | 273 | ```python 274 | aiida_dynamic_workflows.utils.parallel_rmtree(unrefd) 275 | ``` 276 | -------------------------------------------------------------------------------- /examples/01-calculations.md: -------------------------------------------------------------------------------- 1 | # Running individual calculations with aiida-dynamic-workflows 2 | 3 | 4 | This notebook shows how to define and run individual calculations with aiida-dynamic-workflows, and how to _manually chain the results_ from one calculation into the next one. Chaining individual calculations together in a _workflow_ will be shown in the next notebook. 5 | 6 | 7 | ### This example assumes you already have Aiida set up, as well as the relevant codes/computers 8 | 9 | 10 | If that's not your case, check out the `example_computer_setup` directory. 11 | 12 | 13 | ### The imports 14 | 15 | 16 | First things first we must import `aiida` and call `aiida.load_profile`. 17 | 18 | This loads the default Aiida profile. Each Aiida profile has a separate database for storing calculations and data, 19 | as well as separate daemons for submitting calculations. 20 | 21 | ```python 22 | import aiida 23 | 24 | aiida.load_profile() 25 | aiida.__version__ 26 | ``` 27 | 28 | Next we must import the plugin. 29 | 30 | Additionally we call `ensure_daemon_restarted()` to ensure that the Aiida daemon has loaded the latest version of the plugin. 31 | Failing to restart the daemon when aiida-dynamic-workflows is updated can give strange results, as the environment in the notebook and the environment on the daemon will differ. After a restart the daemon will continue processing any running calculations (so nothing will be lost). 32 | 33 | ```python 34 | import aiida_dynamic_workflows as flows 35 | 36 | flows.control.ensure_daemon_restarted() 37 | flows.__version__ 38 | ``` 39 | 40 | # First define the execution environment 41 | 42 | 43 | We create an execution environment that uses the Conda environment `py39` on `my-cluster`, and will submit calculations to the `some-queue` queue. 44 | 45 | ```python 46 | cluster_env = flows.engine.execution_environment( 47 | "py39", # conda environment 48 | "my-cluster", # computer name 49 | queue=("some-queue", 24), # queue and num. cores per machine 50 | ) 51 | ``` 52 | 53 | We can also create an execution environment that uses the Conda environment on _this_ machine: 54 | 55 | ```python 56 | local_env = flows.engine.execution_environment("py39", "localhost") 57 | ``` 58 | 59 | Let's use the cluster execution environment going forward. 60 | 61 | ```python 62 | env = cluster_env 63 | ``` 64 | 65 | ## Then define some functions to run 66 | 67 | ```python 68 | @flows.step(returns="x_plus_y") 69 | def add(x: int, y: int): 70 | return x + y 71 | ``` 72 | 73 | ```python 74 | @flows.step(returns="z") 75 | def multiply(x: int, y: int) -> int: 76 | return x * y 77 | ``` 78 | 79 | ### Can be used as ordinary Python functions 80 | 81 | ```python 82 | add(1, 2) 83 | ``` 84 | 85 | ```python 86 | multiply(1, 2) 87 | ``` 88 | 89 | ```python 90 | multiply(add(3, 4), 5) 91 | ``` 92 | 93 | ### But they are really objects in the Aiida data store 94 | 95 | ```python 96 | add 97 | ``` 98 | 99 | ## We can submit them using Aiida 100 | 101 | 102 | We first build the calculation: 103 | 104 | ```python 105 | z = flows.engine.apply(add, x=1, y=2) 106 | z 107 | ``` 108 | 109 | We see that `engine.apply` produces a kind of specification for what to run. 110 | 111 | This is not yet enough to be able to run the thing: we need to specify _where_ to run it. 112 | 113 | We do this with the `on` method, which expects an execution environment: 114 | 115 | ```python 116 | z.on(env) 117 | ``` 118 | 119 | Note that the specification returned from the `on` method now contains a `queue_name`, and a `code` (which includes a reference to the cluster to run on). 120 | 121 | 122 | Finally we will actually run this specification. 123 | 124 | Even though the notebook is blocked, execution of `add` is actually happening _on the cluster_. 125 | 126 | ```python 127 | r = aiida.engine.run(z.on(env)) 128 | ``` 129 | 130 | If the execution of the cell above is hanging for too long, you may want to drop to the command line and inspect the running processes, e.g. using via `verdi process list`. 131 | The (verbose) daemon logs should be showing "copying file/folder" + Slurm-related stuff. 132 | 133 | If you're having trouble with remote execution, feel free to continue through the rest of the tutorial on your local computer by setting `env=local_env`. 134 | 135 | ```python 136 | %%time 137 | r["return_values"]["x_plus_y"].fetch_value() 138 | ``` 139 | 140 | This is good for debugging, but typically you don't want to block the notebook. 141 | 142 | Instead of `run` you can use `submit` to get a daemon worker to do the waiting for you: 143 | 144 | ```python 145 | r_submitted = aiida.engine.submit(z.on(env)) 146 | ``` 147 | 148 | ```python 149 | r_submitted 150 | ``` 151 | 152 | ```python 153 | print(flows.report.progress(r_submitted)) 154 | flows.report.graph(r_submitted) 155 | ``` 156 | 157 | Only a _reference to a file on the cluster_ is returned: 158 | 159 | ```python 160 | remote_value = r_submitted.outputs.return_values.x_plus_y 161 | remote_value 162 | ``` 163 | 164 | ```python 165 | remote_value.pickle_path 166 | ``` 167 | 168 | ```python 169 | %%time 170 | remote_value.fetch_value() 171 | ``` 172 | 173 | ## We can pass the output `PyRemoteData` as an _input_ to the next calculation 174 | 175 | ```python 176 | r_pass_as_remote_value = aiida.engine.run( 177 | flows.engine 178 | .apply(multiply, x=remote_value, y=2) 179 | .on(env) 180 | ) 181 | ``` 182 | 183 | ```python 184 | actual_return_value = r_pass_as_remote_value["return_values"]["z"] 185 | ``` 186 | 187 | ```python 188 | %%time 189 | actual_return_value.fetch_value() 190 | ``` 191 | 192 | ## We can also do maps, which will make use of Slurm Job arrays 193 | 194 | ```python 195 | import numpy as np 196 | 197 | xs = np.arange(100).reshape(10, 10) 198 | ys = np.arange(100, 200).reshape(10, 5, 2) 199 | ``` 200 | 201 | ```python 202 | z = ( 203 | flows.engine 204 | .map_(add, "x[i, j], y[j, k, l] -> z[i, j, k, l]") 205 | .on(env, max_concurrent_machines=2) 206 | ) 207 | ``` 208 | 209 | ```python 210 | %%time 211 | r_map = aiida.engine.submit(z.finalize(x=xs, y=ys)) 212 | ``` 213 | 214 | ```python 215 | print(flows.report.progress(r_map)) 216 | flows.report.graph(r_map) 217 | ``` 218 | 219 | ```python 220 | remote_mapped_values = r_map.outputs.return_values.x_plus_y 221 | remote_mapped_values.shape 222 | ``` 223 | 224 | Each element in the `map` is in its own Slurm job (in a single job array), _and they all write to separate files_. 225 | 226 | ```python 227 | %%time 228 | a = remote_mapped_values.fetch_value() 229 | a 230 | ``` 231 | 232 | `.fetch_value()` uses the default Aiida transport, and so is quite inefficient for loading many files (as in this example) 233 | 234 | Passing `local_files=True` is useful when the Aiida working directory on `my-cluster` is actually mounted locally on the machine where this notebook is running. 235 | 236 | ```python 237 | remote_mapped_values.get_remote_path() 238 | ``` 239 | 240 | ```python 241 | %%time 242 | a = remote_mapped_values.fetch_value(local_files=True) 243 | a 244 | ``` 245 | 246 | The loading operation is, consequently, several times faster. 247 | 248 | 249 | ## We can use the output of _that_ map as the input to another, of course! 250 | 251 | ```python 252 | chained_map = ( 253 | flows.engine 254 | .map_( 255 | multiply, 256 | "x[i, j, k, l] -> z[i, j, k, l]", # Now we only map over 'x'; 'y' is treated single value 257 | max_concurrent_machines=1, 258 | ).on(env) 259 | .finalize(x=remote_mapped_values, y=5) 260 | ) 261 | ``` 262 | 263 | ```python 264 | chained_map_job = aiida.engine.submit(chained_map) 265 | ``` 266 | 267 | ```python 268 | print(flows.report.progress(chained_map_job)) 269 | flows.report.graph(chained_map_job) 270 | ``` 271 | 272 | ```python 273 | rv = chained_map_job.outputs.return_values.z 274 | print(rv.shape) 275 | %time rv.fetch_value(local_files=True) 276 | ``` 277 | 278 | ## And then perform a reduction 279 | 280 | ```python 281 | @flows.step 282 | def reduce(xs: "FileBasedObjectArray"): 283 | return np.sum(xs.to_array()) 284 | ``` 285 | 286 | ```python 287 | r = aiida.engine.submit(flows.engine.apply(reduce, xs=rv).on(env)) 288 | ``` 289 | 290 | ```python 291 | print(flows.report.progress(r)) 292 | flows.report.graph(r) 293 | ``` 294 | 295 | ```python 296 | r.outputs.return_values._return_value.fetch_value() 297 | ``` 298 | 299 | # Defining the resource requirements for functions 300 | 301 | 302 | You can specify that functions need a certain amount of resources to run by passing a `resources` dictionary to `step`. 303 | 304 | 305 | Currently only `memory` and `cores` may be specified; these are passed to Slurm using the `--mem` and `--cpus-per-task` flags. 306 | 307 | For example, the following function declares that it requires 6 cores to run, and a total of `25GB` of memory (for the whole thing, not per core). 308 | 309 | 310 | A single instance of the function will run on this allocation, so we may use whatever method we wish to distribute work over the cores. In this example we are using `loky` to perform a simple map-reduce, but you could also use, e.g. an OpenMP-enabled BLAS to distribute a matrix computation over the cores. 311 | 312 | ```python 313 | import loky 314 | import time 315 | 316 | @flows.step(returns=("z", "elapsed_time"), resources=dict(memory="25GB", cores=6)) 317 | def f_on_several_cores(xs: list) -> list: 318 | 319 | def go(x): 320 | time.sleep(5) 321 | return x ** 2 322 | 323 | with loky.ProcessPoolExecutor(6) as ex: 324 | start = time.time() 325 | r = sum(ex.map(go, xs)) 326 | return r, f"execution time: {time.time() - start:.2f}s" 327 | ``` 328 | 329 | ```python 330 | r = aiida.engine.submit( 331 | flows.engine.apply( 332 | f_on_several_cores, xs=list(range(18)) 333 | ).on(cluster_env) 334 | ) 335 | ``` 336 | 337 | ```python 338 | print(flows.report.progress(r)) 339 | flows.report.graph(r) 340 | ``` 341 | 342 | ```python 343 | r.outputs.return_values.z.fetch_value() 344 | ``` 345 | 346 | ```python 347 | r.outputs.return_values.elapsed_time.fetch_value() 348 | ``` 349 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/report.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from collections import Counter 6 | import textwrap 7 | from typing import Union 8 | 9 | from IPython.display import Image 10 | import aiida.cmdline.utils.common as cmd 11 | from aiida.cmdline.utils.query.formatting import format_relative_time 12 | import aiida.orm 13 | from aiida.tools.visualization import Graph 14 | import graphviz 15 | 16 | from . import query 17 | from .calculations import PyCalcJob, PyMapJob, num_mapjob_tasks 18 | from .data import PyRemoteArray, PyRemoteData 19 | from .utils import render_png 20 | from .workchains import RestartedPyCalcJob, RestartedPyMapJob 21 | from .workflow import PyWorkChain 22 | 23 | __all__ = [ 24 | "log", 25 | "graph", 26 | "progress", 27 | "running_workflows", 28 | "recent_workflows", 29 | ] 30 | 31 | 32 | ProcessType = Union[aiida.orm.ProcessNode, int, str] 33 | 34 | 35 | def log(proc: ProcessType) -> str: 36 | """Return the output of 'verdi process report' for the given process. 37 | 38 | Parameters 39 | ---------- 40 | proc 41 | The Aiida node for the process, or a numeric ID, or a UUID. 42 | """ 43 | proc = _ensure_process_node(proc) 44 | if isinstance(proc, aiida.orm.CalcJobNode): 45 | return cmd.get_calcjob_report(proc) 46 | elif isinstance(proc, aiida.orm.WorkChainNode): 47 | return cmd.get_workchain_report(proc, levelname="REPORT") 48 | elif isinstance(proc, (aiida.orm.CalcFunctionNode, aiida.orm.WorkFunctionNode)): 49 | return cmd.get_process_function_report(proc) 50 | else: 51 | raise TypeError(f"Cannot get report for processes of type '{type(proc)}'") 52 | 53 | 54 | def graph( 55 | proc: ProcessType, size=(20, 20), as_png=False 56 | ) -> Union[graphviz.Digraph, Image]: 57 | """Return a graph visualization of a calculation or workflow. 58 | 59 | Parameters 60 | ---------- 61 | proc 62 | The Aiida node for the process, or a numeric ID, or a UUID. 63 | """ 64 | proc = _ensure_process_node(proc) 65 | graph = Graph( 66 | graph_attr={"size": ",".join(map(str, size)), "rankdir": "LR"}, 67 | node_sublabel_fn=_node_sublabel, 68 | ) 69 | graph.recurse_descendants(proc, include_process_inputs=True) 70 | if as_png: 71 | return render_png(graph.graphviz) 72 | return graph.graphviz 73 | 74 | 75 | def progress(proc: ProcessType) -> str: 76 | """Return a progress report of the given calculation or workflow. 77 | 78 | Parameters 79 | ---------- 80 | proc 81 | The Aiida node for the process, or a numeric ID, or a UUID. 82 | """ 83 | proc = _ensure_process_node(proc) 84 | if isinstance(proc, aiida.orm.CalcJobNode): 85 | return _calcjob_progress(proc) 86 | elif isinstance(proc, aiida.orm.WorkChainNode): 87 | if issubclass(proc.process_class, PyWorkChain): 88 | return _workflow_progress(proc) 89 | elif issubclass(proc.process_class, (RestartedPyCalcJob, RestartedPyMapJob)): 90 | return _restarted_calcjob_progress(proc) 91 | elif isinstance(proc, (aiida.orm.CalcFunctionNode, aiida.orm.WorkFunctionNode)): 92 | return _function_progress(proc) 93 | else: 94 | raise TypeError( 95 | "Cannot get a progress report for processes of type '{type(proc)}'" 96 | ) 97 | 98 | 99 | def running_workflows() -> str: 100 | """Return a progress report of the running workflows.""" 101 | r = _flatten(query.running_workflows().iterall()) 102 | return "\n\n".join(map(_workflow_progress, r)) 103 | 104 | 105 | def recent_workflows(days: int = 0, hours: int = 0, minutes: int = 0) -> str: 106 | """Return a progress report of all workflows that were started recently. 107 | 108 | This also includes workflows that are already complete. 109 | 110 | Parameters 111 | ---------- 112 | days, hours, minutes 113 | Any workflows started more recently than this many days/minutes/hours 114 | will be included in the result of the query. 115 | """ 116 | r = _flatten(query.recent_workflows(**locals()).iterall()) 117 | return "\n\n".join(map(_workflow_progress, r)) 118 | 119 | 120 | def _flatten(xs): 121 | for ys in xs: 122 | yield from ys 123 | 124 | 125 | def _workflow_progress(p: aiida.orm.WorkChainNode) -> str: 126 | assert issubclass(p.process_class, PyWorkChain) 127 | lines = [ 128 | # This is a _single_ output line 129 | f"{p.label or ''} (pk: {p.id}) " 130 | f"[{_process_status(p)}, created {format_relative_time(p.ctime)}]" 131 | ] 132 | for c in p.called: 133 | lines.append(textwrap.indent(progress(c), " ")) 134 | 135 | return "\n".join(lines) 136 | 137 | 138 | def _restarted_calcjob_progress(p: aiida.orm.WorkChainNode) -> str: 139 | assert issubclass(p.process_class, (RestartedPyCalcJob, RestartedPyMapJob)) 140 | lines = [ 141 | f"with_restarts({p.get_option('max_restarts')}) " 142 | f"(pk: {p.id}) [{_process_status(p)}]" 143 | ] 144 | for i, c in enumerate(p.called, 1): 145 | if c.label == p.label: 146 | # The launched process is the payload that we are running with restarts 147 | s = f"attempt {i}: {progress(c)}" 148 | else: 149 | # Some post-processing (for RestartedPyMapJob) 150 | s = progress(c) 151 | lines.append(textwrap.indent(s, " ")) 152 | 153 | return "\n".join(lines) 154 | 155 | 156 | def _calcjob_progress(p: aiida.orm.CalcJobNode) -> str: 157 | assert issubclass(p.process_class, PyCalcJob) 158 | s = p.get_state() or p.process_state 159 | 160 | # Show more detailed info while we're waiting for the Slurm job. 161 | if s == aiida.common.CalcJobState.WITHSCHEDULER: 162 | sections = [ 163 | f"created {format_relative_time(p.ctime)}", 164 | ] 165 | if p.get_scheduler_state(): 166 | sections.append(f"{p.get_scheduler_state().value} job {p.get_job_id()}") 167 | 168 | # Show total number of tasks and states of remaining tasks in mapjobs. 169 | job_states = _slurm_job_states(p) 170 | if job_states: 171 | if issubclass(p.process_class, PyMapJob): 172 | task_counts = Counter(job_states) 173 | task_states = ", ".join(f"{k}: {v}" for k, v in task_counts.items()) 174 | task_summary = f"{sum(task_counts.values())} / {num_mapjob_tasks(p)}" 175 | sections.extend( 176 | [ 177 | f"remaining tasks ({task_summary})", 178 | f"task states: {task_states}", 179 | ] 180 | ) 181 | else: 182 | sections.append(f"job state: {job_states[0]}") 183 | msg = ", ".join(sections) 184 | else: 185 | msg = _process_status(p) 186 | 187 | return f"{p.label} (pk: {p.id}) [{msg}]" 188 | 189 | 190 | def _process_status(p: aiida.orm.ProcessNode) -> str: 191 | 192 | generic_failure = ( 193 | f"failed, run 'aiida_dynamic_workflows.report.log({p.id})' " 194 | "for more information" 195 | ) 196 | 197 | if p.is_finished and not p.is_finished_ok: 198 | # 's.value' is "finished", even if the process finished with a non-zero exit 199 | # code. We prefer the more informative 'failed' + next steps. 200 | msg = generic_failure 201 | elif p.is_killed: 202 | # Process was killed: 'process_status' includes the reason why. 203 | msg = f"killed, {p.process_status}" 204 | elif p.is_excepted: 205 | # Process failed, and the error occured in the Aiida layers 206 | msg = generic_failure 207 | elif p.is_created_from_cache: 208 | msg = ( 209 | f"{p.process_state.value} " 210 | f"(created from cache, uuid: {p.get_cache_source()})" 211 | ) 212 | elif p.is_finished_ok: 213 | msg = "success" 214 | else: 215 | try: 216 | # Calcjobs have 'get_state', which gives more fine-grained information 217 | msg = p.get_state().value 218 | except AttributeError: 219 | msg = p.process_state.value 220 | 221 | return msg 222 | 223 | 224 | def _function_progress( 225 | p: Union[aiida.orm.CalcFunctionNode, aiida.orm.WorkFunctionNode] 226 | ) -> str: 227 | return f"{p.label} (pk: {p.id}) [{p.process_state.value}]" 228 | 229 | 230 | def _slurm_job_states(process): 231 | info = process.get_last_job_info() 232 | if not info: 233 | return [] 234 | else: 235 | return [x[1] for x in info.raw_data] 236 | 237 | 238 | def _ensure_process_node( 239 | node_or_id: Union[aiida.orm.ProcessNode, int, str] 240 | ) -> aiida.orm.ProcessNode: 241 | if isinstance(node_or_id, aiida.orm.ProcessNode): 242 | return node_or_id 243 | else: 244 | return aiida.orm.load_node(node_or_id) 245 | 246 | 247 | def _node_sublabel(node): 248 | if isinstance(node, aiida.orm.CalcJobNode) and issubclass( 249 | node.process_class, PyCalcJob 250 | ): 251 | labels = [f"function: {node.inputs.func.name}"] 252 | if state := node.get_state(): 253 | labels.append(f"State: {state.value}") 254 | if (job_id := node.get_job_id()) and (state := node.get_scheduler_state()): 255 | labels.append(f"Job: {job_id} ({state.value})") 256 | if node.exit_status is not None: 257 | labels.append(f"Exit Code: {node.exit_status}") 258 | if node.exception: 259 | labels.append("excepted") 260 | return "\n".join(labels) 261 | elif isinstance(node, (PyRemoteData, PyRemoteArray)): 262 | try: 263 | create_link = node.get_incoming().one() 264 | except Exception: 265 | return aiida.tools.visualization.graph.default_node_sublabels(node) 266 | if create_link.link_label.startswith("return_values"): 267 | return create_link.link_label.split("__")[1] 268 | else: 269 | return create_link.link_label 270 | else: 271 | return aiida.tools.visualization.graph.default_node_sublabels(node) 272 | -------------------------------------------------------------------------------- /examples/02-workflows.md: -------------------------------------------------------------------------------- 1 | # Dynamic workflows 2 | 3 | This notebook shows how to compose several steps into a workflow and launch them all at once. 4 | 5 | Contrast this to [01-calculations.md](./01-calculations.md), where we waited until calculations were finished before passing their data to the next calculation. 6 | 7 | 8 | First we do the usual imports and define an execution environment 9 | 10 | ```python 11 | from dataclasses import dataclass 12 | import time 13 | 14 | import numpy as np 15 | import toolz 16 | ``` 17 | 18 | ```python 19 | import aiida 20 | aiida.load_profile() 21 | 22 | aiida.__version__ 23 | ``` 24 | 25 | ```python 26 | import aiida_dynamic_workflows as flows 27 | 28 | flows.control.ensure_daemon_restarted() 29 | flows.__version__ 30 | ``` 31 | 32 | ```python 33 | cluster_env = flows.engine.execution_environment( 34 | "py39", # conda environment 35 | "my-cluster", # computer name 36 | queue=("some-queue", 24), # queue and num. cores per machine 37 | ) 38 | ``` 39 | 40 | ## Step definitions 41 | 42 | 43 | Next we define a bunch of individual "steps" from Python functions. 44 | 45 | as we saw in [01-calculations.md](./01-calculations.md), this will save the pickled function in the Aiida database 46 | 47 | ```python 48 | from aiida_dynamic_workflows import step 49 | ``` 50 | 51 | ```python 52 | @dataclass(frozen=True) 53 | class Geometry: 54 | x : float 55 | y : float 56 | 57 | 58 | @dataclass(frozen=True) 59 | class Mesh: 60 | geometry : Geometry 61 | mesh_size : float 62 | 63 | @dataclass(frozen=True) 64 | class Materials: 65 | geometry: Geometry 66 | materials: list[str] 67 | 68 | @dataclass(frozen=True) 69 | class Electrostatics: 70 | mesh: Mesh 71 | materials: Materials 72 | voltages: list[float] 73 | ``` 74 | 75 | ```python 76 | @step(returns="geo") 77 | def make_geometry(x: float, y: float) -> Geometry: 78 | time.sleep(5) # do some work 79 | return Geometry(x, y) 80 | 81 | 82 | @step(returns=("mesh", "coarse_mesh")) 83 | def make_mesh( 84 | geo: Geometry, 85 | mesh_size: float, 86 | coarse_mesh_size: float, 87 | ) -> tuple[Mesh, Mesh]: 88 | time.sleep(5) # do some work 89 | return Mesh(geo, mesh_size), Mesh(geo, coarse_mesh_size) 90 | 91 | 92 | @step(returns="materials") 93 | def make_materials(geo: Geometry) -> Materials: 94 | time.sleep(5) # do some work 95 | return Materials(geo, ["a", "b", "c"]) 96 | 97 | 98 | @step(returns="electrostatics") 99 | def run_electrostatics( 100 | mesh: Mesh, materials: Materials, V_left: float, V_right: float 101 | ) -> Electrostatics: 102 | time.sleep(10) # do some work 103 | return Electrostatics(mesh, materials, [V_left, V_right]) 104 | 105 | @step(returns="charge") 106 | def get_charge(electrostatics: Electrostatics) -> float: 107 | # obviously not actually the charge; but we should return _some_ number that 108 | # is "derived" from the electrostatics. 109 | return sum(electrostatics.voltages) 110 | ``` 111 | 112 | This final step is a little special. 113 | 114 | As we shall see in a couple cell's time this step will be used on the output `get_charge`, which will be "mapped" over its inputs. 115 | 116 | As a consequence, `average_charge` will be passed a reference to an "array" of values, where each value in the array is actually stored in a separate file on disk, hence the strance type signature. 117 | 118 | ```python 119 | @step(returns="average_charge") 120 | def average_charge(charge: "FileBasedObjectArray") -> float: 121 | # .to_array() is a bit dumb; it loads in _all_ the data at once, but 122 | # this is the simplest way, and in this example the data is not so large. 123 | return np.mean(charge.to_array()) 124 | ``` 125 | 126 | ## Composing workflows 127 | 128 | 129 | Here we compose up 2 "workflows": `model_flow` and `electrostatics_flow`: 130 | 131 | ```python 132 | from aiida_dynamic_workflows.workflow import first, concurrently, map_, new_workflow 133 | 134 | model_flow = ( 135 | new_workflow(name="model_flow") 136 | .then(make_geometry) 137 | .then( 138 | # These 2 steps will be done at the same time 139 | concurrently(make_mesh, make_materials) 140 | ) 141 | ) 142 | 143 | electrostatics_flow = ( 144 | new_workflow(name="electrostatics_flow") 145 | .then( 146 | map_( 147 | run_electrostatics, 148 | "V_left[a], V_right[b] -> electrostatics[a, b]", 149 | ) 150 | ).then( 151 | map_( 152 | get_charge, 153 | "electrostatics[i, j] -> charge[i, j]" 154 | ) 155 | ).then(average_charge) 156 | ) 157 | ``` 158 | 159 | We see that `electrostatics_flow` makes use of the `map_` function, which takes the step to execute, as well as a specification for how to map the inputs to the outputs. 160 | 161 | In the above example we see that `electrostatics_flow` expects `V_left` and `V_right` to be 1D arrays, and it will `run_electrostatics` for each pair of values in these two arrays (an "outer product"), producing a 2D array of values. 162 | 163 | The next step takes each of the elements in this 2D array and runs `get_charge` on them. 164 | 165 | The final step of `electrostatics_flow` (`average_charge`) takes the _whole 2D `charge` array_ and produces a single value. 166 | 167 | 168 | We can inspect what parameters and what outputs are produced by each flow: 169 | 170 | ```python 171 | model_flow.parameters, model_flow.all_outputs 172 | ``` 173 | 174 | ```python 175 | electrostatics_flow.parameters, electrostatics_flow.all_outputs 176 | ``` 177 | 178 | Note that the `mat_data` and `mesh` outputs from `model_flow` "line up" with parameters of the same name of `electrostatics_flow`. 179 | 180 | This enables us to `join` the two flows together: 181 | 182 | ```python 183 | total_flow = ( 184 | new_workflow(name="total_electrostatics") 185 | .join(model_flow) 186 | .join(electrostatics_flow) 187 | .returning("electrostatics", average_charge="avg_electrostatic_charge") 188 | ) 189 | ``` 190 | 191 | Invoking `returning` allows us to declare which of all the outputs should be considered "return values" of the workflow: 192 | 193 | ```python 194 | total_flow.returns 195 | ``` 196 | 197 | This is purely a convenience; all outputs produced by `total_flow` will be inspectable. 198 | 199 | 200 | We can finally visualize the workflow with `.visualize()`: 201 | 202 | ```python 203 | total_flow.visualize(as_png=True) 204 | ``` 205 | 206 | Ovals represent **data** and rectangles represent **calculations**. 207 | 208 | **grey** ovals represent _inputs_, while **white** ovals represent "intermediate" data. 209 | 210 | Any **red** rectangles indicate "map" calculations. **red** ovals represent data that is being mapped over / produced by a "map" step. 211 | 212 | 213 | ## Running the workflow 214 | 215 | 216 | Firstly we create a dictionary of all the inputs required by `total_flow`: 217 | 218 | ```python 219 | total_flow.parameters 220 | ``` 221 | 222 | ```python 223 | inputs = dict( 224 | mesh_size=0.01, 225 | V_left=np.linspace(0, 2, 10), 226 | V_right=np.linspace(-0.5, 0.5, 20), 227 | x=0.1, 228 | y=0.2, 229 | coarse_mesh_size=0.05, 230 | ) 231 | ``` 232 | 233 | then we combine the workflow and the inputs into a specification that Aiida can run: 234 | 235 | ```python 236 | ready = flows.workflow.build( 237 | total_flow.on(cluster_env), 238 | **inputs, 239 | ) 240 | ``` 241 | 242 | Note that similarly to single calculations, the workflow has an `on` method that can be used to specify where the calculations in the workflow should be run. 243 | 244 | 245 | Finally we submit the workflow to the Aiida daemon 246 | 247 | ```python 248 | running_flow = aiida.engine.submit(ready) 249 | ``` 250 | 251 | ## Seeing what's happening 252 | 253 | 254 | We can print a progress report of what's going on: 255 | 256 | ```python 257 | print(flows.report.progress(running_flow)) 258 | ``` 259 | 260 | And visualize the workflow graph: 261 | 262 | ```python 263 | flows.report.graph(running_flow) 264 | ``` 265 | 266 | ### If you restart your notebook 267 | 268 | 269 | As soon as you `submit`, your workflow run is recorded in the Aiida database, so even if you restart your notebook you will not "lose" the running workflow. 270 | 271 | You can use `running_workflows()` to get a summary of the workflows that are currently running: 272 | 273 | ```python 274 | print(flows.report.running_workflows()) 275 | ``` 276 | 277 | You can also get a summary of all the workflows started recently, e.g.: 278 | 279 | ```python 280 | print(flows.report.recent_workflows(days=2)) # All workflows started in the last 2 days. 281 | ``` 282 | 283 | ## Viewing results 284 | 285 | 286 | Once the workflow has completed we can get the returned values by inspecting `outputs.return_values`: 287 | 288 | ```python 289 | running_flow.outputs.return_values 290 | ``` 291 | 292 | Note that to get an inspectable value back we use `fetch_value()`, which pulls the cloudpickle blob from the cluster filesystem and loads it: 293 | 294 | ```python 295 | %%time 296 | running_flow.outputs.return_values.avg_electrostatic_charge.fetch_value() 297 | ``` 298 | 299 | We can also inspect any intermediate results by loading the appropriate data: 300 | 301 | ```python 302 | %%time 303 | running_flow.called[-2].outputs.return_values.charge.fetch_value(local_files=True)[:2, :2] 304 | ``` 305 | 306 | ## Viewing anything else 307 | 308 | 309 | We can always load any object that is stored in the database by querying for it's "primary key" or "UUID". 310 | 311 | For example, if we wanted the database node corresponding to the step `make_geometry` from the above run, we could: 312 | 313 | ```python 314 | ## NB: change the "5269" to the "primary key" of the "make_geometry" step 315 | ## You can get this information from the call-graph above. 316 | executed_geometry_step = aiida.orm.load_node(5269) 317 | ``` 318 | 319 | We can get, for example, the output from `sacct` from the completed job: 320 | 321 | ```python 322 | executed_geometry_step.get_detailed_job_info() 323 | ``` 324 | 325 | Or the contents of `stdout` and `stderr` from the job: 326 | 327 | ```python 328 | executed_geometry_step.get_scheduler_stdout() 329 | ``` 330 | 331 | In a pinch we can also get the directory on the cluster where the job ran. 332 | We can use this to manually inspect input/output files for sanity. 333 | 334 | ```python 335 | executed_geometry_step.get_remote_workdir() 336 | ``` 337 | 338 | ## Inspecting "sample plans" for results 339 | 340 | 341 | Often, given the result of a simulation we will want to be able to see the parameters that produced it. 342 | 343 | For example, the above workflow produces an intermediate result `charges`, and we might want to know what values of the inputs `x`, `y`, `V_left` etc. correspond to each values in the `charges` array. 344 | 345 | We can query this using `input_samples`: 346 | 347 | ```python 348 | import pandas as pd 349 | 350 | charges = running_flow.called[-2].outputs.return_values.charge 351 | 352 | df = pd.DataFrame(flows.input_samples(charges)) 353 | 354 | df 355 | ``` 356 | 357 | We see that we can feed the output into `pd.DataFrame` to get a dataframe of samples. 358 | 359 | Even though `charges` is a 2D array: 360 | 361 | ```python 362 | charges.shape 363 | ``` 364 | 365 | The samples are still presented as a (1D) dataframe. 366 | 367 | The rows of the dataframe are ordered in the same way as a _flattened_ `charges`. 368 | 369 | We can add another column to the dataframe so that the result is reported along with the inputs: 370 | 371 | ```python 372 | df_with_results = df.assign(charge=charges.fetch_value(local_files=True).reshape(-1)) 373 | df_with_results 374 | ``` 375 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/workchains.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from collections import defaultdict 6 | from typing import Any, Dict, Optional 7 | 8 | from aiida.engine import WorkChain, append_, if_, while_ 9 | import aiida.orm 10 | import numpy as np 11 | import toolz 12 | 13 | from . import common 14 | from .calculations import ( 15 | PyCalcJob, 16 | PyMapJob, 17 | array_job_spec_from_booleans, 18 | expected_mask, 19 | merge_remote_arrays, 20 | ) 21 | 22 | 23 | # Subclass needed for "option" getters/setters, so that a WorkChain 24 | # can transparently wrap a CalcJob. 25 | class WorkChainNode(aiida.orm.WorkChainNode): 26 | """ORM class for nodes representing the execution of a WorkChain.""" 27 | 28 | def get_option(self, name: str) -> Optional[Any]: 29 | """Return the value of an option that was set for this CalcJobNode.""" 30 | return self.get_attribute(name, None) 31 | 32 | def set_option(self, name: str, value: Any) -> None: 33 | """Set an option to the given value.""" 34 | self.set_attribute(name, value) 35 | 36 | def get_options(self) -> Dict[str, Any]: 37 | """Return the dictionary of options set for this CalcJobNode.""" 38 | options = {} 39 | for name in self.process_class.spec_options.keys(): 40 | value = self.get_option(name) 41 | if value is not None: 42 | options[name] = value 43 | 44 | return options 45 | 46 | def set_options(self, options: Dict[str, Any]) -> None: 47 | """Set the options for this CalcJobNode.""" 48 | for name, value in options.items(): 49 | self.set_option(name, value) 50 | 51 | 52 | # Hack to make this new node type use the Aiida logger. 53 | # This is important so that WorkChains that use this node type also 54 | # use the Aiida logger. 55 | WorkChainNode._logger = aiida.orm.WorkChainNode._logger 56 | 57 | 58 | class RestartedPyMapJob(WorkChain): 59 | """Workchain that resubmits a PyMapJob until all the tasks are complete. 60 | 61 | Tasks in the PyMapJob that succeeded on previous runs will not be resubmitted. 62 | """ 63 | 64 | _node_class = WorkChainNode 65 | 66 | @classmethod 67 | def define(cls, spec): # noqa: D102 68 | super().define(spec) 69 | spec.expose_inputs(PyMapJob) 70 | spec.expose_outputs(PyMapJob, include=["return_values", "exception"]) 71 | spec.input( 72 | "metadata.options.max_restarts", 73 | valid_type=int, 74 | default=5, 75 | help=( 76 | "Maximum number of iterations the work chain will " 77 | "restart the process to finish successfully." 78 | ), 79 | ) 80 | spec.exit_code( 81 | 410, 82 | "MAXIMUM_RESTARTS_EXCEEDED", 83 | message="The maximum number of restarts was exceeded.", 84 | ) 85 | 86 | spec.outline( 87 | cls.setup, 88 | while_(cls.should_run)(cls.run_mapjob, cls.inspect_result), 89 | if_(cls.was_restarted)(cls.merge_arrays, cls.extract_merged_arrays).else_( 90 | cls.pass_through_arrays 91 | ), 92 | cls.output, 93 | ) 94 | 95 | def setup(self): # noqa: D102 96 | self.report("Setting up") 97 | 98 | mapspec = common.MapSpec.from_string(self.inputs.metadata.options.mapspec) 99 | mapped_inputs = { 100 | k: v for k, v in self.inputs.kwargs.items() if k in mapspec.parameters 101 | } 102 | 103 | self.ctx.required_mask = expected_mask(mapspec, mapped_inputs) 104 | self.ctx.total_output_mask = np.full_like(self.ctx.required_mask, True) 105 | 106 | self.ctx.job_shape = self.ctx.required_mask.shape 107 | self.ctx.total_num_tasks = np.sum(~self.ctx.required_mask) 108 | 109 | self.ctx.iteration = 0 110 | self.ctx.launched_mapjobs = [] 111 | 112 | @property 113 | def n_tasks_remaining(self) -> int: 114 | """Return the number of tasks that remain to be run.""" 115 | return self.ctx.total_num_tasks - np.sum(~self.ctx.total_output_mask) 116 | 117 | @property 118 | def remaining_task_array(self) -> np.ndarray: 119 | """Return a boolean array indicating which tasks still need to be run.""" 120 | return np.logical_xor(self.ctx.required_mask, self.ctx.total_output_mask) 121 | 122 | @property 123 | def has_all_results(self) -> bool: 124 | """Return True iff all the necessary outputs are present.""" 125 | return np.all(self.ctx.total_output_mask == self.ctx.required_mask) 126 | 127 | def should_run(self): # noqa: D102 128 | return ( 129 | not self.has_all_results 130 | and self.ctx.iteration < self.inputs.metadata.options.max_restarts 131 | ) 132 | 133 | def run_mapjob(self): # noqa: D102 134 | # Run failed elements only, using custom 135 | # Slurm parameters: -A 1,3-10,20%24 136 | self.ctx.iteration += 1 137 | 138 | self.report(f"Running MapJob for {self.n_tasks_remaining} tasks") 139 | 140 | inputs = self.exposed_inputs(PyMapJob) 141 | 142 | # Modify "metadata.options.custom_scheduler_commands" so that the 143 | # correct tasks in the Slurm Job Array are run. 144 | # NOTE: This assumes we are running on Slurm 145 | options = inputs["metadata"]["options"] 146 | csc = options.custom_scheduler_commands 147 | # Remove the existing Array Job specification 148 | commands = [x for x in csc.split("\n") if "--array" not in x] 149 | # Add an updated Array Job specification 150 | task_spec = array_job_spec_from_booleans(self.remaining_task_array.reshape(-1)) 151 | max_concurrent_jobs = ( 152 | options.cores_per_machine * options.max_concurrent_machines 153 | ) 154 | commands.append(f"#SBATCH --array={task_spec}%{max_concurrent_jobs}") 155 | inputs = toolz.assoc_in( 156 | inputs, 157 | ("metadata", "options", "custom_scheduler_commands"), 158 | "\n".join(commands), 159 | ) 160 | 161 | # "max_restarts" does not apply to PyMapJobs 162 | del inputs["metadata"]["options"]["max_restarts"] 163 | 164 | fut = self.submit(PyMapJob, **inputs) 165 | return self.to_context(launched_mapjobs=append_(fut)) 166 | 167 | def inspect_result(self): # noqa: D102 168 | self.report("Inspecting result") 169 | 170 | job = self.ctx.launched_mapjobs[-1] 171 | 172 | m = result_mask(job, self.ctx.job_shape) 173 | self.ctx.total_output_mask[~m] = False 174 | 175 | self.report( 176 | f"{np.sum(~m)} tasks succeeded, " 177 | f"{self.n_tasks_remaining} / {self.ctx.total_num_tasks} remaining" 178 | ) 179 | 180 | def was_restarted(self): # noqa: D102 181 | return self.ctx.iteration > 1 182 | 183 | def merge_arrays(self): # noqa: D102 184 | self.report(f"Gathering arrays from {self.ctx.iteration} mapjobs.") 185 | assert self.ctx.iteration > 1 186 | 187 | exception_arrays = [] 188 | return_value_arrays = defaultdict(list) 189 | for j in self.ctx.launched_mapjobs: 190 | if "exception" in j.outputs: 191 | exception_arrays.append(j.outputs.exception) 192 | if "return_values" in j.outputs: 193 | for k, v in j.outputs.return_values.items(): 194 | return_value_arrays[k].append(v) 195 | 196 | # 'merge_remote_array' must take **kwargs (this is a limitation of Aiida), so 197 | # we convert a list of inputs into a dictionary with keys 'x0', 'x1' etc. 198 | def list_to_dict(lst): 199 | return {f"x{i}": x for i, x in enumerate(lst)} 200 | 201 | context_update = dict() 202 | 203 | # TODO: switch 'runner.run_get_node' to 'submit' once WorkChain.submit 204 | # allows CalcFunctions (it should already; this appears to be a 205 | # bug in Aiida). 206 | 207 | if exception_arrays: 208 | r = self.runner.run_get_node( 209 | merge_remote_arrays, 210 | **list_to_dict(exception_arrays), 211 | ) 212 | context_update["exception"] = r.node 213 | 214 | for k, arrays in return_value_arrays.items(): 215 | r = self.runner.run_get_node( 216 | merge_remote_arrays, 217 | **list_to_dict(arrays), 218 | ) 219 | context_update[f"return_values.{k}"] = r.node 220 | 221 | return self.to_context(**context_update) 222 | 223 | def extract_merged_arrays(self): # noqa: D102 224 | if "exception" in self.ctx: 225 | self.ctx.exception = self.ctx.exception.outputs.result 226 | if "return_values" in self.ctx: 227 | for k, v in self.ctx.return_values.items(): 228 | self.ctx.return_values[k] = v.outputs.result 229 | 230 | def pass_through_arrays(self): # noqa: D102 231 | self.report("Passing through results from single mapjob") 232 | assert self.ctx.iteration == 1 233 | (job,) = self.ctx.launched_mapjobs 234 | if "exception" in job.outputs: 235 | self.ctx.exception = job.outputs.exception 236 | if "return_values" in job.outputs: 237 | for k, v in job.outputs.return_values.items(): 238 | self.ctx[f"return_values.{k}"] = v 239 | 240 | def output(self): # noqa: D102 241 | self.report("Setting outputs") 242 | if "exception" in self.ctx: 243 | self.out("exception", self.ctx.exception) 244 | for k, v in self.ctx.items(): 245 | if k.startswith("return_values"): 246 | self.out(k, v) 247 | 248 | max_restarts = self.inputs.metadata.options.max_restarts 249 | if not self.has_all_results and self.ctx.iteration >= max_restarts: 250 | self.report(f"Restarted the maximum number of times {max_restarts}") 251 | return self.exit_codes.MAXIMUM_RESTARTS_EXCEEDED 252 | 253 | 254 | def result_mask(job, expected_shape) -> np.ndarray: 255 | """Return the result mask for a PyMapJob that potentially has multiple outputs.""" 256 | if "return_values" not in job.outputs: 257 | return np.full(expected_shape, True) 258 | rvs = job.outputs.return_values 259 | masks = [getattr(rvs, x).mask for x in rvs] 260 | if len(masks) == 1: 261 | return masks[0] 262 | else: 263 | # If for some reason one of the outputs is missing elements (i.e. the 264 | # mask value is True) then we need to re-run the corresponding task. 265 | return np.logical_or(*masks) 266 | 267 | 268 | class RestartedPyCalcJob(WorkChain): 269 | """Workchain that resubmits a PyCalcJob until it succeeds.""" 270 | 271 | _node_class = WorkChainNode 272 | 273 | @classmethod 274 | def define(cls, spec): # noqa: D102 275 | super().define(spec) 276 | spec.expose_inputs(PyCalcJob) 277 | spec.expose_outputs(PyCalcJob, include=["return_values", "exception"]) 278 | spec.input( 279 | "metadata.options.max_restarts", 280 | valid_type=int, 281 | default=5, 282 | help=( 283 | "Maximum number of iterations the work chain will " 284 | "restart the process to finish successfully." 285 | ), 286 | ) 287 | spec.exit_code( 288 | 410, 289 | "MAXIMUM_RESTARTS_EXCEEDED", 290 | message="The maximum number of restarts was exceeded.", 291 | ) 292 | spec.exit_code( 293 | 411, 294 | "CHILD_PROCESS_EXCEPTED", 295 | message="The child process excepted.", 296 | ) 297 | spec.outline( 298 | cls.setup, 299 | while_(cls.should_run)(cls.run_calcjob, cls.inspect_result), 300 | cls.output, 301 | ) 302 | 303 | def setup(self): # noqa: D102 304 | self.ctx.iteration = 0 305 | self.ctx.function_name = self.inputs.func.name 306 | self.ctx.children = [] 307 | self.ctx.is_finished = False 308 | 309 | def should_run(self): # noqa: D102 310 | return ( 311 | not self.ctx.is_finished 312 | and self.ctx.iteration < self.inputs.metadata.options.max_restarts 313 | ) 314 | 315 | def run_calcjob(self): # noqa: D102 316 | self.ctx.iteration += 1 317 | inputs = self.exposed_inputs(PyCalcJob) 318 | del inputs["metadata"]["options"]["max_restarts"] 319 | node = self.submit(PyCalcJob, **inputs) 320 | 321 | self.report( 322 | f"Launching {self.ctx.function_name}<{node.pk}> " 323 | f"iteration #{self.ctx.iteration}" 324 | ) 325 | 326 | return self.to_context(children=append_(node)) 327 | 328 | def inspect_result(self): # noqa: D102 329 | node = self.ctx.children[-1] 330 | 331 | if node.is_excepted: 332 | self.report(f"{self.ctx.function_name}<{node.pk}> excepted; aborting") 333 | return self.exit_codes.CHILD_PROCESS_EXCEPTED 334 | 335 | self.ctx.is_finished = node.exit_status == 0 336 | 337 | def output(self): # noqa: D102 338 | node = self.ctx.children[-1] 339 | label = f"{self.ctx.function_name}<{node.pk}>" 340 | 341 | self.out_many(self.exposed_outputs(node, PyCalcJob)) 342 | 343 | max_restarts = self.inputs.metadata.options.max_restarts 344 | if not self.ctx.is_finished and self.ctx.iteration >= max_restarts: 345 | self.report( 346 | f"Reached the maximum number of iterations {max_restarts}: " 347 | f"last ran {label}" 348 | ) 349 | return self.exit_codes.MAXIMUM_RESTARTS_EXCEEDED 350 | else: 351 | self.report( 352 | f"Succeeded after {self.ctx.iteration} submissions: " 353 | f"last ran {label}" 354 | ) 355 | -------------------------------------------------------------------------------- /examples/03-failures.md: -------------------------------------------------------------------------------- 1 | # Handling failures 2 | 3 | 4 | This notebook demonstrates how workflows can be built to handle common failure modes: 5 | 6 | 1. Persistent errors (e.g. a few samples in the sample plan are ill-defined) 7 | 2. Transient errors (e.g. meshing failed due to some random failure) 8 | 9 | To explore this we will take the workflows developed in [02-workflows.md](./02-workflows.md) and make a few modifications. 10 | 11 | 12 | First we do the usual imports and define an execution environment 13 | 14 | ```python 15 | from dataclasses import dataclass 16 | import random 17 | import time 18 | 19 | import numpy as np 20 | import toolz 21 | ``` 22 | 23 | ```python 24 | import aiida 25 | aiida.load_profile() 26 | 27 | aiida.__version__ 28 | ``` 29 | 30 | ```python 31 | import aiida_dynamic_workflows as flows 32 | from aiida_dynamic_workflows import step 33 | 34 | flows.control.ensure_daemon_restarted() 35 | flows.__version__ 36 | ``` 37 | 38 | ```python 39 | cluster_env = flows.engine.execution_environment( 40 | "py39", # conda environment 41 | "my-cluster", # computer name 42 | queue=("some-queue", 24), # queue and num. cores per machine 43 | ) 44 | ``` 45 | 46 | ## Defining the steps and workflows 47 | 48 | 49 | This is copied verbatim from [02-workflows.md](./02-workflows.md). 50 | 51 | In principle we could put this in a separate module, but this won't quite work until cloudpickle gets [this new feature](https://github.com/cloudpipe/cloudpickle/pull/417). 52 | 53 | ```python 54 | @dataclass(frozen=True) 55 | class Geometry: 56 | x : float 57 | y : float 58 | 59 | @dataclass(frozen=True) 60 | class Mesh: 61 | geometry : Geometry 62 | mesh_size : float 63 | 64 | @dataclass(frozen=True) 65 | class Materials: 66 | geometry: Geometry 67 | materials: list[str] 68 | 69 | @dataclass(frozen=True) 70 | class Electrostatics: 71 | mesh: Mesh 72 | materials: Materials 73 | voltages: list[float] 74 | ``` 75 | 76 | ```python 77 | @step(returns="geo") 78 | def make_geometry(x: float, y: float) -> Geometry: 79 | time.sleep(5) # do some work 80 | return Geometry(x, y) 81 | 82 | 83 | @step(returns=("mesh", "coarse_mesh")) 84 | def make_mesh( 85 | geo: Geometry, 86 | mesh_size: float, 87 | coarse_mesh_size: float, 88 | ) -> tuple[Mesh, Mesh]: 89 | time.sleep(5) # do some work 90 | return Mesh(geo, mesh_size), Mesh(geo, coarse_mesh_size) 91 | 92 | 93 | @step(returns="materials") 94 | def make_materials(geo: Geometry) -> Materials: 95 | time.sleep(5) # do some work 96 | return Materials(geo, ["a", "b", "c"]) 97 | 98 | 99 | @step(returns="electrostatics") 100 | def run_electrostatics( 101 | mesh: Mesh, materials: Materials, V_left: float, V_right: float 102 | ) -> Electrostatics: 103 | time.sleep(10) # do some work 104 | return Electrostatics(mesh, materials, [V_left, V_right]) 105 | 106 | 107 | @step(returns="charge") 108 | def get_charge(electrostatics: Electrostatics) -> float: 109 | # obviously not actually the charge; but we should return _some_ number that 110 | # is "derived" from the electrostatics. 111 | return sum(electrostatics.voltages) 112 | 113 | 114 | @step(returns="average_charge") 115 | def average_charge(charge: "FileBasedObjectArray") -> float: 116 | # .to_array() is a bit dumb; it loads in _all_ the data at once, but 117 | # this is the simplest way, and in this example the data is not so large. 118 | return np.mean(charge.to_array()) 119 | ``` 120 | 121 | ```python 122 | from aiida_dynamic_workflows.workflow import first, concurrently, map_, new_workflow 123 | 124 | model_flow = ( 125 | new_workflow(name="model_flow") 126 | .then(make_geometry) 127 | .then( 128 | # These 2 steps will be done at the same time 129 | concurrently(make_mesh, make_materials) 130 | ) 131 | ) 132 | 133 | electrostatics_flow = ( 134 | new_workflow(name="electrostatics_flow") 135 | .then( 136 | map_( 137 | run_electrostatics, 138 | "V_left[a], V_right[b] -> electrostatics[a, b]", 139 | ) 140 | ).then( 141 | map_( 142 | get_charge, 143 | "electrostatics[i, j] -> charge[i, j]" 144 | ) 145 | ).then(average_charge) 146 | ) 147 | 148 | total_flow = ( 149 | new_workflow(name="total_electrostatics") 150 | .join(model_flow) 151 | .join(electrostatics_flow) 152 | .returning("electrostatics", average_charge="avg_electrostatic_charge") 153 | ) 154 | ``` 155 | 156 | ## Modifying steps 157 | 158 | 159 | Now we make new meshing and electrostatics steps with the following modifications: 160 | 161 | + If the `mesh_error` parameter is True, then the meshing step always raises a `ValueError`. 162 | + If `V_left` or `V_right` is outside the bounds set by `V_limits` then the electrostatics step raises a `ValueError`. 163 | + The charge-extracting step will randomly fail with probability `failure_probability`. 164 | 165 | ```python 166 | # Inside the modified steps we should only reference the raw Python function, _not_ the 167 | # object in the Aiida database (which we will not be able to resolve, given that the code 168 | # will eventually be run in a job on the cluster). 169 | original_make_mesh = make_mesh.callable 170 | original_electrostatics = run_electrostatics.callable 171 | original_get_charge = get_charge.callable 172 | 173 | 174 | @flows.step(returns=("mesh", "coarse_mesh")) 175 | def modified_make_mesh(geo, mesh_size, coarse_mesh_size, mesh_error): 176 | if mesh_error: 177 | raise ValueError("Meshing step failed") 178 | else: 179 | return original_make_mesh(geo, mesh_size, coarse_mesh_size) 180 | 181 | 182 | @flows.step(returns="electrostatics") 183 | def modified_electrostatics(geo, mesh, materials, V_left, V_right, V_limits: tuple): 184 | a, b = V_limits 185 | if not (a < V_left < b and a < V_right < b): 186 | raise ValueError(f"Voltages ({V_left}, {V_right}) out of acceptable range {V_limits}") 187 | else: 188 | return original_electrostatics(mesh, materials, V_left, V_right) 189 | 190 | @flows.step(returns="charge") 191 | def modified_get_charge(electrostatics, failure_probability): 192 | import random 193 | if random.random() < failure_probability: 194 | raise ValueError("Randomly failed!") 195 | else: 196 | return original_get_charge(electrostatics) 197 | ``` 198 | 199 | ## Modifying workflows 200 | 201 | 202 | We now use the `replace_steps` method of the `total_flow` defined in `basic_electrostatics`. 203 | 204 | This allows us to easily replace the `make_mesh` and `run_electrostatics` steps with their modified versions that we defined above: 205 | 206 | ```python 207 | new_flow = ( 208 | total_flow 209 | .rename("total_flow_with_failures") 210 | .replace_steps({ 211 | make_mesh: modified_make_mesh, 212 | run_electrostatics: modified_electrostatics, 213 | get_charge: modified_get_charge, 214 | }) 215 | ) 216 | 217 | new_flow.visualize(as_png=True) 218 | ``` 219 | 220 | ## Running the workflow 221 | 222 | 223 | Let's first run the workflow with `mesh_error=True`, and see what happens: 224 | 225 | ```python 226 | inputs = dict( 227 | mesh_size=0.015, 228 | V_left=np.linspace(0, 1, 10), 229 | V_right=np.linspace(-0.5, 0.5, 20), 230 | x=0.15, 231 | y=0.25, 232 | coarse_mesh_size=0.05, 233 | # Extra parameters; needed for the modified steps 234 | V_limits=[-0.4, 0.4], 235 | failure_probability=0.2, 236 | mesh_error=True, 237 | ) 238 | ``` 239 | 240 | ```python 241 | running_workflow = aiida.engine.submit(flows.workflow.build( 242 | new_flow.on(cluster_env), 243 | **inputs, 244 | )) 245 | ``` 246 | 247 | 248 | ```python 249 | print(flows.report.progress(running_workflow)) 250 | flows.report.graph(running_workflow, as_png=True) 251 | ``` 252 | 253 | We see that the `make_geometry` and `make_mat_data` steps completed successfully (Exit Code 0), but `modified_make_mesh` failed with exit code 401. 254 | 255 | We can use `flows.report.log` to figure out what happened: 256 | 257 | ```python 258 | modified_mesh_calc = running_workflow.called[1] 259 | print(flows.report.log(modified_mesh_calc)) 260 | ``` 261 | 262 | We see that `User code raised an Exception`, and that `modified_make_mesh` returned an `exception` output. 263 | 264 | We can inspect the exception to see what happened: 265 | 266 | ```python 267 | modified_mesh_calc.outputs.exception.fetch_value() 268 | ``` 269 | 270 | We can get more insight into what happened by printing the log from the _workflow_: 271 | 272 | ```python 273 | print(flows.report.log(running_workflow)) 274 | ``` 275 | 276 | We see that the workflow detected the failure of `modified_make_mesh`. 277 | 278 | It tried to carry on anyway, but `modified_electrostatics` requires `mesh`, _so the step is skipped_. 279 | The remaining steps are also skipped for similar reasons. 280 | 281 | 282 | **The default workflow behaviour is to try to execute all steps, skipping steps for which there is not sufficient input.** 283 | 284 | 285 | ## Persistent errors in Map elements 286 | 287 | 288 | Now we will flip the `mesh_error` flag so that the mesh step completes successfully. 289 | 290 | Note, however, that some elements of the `modified_electrostatics` map will raise an exception 291 | because `V_left` or `V_right` are outside of the specified limits. 292 | 293 | We will see how the workflow handles such an error condition. 294 | 295 | 296 | We can easily make a small modification to the parameters before resubmitting by using `get_builder_restart()` on the previously executed workflow: 297 | 298 | ```python 299 | no_mesh_error = running_workflow.get_builder_restart() 300 | no_mesh_error.kwargs.mesh_error = aiida.orm.to_aiida_type(False) 301 | ``` 302 | 303 | Then submit the workflow with the modified parameters: 304 | 305 | ```python 306 | running_workflow2 = aiida.engine.submit(no_mesh_error) 307 | ``` 308 | 309 | ```python 310 | print(flows.report.progress(running_workflow2)) 311 | flows.report.graph(running_workflow2, as_png=True) 312 | ``` 313 | 314 | We see that the `modified_electrostatics` step returned exit code 401, indicating that our code raised a Python exception; we also see that the `exception` output was produced. 315 | 316 | 317 | However, we also see that an `electrostatics` output was produced, despite the non-zero exit code. 318 | 319 | 320 | Let's load in the exception to see what is going on: 321 | 322 | ```python 323 | electrostatics_step = running_workflow2.called[3] 324 | ``` 325 | 326 | ```python 327 | exceptions = electrostatics_step.outputs.exception.fetch_value() 328 | ``` 329 | 330 | ```python 331 | exceptions[:4, :4] 332 | ``` 333 | 334 | As `modified_electrostatics` was run as a `PyMapJob`, `exceptions` is a _masked array_, that contains the exception raised by the given element in the map (and masked for elements that did not raise an exception). 335 | 336 | 337 | Similarly, `electrostatics` will be a masked array, with the map elements that raised an exception _masked out_: 338 | 339 | ```python 340 | electrostatics_array = electrostatics_step.outputs.return_values.electrostatics 341 | electrostatics_array.mask[:4, :4] 342 | ``` 343 | 344 | Nevertheless, the workflow can continue, even with this "partial" output. 345 | 346 | 347 | The downstream `PyMapJob` that runs `get_charge` detects that the input(s) are "masked" and only runs `get_charge` for the data that actually exists. 348 | 349 | We can "see" this by inspecting the `--array` specification that was passed to Slurm by the job: 350 | 351 | ```python 352 | get_charge_step = running_workflow2.called[4] 353 | print(get_charge_step.attributes["custom_scheduler_commands"]) 354 | ``` 355 | 356 | We see that only array elements 2-17, 22-37 etc. are submitted, as the other elements of `electrostatics` are missing. 357 | 358 | 359 | ## Mitigating transient errors 360 | 361 | 362 | Our modified `get_charge` step randomly fails with a certain probability: a "transient" error. 363 | 364 | We can see this, as the array of charges output by the step does not have the same mask as the electrostatics that were used as input: 365 | 366 | ```python 367 | charge_array = get_charge_step.outputs.return_values.charge 368 | 369 | np.sum(charge_array.mask != electrostatics_array.mask) 370 | ``` 371 | 372 | A simple way to mitigate transient errors is to specify that steps should be restarted. 373 | 374 | For MapJobs we can do this by specifying `max_restarts` to `map_`: 375 | 376 | ```python 377 | electrostatics_flow_with_restarts = ( 378 | first( 379 | map_( 380 | run_electrostatics, 381 | "V_left[i], V_right[j] -> electrostatics[i, j]", 382 | ) 383 | ).then( 384 | map_( 385 | get_charge, 386 | "electrostatics[i, j] -> charge[i, j]", 387 | max_restarts=5, # <-- specify the max number of restarts here 388 | ) 389 | ).then(average_charge) 390 | ) 391 | 392 | ``` 393 | 394 | Alternatively, we can use the `with_restarts` method of an existing workflow to add restarts to the named steps: 395 | 396 | ```python 397 | total_flow_with_restarts = ( 398 | new_flow 399 | .rename(name="flow_with_restarts") 400 | .with_restarts({modified_get_charge: 5}) 401 | ) 402 | ``` 403 | 404 | ```python 405 | running_workflow_with_restarts = aiida.engine.submit(flows.workflow.build( 406 | total_flow_with_restarts.on(cluster_env), 407 | **toolz.assoc(inputs, "mesh_error", False), 408 | )) 409 | ``` 410 | 411 | ```python 412 | print(flows.report.progress(running_workflow_with_restarts)) 413 | flows.report.graph(running_workflow_with_restarts, as_png=True) 414 | ``` 415 | 416 | We see that after `modified_electrostatics` there is a "`RestartedPyMapJob`" that sequentially launches several `PyMapJob`s that each run `modified_get_charge`. 417 | 418 | We can see what is going on by printing the log of this `RestartedMapJob`: 419 | 420 | ```python 421 | restarted_mapjob = running_workflow_with_restarts.called[-2] 422 | print(flows.report.log(restarted_mapjob)) 423 | ``` 424 | 425 | The first time `modified_get_charge` is run, it is run over 64 tasks, the number of unmasked `electrostatics` in the input: 426 | 427 | ```python 428 | electrostatics_mapjob = running_workflow_with_restarts.called[-3] 429 | np.sum(~electrostatics_mapjob.outputs.return_values.electrostatics.mask) 430 | ``` 431 | 432 | This run results in a few failures, so the failed tasks are submitted again and so on until all 64 results have been obtained (or the maximum number of restarts has been exceeded). 433 | 434 | ```python 435 | for j in restarted_mapjob.called: 436 | if 'PyMapJob' not in j.process_type: 437 | continue 438 | print(j.get_option("custom_scheduler_commands")) 439 | ``` 440 | 441 | The `RestartedPyMapJob` then merges the outputs from the different runs together into a single array. 442 | 443 | 444 | Finally, we see that even the `average_charge` step completed successfully, as it was written in such a way that it transparently handles masked arrays: 445 | 446 | ```python 447 | running_workflow_with_restarts.outputs.return_values.avg_electrostatic_charge.fetch_value() 448 | ``` 449 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | """Aiida data plugins for running arbitrary Python functions.""" 5 | 6 | from concurrent.futures import ThreadPoolExecutor 7 | import functools 8 | import inspect 9 | import io 10 | from itertools import repeat 11 | import operator 12 | import os 13 | from pathlib import Path 14 | import tempfile 15 | from typing import Any, Callable, Dict, List, Optional, Tuple 16 | 17 | import aiida.orm 18 | import cloudpickle 19 | import numpy as np 20 | import toolz 21 | 22 | # To get Aiida's caching to be useful we need to have a stable way to hash Python 23 | # functions. The "default" is to hash the cloudpickle blob, but this is not 24 | # typically stable for functions defined in a Jupyter notebook. 25 | # TODO: insert something useful here. 26 | function_hasher = None 27 | 28 | 29 | class PyFunction(aiida.orm.Data): 30 | """Aiida representation of a Python function.""" 31 | 32 | def __init__(self, **kwargs): 33 | # TODO: basic typechecks on these 34 | func = kwargs.pop("func") 35 | assert callable(func) 36 | returns = kwargs.pop("returns") 37 | if isinstance(returns, str): 38 | returns = [returns] 39 | resources = kwargs.pop("resources", None) 40 | if resources is None: 41 | resources = dict() 42 | 43 | super().__init__(**kwargs) 44 | 45 | self.put_object_from_filelike( 46 | path="function.pickle", 47 | handle=io.BytesIO(cloudpickle.dumps(func)), 48 | ) 49 | self.set_attribute("resources", resources) 50 | self.set_attribute("returns", returns) 51 | self.set_attribute("parameters", _parameters(func)) 52 | 53 | # If 'function_hasher' is available then we store the 54 | # function hash directly, and _get_objects_to_hash will 55 | # _not_ use the pickle blob (which is not stable e.g. 56 | # for functions defined in a notebook). 57 | if callable(function_hasher): 58 | self.set_attribute("_function_hash", function_hasher(func)) 59 | 60 | try: 61 | source = inspect.getsource(func) 62 | except Exception: 63 | pass 64 | else: 65 | self.set_attribute("source", source) 66 | 67 | name = getattr(func, "__name__", None) 68 | if name: 69 | self.set_attribute("name", name) 70 | 71 | @property 72 | def resources(self) -> Dict[str, str]: 73 | """Resources required by this function.""" 74 | return self.get_attribute("resources") 75 | 76 | @property 77 | def source(self) -> str: 78 | """Source code of this function.""" 79 | return self.get_attribute("source") 80 | 81 | @property 82 | def name(self) -> str: 83 | """Name of this function.""" 84 | return self.get_attribute("name") 85 | 86 | @property 87 | def parameters(self) -> List[str]: 88 | """Parameters of this function.""" 89 | return self.get_attribute("parameters") 90 | 91 | @property 92 | def returns(self) -> Optional[List[str]]: 93 | """List of names returned by this function.""" 94 | return self.get_attribute("returns") 95 | 96 | # TODO: use better caching for this (maybe on the class level?) 97 | @functools.cached_property 98 | def pickle(self) -> bytes: 99 | """Pickled function.""" 100 | return self.get_object_content("function.pickle", "rb") 101 | 102 | @functools.cached_property 103 | def callable(self) -> Callable: 104 | """Return the function stored in this object.""" 105 | return cloudpickle.loads(self.pickle) 106 | 107 | @property 108 | def __signature__(self): 109 | return inspect.signature(self.callable) 110 | 111 | def __call__(self, *args: Any, **kwargs: Any): 112 | """Call the function stored in this object.""" 113 | return self.callable(*args, **kwargs) 114 | 115 | def _get_objects_to_hash(self) -> List[Any]: 116 | objects = super()._get_objects_to_hash() 117 | 118 | # XXX: this depends on the specifics of the implementation 119 | # of super()._get_objects_to_hash(). The second-to-last 120 | # elements in 'objects' is the hash of the file repository. 121 | # For 'PyFunction' nodes this contains the cloudpickle blob, 122 | # which we _do not_ want hashed. 123 | if "_function_hash" in self.attributes: 124 | *a, _, x = objects 125 | return [*a, x] 126 | else: 127 | return objects 128 | 129 | 130 | def _parameters(f: Callable) -> List[str]: 131 | valid_kinds = [ 132 | getattr(inspect.Parameter, k) for k in ("POSITIONAL_OR_KEYWORD", "KEYWORD_ONLY") 133 | ] 134 | params = inspect.signature(f).parameters.values() 135 | if any(p.kind not in valid_kinds for p in params): 136 | raise TypeError("Invalid signature") 137 | return [p.name for p in params] 138 | 139 | 140 | class Nil(aiida.orm.Data): 141 | """Trivial representation of the None type in Aiida.""" 142 | 143 | 144 | # TODO: make this JSON serializable so it can go directly in the DB 145 | class PyOutline(aiida.orm.Data): 146 | """Naive Aiida representation of a workflow outline.""" 147 | 148 | def __init__(self, **kwargs): 149 | outline = kwargs.pop("outline") 150 | super().__init__(**kwargs) 151 | 152 | self.put_object_from_filelike( 153 | path="outline.pickle", 154 | handle=io.BytesIO(cloudpickle.dumps(outline)), 155 | ) 156 | 157 | @functools.cached_property 158 | def value(self): 159 | """Python object loaded from the stored pickle.""" 160 | return cloudpickle.loads(self.get_object_content("outline.pickle", "rb")) 161 | 162 | 163 | # TODO: Annotate these with the class name (useful for visualization) 164 | class PyData(aiida.orm.Data): 165 | """Naive Aiida representation of an arbitrary Python object.""" 166 | 167 | def __init__(self, **kwargs): 168 | pickle_path = kwargs.pop("pickle_path") 169 | 170 | super().__init__(**kwargs) 171 | self.put_object_from_file(filepath=pickle_path, path="object.pickle") 172 | 173 | # TODO: do caching more intelligently: we could attach a cache to the 174 | # _class_ instead so that if we create 2 PyData objects that 175 | # point to the _same_ database entry (pk) then we only have to 176 | # load the data once. 177 | # (does Aiida provide some tooling for this?) 178 | @functools.cached_property 179 | def value(self): 180 | """Python object loaded from the stored pickle.""" 181 | return cloudpickle.loads(self.get_object_content("object.pickle", "rb")) 182 | 183 | 184 | class PyRemoteData(aiida.orm.RemoteData): 185 | """Naive Aiida representation of an arbitrary Python object on a remote computer.""" 186 | 187 | def __init__(self, **kwargs): 188 | pickle_path = str(kwargs.pop("pickle_path")) 189 | super().__init__(**kwargs) 190 | 191 | self.set_attribute("pickle_path", pickle_path) 192 | 193 | @property 194 | def pickle_path(self): 195 | """Return the remote path that contains the pickle.""" 196 | return os.path.join(self.get_remote_path(), self.get_attribute("pickle_path")) 197 | 198 | def fetch_value(self): 199 | """Load Python object from the remote pickle.""" 200 | with tempfile.NamedTemporaryFile(mode="rb") as f: 201 | self.getfile(self.get_attribute("pickle_path"), f.name) 202 | return cloudpickle.load(f) 203 | 204 | @classmethod 205 | def from_remote_data(cls, rd: aiida.orm.RemoteData, pickle_path: str): 206 | """Return a new PyRemoteData, given an existing RemoteData. 207 | 208 | Parameters 209 | ---------- 210 | rd 211 | RemoteData folder. 212 | pickle_path 213 | Relative path in the RemoteData that contains pickle data. 214 | """ 215 | return cls( 216 | remote_path=rd.get_remote_path(), 217 | pickle_path=pickle_path, 218 | computer=rd.computer, 219 | ) 220 | 221 | 222 | class PyRemoteArray(aiida.orm.RemoteData): 223 | """Naive Aiida representation of a remote array of arbitrary Python objects. 224 | 225 | Each object is stored in a separate file. 226 | """ 227 | 228 | def __init__(self, **kwargs): 229 | shape = kwargs.pop("shape") 230 | filename_template = kwargs.pop("filename_template") 231 | super().__init__(**kwargs) 232 | self.set_attribute("shape", tuple(shape)) 233 | self.set_attribute("filename_template", str(filename_template)) 234 | 235 | def _file(self, i: int) -> str: 236 | return self.get_attribute("filename_template").format(i) 237 | 238 | @property 239 | def pickle_path(self): 240 | """Return the remote path that contains the pickle files.""" 241 | return self.get_remote_path() 242 | 243 | def _fetch_buffer(self, local_files=False): 244 | """Return iterator over Python objects in this array.""" 245 | 246 | def _load(dir: Path, pickle_file: str): 247 | path = dir / pickle_file 248 | if not path.is_file(): 249 | return None 250 | else: 251 | with open(path, "rb") as f: 252 | return cloudpickle.load(f) 253 | 254 | def _iter_files(dir): 255 | with ThreadPoolExecutor() as ex: 256 | file_gen = map(self._file, range(self.size)) 257 | yield from ex.map(_load, repeat(dir), file_gen) 258 | 259 | if local_files: 260 | # If the array's directory does not exist then it's 261 | # not actually mounted locally. 262 | root_dir = Path(self.get_remote_path()) 263 | if not root_dir.is_dir(): 264 | raise FileNotFoundError(str(root_dir)) 265 | else: 266 | yield from _iter_files(root_dir) 267 | else: 268 | with tempfile.TemporaryDirectory() as temp_dir: 269 | dir = Path(os.path.join(temp_dir, "values")) 270 | # TODO: do this with chunks, rather than all files at once. 271 | with self.get_authinfo().get_transport() as transport: 272 | transport.gettree(self.get_remote_path(), dir) 273 | yield from _iter_files(dir) 274 | 275 | def fetch_value(self, local_files=False) -> np.ma.core.MaskedArray: 276 | """Return a numpy array with dtype 'object' for this array.""" 277 | # Objects that have a bogus '__array__' implementation fool 278 | # 'buff[:] = xs', so we need to manually fill the array. 279 | buff = np.empty((self.size,), dtype=object) 280 | for i, x in enumerate(self._fetch_buffer(local_files)): 281 | buff[i] = x 282 | buff = buff.reshape(self.shape) 283 | return np.ma.array(buff, mask=self.mask) 284 | 285 | @property 286 | def shape(self) -> Tuple[int, ...]: 287 | """Shape of this remote array.""" 288 | return tuple(self.get_attribute("shape")) 289 | 290 | @property 291 | def is_masked(self) -> bool: 292 | """Return True if some elements of the array are 'masked' (missing).""" 293 | return np.any(self.mask) 294 | 295 | @property 296 | def mask(self) -> np.ndarray: 297 | """Return the mask for the missing elements of the array.""" 298 | existing_files = set( 299 | v["name"] for v in self.listdir_withattributes() if not v["isdir"] 300 | ) 301 | return np.array( 302 | [self._file(i) not in existing_files for i in range(self.size)], 303 | dtype=bool, 304 | ).reshape(self.shape) 305 | 306 | @property 307 | def size(self) -> int: 308 | """Size of this remote array (product of the shape).""" 309 | return toolz.reduce(operator.mul, self.shape, 1) 310 | 311 | 312 | class PyArray(PyData): 313 | """Wrapper around PyData for storing a single array.""" 314 | 315 | def __init__(self, **kwargs): 316 | array = np.asarray(kwargs.pop("array")) 317 | with tempfile.NamedTemporaryFile() as handle: 318 | cloudpickle.dump(array, handle) 319 | handle.flush() 320 | handle.seek(0) 321 | super().__init__(pickle_path=handle.name, **kwargs) 322 | self.set_attribute("shape", array.shape) 323 | self.set_attribute("dtype", str(array.dtype)) 324 | self._cached = None 325 | 326 | @property 327 | def shape(self) -> Tuple[int, ...]: 328 | """Shape of this remote array.""" 329 | return tuple(self.get_attribute("shape")) 330 | 331 | @property 332 | def dtype(self) -> Tuple[int, ...]: 333 | """Shape of this remote array.""" 334 | return np.dtype(self.get_attribute("dtype")) 335 | 336 | @property 337 | def size(self) -> int: 338 | """Size of this remote array (product of the shape).""" 339 | return toolz.reduce(operator.mul, self.shape, 1) 340 | 341 | def get_array(self) -> np.ndarray: 342 | """Return the array.""" 343 | return self.value 344 | 345 | 346 | class PyException(aiida.orm.Data): 347 | """Aiida representation of a Python exception.""" 348 | 349 | # - Exception type 350 | # - message 351 | # - traceback 352 | ... 353 | 354 | 355 | # Register automatic conversion from lists and numpy arrays 356 | # to the appropriate Aiida datatypes 357 | 358 | 359 | @aiida.orm.to_aiida_type.register(type(None)) 360 | def _(_: None): 361 | return Nil() 362 | 363 | 364 | # Aiida Lists can only handle built-in types, which is not general 365 | # enough for our purposes. We therefore convert Python lists into 366 | # 1D PyArray types with 'object' dtype. 367 | @aiida.orm.to_aiida_type.register(list) 368 | def _(xs: list): 369 | arr = np.empty((len(xs),), dtype=object) 370 | # Objects that have a bogus '__array__' implementation fool 371 | # 'arr[:] = xs', so we need to manually fill the array. 372 | for i, x in enumerate(xs): 373 | arr[i] = x 374 | return PyArray(array=arr) 375 | 376 | 377 | @aiida.orm.to_aiida_type.register(np.ndarray) 378 | def _(x): 379 | return PyArray(array=x) 380 | 381 | 382 | def ensure_aiida_type(x: Any) -> aiida.orm.Data: 383 | """Return a new Aiida value containing 'x', if not already of an Aiida datatype. 384 | 385 | If 'x' is already an Aiida datatype, then return 'x'. 386 | """ 387 | if isinstance(x, aiida.orm.Data): 388 | return x 389 | else: 390 | r = aiida.orm.to_aiida_type(x) 391 | if not isinstance(r, aiida.orm.Data): 392 | raise RuntimeError( 393 | "Expected 'to_aiida_type' to return an Aiida data node, but " 394 | f"got an object of type '{type(r)}' instead (when passed " 395 | f"an object of type '{type(x)}')." 396 | ) 397 | return r 398 | 399 | 400 | # Register handlers for getting native Python objects from their 401 | # Aiida equivalents 402 | 403 | 404 | @functools.singledispatch 405 | def from_aiida_type(x): 406 | """Turn Aiida types into their corresponding native Python types.""" 407 | raise TypeError(f"Do not know how to convert {type(x)} to native Python type") 408 | 409 | 410 | @from_aiida_type.register(Nil) 411 | def _(_): 412 | return None 413 | 414 | 415 | @from_aiida_type.register(aiida.orm.BaseType) 416 | def _(x): 417 | return x.value 418 | 419 | 420 | @from_aiida_type.register(PyData) 421 | def _(x): 422 | return x.value 423 | 424 | 425 | @from_aiida_type.register(PyArray) 426 | def _(x): 427 | return x.get_array() 428 | 429 | 430 | # Register handlers for figuring out array shapes for different datatypes 431 | 432 | 433 | @functools.singledispatch 434 | def array_shape(x) -> Tuple[int, ...]: 435 | """Return the shape of 'x'.""" 436 | try: 437 | return tuple(map(int, x.shape)) 438 | except AttributeError: 439 | raise TypeError(f"No array shape defined for type {type(x)}") 440 | 441 | 442 | @array_shape.register(aiida.orm.List) 443 | def _(x): 444 | return (len(x),) 445 | 446 | 447 | # Register handlers for figuring out array masks for different datatypes 448 | 449 | 450 | @functools.singledispatch 451 | def array_mask(x) -> np.ndarray: 452 | """Return the mask applied to 'x'.""" 453 | try: 454 | return x.mask 455 | except AttributeError: 456 | raise TypeError(f"No array mask defined for type {type(x)}") 457 | 458 | 459 | @array_mask.register(aiida.orm.List) 460 | def _(x): 461 | return np.full((len(x),), False) 462 | 463 | 464 | @array_mask.register(PyArray) 465 | @array_mask.register(np.ndarray) 466 | def _(x): 467 | return np.full(x.shape, False) 468 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from __future__ import annotations 6 | 7 | from collections.abc import Mapping 8 | import copy 9 | from dataclasses import dataclass 10 | import os 11 | import sys 12 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 13 | 14 | import aiida.engine 15 | import aiida.orm 16 | import toolz 17 | 18 | from .calculations import PyCalcJob, PyMapJob, array_job_spec 19 | from .common import MapSpec 20 | from .data import PyFunction, ensure_aiida_type 21 | from .workchains import RestartedPyCalcJob, RestartedPyMapJob 22 | 23 | __all__ = ["apply", "map_"] 24 | 25 | 26 | @dataclass(frozen=True) 27 | class ExecutionEnvironment: 28 | """An execution environment in which to run a PyFunction as a PyCalcJob.""" 29 | 30 | code_label: str 31 | computer_label: str 32 | queue: Optional[Tuple[str, int]] = None 33 | 34 | @property 35 | def code(self): 36 | return aiida.orm.load_code("@".join((self.code_label, self.computer_label))) 37 | 38 | @property 39 | def computer(self): 40 | return aiida.orm.load_computer(self.computer_label) 41 | 42 | 43 | def code_from_conda_env(conda_env: str, computer_name: str) -> aiida.orm.Code: 44 | """Create AiiDA Code for python interpreter from conda environment.""" 45 | c = aiida.orm.load_computer(computer_name) 46 | with c.get_transport() as t: 47 | username = t.whoami() 48 | try: 49 | conda_dir = c.get_property("conda_dir").format(username=username) 50 | except AttributeError: 51 | raise RuntimeError(f"'conda_dir' is not set for {computer_name}.") 52 | 53 | conda_initscript = os.path.join(conda_dir, "etc", "profile.d", "conda.sh") 54 | python_path = os.path.join(conda_dir, "envs", conda_env, "bin", "python") 55 | 56 | prepend_text = "\n".join( 57 | [f"source {conda_initscript}", f"conda activate {conda_env}"] 58 | ) 59 | 60 | r, _stdout, stderr = t.exec_command_wait(prepend_text) 61 | 62 | if r != 0: 63 | raise RuntimeError( 64 | f"Failed to find Conda environment '{conda_env}' on '{computer_name}':" 65 | f"\n{stderr}" 66 | ) 67 | 68 | code = aiida.orm.Code((c, python_path), label=conda_env) 69 | code.set_prepend_text(prepend_text) 70 | code.store() 71 | return code 72 | 73 | 74 | def current_conda_environment() -> str: 75 | """Return current conda environment name.""" 76 | # from https://stackoverflow.com/a/57716519/3447047 77 | return sys.exec_prefix.split(os.sep)[-1] 78 | 79 | 80 | def execution_environment(conda_env: Optional[str], computer: str, queue=None): 81 | if conda_env is None: 82 | conda_env = current_conda_environment() 83 | code_id = "@".join([conda_env, computer]) 84 | try: 85 | aiida.orm.load_code(code_id) 86 | except aiida.common.NotExistent: 87 | code = code_from_conda_env(conda_env, computer) 88 | code.store() 89 | 90 | if queue and (queue[0] not in get_queues(computer)): 91 | raise ValueError(f"Queue '{queue[0]}' does not exist on '{computer}'") 92 | 93 | return ExecutionEnvironment(conda_env, computer, queue) 94 | 95 | 96 | def get_queues(computer_name) -> List[str]: 97 | """Return a list of valid queue names for the named computer.""" 98 | computer = aiida.orm.load_computer(computer_name) 99 | with computer.get_transport() as t: 100 | command = "sinfo --summarize" 101 | retval, stdout, stderr = t.exec_command_wait(command) 102 | if retval != 0: 103 | raise RuntimeError( 104 | f"'{command}' failed on on '{computer_name}' " 105 | f"with exit code {retval}: {stderr}" 106 | ) 107 | _, *lines = stdout.splitlines() 108 | return [line.split(" ")[0] for line in lines] 109 | 110 | 111 | def local_current_execution_environment() -> ExecutionEnvironment: 112 | return execution_environment(None, "localhost") 113 | 114 | 115 | class ProcessBuilder(aiida.engine.ProcessBuilder): 116 | """ProcessBuilder that is serializable.""" 117 | 118 | def on( 119 | self, env: ExecutionEnvironment, max_concurrent_machines: Optional[int] = None 120 | ) -> ProcessBuilder: 121 | """Return a new ProcessBuilder, setting it up for execution on 'env'.""" 122 | r = copy.deepcopy(self) 123 | 124 | r.code = env.code 125 | 126 | if env.queue is not None: 127 | queue_name, cores_per_machine = env.queue 128 | r.metadata.options.queue_name = queue_name 129 | 130 | if issubclass(r.process_class, (PyMapJob, RestartedPyMapJob)): 131 | # NOTE: We are using a feature of the scheduler (Slurm in our case) to 132 | # use array jobs. We could probably figure a way to do this with 133 | # the 'direct' scheduler (GNU parallel or sth), but that is out 134 | # of scope for now. 135 | if env.computer.scheduler_type != "dynamic_workflows.slurm": 136 | raise NotImplementedError( 137 | "Mapping is currently only supported in an environment that " 138 | f"supports Slurm array jobs, but {env.computer.label} is " 139 | f" configured to use '{env.computer.scheduler_type}'." 140 | ) 141 | 142 | if env.queue is None: 143 | raise ValueError( 144 | "A queue specification (e.g. ('my-queue', 24) ) is required" 145 | ) 146 | 147 | r.metadata.options.cores_per_machine = cores_per_machine 148 | 149 | if max_concurrent_machines is not None: 150 | r.metadata.options.max_concurrent_machines = max_concurrent_machines 151 | 152 | return r 153 | 154 | def finalize(self, **kwargs) -> ProcessBuilder: 155 | """Return a new ProcessBuilder, setting its 'kwargs' to those provided.""" 156 | r = copy.deepcopy(self) 157 | r.kwargs = toolz.valmap(ensure_aiida_type, kwargs) 158 | 159 | opts = r.metadata.options 160 | 161 | custom_scheduler_commands = ["#SBATCH --requeue"] 162 | 163 | if issubclass(r.process_class, (PyMapJob, RestartedPyMapJob)): 164 | mapspec = MapSpec.from_string(opts.mapspec) 165 | mapped_kwargs = { 166 | k: v for k, v in r.kwargs.items() if k in mapspec.parameters 167 | } 168 | 169 | cores_per_job = opts.resources.get( 170 | "num_cores_per_mpiproc", 1 171 | ) * opts.resources.get("num_mpiprocs_per_machine", 1) 172 | jobs_per_machine = opts.cores_per_machine // cores_per_job 173 | max_concurrent_jobs = jobs_per_machine * opts.max_concurrent_machines 174 | 175 | task_spec = array_job_spec(mapspec, mapped_kwargs) 176 | # NOTE: This assumes that we are running on Slurm. 177 | custom_scheduler_commands.append( 178 | f"#SBATCH --array={task_spec}%{max_concurrent_jobs}" 179 | ) 180 | 181 | opts.custom_scheduler_commands = "\n".join(custom_scheduler_commands) 182 | 183 | return r 184 | 185 | def with_restarts(self, max_restarts: int) -> ProcessBuilder: 186 | """Return a new builder for a RestartedPyCalcJob or RestartedPyMapJob.""" 187 | if issubclass(self.process_class, (PyMapJob, RestartedPyMapJob)): 188 | r = ProcessBuilder(RestartedPyMapJob) 189 | elif issubclass(self.process_class, (PyCalcJob, RestartedPyCalcJob)): 190 | r = ProcessBuilder(RestartedPyCalcJob) 191 | else: 192 | raise TypeError(f"Do not know how to add restarts to {self.process_class}") 193 | _copy_builder_contents(to=r, frm=self) 194 | r.metadata.options.max_restarts = max_restarts 195 | return r 196 | 197 | # XXX: This is a complete hack to be able to serialize "Outline". 198 | # We should think this through more carefully when we come to refactor. 199 | 200 | def __getstate__(self): 201 | def serialized_aiida_nodes(x): 202 | if isinstance(x, aiida.orm.Data): 203 | if not x.is_stored: 204 | x.store() 205 | return _AiidaData(x.uuid) 206 | else: 207 | return x 208 | 209 | serialized_data = traverse_mapping(serialized_aiida_nodes, self._data) 210 | return self._process_class, serialized_data 211 | 212 | def __setstate__(self, state): 213 | process_class, serialized_data = state 214 | self.__init__(process_class) 215 | 216 | def deserialize_aiida_nodes(x): 217 | if isinstance(x, _AiidaData): 218 | return aiida.orm.load_node(x.uuid) 219 | else: 220 | return x 221 | 222 | deserialized_data = traverse_mapping(deserialize_aiida_nodes, serialized_data) 223 | 224 | for k, v in deserialized_data.items(): 225 | if isinstance(v, Mapping): 226 | getattr(self, k)._update(v) 227 | else: 228 | setattr(self, k, v) 229 | 230 | 231 | # XXX: This is part of the __getstate__/__setstate__ hack for our custom ProcessBuilder 232 | @dataclass(frozen=True) 233 | class _AiidaData: 234 | uuid: str 235 | 236 | 237 | def _copy_builder_contents( 238 | to: aiida.engine.ProcessBuilderNamespace, 239 | frm: aiida.engine.ProcessBuilderNamespace, 240 | ): 241 | """Recursively copy the contents of 'frm' into 'to'. 242 | 243 | This mutates 'to'. 244 | """ 245 | for k, v in frm.items(): 246 | if isinstance(v, aiida.engine.ProcessBuilderNamespace): 247 | _copy_builder_contents(to[k], v) 248 | else: 249 | setattr(to, k, v) 250 | 251 | 252 | def traverse_mapping(f: Callable[[Any], Any], d: Mapping): 253 | """Traverse a nested Mapping, applying 'f' to all non-mapping values.""" 254 | return { 255 | k: traverse_mapping(f, v) if isinstance(v, Mapping) else f(v) 256 | for k, v in d.items() 257 | } 258 | 259 | 260 | def apply(f: PyFunction, *, max_restarts: int = 1, **kwargs) -> ProcessBuilder: 261 | """Apply f to **kwargs as a PyCalcJob or RestartedPyCalcJob. 262 | 263 | Parameters 264 | ---------- 265 | f 266 | The function to apply 267 | max_restarts 268 | The number of times to run 'f'. If >1 then a builder 269 | for a RestartedPyCalcJob is returned, otherwise 270 | a builder for a PyCalcJob is returned. 271 | **kwargs 272 | Keyword arguments to pass to 'f'. Will be converted 273 | to Aiida types using "aiida.orm.to_aiida_type" if 274 | not already a subtype of "aiida.orm.Data". 275 | """ 276 | # TODO: check that 'f' applies cleanly to '**kwargs' 277 | if max_restarts > 1: 278 | builder = ProcessBuilder(RestartedPyCalcJob) 279 | builder.metadata.options.max_restarts = int(max_restarts) 280 | else: 281 | builder = ProcessBuilder(PyCalcJob) 282 | 283 | builder.func = f 284 | builder.metadata.label = f.name 285 | if kwargs: 286 | builder.kwargs = toolz.valmap(ensure_aiida_type, kwargs) 287 | if f.resources: 288 | _apply_pyfunction_resources(f.resources, builder.metadata.options) 289 | return builder 290 | 291 | 292 | def apply_some(f: PyFunction, *, max_restarts: int = 1, **kwargs) -> ProcessBuilder: 293 | """Apply f to **kwargs as a PyCalcJob or RestartedPyCalcJob. 294 | 295 | 'kwargs' may contain _more_ inputs than what 'f' requires: extra 296 | inputs are ignored. 297 | 298 | Parameters 299 | ---------- 300 | f 301 | The function to apply 302 | max_restarts 303 | The number of times to run 'f'. If >1 then a builder 304 | for a RestartedPyCalcJob is returned, otherwise 305 | a builder for a PyCalcJob is returned. 306 | **kwargs 307 | Keyword arguments to pass to 'f'. Will be converted 308 | to Aiida types using "aiida.orm.to_aiida_type" if 309 | not already a subtype of "aiida.orm.Data". 310 | """ 311 | if max_restarts > 1: 312 | builder = ProcessBuilder(RestartedPyCalcJob) 313 | builder.metadata.options.max_restarts = int(max_restarts) 314 | else: 315 | builder = ProcessBuilder(PyCalcJob) 316 | 317 | builder.func = f 318 | builder.metadata.label = f.name 319 | relevant_kwargs = toolz.keyfilter(lambda k: k in f.parameters, kwargs) 320 | if relevant_kwargs: 321 | builder.kwargs = toolz.valmap(ensure_aiida_type, relevant_kwargs) 322 | if f.resources: 323 | _apply_pyfunction_resources(f.resources, builder.metadata.options) 324 | return builder 325 | 326 | 327 | def map_( 328 | f: PyFunction, 329 | spec: Union[str, MapSpec], 330 | *, 331 | max_concurrent_machines: Optional[int] = None, 332 | max_restarts: int = 1, 333 | **kwargs, 334 | ) -> aiida.engine.ProcessBuilder: 335 | """Map 'f' over (a subset of) its inputs as a PyMapJob. 336 | 337 | Parameters 338 | ---------- 339 | f 340 | Function to map over 341 | spec 342 | Specification for which parameters to map over, and how to map them. 343 | max_concurrent_machines 344 | The maximum number of machines to use concurrently. 345 | max_restarts 346 | The maximum number of times to restart the PyMapJob before returning 347 | a partial (masked) result and a non-zero exit code. 348 | **kwargs 349 | Keyword arguments to 'f'. Any arguments that are to be mapped over 350 | must by Aiida lists. 351 | 352 | Examples 353 | -------- 354 | >>> from aiida.orm import List 355 | >>> import aiida_dynamic_workflows as flow 356 | >>> 357 | >>> f = flow.step(lambda x, y: x + y, returns="sum") 358 | >>> 359 | >>> # We can map over _all_ inputs 360 | >>> sums = flow.engine.map_( 361 | ... f, "x[i], y[i] -> sum[i]", x=List([1, 2, 3]), y=List([4, 5, 6]) 362 | ... ) 363 | >>> # or we can map over a _subset_ of inputs 364 | >>> only_one = flow.engine.map_(f, "x[i] -> sum[i]", x=List([1, 2, 3]), y=5) 365 | >>> # or we can do an "outer product": 366 | >>> outer= flow.engine.map_( 367 | ... f, "x[i], y[j] -> sum[i, j]", x=List([1, 2, 3]), y=List([4, 5, 6]) 368 | ... ) 369 | """ 370 | if max_restarts > 1: 371 | builder = ProcessBuilder(RestartedPyMapJob) 372 | builder.metadata.options.max_restarts = int(max_restarts) 373 | else: 374 | builder = ProcessBuilder(PyMapJob) 375 | 376 | builder.func = f 377 | builder.metadata.label = f.name 378 | 379 | if isinstance(spec, str): 380 | spec = MapSpec.from_string(spec) 381 | elif not isinstance(spec, MapSpec): 382 | raise TypeError(f"Expected single string or MapSpec, got {spec}") 383 | if unknown_params := set(x.name for x in spec.inputs) - set(f.parameters): 384 | raise ValueError( 385 | f"{f} cannot be mapped over parameters that " 386 | f"it does not take: {unknown_params}" 387 | ) 388 | builder.metadata.options.mapspec = spec.to_string() 389 | 390 | if max_concurrent_machines is not None: 391 | builder.metadata.options.max_concurrent_machines = max_concurrent_machines 392 | 393 | if f.resources: 394 | _apply_pyfunction_resources(f.resources, builder.metadata.options) 395 | 396 | if not kwargs: 397 | return builder 398 | 399 | return builder.finalize(**kwargs) 400 | 401 | 402 | def _apply_pyfunction_resources( 403 | resources: Dict, options: aiida.engine.ProcessBuilderNamespace 404 | ) -> None: 405 | """Apply the resource specification in 'resources' to the CalcJob options 'options'. 406 | 407 | This mutates 'options'. 408 | """ 409 | memory = resources.get("memory") 410 | if memory is not None: 411 | # The Aiida Slurm plugin erroneously uses the multiplyer "1024" when converting 412 | # to MegaBytes and passing to "--mem", so we must use it here also. 413 | multiplier = {"kB": 1, "MB": 1024, "GB": 1000 * 1024} 414 | amount, unit = memory[:-2], memory[-2:] 415 | options.max_memory_kb = int(amount) * multiplier[unit] 416 | 417 | cores = resources.get("cores") 418 | if cores is not None: 419 | # Re-assign the whole 'resources' input dict to avoid problems with 420 | # serialization (also, mutating it seems to change the 'resources' for 421 | # all other Builders, which is not good!). 422 | options.resources = toolz.assoc( 423 | options.resources, "num_cores_per_mpiproc", int(cores) 424 | ) 425 | 426 | 427 | def all_equal(seq): 428 | """Return True iff all elements of 'seq' are equal. 429 | 430 | Returns 'True' if the sequence contains 0 or 1 elements. 431 | """ 432 | seq = list(seq) 433 | if len(seq) in (0, 1): 434 | return True 435 | fst, *rest = seq 436 | return all(r == fst for r in rest) 437 | -------------------------------------------------------------------------------- /aiida_dynamic_workflows/workflow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | 5 | from __future__ import annotations 6 | 7 | import abc 8 | import copy 9 | from dataclasses import dataclass, replace 10 | from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union 11 | 12 | import aiida.engine 13 | import graphviz 14 | import toolz 15 | 16 | from . import common, engine 17 | from .calculations import PyCalcJob, PyMapJob 18 | from .data import PyFunction, PyOutline, ensure_aiida_type 19 | from .utils import render_png 20 | 21 | # TODO: this will all need to be refactored when we grok 22 | # Aiida's 'Process' and 'Port' concepts. 23 | 24 | 25 | class Step(metaclass=abc.ABCMeta): 26 | """Abstract base class for steps.""" 27 | 28 | pass 29 | 30 | 31 | class Single(Step): 32 | """A single workflow step.""" 33 | 34 | pass 35 | 36 | 37 | class Action(Single): 38 | """Step that will be run with the current workchain passed as argument.""" 39 | 40 | def do(self, workchain): 41 | """Do the action on the workchain.""" 42 | pass 43 | 44 | 45 | @dataclass(frozen=True) 46 | class Concurrent(Step): 47 | """Step consisting of several concurrent steps.""" 48 | 49 | steps: List[Step] 50 | 51 | 52 | @dataclass(frozen=True) 53 | class Sequential(Step): 54 | """Step consisting of several sequential steps.""" 55 | 56 | steps: List[Step] 57 | 58 | 59 | @dataclass(frozen=True) 60 | class Process(Single): 61 | """Step consisting of a single Aiida Process.""" 62 | 63 | builder: aiida.engine.ProcessBuilder 64 | parameters: Tuple[str] 65 | returns: Tuple[str] 66 | 67 | def __str__(self): 68 | kind = self.builder.process_class 69 | if issubclass(kind, PyCalcJob): 70 | func = self.builder.func 71 | return f"{kind.__name__}[{func.name}(pk: {func.pk})]" 72 | else: 73 | return kind.__name__ 74 | 75 | 76 | @dataclass(frozen=True) 77 | class OutputAction(Action): 78 | """Action step that outputs values from the workflow context.""" 79 | 80 | outputs: Dict[str, str] 81 | 82 | def do(self, workchain): 83 | """Return the named outputs from this workflow.""" 84 | for from_name, to_name in self.outputs.items(): 85 | if from_name in workchain.ctx: 86 | workchain.out(f"return_values.{to_name}", workchain.ctx[from_name]) 87 | else: 88 | workchain.report( 89 | f"Failed to set output '{to_name}': '{from_name}' " 90 | "does not exist on the workchain context (did " 91 | "the step that produces this output fail?)" 92 | ) 93 | 94 | 95 | class PyAction(Action): 96 | """Action step defined by a PyFunction.""" 97 | 98 | action: PyFunction 99 | 100 | def do(self, workchain): 101 | """Do the action on the workchain.""" 102 | self.action(workchain) 103 | 104 | 105 | def single_steps(step: Step) -> Iterator[Single]: 106 | """Yield all Single steps in a given step.""" 107 | if isinstance(step, Single): 108 | yield step 109 | elif isinstance(step, (Concurrent, Sequential)): 110 | yield from toolz.mapcat(single_steps, step.steps) 111 | else: 112 | assert False, f"Unknown step type {type(step)}" 113 | 114 | 115 | def single_processes(step: Step) -> Iterator[Process]: 116 | """Yield all Process steps in a given step.""" 117 | return filter(lambda s: isinstance(s, Process), single_steps(step)) 118 | 119 | 120 | def _check_valid_pyfunction(f: PyFunction): 121 | """Check that the provided PyFunction may be used as part of a workflow.""" 122 | if not isinstance(f, PyFunction): 123 | raise TypeError() 124 | if any(r.startswith("_") for r in f.returns): 125 | raise ValueError( 126 | "Cannot use functions with return names containing underscores " 127 | "in workflows." 128 | ) 129 | if set(f.parameters).intersection(f.returns): 130 | raise ValueError( 131 | "Function has outputs that are named identically to its input(s)." 132 | ) 133 | 134 | 135 | def _check_pyfunctions_compatible(a: PyFunction, b: PyFunction): 136 | """Check that Pyfunction 'b' has enough inputs/outputs to be compatible with 'a'.""" 137 | _check_valid_pyfunction(a) 138 | _check_valid_pyfunction(b) 139 | if missing_parameters := set(a.parameters) - set(b.parameters): 140 | raise ValueError(f"'{b.name}' is missing parameters: {missing_parameters}") 141 | if missing_returns := set(a.returns) - set(b.returns): 142 | raise ValueError(f"'{b.name}' is missing return values: {missing_returns}") 143 | 144 | 145 | def from_pyfunction(f: PyFunction) -> Step: 146 | """Construct a Step corresponding to applying a PyFunction.""" 147 | _check_valid_pyfunction(f) 148 | return Process( 149 | builder=engine.apply(f), 150 | parameters=f.parameters, 151 | returns=f.returns, 152 | ) 153 | 154 | 155 | def map_(f: PyFunction, *args, **kwargs) -> Step: 156 | """Construct a Step corresponding to mapping a PyFunction. 157 | 158 | Parameters 159 | ---------- 160 | *args, **kwargs 161 | Positional/keyword arguments to pass to 'aiida_dynamic_workflows.engine.map_'. 162 | 163 | See Also 164 | -------- 165 | aiida_dynamic_workflows.engine.map_ 166 | """ 167 | _check_valid_pyfunction(f) 168 | return Process( 169 | builder=engine.map_(f, *args, **kwargs), 170 | parameters=f.parameters, 171 | returns=f.returns, 172 | ) 173 | 174 | 175 | def concurrently(*fs: Union[PyFunction, Step]) -> Step: 176 | """Construct a Step for several tasks executing concurrently.""" 177 | if len(fs) < 2: 178 | raise ValueError("Expected at least 2 steps") 179 | 180 | for i, f in enumerate(fs): 181 | for g in fs[i + 1 :]: 182 | if set(f.returns).intersection(g.returns): 183 | raise ValueError("Steps return values that are named the same") 184 | 185 | returns = [set(f.returns) for f in fs] 186 | 187 | parameters = [set(f.parameters) for f in fs] 188 | if any(a.intersection(b) for a in parameters for b in returns): 189 | raise ValueError("Steps cannot be run concurrently") 190 | 191 | def ensure_single(f): 192 | if isinstance(f, PyFunction): 193 | return from_pyfunction(f) 194 | elif isinstance(f, Single): 195 | return f 196 | else: 197 | raise TypeError(f"Expected PyFunction or Single, got {type(f)}") 198 | 199 | return Concurrent([ensure_single(f) for f in fs]) 200 | 201 | 202 | def new_workflow(name: str) -> Outline: 203 | """Return an Outline with no steps , and the given name.""" 204 | return Outline(steps=(), label=name) 205 | 206 | 207 | def first(s: Union[PyFunction, Step]) -> Outline: 208 | """Return an Outline consisting of a single Step.""" 209 | return Outline(steps=(ensure_step(s),)) 210 | 211 | 212 | def ensure_step(s: Union[Step, PyFunction]) -> Step: 213 | """Return a Step, given a Step or a PyFunction.""" 214 | if isinstance(s, Step): 215 | return s 216 | elif isinstance(s, PyFunction): 217 | return from_pyfunction(s) 218 | elif isinstance(s, Outline): 219 | return Sequential(s.steps) 220 | else: 221 | raise TypeError(f"Expected PyFunction, Step, or Outline, got {type(s)}") 222 | 223 | 224 | def output(*names: str, **mappings: str) -> OutputAction: 225 | """Return an OutputAction that can be used in an outline.""" 226 | outputs = {name: name for name in names} 227 | outputs.update({from_: to_ for from_, to_ in mappings.items()}) 228 | 229 | return OutputAction(outputs) 230 | 231 | 232 | @dataclass(frozen=True) 233 | class Outline: 234 | """Outline of the steps to be executed. 235 | 236 | Each step kicks off either a _single_ process, or several processes 237 | concurrently. 238 | """ 239 | 240 | steps: Tuple[Step] 241 | #: Sequence of steps constituting the workflow 242 | label: Optional[str] = None 243 | #: Optional label identifying the workflow 244 | 245 | def rename(self, name: str) -> Outline: 246 | """Return a new outline with a new name.""" 247 | return replace(self, label=name) 248 | 249 | def then(self, step: Union[PyFunction, Step, Outline]) -> Outline: 250 | """Add the provided Step to the outline. 251 | 252 | If a PyFunction is provided it is added as a single step. 253 | """ 254 | return replace(self, steps=self.steps + (ensure_step(step),)) 255 | 256 | def join(self, other: Outline) -> Outline: 257 | """Return a new outline consisting of this and 'other' joined together.""" 258 | return replace(self, steps=self.steps + other.steps) 259 | 260 | def returning(self, *names, **mappings) -> Outline: 261 | """Return the named values from this workflow.""" 262 | possible_names = self.parameters.union(self.all_outputs) 263 | existing_names = self.returns 264 | requested_names = set(names).union(mappings.keys()) 265 | 266 | if invalid_names := requested_names - possible_names: 267 | raise ValueError( 268 | f"Cannot return any of {invalid_names}; " 269 | "they do not appear in this outline." 270 | ) 271 | 272 | if already_returned := requested_names.intersection(existing_names): 273 | raise ValueError( 274 | "The following names are already returned " 275 | f"by this outline: {already_returned}." 276 | ) 277 | 278 | return replace(self, steps=self.steps + (output(*names, **mappings),)) 279 | 280 | @property 281 | def _single_processes(self) -> Iterator[Process]: 282 | for step in self.steps: 283 | yield from single_processes(step) 284 | 285 | @property 286 | def _single_steps(self) -> Iterator[Single]: 287 | for step in self.steps: 288 | yield from single_steps(step) 289 | 290 | @property 291 | def parameters(self) -> Set[str]: 292 | """Parameters of the Outline.""" 293 | raw_parameters = toolz.reduce( 294 | set.union, 295 | (s.parameters for s in self._single_processes), 296 | set(), 297 | ) 298 | return raw_parameters - self.all_outputs 299 | 300 | @property 301 | def returns(self) -> Set[str]: 302 | """Values returned by this Outline.""" 303 | ret = set() 304 | for step in self._single_steps: 305 | if isinstance(step, OutputAction): 306 | ret.update(step.outputs.values()) 307 | return ret 308 | 309 | @property 310 | def all_outputs(self) -> Set[str]: 311 | """All outputs of this outline.""" 312 | return toolz.reduce( 313 | set.union, 314 | (s.returns for s in self._single_processes), 315 | set(), 316 | ) 317 | 318 | def visualize(self, as_png=False) -> Union[graphviz.Digraph]: 319 | """Return a Graphviz visualization of this outline.""" 320 | g = graphviz.Digraph(graph_attr=dict(rankdir="LR")) 321 | 322 | mapped_inputs = set() 323 | 324 | for proc in self._single_processes: 325 | proc_id = str(id(proc)) 326 | is_mapjob = issubclass(proc.builder.process_class, PyMapJob) 327 | 328 | opts = dict(shape="rectangle") 329 | output_opts = dict() 330 | if is_mapjob: 331 | for d in (opts, output_opts): 332 | d["style"] = "filled" 333 | d["fillcolor"] = "#ffaaaaaa" 334 | 335 | g.node(proc_id, label=proc.builder.func.name, **opts) 336 | 337 | if is_mapjob: 338 | spec = common.MapSpec.from_string(proc.builder.metadata.options.mapspec) 339 | for p in spec.parameters: 340 | mapped_inputs.add(p) 341 | g.node(p, **output_opts) 342 | 343 | for r in proc.returns: 344 | g.node(r, **output_opts) 345 | g.edge(proc_id, r) 346 | 347 | for p in self.parameters - mapped_inputs: 348 | g.node(p, style="filled", fillcolor="#aaaaaa") 349 | 350 | for proc in self._single_processes: 351 | proc_id = str(id(proc)) 352 | for p in proc.parameters: 353 | g.edge(p, proc_id) 354 | if as_png: 355 | return render_png(g) 356 | return g 357 | 358 | def traverse(self, f: Callable[[Single], Single]) -> Outline: 359 | """Return a copy of this Outline, with 'f' applied to all Single steps.""" 360 | 361 | def transform(x: Step) -> Step: 362 | if isinstance(x, Single): 363 | return f(x) 364 | elif isinstance(x, (Concurrent, Sequential)): 365 | return type(x)(steps=tuple(map(transform, x.steps))) 366 | else: 367 | raise TypeError(f"Unknown step type {type(x)}") 368 | 369 | return replace(self, steps=tuple(map(transform, self.steps))) 370 | 371 | def with_restarts(self, step_restarts: Dict[PyFunction, int]) -> Outline: 372 | """Return a copy of this Outline with restarts added to all specified steps. 373 | 374 | Examples 375 | -------- 376 | >>> # Set up the original flow 377 | >>> import aiida_dynamic_workflows as flows 378 | >>> a = flows.step(lambda x, y: x + y, returning="z") 379 | >>> b = flows.step(lambda z: 2 * z) 380 | >>> flow = flows.workflow.first(a).then(b) 381 | >>> # Apply restarts: a restarted up to 2 times, b up to 3. 382 | >>> new_flow = flow.with_restarts({a: 2, b: 3}) 383 | """ 384 | 385 | def mapper(step): 386 | try: 387 | max_restarts = step_restarts[step.builder.func] 388 | except (AttributeError, KeyError): 389 | return step 390 | else: 391 | return replace(step, builder=step.builder.with_restarts(max_restarts)) 392 | 393 | return self.traverse(mapper) 394 | 395 | def replace_steps(self, step_map: Dict[PyFunction, PyFunction]) -> Outline: 396 | """Return a copy of this Outline, replacing the step functions specified. 397 | 398 | Any steps that are PyCalcJobs or PyMapJobs executing a PyFunction specified 399 | in 'step_map' will have the function executed replaced by the corresponding 400 | value in 'step_map'. 401 | 402 | See Also 403 | -------- 404 | traverse 405 | 406 | Examples 407 | -------- 408 | >>> # Set up the original flow 409 | >>> import aiida_dynamic_workflows as flows 410 | >>> a = flows.step(lambda x, y: x + y, returning="z") 411 | >>> b = flows.step(lambda z: 2 * z) 412 | >>> flow = flows.workflow.first(a).then(b) 413 | >>> # Create the new steps 414 | >>> new_a = flows.step(lambda x, y: x * y, returning="z") 415 | >>> new_b = flows.step(lambda z: 5 * z 416 | >>> # Replace the old steps with new ones! 417 | >>> new_flow = flow.replacing_steps({a: new_a, b: new_b}) 418 | """ 419 | for a, b in step_map.items(): 420 | _check_pyfunctions_compatible(a, b) 421 | 422 | def mapper(step): 423 | try: 424 | new_func = step_map[step.builder.func] 425 | except (AttributeError, KeyError): 426 | return step 427 | else: 428 | b = copy.deepcopy(step.builder) 429 | b.func = new_func 430 | return Process( 431 | builder=b, parameters=new_func.parameters, returns=new_func.returns 432 | ) 433 | 434 | return self.traverse(mapper) 435 | 436 | def on( 437 | self, 438 | env: engine.ExecutionEnvironment, 439 | max_concurrent_machines: Optional[int] = None, 440 | ) -> Outline: 441 | """Return a new Outline with the execution environment set for all steps.""" 442 | 443 | def transform(s: Single): 444 | if not isinstance(s, Process): 445 | return s 446 | return replace(s, builder=s.builder.on(env, max_concurrent_machines)) 447 | 448 | return self.traverse(transform) 449 | 450 | 451 | # TODO: See if we can come up with a cleaner separation of "logical data flow" 452 | # and "error handling flow". 453 | 454 | # TODO: see if we can do this more "directly" with the Aiida/Plumpy 455 | # "process" interface. As-is we are running our own "virtual machine" 456 | # on top of Aiida's!. 457 | class PyWorkChain(aiida.engine.WorkChain): 458 | """WorkChain for executing Outlines.""" 459 | 460 | @classmethod 461 | def define(cls, spec): # noqa: D102 462 | super().define(spec) 463 | spec.input("outline", valid_type=PyOutline) 464 | spec.input_namespace("kwargs", dynamic=True) 465 | spec.output_namespace("return_values", dynamic=True) 466 | spec.outline( 467 | cls.setup, 468 | aiida.engine.while_(cls.is_not_done)(cls.do_step, cls.check_output), 469 | cls.finalize, 470 | ) 471 | 472 | spec.exit_code(401, "INVALID_STEP", message="Invalid step definition") 473 | spec.exit_code( 474 | 450, "STEP_RETURNED_ERROR_CODE", message="A step returned an error code" 475 | ) 476 | 477 | @classmethod 478 | def get_builder(cls): # noqa: D102 479 | return engine.ProcessBuilder(cls) 480 | 481 | # TODO: have the outline persisted into "self.ctx"; this way 482 | # we don't need to reload it from the DB on every step. 483 | 484 | def setup(self): # noqa: D102 485 | """Set up the state for the workchain.""" 486 | outline = self.inputs.outline.value 487 | self.ctx._this_step = 0 488 | self.ctx._num_steps = len(outline.steps) 489 | self.ctx._had_errors = False 490 | 491 | if "kwargs" in self.inputs: 492 | self.ctx.update(self.inputs.kwargs) 493 | 494 | def finalize(self): 495 | """Finalize the workchain.""" 496 | if self.ctx._had_errors: 497 | return self.exit_codes.STEP_RETURNED_ERROR_CODE 498 | 499 | def is_not_done(self) -> bool: 500 | """Return True when there are no more steps in the workchain.""" 501 | return self.ctx._this_step < self.ctx._num_steps 502 | 503 | def do_step(self): 504 | """Execute the current step in the workchain.""" 505 | this_step = self.ctx._this_step 506 | self.report(f"doing step {this_step} of {self.ctx._num_steps}") 507 | step = self.inputs.outline.value.steps[this_step] 508 | 509 | if isinstance(step, (Single, Sequential)): 510 | concurrent_steps = [step] 511 | elif isinstance(step, Concurrent): 512 | concurrent_steps = list(step.steps) 513 | else: 514 | self.report(f"Unknown step type {type(step)}") 515 | return self.exit_codes.INVALID_STEP 516 | 517 | for s in concurrent_steps: 518 | self._base_step(s) 519 | 520 | self.ctx._this_step += 1 521 | 522 | def _base_step(self, s: Step): 523 | if isinstance(s, Process): 524 | try: 525 | inputs = get_keys(self.ctx, s.parameters) 526 | except KeyError as err: 527 | self.report(f"Skipping step {s} due to missing inputs: {err.args}") 528 | self.ctx._had_errors = True 529 | return 530 | 531 | finalized_builder = s.builder.finalize(**inputs) 532 | 533 | fut = self.submit(finalized_builder) 534 | self.report(f"Submitted {s} (pk: {fut.pk})") 535 | self.to_context(_futures=aiida.engine.append_(fut)) 536 | elif isinstance(s, Sequential): 537 | ol = Outline(steps=tuple(s.steps)) 538 | try: 539 | inputs = get_keys(self.ctx, ol.parameters) 540 | except KeyError as err: 541 | self.report(f"Skipping step {s} due to missing inputs: {err.args}") 542 | self.ctx._had_errors = True 543 | return 544 | 545 | builder = PyWorkChain.get_builder() 546 | builder.outline = PyOutline(outline=ol) 547 | builder.kwargs = inputs 548 | fut = self.submit(builder) 549 | self.report(f"Submitted sub-workchain: {fut.pk}") 550 | self.to_context(_futures=aiida.engine.append_(fut)) 551 | elif isinstance(s, Action): 552 | return s.do(self) 553 | 554 | def check_output(self): 555 | """Check the output of the current step in the workchain.""" 556 | if "_futures" not in self.ctx: 557 | return 558 | 559 | for step in self.ctx._futures: 560 | if step.exit_status != 0: 561 | self.report(f"Step {step} reported a problem: {step.exit_message}") 562 | self.ctx._had_errors = True 563 | for name, value in return_values(step): 564 | self.ctx[name] = value 565 | 566 | del self.ctx["_futures"] 567 | 568 | 569 | def get_keys(dictionary, keys): 570 | """Select all keys in 'keys' from 'dictionary'.""" 571 | missing = [] 572 | r = dict() 573 | for k in keys: 574 | if k in dictionary: 575 | r[k] = dictionary[k] 576 | else: 577 | missing.append(k) 578 | if missing: 579 | raise KeyError(*missing) 580 | return r 581 | 582 | 583 | # XXX: This is all very tightly coupled to the definitions of "PyCalcJob" 584 | # and "PyMapJob". 585 | def return_values(calc: aiida.orm.ProcessNode): 586 | """Yield (name, node) tuples of return values of the given ProcessNode. 587 | 588 | This assumes an output port namespace called "return_values". 589 | """ 590 | try: 591 | return calc.outputs.return_values.items() 592 | except AttributeError: 593 | return () 594 | 595 | 596 | def build(outline: Outline, **kwargs) -> PyWorkChain: 597 | """Return a ProcessBuilder for launching the given Outline.""" 598 | # TODO: validate that all ProcessBuilders in 'outline' are fully specified 599 | _check_outline(outline) 600 | builder = PyWorkChain.get_builder() 601 | builder.outline = PyOutline(outline=outline) 602 | if outline.label: 603 | builder.metadata.label = outline.label 604 | if missing := set(outline.parameters) - set(kwargs): 605 | raise ValueError(f"Missing parameters: {missing}") 606 | if superfluous := set(kwargs) - set(outline.parameters): 607 | raise ValueError(f"Too many parameters: {superfluous}") 608 | builder.kwargs = toolz.valmap(ensure_aiida_type, kwargs) 609 | return builder 610 | 611 | 612 | def _check_outline(outline: Outline): 613 | for proc in outline._single_processes: 614 | if proc.builder.code is None: 615 | raise ValueError( 616 | f"Execution environment not specified for {proc.builder.func.name}. " 617 | "Did you remember to call 'on(env)' on the workflow?" 618 | ) 619 | --------------------------------------------------------------------------------