├── lambda
    ├── __init__.py
    ├── src
    │   ├── __init__.py
    │   ├── chooser
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── multichooser.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   └── python
    │   │   │   ├── __init__.py
    │   │   │   ├── lambda_logs.py
    │   │   │   ├── substitutions.py
    │   │   │   ├── file_select.py
    │   │   │   └── repo_utils.py
    │   ├── compiler
    │   │   ├── __init__.py
    │   │   ├── pkg
    │   │   │   ├── __init__.py
    │   │   │   ├── native_step_resources.py
    │   │   │   ├── chooser_resources.py
    │   │   │   ├── compiler.py
    │   │   │   ├── enhanced_parallel_resources.py
    │   │   │   ├── subpipe_resources.py
    │   │   │   ├── util.py
    │   │   │   └── scatter_gather_resources.py
    │   │   ├── requirements.txt
    │   │   ├── handler.py
    │   │   └── compiler_cli.py
    │   ├── gather
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── gather.py
    │   ├── job_def
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── register.py
    │   ├── qc_checker
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── qc_checker.py
    │   ├── router
    │   │   ├── __init__.py
    │   │   └── job_router.py
    │   ├── scatter
    │   │   ├── __init__.py
    │   │   └── requirements.txt
    │   ├── subpipes
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── subpipes.py
    │   ├── initializer
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── initializer.py
    │   ├── notifications
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── notifications.py
    │   └── scatter_init
    │   │   ├── __init__.py
    │   │   └── scatter_init.py
    └── tests
    │   ├── __init__.py
    │   ├── chooser
    │       └── __init__.py
    │   ├── common
    │       ├── __init__.py
    │       ├── test_repo_utils.py
    │       ├── test_substitutions.py
    │       └── test_file_select.py
    │   ├── compiler
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── test_chooser_resources.py
    │       ├── test_state_machine_resources.py
    │       ├── test_util.py
    │       ├── test_subpipe_resources.py
    │       └── test_enhanced_parallel_resources.py
    │   ├── gather
    │       ├── __init__.py
    │       └── test_gather.py
    │   ├── job_def
    │       └── __init__.py
    │   ├── router
    │       └── __init__.py
    │   ├── scatter
    │       └── __init__.py
    │   ├── subpipes
    │       └── __init__.py
    │   ├── initializer
    │       └── __init__.py
    │   ├── notifications
    │       ├── __init__.py
    │       └── test_notifications.py
    │   ├── qc_checker
    │       └── __init__.py
    │   ├── scatter_init
    │       ├── __init__.py
    │       └── test_scatter_init.py
    │   └── requirements.txt
├── bclaw_runner
    ├── __init__.py
    ├── src
    │   ├── __init__.py
    │   ├── runner
    │   │   ├── __init__.py
    │   │   ├── preamble.py
    │   │   ├── string_subs.py
    │   │   ├── signal_trapper.py
    │   │   ├── qc_check.py
    │   │   ├── workspace.py
    │   │   ├── cache.py
    │   │   └── runner_main.py
    │   └── runner_cli.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_signal_trapper.py
    │   ├── test_workspace.py
    │   ├── conftest.py
    │   ├── test_qc_check.py
    │   ├── test_cache.py
    │   └── test_string_subs.py
    ├── .dockerignore
    ├── requirements.txt
    ├── Dockerfile
    └── Dockerfile.alpine
├── MAINTAINERS
├── util
    └── bclaw_logs
    │   ├── lambda
    │       ├── __init__.py
    │       ├── src
    │       │   ├── __init__.py
    │       │   └── job_status.py
    │       └── tests
    │       │   ├── __init__.py
    │       │   └── test_job_status.py
    │   ├── sam_install.txt
    │   └── template.yaml
├── doc
    ├── tutorial
    │   ├── sf_exec_list.png
    │   ├── sf_exec_history.png
    │   ├── sf_visual_workflow.png
    │   └── bclaw_architecture2.png
    ├── resources
    │   ├── subpipes_step_functions_link1.png
    │   └── subpipes_step_functions_link2.png
    ├── qc.md
    ├── runtime_env.md
    ├── workflow_versions.md
    ├── quick-start.md
    ├── options_and_parameters.md
    ├── subpipes.md
    └── notifications.md
├── LICENSE
├── .gitignore
├── README.md
├── .github
    └── workflows
    │   └── installer.yaml
├── cloudformation
    └── bc_ecs_task_role.yaml
└── CONTRIBUTING.md


/lambda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bclaw_runner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/chooser/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/gather/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/job_def/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/qc_checker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/router/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/scatter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/subpipes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/chooser/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/compiler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/gather/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/job_def/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/router/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/scatter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/subpipes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | jetaba (Jack Tabaska)
2 | 


--------------------------------------------------------------------------------
/lambda/src/common/python/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/gather/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/initializer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/notifications/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/qc_checker/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/scatter_init/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/subpipes/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/initializer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/notifications/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/qc_checker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/tests/scatter_init/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/bclaw_logs/lambda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/bclaw_logs/lambda/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/bclaw_logs/lambda/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lambda/src/job_def/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.38.0


--------------------------------------------------------------------------------
/lambda/src/initializer/requirements.txt:
--------------------------------------------------------------------------------
1 | jmespath
2 | 


--------------------------------------------------------------------------------
/lambda/src/notifications/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml
2 | 


--------------------------------------------------------------------------------
/bclaw_runner/.dockerignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/lambda/src/chooser/requirements.txt:
--------------------------------------------------------------------------------
1 | python-box[all]~=6.0
2 | 


--------------------------------------------------------------------------------
/lambda/src/scatter/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonpath
2 | pyyaml
3 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/requirements.txt:
--------------------------------------------------------------------------------
1 | humanfriendly
2 | pyyaml
3 | voluptuous
4 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/__init__.py:
--------------------------------------------------------------------------------
1 | from .runner_main import cli
2 | 
3 | __all__ = ["cli"]
4 | 


--------------------------------------------------------------------------------
/doc/tutorial/sf_exec_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bayer-Group/BayerCLAW/HEAD/doc/tutorial/sf_exec_list.png


--------------------------------------------------------------------------------
/doc/tutorial/sf_exec_history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bayer-Group/BayerCLAW/HEAD/doc/tutorial/sf_exec_history.png


--------------------------------------------------------------------------------
/doc/tutorial/sf_visual_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bayer-Group/BayerCLAW/HEAD/doc/tutorial/sf_visual_workflow.png


--------------------------------------------------------------------------------
/bclaw_runner/src/runner_cli.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from runner import cli
3 | 
4 | if __name__ == "__main__":
5 |     sys.exit(cli())
6 | 


--------------------------------------------------------------------------------
/doc/tutorial/bclaw_architecture2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bayer-Group/BayerCLAW/HEAD/doc/tutorial/bclaw_architecture2.png


--------------------------------------------------------------------------------
/doc/resources/subpipes_step_functions_link1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bayer-Group/BayerCLAW/HEAD/doc/resources/subpipes_step_functions_link1.png


--------------------------------------------------------------------------------
/doc/resources/subpipes_step_functions_link2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bayer-Group/BayerCLAW/HEAD/doc/resources/subpipes_step_functions_link2.png


--------------------------------------------------------------------------------
/bclaw_runner/requirements.txt:
--------------------------------------------------------------------------------
 1 | backoff
 2 | boto3==1.34.38
 3 | docker
 4 | docopt
 5 | jmespath
 6 | more_itertools
 7 | pytest
 8 | pytest-mock
 9 | requests
10 | 


--------------------------------------------------------------------------------
/lambda/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.38.3
 2 | humanfriendly
 3 | jmespath
 4 | jsonpath
 5 | moto[all]==5.0.1
 6 | pytest
 7 | pytest-mock
 8 | python-box[all]~=6.0
 9 | pyyaml
10 | voluptuous
11 | 


--------------------------------------------------------------------------------
/util/bclaw_logs/sam_install.txt:
--------------------------------------------------------------------------------
1 | sam build -b ./build -s . -t template.yaml
2 | 
3 | sam deploy \
4 | --template-file build/template.yaml \
5 | --stack-name bclaw-logs \
6 | --resolve-s3 \
7 | --capabilities CAPABILITY_IAM \
8 | --profile bclaw-public
9 | 


--------------------------------------------------------------------------------
/bclaw_runner/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/docker/library/python:3.10-slim AS base
 2 | 
 3 | LABEL maintainer="jack.tabaska@bayer.com"
 4 | 
 5 | # https://www.cynnovative.com/simple-multi-stage-docker-builds/
 6 | 
 7 | WORKDIR /bclaw_runner
 8 | 
 9 | COPY requirements.txt ./
10 | RUN pip install --no-cache-dir --upgrade pip && \
11 |     pip install --no-cache-dir -r requirements.txt
12 | 
13 | COPY src src
14 | COPY __init__.py __init__.py
15 | 
16 | FROM base AS test
17 | 
18 | RUN pip install --no-cache-dir pytest moto requests_mock
19 | 
20 | COPY tests tests
21 | RUN pytest -s -vvv tests/
22 | 
23 | FROM base AS build
24 | 
25 | ENV PYTHONBUFFERED=1
26 | ENV PATH=/bclaw:$PATH
27 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/preamble.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | def log_preamble():
 8 |     logger.info(f"workflow_name={os.environ['BC_WORKFLOW_NAME']}")
 9 |     logger.info(f"step_name={os.environ['BC_STEP_NAME']}")
10 |     logger.info(f"job_file=s3://{os.environ['BC_LAUNCH_BUCKET']}/{os.environ['BC_LAUNCH_KEY']}:{os.environ['BC_LAUNCH_VERSION']}")
11 |     logger.info(f"sfn_execution_id={os.environ['BC_EXECUTION_ID']}")
12 |     logger.info(f"branch={os.environ['BC_BRANCH_IDX']}")
13 |     logger.info(f"batch_job_id={os.environ['AWS_BATCH_JOB_ID']}")
14 |     logger.info(f"bclaw_version={os.environ['BC_VERSION']}")
15 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/test_signal_trapper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import threading
 3 | import time
 4 | 
 5 | from ..src.runner.signal_trapper import signal_trapper
 6 | 
 7 | 
 8 | def test_signal_trapper(mock_container_factory):
 9 |     pid = os.getpid()
10 | 
11 |     def trigger_signal():
12 |         time.sleep(1)
13 |         os.kill(pid, 2)
14 | 
15 |     thread = threading.Thread(target=trigger_signal)
16 |     thread.daemon = True
17 |     thread.start()
18 | 
19 |     test_container = mock_container_factory(0, False)
20 | 
21 |     with signal_trapper(test_container):
22 |         time.sleep(3)
23 |         print("yo")
24 | 
25 |     assert test_container.exit_code == 99  # test_container.stop() was called
26 | 


--------------------------------------------------------------------------------
/bclaw_runner/Dockerfile.alpine:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/docker/library/python:3.12-alpine AS base
 2 | 
 3 | LABEL maintainer="jack.tabaska@bayer.com"
 4 | 
 5 | # https://www.cynnovative.com/simple-multi-stage-docker-builds/
 6 | 
 7 | WORKDIR /bclaw_runner
 8 | 
 9 | COPY requirements.txt ./
10 | RUN pip install --no-cache-dir --upgrade pip && \
11 |     pip install --no-cache-dir -r requirements.txt
12 | 
13 | COPY src src
14 | COPY __init__.py __init__.py
15 | 
16 | FROM base AS test
17 | 
18 | # https://github.com/pachisi456/alpine-pytest-docker
19 | 
20 | RUN apk add --no-cache --virtual .build-deps \
21 |     build-base openssl-dev libffi-dev && \
22 |     pip install --no-cache-dir pytest moto[all]==5.0.1 requests_mock
23 | 
24 | COPY tests tests
25 | RUN pytest -s -vvv tests/
26 | 
27 | FROM base AS build
28 | 
29 | ARG BC_VERSION_ARG
30 | ENV BC_VERSION=$BC_VERSION_ARG
31 | 
32 | ENV PYTHONBUFFERED=1
33 | ENV PATH=/bclaw_runner:$PATH
34 | 


--------------------------------------------------------------------------------
/lambda/src/common/python/lambda_logs.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | import json
 3 | import logging
 4 | import os
 5 | from textwrap import dedent
 6 | 
 7 | 
 8 | def log_preamble(logger: logging.Logger,
 9 |                  branch: str = "N/A",
10 |                  job_file_bucket: str = "N/A",
11 |                  job_file_key: str = "N/A",
12 |                  job_file_version: str = "N/A",
13 |                  sfn_execution_id: str = "N/A",
14 |                  step_name: str = "N/A",
15 |                  workflow_name: str = "N/A") -> None:
16 |     logger.info(dedent(f"""---------- preamble ----------
17 |         {workflow_name=}
18 |         {step_name=}
19 |         job_file=s3://{job_file_bucket}/{job_file_key}:{job_file_version}
20 |         {sfn_execution_id=}
21 |         {branch=}
22 |         bclaw_version={os.environ.get("BCLAW_VERSION", "N/A")}
23 |     """))
24 | 
25 | 
26 | def log_event(logger: logging.Logger, event: dict) -> None:
27 |     logger.info("---------- event ----------" + json.dumps(event, indent=2))
28 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/string_subs.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import jmespath
 3 | import re
 4 | from typing import Any
 5 | 
 6 | 
 7 | def lookup(m: re.Match, spec: dict) -> str:
 8 |     ret = jmespath.search(m.group(1), spec)
 9 |     if ret is None:
10 |         ret = m.group(0)
11 |     return str(ret)
12 | 
13 | 
14 | SUB_FINDER = re.compile(r"\${(.+?)}")
15 | 
16 | def substitute(target: Any, spec: dict) -> Any:
17 |     if isinstance(target, str):
18 |         _lookup = partial(lookup, spec=spec)
19 |         ret = SUB_FINDER.sub(_lookup, target)
20 |     elif isinstance(target, list):
21 |         ret = [substitute(v, spec) for v in target]
22 |     elif isinstance(target, dict):
23 |         ret = {k: substitute(v, spec) for k, v in target.items()}
24 |     else:
25 |         ret = target
26 | 
27 |     return ret
28 | 
29 | 
30 | def substitute_image_tag(image_spec: dict, sub_spec: dict) -> dict:
31 |     name = image_spec["name"]
32 |     parts = name.split("/")
33 |     name_ver = parts.pop(-1)
34 |     _lookup = partial(lookup, spec=sub_spec)
35 |     subbed = SUB_FINDER.sub(_lookup, name_ver)
36 | 
37 |     ret = image_spec.copy()
38 |     ret["name"] = "/".join(parts + [subbed])
39 |     return ret
40 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/handler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from pkg.compiler import compile_template
 4 | 
 5 | logger = logging.getLogger()
 6 | logger.setLevel(logging.INFO)
 7 | 
 8 | 
 9 | def lambda_handler(event: dict, context: object) -> dict:
10 |     # event = {
11 |     #   accountId: str
12 |     #   fragment: {
13 |     #       Repository: str
14 |     #       Parameters: {...}
15 |     #       Options: {...}
16 |     #       Steps: []
17 |     #   }
18 |     #   region: str
19 |     #   params: {}  # empty
20 |     #   requestId: uuid,
21 |     #   templateParameterValues: {
22 |     #       param1: value1
23 |     #       param2: value2
24 |     #       ...
25 |     #   }
26 |     #   transformId: str
27 |     # }
28 |     logger.info(f"{event=}")
29 |     ret = event.copy()
30 | 
31 |     try:
32 |         ret["fragment"] = compile_template(event["fragment"], event["templateParameterValues"])
33 |         ret["status"] = "success"
34 | 
35 |     except Exception as e:
36 |         # https://stackoverflow.com/questions/55190232/aws-cloudformation-transform-how-do-i-properly-return-an-error-message
37 |         logger.exception("failed: ")
38 |         ret["status"] = "failure"
39 |         ret["errorMessage"] = str(e)
40 | 
41 |     return ret
42 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/compiler_cli.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This CLI depends on having valid AWS credentials active, to query the account environment,
 3 | and on the environment variable CORE_STACK_NAME, which defaults to 'bclaw-core' if not set.
 4 | """
 5 | 
 6 | import argparse
 7 | import logging
 8 | import yaml
 9 | import sys
10 | 
11 | from dotenv import load_dotenv
12 | 
13 | from pkg.compiler import compile_template
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser(description=__doc__)
17 |     parser.add_argument("infile", type=argparse.FileType("r"), nargs="?", default=sys.stdin)
18 |     parser.add_argument("cfn_file", type=argparse.FileType("w"), nargs="?", default=sys.stdout)
19 |     parser.add_argument("sfn_file", type=argparse.FileType("w"), nargs="?", default=sys.stderr)
20 |     parser.add_argument("--verbose", "-v", action="count")
21 |     args = parser.parse_args()
22 | 
23 |     load_dotenv()
24 | 
25 |     logging.basicConfig(level=(logging.DEBUG if args.verbose else logging.INFO))
26 | 
27 |     wf_spec = yaml.safe_load(args.infile)
28 |     wf_spec.pop("Transform", None)
29 | 
30 |     result = compile_template(wf_spec, {}, state_machine_out=args.sfn_file)
31 |     yaml.safe_dump(result, args.cfn_file)
32 | 
33 |     sys.exit(0)
34 | 


--------------------------------------------------------------------------------
/lambda/tests/compiler/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def compiler_env():
 8 |     os.environ.update({
 9 |         "CORE_STACK_NAME": "bclaw-core",
10 |         "CHOOSER_LAMBDA_ARN": "chooser_lambda_arn",
11 |         "ECS_TASK_ROLE_ARN": "ecs_task_role_arn",
12 |         "NOTIFICATIONS_LAMBDA_ARN": "notifications_lambda_arn",
13 |         "GATHER_LAMBDA_ARN": "gather_lambda_arn",
14 |         "ON_DEMAND_GPU_QUEUE_ARN": "on_demand_gpu_queue_arn",
15 |         "ON_DEMAND_QUEUE_ARN": "on_demand_queue_arn",
16 |         "INITIALIZER_LAMBDA_ARN": "initializer_lambda_arn",
17 |         "JOB_DEF_LAMBDA_ARN": "job_def_lambda_arn",
18 |         "LAUNCHER_BUCKET_NAME": "launcher_bucket_name",
19 |         "LOG_RETENTION_DAYS": "99",
20 |         "LOGGING_DESTINATION_ARN": "logging_destination_arn",
21 |         "RESOURCE_BUCKET_NAME": "resource_bucket_name",
22 |         "RUNNER_REPO_URI": "runner_repo_uri",
23 |         "SCATTER_INIT_LAMBDA_ARN": "scatter_init_lambda_arn",
24 |         "SCATTER_LAMBDA_ARN": "scatter_lambda_arn",
25 |         "SOURCE_VERSION": "1234567",
26 |         "SPOT_GPU_QUEUE_ARN": "spot_gpu_queue_arn",
27 |         "SPOT_QUEUE_ARN": "spot_queue_arn",
28 |         "STATES_EXECUTION_ROLE_ARN": "states_execution_role_arn",
29 |         "SUBPIPES_LAMBDA_ARN": "subpipes_lambda_arn",
30 |     })
31 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/signal_trapper.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | import logging
 3 | import signal
 4 | 
 5 | from docker.models.containers import Container
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | SKIP = {
10 |     # these signal handlers cannot be overridden
11 |     signal.SIGKILL,
12 |     signal.SIGSTOP,
13 |     # these signals are typically ignored (https://man.netbsd.org/signal.7)
14 |     signal.SIGURG,
15 |     signal.SIGCONT,
16 |     signal.SIGCHLD,
17 |     signal.SIGIO,
18 |     signal.SIGWINCH,
19 |     # signal.SIGINFO,
20 |     # signal.SIGPWR,
21 | }
22 | 
23 | 
24 | # https://stackoverflow.com/questions/2148888/python-trap-all-signals
25 | @contextmanager
26 | def signal_trapper(container: Container):
27 |     def _handler(signal_number: int, _):
28 |         logger.warning(f"received signal {signal.strsignal(signal_number)}")
29 |         logger.warning("stopping subprocess")
30 |         container.stop(timeout=5)
31 | 
32 |     original_handlers = {}
33 |     try:
34 |         logger.debug("setting new signal handlers")
35 |         for sig in signal.valid_signals() - SKIP:
36 |             if signal.getsignal(sig) is not signal.SIG_IGN:
37 |                 original_handlers[sig] = signal.signal(sig, _handler)
38 |         yield
39 |     finally:
40 |         logger.debug("restoring signal handlers")
41 |         for k, v in original_handlers.items():
42 |             signal.signal(k, v)
43 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/native_step_resources.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Generator, List
 3 | 
 4 | from . import state_machine_resources as sm
 5 | from .util import Step, Resource, State
 6 | 
 7 | 
 8 | def handle_native_step(step: Step,
 9 |                        options: dict,
10 |                        map_depth: int) -> Generator[Resource, None, List[State]]:
11 |     logger = logging.getLogger(__name__)
12 |     logger.info(f"making native step {step.name}")
13 | 
14 |     step_type = step.spec["Type"]
15 | 
16 |     ret = step.spec.copy()
17 | 
18 |     if step_type == "Parallel":
19 |         sub_branches = []
20 | 
21 |         for branch in step.spec["Branches"]:
22 |             sub_branch = yield from sm.make_branch(branch["steps"], options, depth=map_depth)
23 |             sub_branches.append(sub_branch)
24 | 
25 |         ret.update({"Branches": sub_branches})
26 | 
27 |     try:
28 |         # if this native step was generated by the compiler, don't modify ResultPath or OutputPath
29 |         ret.pop("_stet")
30 | 
31 |     except KeyError:
32 |         if step_type not in {"Wait", "Succeed", "Fail"}:
33 |             ret.update({"ResultPath": None})
34 | 
35 |         if step_type != "Fail":
36 |             ret.update({"OutputPath": "$"})
37 | 
38 |     ret.pop("Next", None)
39 |     ret.pop("End", None)
40 | 
41 |     if step_type not in {"Succeed", "Fail"}:
42 |         ret.update(**step.next_or_end)
43 | 
44 |     return [State(step.name, ret)]
45 | 


--------------------------------------------------------------------------------
/lambda/src/qc_checker/qc_checker.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | import json
 3 | import logging
 4 | 
 5 | import boto3
 6 | 
 7 | # from lambda_logs import JSONFormatter, custom_lambda_logs
 8 | 
 9 | logger = logging.getLogger()
10 | logger.setLevel(logging.INFO)
11 | # logger.handlers[0].setFormatter(JSONFormatter())
12 | 
13 | 
14 | class QCFailed(Exception):
15 |     def __init__(self, message: str):
16 |         self.message = message
17 | 
18 | 
19 | def lambda_handler(event: dict, context: object):
20 |     # with custom_lambda_logs(**event["logging"]):
21 |     logger.info(f"event: {str(event)}")
22 | 
23 |     s3_path = f"{event['repo']}/{event['qc_result_file']}"
24 |     bucket, key = s3_path.split("/", 3)[2:]
25 | 
26 |     s3 = boto3.client("s3")
27 |     response = s3.get_object(Bucket=bucket, Key=key)
28 |     with closing(response["Body"]) as fp:
29 |         qc_object = json.load(fp)
30 | 
31 |     logger.info(f"input: {str(qc_object)}")
32 | 
33 |     result = eval(event["qc_expression"], globals(), qc_object)
34 | 
35 |     if result:
36 |         logger.warning("failed QC check")
37 |         sfn = boto3.client("stepfunctions")
38 |         sfn.stop_execution(
39 |             executionArn=event["execution_id"],
40 |             error=f"Job {event['logging']['job_file_key']} failed QC check at step {event['logging']['step_name']}",
41 |             cause=f"failed condition: {event['qc_expression']}"
42 |         )
43 |         raise QCFailed(f"QC check failed ({event['qc_expression']})")
44 |     else:
45 |         logger.info("passed QC check")
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Bayer AG
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/chooser_resources.py:
--------------------------------------------------------------------------------
 1 | import jmespath
 2 | import logging
 3 | import os
 4 | from typing import List
 5 | 
 6 | from voluptuous.error import Invalid
 7 | 
 8 | from .util import Step, State, lambda_logging_block, lambda_retry
 9 | 
10 | 
11 | def choice_spec(expr_str: str, next_step: str) -> dict:
12 |     ret = {
13 |         "Variable": "$.choice",
14 |         "StringEquals": expr_str,
15 |         "Next": next_step
16 |     }
17 |     return ret
18 | 
19 | 
20 | def handle_chooser_step(step: Step) -> List[State]:
21 |     logger = logging.getLogger(__name__)
22 |     logger.info(f"making chooser step {step.name}")
23 | 
24 |     if step.is_terminal:
25 |         raise Invalid("chooser steps cannot be terminal")
26 | 
27 |     choice_step_name = f"{step.name}.choose"
28 | 
29 |     exprs = jmespath.search("choices[].if", step.spec)
30 |     nexts = jmespath.search("choices[].next", step.spec)
31 | 
32 |     choices = [choice_spec(e, n) for e, n in zip(exprs, nexts)]
33 | 
34 |     task_step = {
35 |         "Type": "Task",
36 |         "Resource": os.environ["CHOOSER_LAMBDA_ARN"],
37 |         "Parameters": {
38 |             "repo.$": "$.repo.uri",
39 |             **step.input_field,
40 |             "expressions": exprs,
41 |             **lambda_logging_block(step.name),
42 |         },
43 |         **lambda_retry(),
44 |         "ResultPath": "$.choice",
45 |         "OutputPath": "$",
46 |         "Next": choice_step_name,
47 |     }
48 | 
49 |     choice_step = {
50 |         "Type": "Choice",
51 |         "Choices": choices,
52 |         "Default": step.next,
53 |     }
54 | 
55 |     ret = [
56 |         State(step.name, task_step),
57 |         State(choice_step_name, choice_step),
58 |     ]
59 | 
60 |     return ret
61 | 


--------------------------------------------------------------------------------
/lambda/src/common/python/substitutions.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import jmespath
 3 | import json
 4 | import re
 5 | from string import Template
 6 | from typing import Any
 7 | 
 8 | 
 9 | def _lookup(target: str, data: dict):
10 |     ret0 = jmespath.search(target, data)
11 | 
12 |     if ret0 is None:
13 |         raise RuntimeError(f"{target} not found in job data")
14 |     elif isinstance(ret0, (dict, list)):
15 |         ret = json.dumps(json.dumps(ret0))
16 |     else:
17 |         ret = str(ret0)
18 | 
19 |     return ret
20 | 
21 | 
22 | JOB_DATA_FINDER = re.compile(r"\${(.+?)}")
23 | 
24 | def substitute_job_data(subject: Any, job_data: dict):
25 |     lookup = partial(_lookup, data=job_data)
26 | 
27 |     if isinstance(subject, str):
28 |         result = JOB_DATA_FINDER.sub(lambda m: lookup(m.group(1)), subject)
29 | 
30 |     elif isinstance(subject, list):
31 |         result = [substitute_job_data(v, job_data) for v in subject]
32 | 
33 |     elif isinstance(subject, dict):
34 |         result = {k: substitute_job_data(v, job_data) for k, v in subject.items()}
35 | 
36 |     else:
37 |         result = subject
38 | 
39 |     return result
40 | 
41 | 
42 | def substitute_into_filenames(subject: Any, subs: dict):
43 |     if isinstance(subject, str):
44 |         try:
45 |             result = Template(subject).safe_substitute(subs)
46 | 
47 |         except KeyError:
48 |             raise RuntimeError(f"unrecognized substitution in {subject}")
49 | 
50 |     elif isinstance(subject, list):
51 |         result = [substitute_into_filenames(v, subs) for v in subject]
52 | 
53 |     elif isinstance(subject, dict):
54 |         result = {k: substitute_into_filenames(v, subs) for k, v in subject.items()}
55 | 
56 |     else:
57 |         result = subject
58 | 
59 |     return result
60 | 


--------------------------------------------------------------------------------
/lambda/tests/scatter_init/test_scatter_init.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | import json
 3 | 
 4 | import boto3
 5 | import moto
 6 | import pytest
 7 | 
 8 | from ...src.scatter_init.scatter_init import lambda_handler
 9 | 
10 | JOB_DATA_TEMPLATE = {
11 |     "job": {"job": "data"},
12 |     "scatter": {},
13 |     "parent": {"file": "s3://test-bucket/repo/path/file.txt"}
14 | }
15 | 
16 | 
17 | @pytest.fixture(scope="module")
18 | def repo_bucket():
19 |     with moto.mock_aws():
20 |         yld = boto3.resource("s3", region_name="us-east-1").Bucket("test-bucket")
21 |         yld.create()
22 |         yld.put_object(Key="repo/path/Scatter/_JOB_DATA_", Body=json.dumps(JOB_DATA_TEMPLATE).encode("utf-8"))
23 |         yield yld
24 | 
25 | 
26 | def test_lambda_handler(repo_bucket):
27 |     event = {
28 |         "index": "99",
29 |         "repo": {
30 |             "bucket": repo_bucket.name,
31 |             "prefix": "repo/path/Scatter",
32 |             "uri": "s3://this/is/not/used",
33 |         },
34 |         "scatter": {
35 |             "number": "88",
36 |             "file": "s3://bucket/yada/yada/file.txt",
37 |         },
38 |         "logging": {},
39 |     }
40 | 
41 |     result = lambda_handler(event, {})
42 |     expect = {
43 |         "bucket": repo_bucket.name,
44 |         "prefix": "repo/path/Scatter/00099",
45 |         "uri": f"s3://{repo_bucket.name}/repo/path/Scatter/00099"
46 |     }
47 | 
48 |     assert result == expect
49 | 
50 |     job_data_obj = boto3.resource("s3").Object(result["bucket"], f"{result['prefix']}/_JOB_DATA_")
51 |     response = job_data_obj.get()
52 |     with closing(response["Body"]) as fp:
53 |         job_data = json.load(fp)
54 | 
55 |     expected_job_data = {
56 |         "job": JOB_DATA_TEMPLATE["job"],
57 |         "scatter": event["scatter"],
58 |         "parent": JOB_DATA_TEMPLATE["parent"],
59 |     }
60 |     assert job_data == expected_job_data
61 | 


--------------------------------------------------------------------------------
/lambda/src/scatter_init/scatter_init.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | import json
 3 | import logging
 4 | 
 5 | import boto3
 6 | 
 7 | from lambda_logs import log_preamble, log_event
 8 | from repo_utils import SYSTEM_FILE_TAG, Repo
 9 | 
10 | logger = logging.getLogger()
11 | logger.setLevel(logging.INFO)
12 | 
13 | 
14 | def lambda_handler(event: dict, context: object):
15 |     # event = {
16 |     #   index: str,
17 |     #   repo: {
18 |     #       bucket: str
19 |     #       prefix: str
20 |     #   }
21 |     #   scatter: {
22 |     #     key: value
23 |     #   }
24 |     #   logging: {
25 |     #     branch: str
26 |     #     job_file_bucket: str
27 |     #     job_file_key: str
28 |     #     job_file_version: str
29 |     #     sfn_execution_id: str
30 |     #     step_name: str
31 |     #     workflow_name: str
32 |     #   }
33 |     #   ...
34 |     # }
35 | 
36 |     log_preamble(**event.pop("logging"), logger=logger)
37 |     log_event(logger, event)
38 | 
39 |     s3 = boto3.resource("s3")
40 | 
41 |     # read job data template
42 |     scatter_repo = Repo(event["repo"])
43 |     job_data_template = scatter_repo.qualify("_JOB_DATA_")
44 |     obj = s3.Object(job_data_template.bucket, job_data_template.key)
45 |     response = obj.get()
46 |     with closing(response["Body"]) as fp:
47 |         job_data = json.load(fp)
48 | 
49 |     # replace scatter field
50 |     job_data["scatter"].update(event["scatter"])
51 | 
52 |     # establish branch repo
53 |     branch_repo = scatter_repo.sub_repo(f"{int(event['index']):05}")
54 | 
55 |     # write job data
56 |     job_data_file = branch_repo.qualify("_JOB_DATA_")
57 |     job_data_obj = s3.Object(job_data_file.bucket, job_data_file.key)
58 |     job_data_obj.put(Body=json.dumps(job_data).encode("utf-8"),
59 |                      ServerSideEncryption="AES256",
60 |                      Tagging=SYSTEM_FILE_TAG)
61 | 
62 |     # return repo uri
63 |     return dict(branch_repo)
64 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/qc_check.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | from typing import Generator
 5 | 
 6 | import boto3
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class QCFailure(Exception):
12 |     def __init__(self, message: str, failures: list):
13 |         super().__init__(message)
14 |         self.failures = failures
15 | 
16 | 
17 | def abort_execution(failed_expressions: list) -> None:
18 |     logger.warning("aborting workflow execution")
19 | 
20 |     region = os.environ["AWS_DEFAULT_REGION"]
21 |     acct = os.environ["AWS_ACCOUNT_ID"]
22 |     wf_name = os.environ["BC_WORKFLOW_NAME"]
23 |     exec_id = os.environ["BC_EXECUTION_ID"]
24 |     step_name = os.environ["BC_STEP_NAME"]
25 |     execution_arn = f"arn:aws:states:{region}:{acct}:execution:{wf_name}:{exec_id}"
26 | 
27 |     cause = "failed QC conditions: " + "; ".join(failed_expressions)
28 | 
29 |     sfn = boto3.client("stepfunctions")
30 |     sfn.stop_execution(
31 |         executionArn=execution_arn,
32 |         error=f"Job {exec_id} failed QC check at step {step_name}",
33 |         cause=cause
34 |     )
35 | 
36 | def run_one_qc_check(qc_data: dict, qc_expression: str) -> bool:
37 |     if result := eval(qc_expression, globals(), qc_data):
38 |         logger.warning(f"failed QC check: {qc_expression}")
39 |     else:
40 |         logger.info(f"passed QC check: {qc_expression}")
41 |     return result
42 | 
43 | 
44 | def run_all_qc_checks(checks: list) -> Generator[str, None, None]:
45 |     for item in checks:
46 |         qc_file = item["qc_result_file"]
47 |         logger.info(f"{qc_file=}")
48 | 
49 |         with open(qc_file) as fp:
50 |             qc_data = json.load(fp)
51 | 
52 |         for qc_expression in item["stop_early_if"]:
53 |             if run_one_qc_check(qc_data, qc_expression):
54 |                 yld = f"{os.path.basename(qc_file)}: {qc_expression}"
55 |                 yield yld
56 | 
57 | 
58 | def do_checks(checks: list) -> None:
59 |     if checks:
60 |         logger.info("starting QC checks")
61 |         if failures := list(run_all_qc_checks(checks)):
62 |             logger.error(f"{len(failures)} QC checks failed")
63 |             raise QCFailure("QC checks failed", failures)
64 |         logger.info("QC checks finished")
65 |     else:
66 |         logger.info("no QC checks requested")
67 | 


--------------------------------------------------------------------------------
/lambda/src/common/python/file_select.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | import csv
 3 | import json
 4 | import re
 5 | 
 6 | import boto3
 7 | from jsonpath import jsonpath
 8 | import yaml
 9 | 
10 | # matches:
11 | #   s3://(bucket)/(key/key/key.ext):(jsonpath)
12 | #   s3://(bucket)/(key/key/key.ext)
13 | PARSER = re.compile(r"^s3://(.+?)/([^:]+)(?::(.+))?$")
14 | 
15 | 
16 | def read_json(body):
17 |     ret = json.load(body)
18 |     return ret
19 | 
20 | 
21 | def read_json_lines(body):
22 |     ret = [json.loads(l) for l in body.iter_lines()]
23 |     return ret
24 | 
25 | 
26 | def read_yaml(body):
27 |     ret = yaml.load(body, Loader=yaml.SafeLoader)
28 |     return ret
29 | 
30 | 
31 | def read_csv(body, delim=","):
32 |     text = (l.decode("utf-8") for l in body.iter_lines())
33 |     ret = list(csv.DictReader(text, delimiter=delim))
34 |     return ret
35 | 
36 | 
37 | def slurp(body):
38 |     ret = [l.decode("utf-8") for l in body.iter_lines()]
39 |     return ret
40 | 
41 | 
42 | def stringify(item) -> str:
43 |     if isinstance(item, (dict, list)):
44 |         return json.dumps(item)
45 |     else:
46 |         return str(item)
47 | 
48 | 
49 | def select_file_contents(s3_path: str) -> list:
50 |     bucket, key, selector = PARSER.fullmatch(s3_path).groups()
51 | 
52 |     s3 = boto3.client("s3")
53 |     response = s3.get_object(Bucket=bucket, Key=key)
54 |     with closing(response["Body"]) as fp:
55 |         if selector is None:
56 |             ret0 = slurp(fp)
57 |         else:
58 |             if key.endswith(".json"):
59 |                 contents = read_json(fp)
60 |             elif key.endswith(".jsonl") or key.endswith(".ndjson"):
61 |                 contents = read_json_lines(fp)
62 |             elif key.endswith(".yaml") or key.endswith(".yml"):
63 |                 contents = read_yaml(fp)
64 |             elif key.endswith(".csv"):
65 |                 contents = read_csv(fp)
66 |             elif key.endswith(".tsv") or key.endswith(".tab"):
67 |                 contents = read_csv(fp, delim="\t")
68 |             else:
69 |                 contents = slurp(fp)
70 | 
71 |             ret0 = jsonpath(contents, selector)
72 | 
73 |     if not isinstance(ret0, list):
74 |         raise AssertionError("selector did not create a list")
75 | 
76 |     ret = [stringify(i) for i in ret0]
77 | 
78 |     return ret
79 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/workspace.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | import json
 3 | import logging
 4 | import os
 5 | import shutil
 6 | from tempfile import mkdtemp, NamedTemporaryFile
 7 | from typing import Generator
 8 | 
 9 | from .dind import run_child_container
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class UserCommandsFailed(Exception):
15 |     def __init__(self, message: str, exit_code:int):
16 |         super().__init__(message)
17 |         self.exit_code = exit_code
18 | 
19 | 
20 | @contextmanager
21 | def workspace() -> Generator[str, None, None]:
22 |     orig_path = os.getcwd()
23 |     work_path = mkdtemp(dir=os.environ["BC_SCRATCH_PATH"])
24 | 
25 |     logger.debug(f"workspace={work_path}")
26 | 
27 |     try:
28 |         os.chdir(work_path)
29 |         yield work_path
30 | 
31 |     finally:
32 |         logger.debug("cleaning up workspace")
33 |         os.chdir(orig_path)
34 |         shutil.rmtree(work_path, ignore_errors=True)
35 |         logger.debug("cleanup finished")
36 | 
37 | 
38 | def write_job_data_file(job_data: dict, dest_dir: str) -> str:
39 |     with NamedTemporaryFile(prefix="job_data_", suffix=".json", dir=dest_dir, mode="w", delete=False) as fp:
40 |         json.dump(job_data, fp)
41 |     return fp.name
42 | 
43 | 
44 | def run_commands(image_spec: dict, commands: list, work_dir: str, job_data_file: str, shell_opt: str) -> None:
45 |     script_file = "_commands.sh"
46 | 
47 |     with open(script_file, "w") as fp:
48 |         for command in commands:
49 |             print(command, file=fp)
50 | 
51 |     logger.info(f"shell option={shell_opt}")
52 | 
53 |     if shell_opt == "sh":
54 |         shell_cmd = "sh -veu"
55 |     elif shell_opt == "bash":
56 |         shell_cmd = "bash -veuo pipefail"
57 |     elif shell_opt == "sh-pipefail":
58 |         shell_cmd = "sh -veuo pipefail"
59 |     else:
60 |         raise RuntimeError(f"unrecognized shell: {shell_opt}")
61 | 
62 |     os.chmod(script_file, 0o700)
63 |     command = f"{shell_cmd} {script_file}"
64 | 
65 |     if (exit_code := run_child_container(image_spec, command, work_dir, job_data_file)) == 0:
66 |         logger.info("command block succeeded")
67 |     else:
68 |         logger.error("command block failed")
69 |         raise UserCommandsFailed(f"command block failed with exit code {exit_code}", exit_code)
70 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/compiler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from . import state_machine_resources as sm
 5 | from .util import Resource, substitute_params
 6 | from .validation import workflow_schema
 7 | 
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | # remove this after everybody gets used to capitalized top level keys
12 | def _capitalize_top_level_keys(frag: dict) -> dict:
13 |     ret = {k.capitalize(): v for k, v in frag.items()}
14 |     return ret
15 | 
16 | 
17 | def compile_template(fragment: dict, param_values: dict, state_machine_out=None) -> dict:
18 |     # normalize workflow spec
19 |     capitalized_fragment = _capitalize_top_level_keys(fragment)
20 |     subbed_fragment = substitute_params(param_values, capitalized_fragment)
21 |     normalized_wf = workflow_schema(subbed_fragment)
22 | 
23 |     options = normalized_wf["Options"]
24 |     repository = normalized_wf["Repository"].rstrip("/")
25 |     steps = normalized_wf["Steps"]
26 | 
27 |     # create state machine and associated resources
28 |     resources = {}
29 |     curr_resource = Resource("fake", {})
30 |     for curr_resource in sm.handle_state_machine(steps, options, repository, state_machine_out):
31 |         resources.update([curr_resource])
32 | 
33 |     # the main state machine Resource should be the last thing yielded by sm.handle_state_machine
34 |     state_machine = curr_resource
35 |     sm.add_definition_substitutions(state_machine, resources)
36 | 
37 |     state_machine_version = sm.state_machine_version_rc(state_machine)
38 |     state_machine_alias = sm.state_machine_alias_rc(state_machine_version)
39 |     resources.update([state_machine_alias, state_machine_version])
40 | 
41 |     # create cloudformation template fragment to return
42 |     ret = {
43 |         "AWSTemplateFormatVersion": "2010-09-09",
44 |         "Resources": resources,
45 |         "Outputs": {
46 |             "LauncherBucketName": {
47 |                 "Value": os.environ["LAUNCHER_BUCKET_NAME"],
48 |             },
49 |             "LauncherURI": {
50 |                 "Value": {"Fn::Sub": f"s3://{os.environ['LAUNCHER_BUCKET_NAME']}/${{AWS::StackName}}/"},
51 |             },
52 |             "StepFunctionsStateMachineArn": {
53 |                 "Value": {"Ref": state_machine.name},
54 |             },
55 |         },
56 |     }
57 | 
58 |     if "Parameters" in normalized_wf:
59 |         ret["Parameters"] = normalized_wf["Parameters"]
60 | 
61 |     return ret
62 | 


--------------------------------------------------------------------------------
/doc/qc.md:
--------------------------------------------------------------------------------
 1 | # The BayerCLAW language: Quality Control (QC) checks
 2 | 
 3 | The BayerCLAW language provides a way to define quality control (QC) checks that can be applied to
 4 | analysis results. These checks can be used to ensure that the results are consistent with the
 5 | expectations of the user or the requirements of downstream processes.
 6 | 
 7 | ## The qc_check block
 8 | 
 9 | The `qc_check` block an optional batch step element used to define a QC check. It has the following
10 | structure:
11 | 
12 | ```yaml
13 | qc_check:
14 |   qc_result_file: <path>
15 |   stop_early_if: <condition>
16 | ```
17 | 
18 | The `qc_result_file` field specifies the path to the file containing the QC results. The file must
19 | be in the JSON format and contain a dictionary with the QC results. The dictionary keys are the
20 | names of the QC checks and the values are the results of the checks.
21 | 
22 | The `stop_early_if` field specifies a condition that, if met, will cause workflow execution to be
23 | aborted. The conditions are Python expressions that yield a Boolean value. The expression can refer
24 | to the QC results using the dictionary keys as variables.
25 | 
26 | You may provide multiple qc_check blocks in each batch step, and multiple conditions per qc_check
27 | block:
28 | 
29 | ```yaml
30 | qc_check:
31 |   -
32 |     qc_result_file: qc_results1.json
33 |     stop_early_if: 
34 |       - "mean_coverage < 0.30"
35 |       - "total_length < 100"
36 |   -
37 |     qc_result_file: qc_results2.json
38 |     stop_early_if: 
39 |       - "mean_coverage < 0.30"
40 |       - "total_length < 100"
41 | ```
42 | 
43 | If any `stop_early_if` condition is met, the workflow execution will be aborted.
44 | 
45 | Note that in the second example above, it is assumed that the `qc_results*` files are of the
46 | format:
47 | 
48 | ```json5
49 | {
50 |   "mean_coverage": 0.25,
51 |   "total_length": 50,
52 |   // other fields...
53 | }
54 | ```
55 | 
56 | so that the keys `mean_coverage` and `total_length` become variables in the `stop_early_if` conditions.
57 | 
58 | ## Notifications
59 | 
60 | To receive notifications of failed QC checks, you must subscribe to BayerCLAW's SNS topic. Workflow
61 | executions that fail due to a QC check will terminate in an ABORTED state, therefore to receive only
62 | notifications for failed QC checks, your subscription must include a filter policy like the following:
63 | 
64 | ```json
65 | {
66 |   "workflow_name": ["my_workflow"],
67 |   "status": ["ABORTED"]
68 | }
69 | ```
70 | 
71 | See the [notifications document](notifications.md) for more information.
72 | 


--------------------------------------------------------------------------------
/util/bclaw_logs/lambda/src/job_status.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | # from datetime import datetime, timedelta
 3 | import json
 4 | import os
 5 | 
 6 | import boto3
 7 | from boto3.dynamodb.conditions import Attr
 8 | from botocore.exceptions import ClientError
 9 | 
10 | 
11 | def lambda_handler(event: dict, context: object) -> None:
12 |     print(f"{event=}")
13 | 
14 |     dynamodb = boto3.resource("dynamodb")
15 |     table = dynamodb.Table(os.environ["JOB_STATUS_TABLE"])
16 | 
17 |     # todo:
18 |     #    should launcher put wf name in input object?
19 |     #    should eventbridge rule put something bclaw-specific in input object for recognition?
20 | 
21 |     try:
22 |         wf_name = event["detail"]["stateMachineArn"].rsplit(":", 1)[-1]
23 |         exec_id = event["detail"]["name"]
24 |         job_status = event["detail"]["status"]
25 | 
26 |         input_obj = json.loads(event["detail"]["input"])
27 | 
28 |         # HEY! this causes subpipe executions to look like superpipe executions
29 |         # wf_name, job_file_name = input_obj["job_file"]["key"].split("/", 1)
30 |         job_file_name = input_obj["job_file"]["key"].split("/", 1)[-1]
31 |         job_file_version = input_obj["job_file"]["version"]
32 | 
33 |         time_str = event["time"]
34 |         timestamp = dt.datetime.strptime(time_str, "%Y-%m-%dT%H:%M:%S%z")
35 |         expiration = timestamp + dt.timedelta(days=int(os.environ["EXPIRATION_DAYS"]))
36 | 
37 |         item = {
38 |             "Item": {
39 |                 "workflowName": wf_name,
40 |                 "executionId": exec_id,
41 |                 "jobFile": f"{job_file_name}#{job_file_version}",
42 |                 "status": job_status,
43 |                 "timestamp": int(timestamp.timestamp()),
44 |                 "expiration": int(expiration.timestamp()),
45 |             }
46 |         }
47 | 
48 |         # events might arrive out of order: this condition prevents
49 |         # existing SUCCEEDED, FAILED, or ABORTED records in the table
50 |         # from being overwritten by incoming RUNNING records
51 |         if job_status == "RUNNING":
52 |             item["ConditionExpression"] = (
53 |                 Attr("status").not_exists() |
54 |                 Attr("status").eq("RUNNING")
55 |             )
56 | 
57 |         try:
58 |             result = table.put_item(**item)
59 |             print(str(result))
60 | 
61 |         except ClientError as e:
62 |             if e.response["Error"]["Code"] == "ConditionalCheckFailedException":
63 |                 pass
64 | 
65 |     except (KeyError, ValueError):
66 |         print("not a bayerclaw execution")
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # custom
  2 | *~
  3 | .idea/
  4 | .DS_Store
  5 | defunct/
  6 | notes/
  7 | scratch/
  8 | cheez/
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 | 
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 


--------------------------------------------------------------------------------
/lambda/src/gather/gather.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | from itertools import groupby
 3 | import json
 4 | import logging
 5 | from os.path import basename
 6 | 
 7 | import boto3
 8 | 
 9 | from lambda_logs import log_preamble, log_event
10 | from repo_utils import SYSTEM_FILE_TAG
11 | from substitutions import substitute_job_data
12 | 
13 | logger = logging.getLogger()
14 | logger.setLevel(logging.INFO)
15 | 
16 | 
17 | def lambda_handler(event: dict, context: object):
18 |     # event = {
19 |     #   outputs: str
20 |     #   repo: str
21 |     #   step_name: str
22 |     #   logging: {
23 |     #     branch: str
24 |     #     job_file_bucket: str
25 |     #     job_file_key: str
26 |     #     job_file_version: str
27 |     #     sfn_execution_id: str
28 |     #     step_name: str
29 |     #     workflow_name: str
30 |     #   }
31 |     # }
32 | 
33 |     log_preamble(**event["logging"], logger=logger)
34 |     log_event(logger, event)
35 | 
36 |     parent_outputs = json.loads(event["outputs"])
37 |     if parent_outputs:
38 |         step_name = event["step_name"]
39 | 
40 |         parent_repo = event["repo"]
41 |         parent_repo_bucket, parent_repo_prefix = parent_repo.split("/", 3)[2:]
42 |         parent_job_data_key = f"{parent_repo_prefix}/_JOB_DATA_"
43 | 
44 |         response = boto3.resource("s3").Object(parent_repo_bucket, parent_job_data_key).get()
45 |         with closing(response["Body"]) as fp:
46 |             parent_job_data = json.load(fp)
47 | 
48 |         jobby_outputs = substitute_job_data(parent_outputs, parent_job_data)
49 | 
50 |         bucket = boto3.resource("s3").Bucket(parent_repo_bucket)
51 |         prefix = f"{parent_repo_prefix}/{step_name}"
52 |         scatter_output_objs = bucket.objects.filter(Prefix=prefix)
53 |         scatter_output_uris = [f"s3://{o.bucket_name}/{o.key}" for o in scatter_output_objs]
54 |         scatter_output_uris.sort(key=basename)
55 | 
56 |         filename2group = {k: list(g) for k, g in groupby(scatter_output_uris, key=basename)}
57 |         manifest = {}
58 |         for key, filename in jobby_outputs.items():
59 |             if filename in filename2group:
60 |                 manifest[key] = filename2group[filename]
61 |             else:
62 |                 logger.warning(f"no files named {filename} found")
63 |                 manifest[key] = []
64 | 
65 |         manifest_filename = f"{step_name}_manifest.json"
66 |         manifest_path = f"{parent_repo}/{manifest_filename}"
67 |         manifest_bucket, manifest_key = manifest_path.split("/", 3)[2:]
68 |         manifest_obj = boto3.resource("s3").Object(manifest_bucket, manifest_key)
69 |         manifest_obj.put(Body=json.dumps(manifest).encode("utf-8"),
70 |                          Tagging=SYSTEM_FILE_TAG)
71 | 
72 |         ret = {"manifest": manifest_filename}
73 |     else:
74 |         ret = {}
75 | 
76 |     return ret
77 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/enhanced_parallel_resources.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Generator, List
 4 | 
 5 | from . import state_machine_resources as sm
 6 | from .util import Step, Resource, State, lambda_logging_block, lambda_retry
 7 | 
 8 | 
 9 | def handle_parallel_step(step: Step,
10 |                          options: dict,
11 |                          map_depth: int) -> Generator[Resource, None, List[State]]:
12 |     logger = logging.getLogger(__name__)
13 |     logger.info(f"making enhanced parallel step {step.name}")
14 | 
15 |     sfn_branches = []
16 | 
17 |     for idx, branch in enumerate(step.spec["branches"], start=1):
18 |         steps = branch["steps"]
19 |         try:
20 |             expression = branch["if"]
21 |             next_step_name = next(iter(steps[0]))
22 |             skip_step_name = f"{step.name}: skip_{idx}"
23 | 
24 |             # note: this creates two native-type steps in the BayerCLAW spec language.
25 |             # They will be processed into Amazon States Language in the sm.make_branch()
26 |             # call below.
27 |             preamble = [
28 |                 {
29 |                     f"{step.name}: {expression}?": {
30 |                         "Type": "Task",
31 |                         "Resource": os.environ["CHOOSER_LAMBDA_ARN"],
32 |                         "Parameters": {
33 |                             "repo.$": "$.repo.uri",
34 |                             **step.input_field,
35 |                             "expression": expression,
36 |                             **lambda_logging_block(step.name)
37 |                         },
38 |                         **lambda_retry(),
39 |                         "Catch": [
40 |                             {
41 |                                 "ErrorEquals": ["ConditionFailed"],
42 |                                 "Next": skip_step_name
43 |                             },
44 |                         ],
45 |                         "ResultPath": None,
46 |                         "OutputPath": "$",
47 |                         "_stet": True,
48 | 
49 |                         # don't have to do the next_or_end thing, per validation there
50 |                         # has to be a next step
51 |                         "Next": next_step_name,
52 |                     },
53 |                 },
54 |                 {
55 |                     skip_step_name: {
56 |                         "Type": "Succeed",
57 |                         "_stet": True,
58 |                     },
59 |                 },
60 |             ]
61 | 
62 |             steps = preamble + steps
63 | 
64 |         except KeyError:
65 |             pass
66 | 
67 |         sfn_branch = yield from sm.make_branch(steps, options, depth=map_depth)
68 |         sfn_branches.append(sfn_branch)
69 | 
70 |     ret = {
71 |         "Type": "Parallel",
72 |         "Branches": sfn_branches,
73 |         "ResultPath": None,
74 |         "OutputPath": "$",
75 |         **step.next_or_end,
76 |     }
77 | 
78 |     return [State(step.name, ret)]
79 | 


--------------------------------------------------------------------------------
/lambda/tests/common/test_repo_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from ...src.common.python.repo_utils import S3File, Repo
 6 | 
 7 | 
 8 | def test_s3file():
 9 |     result = S3File("bucket", "path/to/file.txt")
10 |     assert result.bucket == "bucket"
11 |     assert result.key == "path/to/file.txt"
12 |     assert result == "s3://bucket/path/to/file.txt"
13 | 
14 | 
15 | def test_s3file_json_serialize():
16 |     s3file = S3File("bucket", "path/to/file.txt")
17 |     result = json.dumps({"file": s3file})
18 |     expect = '{"file": "s3://bucket/path/to/file.txt"}'
19 |     assert result == expect
20 | 
21 | 
22 | @pytest.mark.parametrize("repo_spec, expected_uri", [
23 |     ({"bucket": "repo-bucket", "prefix": "path/to/repo"}, "s3://repo-bucket/path/to/repo"),
24 |     ({"bucket": "repo-bucket", "prefix": "path/to/repo", "uri": "s3://just/for/testing"}, "s3://just/for/testing")
25 | ])
26 | def test_repo_init(repo_spec, expected_uri):
27 |     result = Repo(repo_spec)
28 |     assert result.bucket == repo_spec["bucket"]
29 |     assert result.prefix == repo_spec["prefix"]
30 |     assert result.uri == expected_uri
31 | 
32 | 
33 | def test_repo_from_uri():
34 |     uri = "s3://repo-bucket/path/to/repo"
35 |     result = Repo.from_uri(uri)
36 |     assert result.bucket == "repo-bucket"
37 |     assert result.prefix == "path/to/repo"
38 | 
39 | 
40 | @pytest.mark.parametrize("spec, expected_bucket, expected_key", [
41 |     ("plain_filename.txt", "repo-bucket", "repo/prefix/plain_filename.txt"),
42 |     ("s3://other-bucket/other/dir/filename.txt", "other-bucket", "other/dir/filename.txt")
43 | ])
44 | def test_repo_qualify(spec, expected_bucket, expected_key):
45 |     repo = Repo(bucket="repo-bucket", prefix="repo/prefix")
46 |     result = repo.qualify(spec)
47 |     assert isinstance(result, S3File)
48 |     assert result.bucket == expected_bucket
49 |     assert result.key == expected_key
50 | 
51 | 
52 | def test_repo_sub_repo():
53 |     repo = Repo(bucket="repo-bucket", prefix="repo/prefix")
54 |     result = repo.sub_repo("sub-repo")
55 |     assert isinstance(result, Repo)
56 |     assert result.bucket == "repo-bucket"
57 |     assert result.prefix == "repo/prefix/sub-repo"
58 | 
59 | 
60 | def test_repo_repr():
61 |     result = Repo(bucket="repo-bucket", prefix="repo/prefix")
62 |     expect = "s3://repo-bucket/repo/prefix"
63 |     assert str(result) == expect
64 | 
65 | 
66 | def test_repo_json_serialize():
67 |     repo = Repo(bucket="repo-bucket", prefix="repo/prefix")
68 |     result = json.dumps({"repo": repo}, sort_keys=True)
69 |     expect0 = {
70 |         "repo": {
71 |             "bucket": "repo-bucket",
72 |             "prefix": "repo/prefix",
73 |             "uri": "s3://repo-bucket/repo/prefix"
74 |         }
75 |     }
76 |     expect = json.dumps(expect0, sort_keys=True)
77 |     assert result == expect
78 | 
79 | 
80 | def test_repo_to_dict():
81 |     repo = Repo(bucket="repo-bucket", prefix="repo/prefix")
82 |     result = dict(repo)
83 |     expect = {
84 |         "bucket": "repo-bucket",
85 |         "prefix": "repo/prefix",
86 |         "uri": "s3://repo-bucket/repo/prefix"
87 |     }
88 |     assert result == expect


--------------------------------------------------------------------------------
/lambda/src/chooser/multichooser.py:
--------------------------------------------------------------------------------
  1 | from contextlib import closing
  2 | import json
  3 | import logging
  4 | import math
  5 | import re
  6 | from typing import Generator, Tuple, Any
  7 | 
  8 | import boto3
  9 | from box import Box, BoxList
 10 | 
 11 | from lambda_logs import log_preamble, log_event
 12 | from substitutions import substitute_job_data
 13 | 
 14 | logger = logging.getLogger()
 15 | logger.setLevel(logging.INFO)
 16 | 
 17 | 
 18 | class ConditionFailed(Exception):
 19 |     pass
 20 | 
 21 | 
 22 | def load_s3_object(repo: str, input_file: str) -> Any:
 23 |     if input_file.startswith("s3://"):
 24 |         s3_path = input_file
 25 |     else:
 26 |         s3_path = f"{repo}/{input_file}"
 27 | 
 28 |     logger.info(f"loading {s3_path}")
 29 |     s3 = boto3.client("s3")
 30 | 
 31 |     bucket, key = s3_path.split("/", 3)[2:]
 32 |     response = s3.get_object(Bucket=bucket, Key=key)
 33 |     with closing(response["Body"]) as fp:
 34 |         ret = json.load(fp)
 35 | 
 36 |     return ret
 37 | 
 38 | 
 39 | def load_vals(inputs_json: str, repo: str) -> Generator[Tuple, None, None]:
 40 |     job_data = load_s3_object(repo, "_JOB_DATA_")
 41 |     yield "job", job_data["job"]
 42 | 
 43 |     inputs = json.loads(inputs_json)
 44 |     jobby_inputs = substitute_job_data(inputs, job_data)
 45 |     for name, input_file in jobby_inputs.items():
 46 |         vals = load_s3_object(repo, input_file)
 47 |         yield name, vals
 48 | 
 49 |         if len(inputs) == 1 and isinstance(vals, dict):
 50 |             vals.pop(name, None)
 51 |             for name2, val2 in vals.items():
 52 |                 yield name2, val2
 53 | 
 54 | 
 55 | def eval_this(expr: str, vals: dict):
 56 |     result = eval(expr, globals(), vals)
 57 |     return result
 58 | 
 59 | 
 60 | def run_exprs(exprs: list, vals: dict):
 61 |     for expr in exprs:
 62 |         result = eval_this(expr, vals)
 63 |         logger.info(f"evaluating '{expr}': {result}")
 64 |         if result:
 65 |             logger.info(f"returning '{expr}'")
 66 |             return expr
 67 |     logger.info("no conditions evaluated True, returning null")
 68 |     return None
 69 | 
 70 | 
 71 | def lambda_handler(event: dict, context: object):
 72 |     # event = {
 73 |     #   repo: str
 74 |     #   inputs: str  # needs to be a json string for auto inputs compatibility
 75 |     #   expressions: [str] | expression: str
 76 |     #   logging: {
 77 |     #     branch: str
 78 |     #     job_file_bucket: str
 79 |     #     job_file_key: str
 80 |     #     job_file_version: str
 81 |     #     sfn_execution_id: str
 82 |     #     step_name: str
 83 |     #     workflow_name: str
 84 |     #   }
 85 |     # }
 86 | 
 87 |     log_preamble(**event.pop("logging"), logger=logger)
 88 |     log_event(logger, event)
 89 | 
 90 |     vals = Box(load_vals(event["inputs"], event["repo"]))
 91 | 
 92 |     if "expressions" in event:
 93 |         ret = run_exprs(event["expressions"], vals)
 94 |         return ret
 95 | 
 96 |     elif "expression" in event:
 97 |         result = eval_this(event["expression"], vals)
 98 |         if not result:
 99 |             raise ConditionFailed
100 |         return event["expression"]
101 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/cache.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor
 2 | import fcntl
 3 | import logging
 4 | import os
 5 | from typing import Dict, Tuple
 6 | 
 7 | import backoff
 8 | import boto3
 9 | 
10 | 
11 | logger = logging.getLogger(__name__)
12 | logging.getLogger("backoff").setLevel(logging.ERROR)
13 | 
14 | 
15 | def _backoff_handler(details):
16 |     name = details["kwargs"]["name_for_logging"]
17 |     wait = details["wait"]
18 |     logger.debug(f"failed to lock {name}, retrying in {wait} seconds")
19 | 
20 | 
21 | def _blocking_download(s3_object, dest_path: str, name_for_logging: str) -> None:
22 |     if os.path.isfile(dest_path):
23 |         logger.info(f"found {name_for_logging} in cache")
24 |     else:
25 |         logger.debug(f"acquiring a lock on {name_for_logging}")
26 |         lock_path = f"{os.path.dirname(dest_path)}.lock"
27 |         with open(lock_path, "w") as lfp:
28 |             fcntl.flock(lfp, fcntl.LOCK_EX | fcntl.LOCK_NB)
29 |             logger.debug(f"lock acquired for {name_for_logging}")
30 |             s3_size = s3_object.content_length
31 |             logger.info(f"downloading {name_for_logging} ({s3_size} bytes) to cache")
32 |             os.makedirs(os.path.dirname(dest_path), exist_ok=True)
33 |             s3_object.download_file(dest_path)
34 |             local_size = os.path.getsize(dest_path)
35 |             logger.info(f"{name_for_logging} ({s3_size} bytes) downloaded to cache ({local_size} bytes)")
36 |             logger.debug(f"releasing lock on {name_for_logging}")
37 |             fcntl.flock(lfp, fcntl.LOCK_UN)
38 |         os.remove(lock_path)
39 | 
40 | 
41 | @backoff.on_exception(backoff.constant,
42 |                       BlockingIOError,
43 |                       interval=5,
44 |                       jitter=None,
45 |                       on_backoff=_backoff_handler)
46 | def _download_loop(s3_object, dest_path: str, *, name_for_logging: str) -> None:
47 |     _blocking_download(s3_object, dest_path, name_for_logging)
48 | 
49 | 
50 | def _download_to_cache(item: Tuple[str, str]) -> Tuple[str, str]:
51 |     session = boto3.Session()
52 | 
53 |     key, s3_path = item
54 |     s3_bucket, s3_key = s3_path.split("/", 3)[2:]
55 |     src = session.resource("s3").Object(s3_bucket, s3_key)
56 | 
57 |     cache_path = os.environ["BC_SCRATCH_PATH"]
58 |     src_etag = src.e_tag.strip('"')  # ETag comes wrapped in double quotes for some reason
59 |     file_name = os.path.basename(s3_key)
60 | 
61 |     cached_file = f"{cache_path}/{src_etag}/{file_name}"
62 | 
63 |     _download_loop(src, cached_file, name_for_logging=file_name)
64 | 
65 |     return key, cached_file
66 | 
67 | 
68 | def get_reference_inputs(ref_spec: Dict[str, str]) -> Dict[str, str]:
69 |     ret = {}
70 | 
71 |     if len(ref_spec) > 0:
72 |         logger.info(f"caching references: {list(ref_spec.values())}")
73 | 
74 |         with ThreadPoolExecutor(max_workers=len(ref_spec)) as executor:
75 |             result = list(executor.map(_download_to_cache, ref_spec.items()))
76 | 
77 |         for key, src in result:
78 |             dst = ret[key] = os.path.basename(src)
79 |             logger.info(f"linking cached {dst} to workspace")
80 |             os.link(src, dst)
81 | 
82 |     return ret
83 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/subpipe_resources.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | from typing import List
 5 | 
 6 | from .util import Step, State, lambda_logging_block, lambda_retry
 7 | 
 8 | 
 9 | def file_submit_step(step: Step, run_subpipe_step_name: str) -> dict:
10 |     ret = {
11 |         "Type": "Task",
12 |         "Resource": os.environ["SUBPIPES_LAMBDA_ARN"],
13 |         "Parameters": {
14 |             "repo.$": "$.repo.uri",
15 |             "job_data": step.spec["job_data"],
16 |             "submit": json.dumps(step.spec["submit"]),
17 |             "step_name": step.name,
18 |             **lambda_logging_block(step.name),
19 |         },
20 |         **lambda_retry(),
21 |         "ResultPath": "$.subpipe",
22 |         "OutputPath": "$",
23 |         "Next": run_subpipe_step_name,
24 |     }
25 | 
26 |     return ret
27 | 
28 | 
29 | def run_subpipe_step(step: Step, retrieve_step_name: str) -> dict:
30 |     state_machine_arn = step.spec["subpipe"]
31 | 
32 |     if not state_machine_arn.startswith("arn:"):
33 |         state_machine_arn = "arn:aws:states:${AWSRegion}:${AWSAccountId}:stateMachine:" + state_machine_arn
34 | 
35 |     ret = {
36 |         "Type": "Task",
37 |         "Resource": "arn:aws:states:::states:startExecution.sync",
38 |         "Parameters": {
39 |             "Input": {
40 |                 "index": "main",
41 |                 "job_file.$": "$.job_file",
42 |                 "prev_outputs": {},
43 |                 "repo.$": "$.subpipe.sub_repo",
44 |                 "share_id.$": "$.share_id",
45 |                 "AWS_STEP_FUNCTIONS_STARTED_BY_EXECUTION_ID.$": "$$.Execution.Id",
46 |             },
47 |             # todo: this could get to be too long if you have nested subpipes
48 |             #   might be better to compute it in subpipe lambda
49 |             "Name.$": f"States.Format('{{}}_{step.name}', $$.Execution.Name)",
50 |             "StateMachineArn": state_machine_arn,
51 |         },
52 |         "ResultPath": None,
53 |         "OutputPath": "$",
54 |         "Next": retrieve_step_name
55 |     }
56 | 
57 |     return ret
58 | 
59 | 
60 | def file_retrieve_step(step: Step) -> dict:
61 |     ret = {
62 |         "Type": "Task",
63 |         "Resource": os.environ["SUBPIPES_LAMBDA_ARN"],
64 |         "Parameters": {
65 |             "repo.$": "$.repo.uri",
66 |             "retrieve": json.dumps(step.spec["retrieve"]),
67 |             "subpipe": {
68 |                 "sub_repo.$": "$.subpipe.sub_repo.uri",
69 |             },
70 |             **lambda_logging_block(step.name)
71 |         },
72 |         **lambda_retry(),
73 |         "ResultSelector": {},
74 |         "ResultPath": "$.prev_outputs",
75 |         "OutputPath": "$",
76 |         **step.next_or_end,
77 |     }
78 | 
79 |     return ret
80 | 
81 | 
82 | def handle_subpipe(step: Step) -> List[State]:
83 |     logger = logging.getLogger(__name__)
84 |     logger.info(f"making subpipe step {step.name}")
85 | 
86 |     submit_step_name = step.name
87 |     subpipe_step_name = f"{step.name}.subpipe"
88 |     retrieve_step_name = f"{step.name}.retrieve"
89 | 
90 |     ret = [
91 |         State(submit_step_name, file_submit_step(step, subpipe_step_name)),
92 |         State(subpipe_step_name, run_subpipe_step(step, retrieve_step_name)),
93 |         State(retrieve_step_name, file_retrieve_step(step)),
94 |     ]
95 | 
96 |     return ret
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bayer CLoud Automated Workflows (BayerCLAW)
 2 | 
 3 | BayerCLAW is a workflow orchestration system targeted at bioinformatics pipelines.
 4 | A workflow consists of a sequence of computational steps, each of which is captured in a Docker container.
 5 | Some steps may parallelize work across many executions of the same container (scatter/gather pattern).
 6 | 
 7 | A workflow is described in a YAML file.
 8 | The BayerCLAW compiler uses AWS CloudFormation to transform the workflow description into AWS resources used by the workflow.
 9 | This includes an AWS StepFunctions state machine that represents the sequence of steps in the workflow.
10 | 
11 | A workflow typically takes several parameters, such as sample IDs or paths to input files.
12 | Once the workflow definition has been deployed, the workflow can be executed by copying a JSON file with the
13 | execution parameters to a "launcher" S3 bucket, which is constructed by BayerCLAW.
14 | The workflow state machine uses AWS Batch to actually run the Docker containers, in the proper order.
15 | 
16 | ## Documentation
17 | 
18 | - [Quick start -- deploying a BayerCLAW workflow](doc/quick-start.md)
19 | - [Tutorial -- detailed example of writing, deploying, and debugging](doc/tutorial.md)
20 | 
21 | - [Installing BayerCLAW into a new AWS account](doc/installation.md)
22 | - [The BayerCLAW language reference](doc/language.md)
23 | - [The BayerCLAW language -- scatter/gather](doc/scatter.md)
24 | - [The BayerCLAW language -- QC checks](doc/qc.md)
25 | - [The BayerCLAW language -- subpipes](doc/subpipes.md)
26 | - [Runtime environment and Docker guidelines](doc/runtime_env.md) for steps
27 | - [BayerCLAW notifications](doc/notifications.md)
28 | 
29 | The [doc/](doc/) directory of this repo contains all the pages linked above.
30 | 
31 | ## Key components of BayerCLAW
32 | 
33 | ### The workflow definition
34 | 
35 | The BayerCLAW workflow template is a JSON- or YAML-formatted file describing the processing steps of the pipeline.
36 | Here is an example of a very simple, one-step workflow:
37 | 
38 | ```YAML
39 | Transform: BC2_Compiler
40 | 
41 | Repository: s3://example-bucket/hello-world/${job.SAMPLE_ID}
42 | 
43 | Steps:
44 |   - hello:
45 |       image: docker.io/library/ubuntu
46 |       commands:
47 |         - echo "Hello world! This is job ${job.SAMPLE_ID}!"
48 | ```
49 | 
50 | ### The repository
51 | 
52 | The repository is a path within an S3 bucket where a given workflow stores its output files, such as `s3://generic-workflow-bucket/my-workflow-repo/`.
53 | The repo is typically parameterized with some job-specific unique ID, so that each execution of the workflow is kept separate.
54 | For example, `s3://generic-workflow-bucket/my-workflow-repo/job12345/`
55 | 
56 | ### Job data file
57 | The job data file contains data needed for a single pipeline execution.
58 | This data must be encoded as a flat JSON object with string keys and string values.
59 | Even integer or float values should be quoted as strings.
60 | 
61 | Copying the job data file into the launcher bucket will trigger an execution of the pipeline.
62 | Overwriting the job data file, even with the same contents, will trigger another execution.
63 | 
64 | #### Sample job data file
65 | ```json5
66 | {
67 |   "SAMPLE_ID": "ABC123",
68 |   "READS1": "s3://workflow-bucket/inputs/reads1.fq",
69 |   "READS2": "s3://workflow-bucket/inputs/reads2.fq"
70 | }
71 | ```
72 | 


--------------------------------------------------------------------------------
/doc/runtime_env.md:
--------------------------------------------------------------------------------
 1 | # BayerCLAW runtime environment
 2 | 
 3 | Each BayerCLAW step runs on AWS Batch, using the specified Docker image.
 4 | 
 5 | The entry point is a program called `bclaw_runner`, which is hosted in its own Docker
 6 | container and  does not need to be baked into the user's Docker image.
 7 | The runner is responsible for downloading inputs from S3,
 8 | running the user-specified commands,  and uploading the output to S3.
 9 | BayerCLAW manages this;  the runner should be basically invisible to users.
10 | 
11 | If one user command exits with an error (non-zero exit code),
12 | the following commands in that step will not be run.
13 | However, any outputs will still be uploaded to S3.
14 | If the container exceeds its maximum allotted memory,
15 | all processes in the container will be killed immediately so no upload is possible.
16 | (Batch will typical report an error code 137 for out of memory.)
17 | 
18 | Each Batch EC2 instance has a temporary EBS volume attached as scratch space.
19 | By default, each *instance* has a 1 Tb scratch volume.
20 | However, multiple jobs may share a single instance, in which case they have to share the scratch space.
21 | AWS Batch controls how jobs are packed onto instances, and we are not aware of a way for users to control this.
22 | 
23 | For each job, `bclaw_runner` will create a temporary directory on the scratch volume.
24 | User commands will be started in this directory. Inputs and outputs will upload/download from this directory.
25 | Before exiting, `bclaw_runner` will remove the directory, to free up space for future jobs that may run on this machine.
26 | 
27 | # Environment variables
28 | 
29 | The following environment variables are available in BayerCLAW Batch jobs:
30 | 
31 | - `BC_BRANCH_IDX`: For jobs running inside of a Scatter step, this will be a number corresponding to the map index
32 | assigned by Step Functions. Outside of a Scatter step, this will always be `main`.
33 | - `BC_EXECUTION_ID`: The ID of the Step Functions execution that triggered this Batch job. You can use this to find
34 | the execution in the Step Functions console.
35 | - `BC_JOB_DATA_FILE`: This is a fully-qualified path to a JSON-formatted file containing the input job data.
36 | - `BC_STEP_NAME`: The name of the current workflow step.
37 | - `BC_WORKFLOW_NAME`: The workflow name. This is the same as the name of the workflow's CloudFormation stack.
38 | - `BC_WORKSPACE`: This is the fully-qualified path to the job's working directory.
39 | - `AWS_ACCOUNT_ID`: The ID of the AWS account the job is running in.
40 | - `AWS_DEFAULT_REGION`: The AWS region the job is running in.
41 | 
42 | These can be incorporated into commands just as one would normally use environment variables, e.g.:
43 | 
44 | ```bash
45 | do_something --cfg ${BC_JOB_DATA_FILE} ${input1} ${input2}
46 | ```
47 | 
48 | # Docker guidelines
49 | Docker limits the number of anonymous pull requests that a single IP address can perform against Docker Hub. Therefore,
50 | while you can use Docker Hub images for low-throughput workflows or for workflows in development, it is
51 | recommended that you store all of your Docker images in Amazon ECR for high-throughput production workflows.
52 | 
53 | Docker images must not specify an ENTRYPOINT -- this prevents `bclaw_runner` from executing correctly.
54 | 
55 | If the Docker image specifies a WORKDIR, it will be ignored when run under BayerCLAW.
56 | 


--------------------------------------------------------------------------------
/.github/workflows/installer.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - installer
 5 |   workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   doIt:
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       contents: "read"
12 |       id-token: "write"
13 |     steps:
14 |       - name: checkout
15 |         uses: actions/checkout@v4
16 |         with:
17 |           fetch-depth: 0
18 |           fetch-tags: true
19 | 
20 |       - name: setupPython
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: "3.10"
24 | 
25 |       - name: setupSam
26 |         uses: aws-actions/setup-sam@v2
27 |         with:
28 |           use-installer: true
29 |           token: ${{ secrets.GITHUB_TOKEN }}
30 | 
31 |       - name: getCreds
32 |         uses: aws-actions/configure-aws-credentials@v4
33 |         with:
34 | #          role-to-assume: "arn:aws:iam::934778205429:role/jaxGithubActionsRole"
35 |           role-to-assume: "arn:aws:iam::934778205429:role/jax-github-role-2025"
36 |           aws-region: us-east-2
37 | 
38 |       - name: runTests
39 |         run: |
40 |           pip install -r lambda/tests/requirements.txt
41 |           PYTHONPATH=${GITHUB_WORKSPACE}/lambda/src/common/python:$PYTHONPATH pytest -s -vvv lambda/tests/
42 | 
43 |       - name: installCore
44 |         id: install-core
45 |         env:
46 |           SUBNETS: "subnet-3ffe7854,subnet-b3b296ff,subnet-e1c63a9c"
47 |           VPC_ID: "vpc-00cb556b"
48 |         run: |
49 |           export SOURCE_VERSION=$(git describe --tags)
50 |           export UNIQIFIER=$(date | md5sum | head -c 16)
51 |           
52 |           sam build -b ./build -s . -t cloudformation/bc_core.yaml
53 |           sam deploy \
54 |           --template-file build/template.yaml \
55 |           --stack-name bayerclaw2-core \
56 |           --resolve-s3 \
57 |           --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \
58 |           --no-fail-on-empty-changeset \
59 |           --tags "bclaw:version=${SOURCE_VERSION}" \
60 |           --parameter-overrides \
61 |           AmiId=auto \
62 |           CompilerMacroName=BC2_Compiler \
63 |           ExistingBatchSubscriptionFilter=none \
64 |           GpuAmiId=auto \
65 |           InstallationName=bayerclaw2 \
66 |           LauncherBucketName=default \
67 |           LogRetentionDays=30 \
68 |           MaxvCpus=256 \
69 |           MinvCpus=0 \
70 |           RootVolumeSize=50 \
71 |           ScratchVolumeSize=100 \
72 |           SecurityGroups=auto \
73 |           SourceVersion=${SOURCE_VERSION} \
74 |           Subnets=${SUBNETS} \
75 |           Uniqifier=${UNIQIFIER} \
76 |           VpcId=${VPC_ID}
77 | 
78 |           echo "runner_image_tag=$(aws cloudformation describe-stacks --query "Stacks[?StackName=='bayerclaw2-core'][].Outputs[?OutputKey=='RunnerImageUri'].OutputValue" --output text)" >> $GITHUB_OUTPUT
79 | 
80 |       - name: loginToEcr
81 |         id: login-to-ecr
82 |         uses: aws-actions/amazon-ecr-login@v2
83 | 
84 |       - name: buildRunner
85 |         working-directory: bclaw_runner
86 |         env:
87 |           DOCKER_BUILDKIT: 1
88 |           RUNNER_IMAGE_TAG: ${{ steps.install-core.outputs.runner_image_tag }}
89 |         run: |
90 |           docker build --target test -f Dockerfile.alpine "."
91 |           docker build --target build -t ${RUNNER_IMAGE_TAG} -f Dockerfile.alpine "."
92 |           docker push ${RUNNER_IMAGE_TAG} || true
93 | 


--------------------------------------------------------------------------------
/lambda/tests/compiler/test_chooser_resources.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import pytest
  4 | 
  5 | from voluptuous.error import Invalid
  6 | 
  7 | logging.basicConfig(level=logging.INFO)
  8 | 
  9 | from ...src.compiler.pkg.chooser_resources import choice_spec, handle_chooser_step
 10 | from ...src.compiler.pkg.util import Step, State, lambda_logging_block, lambda_retry
 11 | 
 12 | 
 13 | def test_choice_spec():
 14 |     condition = "x == 1"
 15 |     next_step = "step99"
 16 | 
 17 |     result = choice_spec(condition, next_step)
 18 |     expect = {
 19 |         "Variable": "$.choice",
 20 |         "StringEquals": condition,
 21 |         "Next": next_step,
 22 |     }
 23 |     assert result == expect
 24 | 
 25 | 
 26 | def test_make_chooser_steps(compiler_env):
 27 |     spec = {
 28 |         "inputs": {
 29 |             "infile1": "file1.json",
 30 |             "infile2": "file2.json",
 31 |         },
 32 |         "choices": [
 33 |             {
 34 |                 "if": "infile1.var1 == 1",
 35 |                 "next": "step99",
 36 |             },
 37 |             {
 38 |                 "if": "infile2.var2 == 2",
 39 |                 "next": "step98",
 40 |             },
 41 |             {
 42 |                 "if": "job.var3 == 3",
 43 |                 "next": "step97",
 44 |             },
 45 |         ]
 46 |     }
 47 | 
 48 |     test_step = Step("step_name", spec, "next_step")
 49 | 
 50 |     expected_task_spec = {
 51 |         "Type": "Task",
 52 |         "Resource": "chooser_lambda_arn",
 53 |         "Parameters": {
 54 |             "repo.$": "$.repo.uri",
 55 |             "inputs": json.dumps(spec["inputs"], separators=(",", ":")),
 56 |             "expressions": [
 57 |                 "infile1.var1 == 1",
 58 |                 "infile2.var2 == 2",
 59 |                 "job.var3 == 3",
 60 |             ],
 61 |             **lambda_logging_block("step_name")
 62 |         },
 63 |         **lambda_retry(),
 64 |         "ResultPath": "$.choice",
 65 |         "OutputPath": "$",
 66 |         "Next": "step_name.choose",
 67 |     }
 68 | 
 69 |     expected_choice_spec = {
 70 |         "Type": "Choice",
 71 |         "Choices": [
 72 |             {
 73 |                 "Variable": "$.choice",
 74 |                 "StringEquals": "infile1.var1 == 1",
 75 |                 "Next": "step99",
 76 |             },
 77 |             {
 78 |                 "Variable": "$.choice",
 79 |                 "StringEquals": "infile2.var2 == 2",
 80 |                 "Next": "step98",
 81 |             },
 82 |             {
 83 |                 "Variable": "$.choice",
 84 |                 "StringEquals": "job.var3 == 3",
 85 |                 "Next": "step97",
 86 |             },
 87 |         ],
 88 |         "Default": "next_step"
 89 |     }
 90 | 
 91 |     result = handle_chooser_step(test_step)
 92 |     assert len(result) == 2
 93 |     assert all(isinstance(s, State) for s in result)
 94 | 
 95 |     task_state = result[0]
 96 |     assert task_state.name == "step_name"
 97 |     assert task_state.spec == expected_task_spec
 98 | 
 99 |     choice_state = result[1]
100 |     assert choice_state.name == "step_name.choose"
101 |     assert choice_state.spec == expected_choice_spec
102 | 
103 | 
104 | def test_make_chooser_steps_terminal_state_fail():
105 |     test_step = Step("step_name", {"not": "used"}, "")
106 |     with pytest.raises(Invalid, match="chooser steps cannot be terminal"):
107 |         _ = handle_chooser_step(test_step)
108 | 


--------------------------------------------------------------------------------
/cloudformation/bc_ecs_task_role.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | 
  3 | Parameters:
  4 |   RoleName:
  5 |     Type: String
  6 |     Default: "default"
  7 | 
  8 |   PolicyName:
  9 |     Type: String
 10 |     Default: "default"
 11 | 
 12 | Conditions:
 13 |   UseDefaultRoleName: !Or [ !Equals [!Ref RoleName, "Default"],
 14 |                             !Equals [!Ref RoleName, "default"] ]
 15 | 
 16 |   UseDefaultPolicyName: !Or [ !Equals [!Ref PolicyName, "Default"],
 17 |                               !Equals [!Ref PolicyName, "default"] ]
 18 | 
 19 | Resources:
 20 |   EcsTaskRole:
 21 |     Type: AWS::IAM::Role
 22 |     DeletionPolicy: Retain
 23 |     Properties:
 24 |       RoleName:
 25 |         !If [UseDefaultRoleName, !Ref AWS::NoValue, !Ref RoleName]
 26 |       AssumeRolePolicyDocument:
 27 |         Version: "2012-10-17"
 28 |         Statement:
 29 |           - Effect: "Allow"
 30 |             Principal:
 31 |               Service:
 32 |                 - "ecs-tasks.amazonaws.com"
 33 |             Action:
 34 |               - "sts:AssumeRole"
 35 | 
 36 |   EcsTaskPolicy:
 37 |     Type: AWS::IAM::ManagedPolicy
 38 |     DeletionPolicy: Retain
 39 |     Properties:
 40 |       ManagedPolicyName:
 41 |         !If [UseDefaultPolicyName, !Ref AWS::NoValue, !Ref PolicyName]
 42 |       Roles:
 43 |         - !Ref EcsTaskRole
 44 |       PolicyDocument:
 45 |         Version: "2012-10-17"
 46 |         Statement:
 47 |           - Sid: "writeLogs"
 48 |             Effect: "Allow"
 49 |             Action:
 50 |               - "logs:CreateLogStream"
 51 |               - "logs:PutLogEvents"
 52 |             Resource: !Sub "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:*"
 53 |           - Sid: "s3BucketAccess"
 54 |             Effect: "Allow"
 55 |             Action:
 56 |               - "s3:ListBucket"
 57 |             Resource: "arn:aws:s3:::*"
 58 |           - Sid: "s3ObjectAccess"
 59 |             Effect: "Allow"
 60 |             Action:
 61 |               - "s3:PutObject"
 62 |               - "s3:GetObject"
 63 |               - "s3:PutObjectAcl"
 64 |               - "s3:GetObjectTagging"
 65 |               - "s3:PutObjectTagging"
 66 |               - "s3:AbortMultipartUpload"
 67 |             Resource: "arn:aws:s3:::*/*"
 68 |           - Sid: "deleteControlObjects"
 69 |             Effect: "Allow"
 70 |             Action:
 71 |               - "s3:DeleteObject"
 72 |             Resource: "arn:aws:s3:::*/_control_/*"
 73 |           - Sid: "ecrAccess"
 74 |             Effect: "Allow"
 75 |             Action:
 76 |               - "ecr:GetAuthorizationToken"
 77 |               - "ecr:BatchCheckLayerAvailability"
 78 |               - "ecr:GetDownloadUrlForLayer"
 79 |               - "ecr:BatchGetImage"
 80 |             Resource: "*"
 81 |           - Sid: "ec2InstanceTagging"
 82 |             Effect: "Allow"
 83 |             Action:
 84 |               - "ec2:CreateTags"
 85 |               - "ec2:DeleteTags"
 86 |             Resource: "*"
 87 |           - Sid: "abortExecution"
 88 |             Effect: "Allow"
 89 |             Action:
 90 |               - "states:StopExecution"
 91 |             Resource: "*"
 92 |           - Sid: "getSecrets"
 93 |             Effect: "Allow"
 94 |             Action:
 95 |               - "secretsmanager:GetSecretValue"
 96 |             Resource: "*"
 97 | 
 98 | Outputs:
 99 |   EcsTaskRoleArn:
100 |     Value: !GetAtt EcsTaskRole.Arn
101 | 
102 |   EcsTaskPolicyArn:
103 |     Value: !Ref EcsTaskPolicy
104 | 


--------------------------------------------------------------------------------
/lambda/src/common/python/repo_utils.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import json
  3 | 
  4 | 
  5 | SYSTEM_FILE_TAG = "bclaw.system=true"
  6 | 
  7 | # @dataclass
  8 | # class S3File:
  9 | #     bucket: str
 10 | #     key: str
 11 | #
 12 | #     def __repr__(self):
 13 | #         return f"s3://{self.bucket}/{self.key}"
 14 | #
 15 | #     # def __eq__(self, other):
 16 | #     #     return self.bucket == other.bucket and self.key == other.key
 17 | 
 18 | class S3File(str):
 19 |     def __new__(cls, bucket: str, key: str):
 20 |         return str.__new__(cls, f"s3://{bucket}/{key}")
 21 | 
 22 |     def __init__(self, bucket: str, key: str):
 23 |         self.bucket = bucket
 24 |         self.key = key
 25 | 
 26 | 
 27 | # @dataclass
 28 | # class Repo:
 29 | #     bucket: str
 30 | #     prefix: str
 31 | #
 32 | #     @classmethod
 33 | #     def from_uri(cls, uri: str):
 34 | #         bucket, key = uri.split("/", 3)[2:]
 35 | #         return cls(bucket, key)
 36 | #
 37 | #     def qualify(self, uri: str) -> S3File:
 38 | #         if uri.startswith("s3://"):
 39 | #             ret = S3File(*uri.split("/", 3)[2:])
 40 | #         else:
 41 | #             wtf = f"{self.prefix}/{uri}"
 42 | #             ret = S3File(bucket=self.bucket, key=f"{self.prefix}/{uri}")
 43 | #         return ret
 44 | #
 45 | #     def sub_repo(self, name):
 46 | #         ret = Repo(self.bucket, f"{self.prefix}/{name}")
 47 | #         return ret
 48 | #
 49 | #     @property
 50 | #     def job_data_file(self) -> S3File:
 51 | #         return self.qualify("_JOB_DATA_")
 52 | #
 53 | #     def __repr__(self):
 54 | #         return f"s3://{self.bucket}/{self.prefix}"
 55 | #
 56 | #     # def __eq__(self, other):
 57 | #     #     return self.bucket == other.bucket and self.prefix == other.prefix
 58 | 
 59 | class Repo(dict):
 60 |     def __init__(self, *args, **kwargs):
 61 |         dict.__init__(self, *args, **kwargs)
 62 |         self.setdefault("uri", f"s3://{self.bucket}/{self.prefix}")
 63 | 
 64 |     @classmethod
 65 |     def from_uri(cls, uri: str):
 66 |         bucket, prefix = uri.split("/", 3)[2:]
 67 |         return cls(bucket=bucket, prefix=prefix)
 68 | 
 69 |     @property
 70 |     def bucket(self) -> str:
 71 |         return self["bucket"]
 72 | 
 73 |     @property
 74 |     def prefix(self) -> str:
 75 |         return self["prefix"]
 76 | 
 77 |     @property
 78 |     def uri(self) -> str:
 79 |         return self["uri"]
 80 | 
 81 |     def qualify(self, file_spec: str) -> S3File:
 82 |         if file_spec.startswith("s3://"):
 83 |             ret = S3File(*file_spec.split("/", 3)[2:])
 84 |         else:
 85 |             ret = S3File(self.bucket, f"{self.prefix}/{file_spec}")
 86 |         return ret
 87 | 
 88 |     def sub_repo(self, name):
 89 |         ret = Repo(bucket=self.bucket, prefix=f"{self.prefix}/{name}")
 90 |         return ret
 91 | 
 92 |     def __repr__(self) -> str:
 93 |         return self.uri
 94 | 
 95 | 
 96 | # https://stackoverflow.com/questions/51286748/make-the-python-json-encoder-support-pythons-new-dataclasses
 97 | # class RepoEncoder(json.JSONEncoder):
 98 | #     def default(self, o):
 99 | #         if isinstance(o, (Repo, S3File)):
100 | #             return str(o)
101 | #         else:
102 | #             return super().default(o)
103 | 
104 | 
105 | # class OtherEncoder(json.JSONEncoder):
106 | #     def default(self, o):
107 | #         if isinstance(o, Repo):
108 | #             return {"bucket": o.bucket, "prefix": o.prefix, "uri": str(o)}
109 | #         else:
110 | #             return super().default(o)
111 | 


--------------------------------------------------------------------------------
/doc/workflow_versions.md:
--------------------------------------------------------------------------------
 1 | # Versioned Workflows and Blue/Green Deployment
 2 | 
 3 | BayerCLAW2 workflows are deployed using a Blue/Green method, which allows you to publish updated
 4 | versions of your workflow without downtime, even when jobs are in progress. Blue/Green deployment
 5 | also enables you to roll your workflows back to earlier versions if necessary.
 6 | 
 7 | ## Step Function versions and aliases
 8 | 
 9 | Blue/Green deployment is implemented through the use of Step Functions versions and aliases.
10 | When you compile a workflow of a given name, the resulting Step Function state machine receives
11 | a unique version number. The version number increases monotonically, is immutable, and will never
12 | be reused. Older versions of the state machine are not automatically deleted (although you may
13 | delete them manually), so any jobs running on a previous version will not be interrupted. 
14 | 
15 | During compilation, the newest version of a state machine also receives the alias `current`. The
16 | `current` alias points to the currently active version of the workflow state machine -- when you put
17 | a job data file into the launcher bucket, a job is triggered on the `current` state machine.
18 | 
19 | See the AWS documentation for more information on Step Functions
20 | [versions](https://docs.aws.amazon.com/step-functions/latest/dg/concepts-state-machine-version.html) and
21 | [aliases](https://docs.aws.amazon.com/step-functions/latest/dg/concepts-state-machine-alias.html).
22 | 
23 | ## Rolling Back to Earlier Versions
24 | 
25 | Faulty workflow deployments can be rolled back to a previous state by reassigning the `current` alias to
26 | the desired state machine version. To do so on the AWS console, navigate to the state machine's page and select
27 | the `Aliases` tab; then select the `current` alias and click `Edit` You can select the desired version in
28 | the dropdown<sup id="a1">[1](#f1)</sup>.
29 | 
30 | Rollbacks may also be conducted using the AWS CLI:
31 | 
32 | ```bash
33 | aws stepfunctions update-state-machine-alias \
34 | --state-machine-alias-arn arn:aws:states:us-east-1:123456789012:stateMachine:my-workflow:current \
35 | --routing-configuration stateMachineVersionArn=arn:aws:states:us-east-1:123456789012:stateMachine:my-workflow:2,weight=100
36 | ```
37 | 
38 | It is also possible to submit jobs directly to a previous version of a workflow. To do so, append a colon
39 | and version number to the workflow name in the launcher bucket path, for example:
40 | 
41 | 's3://bclaw2-launcher-123456789012/**my-workflow:9**/job_data.json'
42 | 
43 | If you assign a custom alias to a certain workflow version, you can submit jobs to that aliased version
44 | in a similar manner:
45 | 
46 | 's3://bclaw2-launcher-123456789012/**my-workflow:my-alias**/job_data.json'
47 | 
48 | ### *Important!*
49 | 
50 | Proper workflow rollbacks depend critically on the use of versioned Docker images. If you rely on Docker's
51 | default `:latest` tag (or even on a mutable generic tag like `:prod`), BayerCLAW could roll back your
52 | workflow's structure, but continue to use buggy Docker images. Consider using a CI/CD system such
53 | as AWS' CodeBuild to build your Docker images upon each new release, and pass the fully-qualified
54 | image tag to BayerCLAW (using `aws cloudformation deploy --parameter-overrides...`) as Parameter values.
55 | 
56 | <hr>
57 | 
58 | <b id="f1">1</b> Note that you have the option to split incoming jobs between two
59 | state machine versions, assigning a percentage of traffic to each. This is not likely
60 | to be too useful but is still available.[↵](#a1)
61 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/test_workspace.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | import subprocess
  5 | 
  6 | import pytest
  7 | 
  8 | from ..src import runner
  9 | from ..src.runner.workspace import workspace, write_job_data_file, run_commands, run_commands, UserCommandsFailed
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | 
 13 | 
 14 | def test_workspace(monkeypatch, tmp_path):
 15 |     monkeypatch.setenv("BC_SCRATCH_PATH", str(tmp_path))
 16 |     orig_dir = os.getcwd()
 17 | 
 18 |     with workspace() as wrk:
 19 |         assert os.path.isdir(wrk)
 20 |         assert os.getcwd() == wrk
 21 |         assert os.path.dirname(wrk) == str(tmp_path)
 22 | 
 23 |     assert os.getcwd() == orig_dir
 24 |     assert not os.path.isdir(wrk)
 25 | 
 26 | 
 27 | def test_write_job_data_file(tmp_path):
 28 |     job_data = {
 29 |         "one": 1,
 30 |         "two": 2,
 31 |         "three": 3,
 32 |     }
 33 | 
 34 |     jdf = write_job_data_file(job_data, str(tmp_path))
 35 | 
 36 |     assert os.path.exists(jdf)
 37 |     assert os.path.dirname(jdf) == str(tmp_path)
 38 | 
 39 |     with open(jdf) as fp:
 40 |         jdf_contents = json.load(fp)
 41 |     assert jdf_contents == job_data
 42 | 
 43 | 
 44 | def fake_container(image_tag: str, command: str, work_dir: str, job_data_file) -> int:
 45 |     response = subprocess.run(command, shell=True)
 46 |     return response.returncode
 47 | 
 48 | 
 49 | def test_run_commands(tmp_path, monkeypatch, caplog):
 50 |     caplog.set_level(logging.INFO)
 51 |     monkeypatch.setattr(runner.workspace, "run_child_container", fake_container)
 52 |     f = tmp_path / "test_success.out"
 53 | 
 54 |     commands = [
 55 |         f"echo 'one' > {str(f)}",
 56 |         "z='two'",
 57 |         f"echo $z >> {str(f)}"
 58 |     ]
 59 | 
 60 |     os.chdir(tmp_path)
 61 |     response = run_commands("fake/image:tag", commands, tmp_path, "fake/job/data/file.json", "sh")
 62 | 
 63 |     assert "command block succeeded" in caplog.text
 64 |     assert f.exists()
 65 |     with f.open() as fp:
 66 |         lines = fp.readlines()
 67 |         assert lines == ["one\n", "two\n"]
 68 | 
 69 | 
 70 | def test_exit_on_command_fail1(tmp_path, monkeypatch):
 71 |     monkeypatch.setattr(runner.workspace, "run_child_container", fake_container)
 72 |     f = tmp_path / "test_exit_on_command_fail.out"
 73 | 
 74 |     commands = [
 75 |         f"echo 'one' > {str(f)}",
 76 |         "false",
 77 |         f"echo $z >> {str(f)}"
 78 |     ]
 79 | 
 80 |     os.chdir(tmp_path)
 81 |     with pytest.raises(UserCommandsFailed) as ucf:
 82 |         run_commands("fake/image:tag", commands, tmp_path, "fake/job/data/file.json", "sh")
 83 |         assert ucf.value.exit_code != 0
 84 | 
 85 |     assert f.exists()
 86 |     with f.open() as fp:
 87 |         lines = fp.readlines()
 88 |         assert lines == ["one\n"]
 89 | 
 90 | 
 91 | def test_exit_on_undef_var1(tmp_path, monkeypatch):
 92 |     monkeypatch.setattr(runner.workspace, "run_child_container", fake_container)
 93 |     f = tmp_path / "test_exit_on_undef_var.out"
 94 | 
 95 |     commands = [
 96 |         f"echo 'one' > {str(f)}",
 97 |         "echo $UNDEFINED_VAR",
 98 |         f"echo $z >> {str(f)}"
 99 |     ]
100 | 
101 |     os.chdir(tmp_path)
102 |     with pytest.raises(UserCommandsFailed) as ucf:
103 |         run_commands("fake/image:tag", commands, tmp_path, "fake/job/data/file.json", "sh")
104 |         assert ucf.value.exit_code != 0
105 | 
106 |     assert f.exists()
107 |     with f.open() as fp:
108 |         lines = fp.readlines()
109 |         assert lines == ["one\n"]
110 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | from typing import Optional
  3 | 
  4 | import boto3
  5 | from docker.errors import ImageNotFound
  6 | import moto
  7 | import pytest
  8 | 
  9 | 
 10 | class MockImage:
 11 |     def __init__(self, name: str, source: str, auth: Optional[dict] = None):
 12 |         self.tags = [name]
 13 |         self.source = source
 14 |         self.auth = auth
 15 |         self.attrs = {
 16 |             "RepoDigests": [f"{name}@sha256:1234567890abcdef"],
 17 |         }
 18 | 
 19 | 
 20 | class MockImages:
 21 |     @staticmethod
 22 |     def get(img_uri: str) -> MockImage:
 23 |         if img_uri == "local/image":
 24 |             return MockImage(img_uri, "local repo")
 25 |         else:
 26 |             raise ImageNotFound("not found message")
 27 | 
 28 |     @staticmethod
 29 |     def pull(img_uri: str, auth_config: dict) -> MockImage:
 30 |         if "dkr.ecr" in img_uri:
 31 |             return MockImage(img_uri, "ecr", auth_config)
 32 |         else:
 33 |             if auth_config:
 34 |                 return MockImage(img_uri, "private repo", auth_config)
 35 |             else:
 36 |                 return MockImage(img_uri, "public repo")
 37 | 
 38 | 
 39 | class MockContainer:
 40 |     def __init__(self, exit_code):
 41 |         self.args = None
 42 |         self.kwargs = None
 43 |         self.exit_code = exit_code
 44 |         self.removed = False
 45 |         self.status = "created"
 46 | 
 47 |     def logs(self, *args, **kwargs) -> io.BytesIO:
 48 |         ret = io.BytesIO(b"line 1\nline 2\nline 3")
 49 |         return ret
 50 | 
 51 |     def stop(self, *args, **kwargs) -> None:
 52 |         self.exit_code = 99
 53 | 
 54 |     def wait(self, *args, **kwargs) -> dict:
 55 |         ret = {"StatusCode": self.exit_code}
 56 |         return ret
 57 | 
 58 |     def remove(self, *args, **kwargs) -> None:
 59 |         self.removed = True
 60 | 
 61 |     def reload(self):
 62 |         self.status = "running"
 63 | 
 64 | 
 65 | class FailingContainer(MockContainer):
 66 |     def __init__(self, exit_code: int):
 67 |         super().__init__(exit_code)
 68 | 
 69 |     def logs(self, *args, **kwargs) -> io.BytesIO:
 70 |         raise RuntimeError("hey")
 71 | 
 72 | 
 73 | @pytest.fixture(scope="function")
 74 | def mock_container_factory():
 75 |     def _ret(exit_code: int, logging_crash: bool):
 76 |         if logging_crash:
 77 |             return FailingContainer(exit_code)
 78 |         else:
 79 |             return MockContainer(exit_code)
 80 |     return _ret
 81 | 
 82 | 
 83 | class MockContainers:
 84 |     def __init__(self, ret: MockContainer):
 85 |         self.ret = ret
 86 | 
 87 |     def run(self, *args, **kwargs) -> MockContainer:
 88 |         self.ret.args = args
 89 |         self.ret.kwargs = kwargs
 90 |         return self.ret
 91 | 
 92 | 
 93 | class MockDockerClient():
 94 |     def __init__(self, container: Optional[MockContainer] = None):
 95 |         self.images = MockImages()
 96 |         self.containers = MockContainers(container)
 97 | 
 98 |     def close(self):
 99 |         pass
100 | 
101 | 
102 | @pytest.fixture(scope="function")
103 | def mock_docker_client_factory():
104 |     def _ret(container: Optional[MockContainer] = None):
105 |         return MockDockerClient(container)
106 |     return _ret
107 | 
108 | 
109 | @pytest.fixture(scope="module")
110 | def mock_ec2_instance():
111 |     with moto.mock_aws():
112 |         ec2 = boto3.resource("ec2", region_name="us-east-1")
113 |         instances = ec2.create_instances(ImageId="ami-12345", MinCount=1, MaxCount=1)
114 |         yield instances[0]
115 | 


--------------------------------------------------------------------------------
/lambda/tests/gather/test_gather.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | import json
 3 | import logging
 4 | 
 5 | import boto3
 6 | import moto
 7 | import pytest
 8 | 
 9 | from ...src.gather.gather import lambda_handler
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | 
13 | TEST_BUCKET = "test-bucket"
14 | JOB_DATA = {"job": {"job": "data"}, "parent": {}, "scatter": {}}
15 | 
16 | 
17 | @pytest.fixture(scope="module")
18 | def repo_bucket():
19 |     with moto.mock_aws():
20 |         yld = boto3.resource("s3", region_name="us-east-1").Bucket(TEST_BUCKET)
21 |         yld.create()
22 |         yld.put_object(Key="repo/path/_JOB_DATA_", Body=json.dumps(JOB_DATA).encode("utf-8"))
23 | 
24 |         yld.put_object(Key="repo/path/test-step/00000/output1", Body=b"00000.output1")
25 |         yld.put_object(Key="repo/path/test-step/00000/output2", Body=b"00000.output2")
26 |         yld.put_object(Key="repo/path/test-step/00000/zoutput2", Body=b"00000.zoutput2")
27 |         yld.put_object(Key="repo/path/test-step/00000/unoutput", Body=b"00000.unoutput")
28 | 
29 |         yld.put_object(Key="repo/path/test-step/00001/output1", Body=b"00001.output1")
30 |         # no output2 in subdir 00001
31 |         yld.put_object(Key="repo/path/test-step/00001/unoutput", Body=b"00001.unoutput")
32 | 
33 |         yld.put_object(Key="repo/path/test-step/00002/output1", Body=b"00002.output1")
34 |         yld.put_object(Key="repo/path/test-step/00002/output2", Body=b"00002.output2")
35 |         yld.put_object(Key="repo/path/test-step/00002/unoutput", Body=b"00002.unoutput")
36 | 
37 |         yield yld
38 | 
39 | 
40 | def test_lambda_handler(caplog, repo_bucket):
41 |     event = {
42 |         "repo": f"s3://{repo_bucket.name}/repo/path",
43 |         "outputs": json.dumps({"out1": "output1", "out2": "output2", "out3": "output3"}),
44 |         "step_name": "test-step",
45 |         "items": [
46 |             {"repo": f"s3://{repo_bucket.name}/repo/path/test-step/00000"},
47 |             {"repo": f"s3://{repo_bucket.name}/repo/path/test-step/00001"},
48 |             {"repo": f"s3://{repo_bucket.name}/repo/path/test-step/00002"},
49 |         ],
50 |         "logging": {
51 |             "step_name": "test-step",
52 |         },
53 |     }
54 | 
55 |     expect = {"manifest": "test-step_manifest.json"}
56 |     result = lambda_handler(event, {})
57 |     assert result == expect
58 | 
59 |     manifest_key = f"repo/path/{result['manifest']}"
60 |     manifest_s3 = repo_bucket.Object(manifest_key)
61 |     response = manifest_s3.get()
62 |     with closing(response["Body"]) as fp:
63 |         manifest = json.load(fp)
64 | 
65 |     expect = {
66 |         "out1": [
67 |             f"s3://{repo_bucket.name}/repo/path/test-step/00000/output1",
68 |             f"s3://{repo_bucket.name}/repo/path/test-step/00001/output1",
69 |             f"s3://{repo_bucket.name}/repo/path/test-step/00002/output1",
70 |         ],
71 |         "out2": [
72 |             f"s3://{repo_bucket.name}/repo/path/test-step/00000/output2",
73 |             f"s3://{repo_bucket.name}/repo/path/test-step/00002/output2",
74 |         ],
75 |         "out3": [],
76 |     }
77 |     assert manifest == expect
78 | 
79 |     assert "no files named output3 found" in caplog.text
80 | 
81 | 
82 | def test_lambda_handler_no_manifest(caplog, repo_bucket):
83 |     event = {
84 |         "repo": f"s3://{repo_bucket.name}/repo/path",
85 |         "outputs": "{}",
86 |         "results": ["fake", "results"],
87 |         "logging": {
88 |             "step_name": "test-step",
89 |         },
90 |     }
91 | 
92 |     result = lambda_handler(event, {})
93 |     assert result == {}
94 | 


--------------------------------------------------------------------------------
/doc/quick-start.md:
--------------------------------------------------------------------------------
 1 | # Quick start - creating and running a pipeline
 2 | 
 3 | ## 1. Containerize your tools
 4 | 
 5 | Because BayerCLAW runs jobs on AWS Batch, all the software for your pipeline must be built into Docker containers.
 6 | These can be stored in any Docker repository, but the default is the AWS Elastic Container Registry (ECR) in your AWS
 7 | account. If you just specify a simple image name, like `ubuntu`, BayerCLAW will assume it is in ECR.
 8 | To reference an image in the public DockerHub repo, you should specify `docker.io/library/ubuntu` (or whatever).
 9 | 
10 | ## 2. Choose an S3 repository location
11 | 
12 | In addition to the **Docker** repository for your images, BayerCLAW uses an S3 location as a **file** repository.
13 | This bucket is not created for you by BayerCLAW, because it is intended to be the long-term home of your data.
14 | You should create this bucket yourself, with appropriate life-cycle policies and other settings, or use an existing bucket.
15 | 
16 | You must NOT use a BayerCLAW launcher bucket as a repository; that one is ONLY for triggering new workflow executions.
17 | 
18 | ## 3. Create a workflow template
19 | 
20 | Use the [BayerCLAW Language References](language.md) to help you author your workflow.
21 | 
22 | ## 4. Deploy the workflow
23 | 
24 | Deploying a workflow creates a StepFunctions state machine and associated resources.
25 | Deployment happens through AWS CloudFormation, and can be done through the console or the command line.
26 | In this example, the workflow is named `bclaw-demo`:
27 | 
28 | ```
29 | # Please edit this name before using
30 | export MYSTACK=bclaw-demo
31 | 
32 | aws cloudformation deploy --template-file bclaw-demo.yaml --stack-name $MYSTACK --capabilities CAPABILITY_IAM
33 | ```
34 | 
35 | If deployment fails, check the logs for the `bclawCompilerLambda` function in the AWS web console.
36 | You can modify the workflow template and re-run the `deploy` command to update the workflow definition.
37 | If for some reason you need to remove the workflow entirely, try:
38 | 
39 | ```
40 | aws cloudformation delete-stack --stack-name $MYSTACK
41 | aws cloudformation wait stack-delete-complete --stack-name $MYSTACK
42 | ```
43 | 
44 | ## 5. Launch a job
45 | 
46 | Assuming BayerCLAW was installed under default parameters, you should find an S3 bucket named something
47 | like `bclaw-main-launcher-<account id>` in your account. BayerCLAW watches this bucket for new input files.
48 | 
49 | To launch an BayerCLAW job, just copy a job file into the launcher bucket. The file must be placed into a
50 | folder with the same name as the workflow you want to run, e.g.:
51 | 
52 | ```
53 | aws s3 cp job.json s3://bclaw-main-launcher-123456789012/bclaw-demo/job.json
54 | ```
55 | 
56 | If you overwrite a file, *even with the same data*, it will trigger the workflow to run again.
57 | Best practice would be to give each job file a unique name -- preferably something based on the file's
58 | contents -- rather than `job.json` (see the [About job naming](#about-job-naming) section below).
59 | 
60 | To monitor the job in the AWS web console, check the pages for Batch and StepFunctions.
61 | If a task fails, you will be able to see it in either place, and there will be links to CloudWatch Logs.
62 | 
63 | ### About job naming
64 | BayerCLAW's StepFunction job names are derived from the name of the job data file. Some characters
65 | may be transformed or removed due to naming constraints. Therefore, if you submit a file named 
66 | `my.input.json`, you my trigger a StepFunctions job named something like `my-input_Q7Pz7WYb`.
67 | The trailing `Q7Pz7WYb` is the first 8 characters of the file's version ID, assigned by S3.
68 | StepFunction execution names are limited to 80 characters, so BayerCLAW may truncate very
69 | long file names.


--------------------------------------------------------------------------------
/util/bclaw_logs/template.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | Transform: AWS::Serverless-2016-10-31
  3 | 
  4 | Resources:
  5 |   # dynamo db
  6 |   JobStatusTable:
  7 |     Type: AWS::DynamoDB::Table
  8 |     Properties:
  9 |       TableName: "bclawLogsTable"
 10 |       AttributeDefinitions:
 11 |         -
 12 |           AttributeName: "workflowName"
 13 |           AttributeType: "S"
 14 |         -
 15 |           AttributeName: "executionId"
 16 |           AttributeType: "S"
 17 |         -
 18 |           AttributeName: "timestamp"
 19 |           AttributeType: "N"
 20 |         -
 21 |           AttributeName: "jobFile"
 22 |           AttributeType: "S"
 23 |       KeySchema:
 24 |         -
 25 |           AttributeName: "workflowName"
 26 |           KeyType: "HASH"
 27 |         -
 28 |           AttributeName: "executionId"
 29 |           KeyType: "RANGE"
 30 |       LocalSecondaryIndexes:
 31 |         -
 32 |           IndexName: executionsByTimestamp
 33 |           KeySchema:
 34 |             -
 35 |               AttributeName: "workflowName"
 36 |               KeyType: "HASH"
 37 |             -
 38 |               AttributeName: "timestamp"
 39 |               KeyType: "RANGE"
 40 |           Projection:
 41 |             ProjectionType: INCLUDE
 42 |             NonKeyAttributes:
 43 |               - "executionId"
 44 |               - "jobFile"
 45 |               - "status"
 46 |         -
 47 |           IndexName: executionsByJobFile
 48 |           KeySchema:
 49 |             -
 50 |               AttributeName: "workflowName"
 51 |               KeyType: "HASH"
 52 |             -
 53 |               AttributeName: "jobFile"
 54 |               KeyType: "RANGE"
 55 |           Projection:
 56 |             ProjectionType: INCLUDE
 57 |             NonKeyAttributes:
 58 |               - "executionId"
 59 |               - "timestamp"
 60 |               - "status"
 61 |       TimeToLiveSpecification:
 62 |         AttributeName: "expiration"
 63 |         Enabled: true
 64 |       BillingMode: PAY_PER_REQUEST
 65 | 
 66 |   # job status lambda
 67 |   JobStatusLambda:
 68 |     Type: AWS::Serverless::Function
 69 |     Properties:
 70 |       Handler: job_status.lambda_handler
 71 |       Runtime: python3.10
 72 |       CodeUri: lambda/src
 73 |       Environment:
 74 |         Variables:
 75 |           JOB_STATUS_TABLE: !Ref JobStatusTable
 76 |           EXPIRATION_DAYS: "90"
 77 |       MemorySize: 128
 78 |       Timeout: 60
 79 |       Policies:
 80 |         -
 81 |           Version: "2012-10-17"
 82 |           Statement:
 83 |             -
 84 |               Effect: Allow
 85 |               Action:
 86 |                 - "dynamodb:DescribeTable"
 87 |                 - "dynamodb:PutItem"
 88 |                 - "dynamodb:UpdateItem"
 89 |               Resource: !GetAtt JobStatusTable.Arn
 90 |       DeploymentPreference:
 91 |         Enabled: False
 92 | 
 93 |   JobStatusLambdaLogGroup:
 94 |     Type: AWS::Logs::LogGroup
 95 |     Properties:
 96 |       LogGroupName: !Sub "/aws/lambda/${JobStatusLambda}"
 97 |       RetentionInDays: 30
 98 | 
 99 |   # event bridge rule
100 |   JobStatusSFNRule:
101 |     Type: AWS::Events::Rule
102 |     Properties:
103 |       EventPattern:
104 |         source:
105 |           - aws.states
106 |         detail-type:
107 |           - Step Functions Execution Status Change
108 |       State: ENABLED
109 |       Targets:
110 |         -
111 |           Id: job-status-lambda
112 |           Arn: !GetAtt JobStatusLambda.Arn
113 | 
114 |   # connector: event bridge -> lambda
115 |   JobStatusSFNtoLambda:
116 |     Type: AWS::Serverless::Connector
117 |     Properties:
118 |       Source:
119 |         Id: JobStatusSFNRule
120 |       Destination:
121 |         Id: JobStatusLambda
122 |       Permissions:
123 |         - Write
124 | 


--------------------------------------------------------------------------------
/lambda/src/notifications/notifications.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | 
  5 | import boto3
  6 | import yaml
  7 | 
  8 | logger = logging.getLogger()
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | 
 12 | def make_state_change_message(attributes: dict) -> str:
 13 |     status = attributes["status"]["StringValue"]
 14 |     workflow_name = attributes["workflow_name"]["StringValue"]
 15 |     execution_id = attributes["execution_id"]["StringValue"]
 16 |     job_file_bucket = attributes["job_file_bucket"]["StringValue"]
 17 |     job_file_key = attributes["job_file_key"]["StringValue"]
 18 |     job_file_version = attributes["job_file_version"]["StringValue"]
 19 | 
 20 |     details = {
 21 |         "details": {
 22 |             "workflow_name": workflow_name,
 23 |             "execution_id": execution_id,
 24 |             "job_status": status,
 25 |             "job_data": f"s3://{job_file_bucket}/{job_file_key}",
 26 |             "job_data_version": job_file_version,
 27 |         }
 28 |     }
 29 | 
 30 |     if status == "RUNNING":
 31 |         action = "has started"
 32 | 
 33 |     elif status == "SUCCEEDED":
 34 |         action = "has finished"
 35 | 
 36 |     elif status == "FAILED":
 37 |         action = "has failed"
 38 | 
 39 |     elif status == "ABORTED":
 40 |         action = "has been aborted"
 41 | 
 42 |     elif status == "TIMED_OUT":
 43 |         action = "has timed out"
 44 | 
 45 |     else:
 46 |         raise RuntimeError(f"status {status} not recognized")
 47 | 
 48 |     job_file_name = job_file_key.rsplit("/", 1)[-1]
 49 | 
 50 |     text = f"Job {execution_id} ('{job_file_name}') on workflow {workflow_name} {action}."
 51 |     message = yaml.safe_dump_all([text, details])
 52 | 
 53 |     return message
 54 | 
 55 | 
 56 | def make_message_attributes(event: dict) -> dict:
 57 |     input_obj = json.loads(event["detail"]["input"])
 58 | 
 59 |     ret = {
 60 |         "status": {
 61 |             "DataType": "String",
 62 |             "StringValue": event["detail"]["status"],
 63 |         },
 64 |         "workflow_name": {
 65 |             "DataType": "String",
 66 |             "StringValue": event["detail"]["stateMachineArn"].rsplit(":", 1)[-1],
 67 |         },
 68 |         "execution_id": {
 69 |             "DataType": "String",
 70 |             "StringValue": event["detail"]["name"],
 71 |         },
 72 |         "job_file_bucket": {
 73 |             "DataType": "String",
 74 |             "StringValue": input_obj["job_file"]["bucket"],
 75 |         },
 76 |         "job_file_key": {
 77 |             "DataType": "String",
 78 |             "StringValue": input_obj["job_file"]["key"],
 79 |         },
 80 |         "job_file_version": {
 81 |             "DataType": "String",
 82 |             "StringValue": input_obj["job_file"]["version"],
 83 |         },
 84 |     }
 85 | 
 86 |     return ret
 87 | 
 88 | 
 89 | def make_sns_payload(message: str, attributes: dict) -> dict:
 90 |     status = attributes["status"]["StringValue"]
 91 |     workflow_name = attributes["workflow_name"]["StringValue"]
 92 | 
 93 |     ret = {
 94 |         "TopicArn": os.environ["TOPIC_ARN"],
 95 |         "Message": message,
 96 |         "Subject": f"{workflow_name}: job {status.lower()}",
 97 |         "MessageAttributes": attributes,
 98 |     }
 99 |     return ret
100 | 
101 | 
102 | def lambda_handler(event: dict, context: object) -> dict:
103 |     print(f"{event=}")
104 | 
105 |     try:
106 |         attributes = make_message_attributes(event)
107 |         message = make_state_change_message(attributes)
108 |         payload = make_sns_payload(message, attributes)
109 | 
110 |         client = boto3.client("sns")
111 |         response = client.publish(**payload)
112 | 
113 |         return response
114 | 
115 |     except KeyError:
116 |         logger.warning("unable to parse BayerCLAW information from event")
117 | 


--------------------------------------------------------------------------------
/lambda/src/router/job_router.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import re
  5 | 
  6 | import boto3
  7 | 
  8 | from lambda_logs import log_preamble, log_event
  9 | 
 10 | logger = logging.getLogger()
 11 | logger.setLevel(logging.INFO)
 12 | 
 13 | 
 14 | def get_state_machine_name(s3_key: str) -> (str, str, str):
 15 |     # (?<!/) is a negative lookbehind to make sure the s3 key doesn't end with a /
 16 |     m = re.fullmatch(r"([A-Za-z0-9-]+)(?::([A-Za-z0-9-_]+))?/+(.+)(?<!/)", s3_key)
 17 | 
 18 |     # throws AttributeError if regex wasn't matched
 19 |     state_machine_name, state_machine_version, remainder = m.groups()
 20 |     if state_machine_version is None:
 21 |         state_machine_version = "current"
 22 | 
 23 |     return state_machine_name, state_machine_version, remainder
 24 | 
 25 | 
 26 | def shorten_filename(key: str) -> str:
 27 |     # assume the filename extension is something uninformative like ".json"
 28 |     ret = os.path.splitext(key)[0]
 29 |     return ret
 30 | 
 31 | 
 32 | def normalize(string: str) -> str:
 33 |     ret0 = re.sub(r"[^A-Za-z0-9]+", "-", string)
 34 |     ret = ret0.strip("-")
 35 |     return ret
 36 | 
 37 | 
 38 | def make_execution_name(s3_path: str, version: str) -> str:
 39 |     # assumes the top level directory (= workflow name) has been stripped from s3_path
 40 |     norm_key = normalize(shorten_filename(s3_path))
 41 |     norm_version = normalize(version) or "NULL"
 42 |     ret = f"{norm_key:.71}_{norm_version:.8}"
 43 |     return ret
 44 | 
 45 | 
 46 | def get_state_machine_arn(state_machine_name: str, state_machine_version: str) -> str:
 47 |     region = os.environ["REGION"]
 48 |     acct_num = os.environ["ACCT_NUM"]
 49 |     ret = f"arn:aws:states:{region}:{acct_num}:stateMachine:{state_machine_name}:{state_machine_version}"
 50 |     return ret
 51 | 
 52 | 
 53 | def lambda_handler(event: dict, context: object) -> None:
 54 |     # event = {
 55 |     #   branch: str
 56 |     #   job_file_bucket: str,
 57 |     #   job_file_key: str,
 58 |     #   job_file_version: str  # empty string if launcher bucket versioning is suspended
 59 |     # }
 60 | 
 61 |     log_preamble(**event, logger=logger)
 62 |     log_event(logger, event)
 63 | 
 64 |     assert "_DIE_DIE_DIE_" not in event["job_file_key"]
 65 | 
 66 |     sfn = boto3.client("stepfunctions")
 67 | 
 68 |     try:
 69 |         # throws AttributeError if regex wasn't matched
 70 |         state_machine_name, state_machine_version, remainder = get_state_machine_name(event["job_file_key"])
 71 | 
 72 |         exec_name = make_execution_name(remainder, event["job_file_version"])
 73 |         logger.info(f"{exec_name=}")
 74 | 
 75 |         input_obj = {
 76 |             "job_file": {
 77 |                 "bucket": event["job_file_bucket"],
 78 |                 "key": event["job_file_key"],
 79 |                 "version": event["job_file_version"],
 80 |             },
 81 |             "index": event["branch"],
 82 |         }
 83 | 
 84 |         state_machine_arn = get_state_machine_arn(state_machine_name, state_machine_version)
 85 | 
 86 |         if "dry_run" not in event:
 87 |             response = sfn.start_execution(
 88 |                 stateMachineArn=state_machine_arn,
 89 |                 name=exec_name,
 90 |                 input=json.dumps(input_obj)
 91 |             )
 92 |             logger.info(f"{response=}")
 93 | 
 94 |     except AttributeError:
 95 |         logger.info("no workflow name found")
 96 | 
 97 |     except sfn.exceptions.ExecutionAlreadyExists:
 98 |         # duplicated s3 events are way more likely than bona fide name collisions
 99 |         logger.info(f"duplicate event: {exec_name}")
100 | 
101 |     # throws AccessDeniedException if state machine is not a bclaw workflow from this installation
102 |     # throws StateMachineDoesNotExist if alias "current" does not exist on state machine


--------------------------------------------------------------------------------
/lambda/tests/compiler/test_state_machine_resources.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ...src.compiler.pkg.state_machine_resources import (make_initializer_step, make_step_list,
 4 |                                                          state_machine_version_rc,
 5 |                                                          state_machine_alias_rc,
 6 |                                                          STATE_MACHINE_VERSION_NAME,
 7 |                                                          STATE_MACHINE_ALIAS_NAME)
 8 | from ...src.compiler.pkg.util import Step, Resource, lambda_logging_block, lambda_retry
 9 | 
10 | 
11 | def test_make_initializer_step(compiler_env):
12 |     repository = "s3://bucket/repo/path/${template}"
13 | 
14 |     result = make_initializer_step(repository)
15 |     expect = {
16 |         "Initialize": {
17 |             "Type": "Task",
18 |             "Resource": "initializer_lambda_arn",
19 |             "Parameters": {
20 |                 "workflow_name": "${WorkflowName}",
21 |                 "repo_template": repository,
22 |                 "input_obj.$": "$",
23 |                 **lambda_logging_block("Initialize"),
24 |             },
25 |             **lambda_retry(),
26 |             "ResultPath": "$",
27 |             "OutputPath": "$",
28 |             "_stet": True,
29 |         },
30 |     }
31 | 
32 |     assert result == expect
33 | 
34 | 
35 | def test_make_step_list():
36 |     steps = [
37 |         {"step1": {"data": "1"}},
38 |         {"step2": {"data": "2"}},
39 |         {"step3": {"data": "3", "next": "step5"}},
40 |         {"step4": {"data": "4", "end": True}},
41 |         {"step5": {"data": "5", "Next": "step7"}},
42 |         {"step6": {"data": "6", "End": True}},
43 |         {"step7": {"data": "7"}},
44 |     ]
45 |     expected_nexts = [
46 |         "step2",
47 |         "step3",
48 |         "step5",
49 |         "",
50 |         "step7",
51 |         "",
52 |         "",
53 |     ]
54 | 
55 |     results = make_step_list(steps)
56 | 
57 |     for orig, result, exp_next in zip(steps, results, expected_nexts):
58 |         assert isinstance(result, Step)
59 |         k, v = next(iter(orig.items()))
60 |         assert result.name == k
61 |         assert result.spec == v
62 |         assert result.next == exp_next
63 | 
64 | 
65 | def test_state_machine_version_rc():
66 |     state_machine = Resource("stateMachineLogicalName", {})
67 |     result = state_machine_version_rc(state_machine)
68 |     expect = Resource(STATE_MACHINE_VERSION_NAME,
69 |                       {
70 |                           "Type": "AWS::StepFunctions::StateMachineVersion",
71 |                           "UpdateReplacePolicy": "Retain",
72 |                           "Properties": {
73 |                               "Description": "No description",
74 |                               "StateMachineArn": {"Ref": "stateMachineLogicalName"},
75 |                               "StateMachineRevisionId": {"Fn::GetAtt": ["stateMachineLogicalName", "StateMachineRevisionId"]},
76 |                           },
77 |                       })
78 |     assert result == expect
79 | 
80 | 
81 | def test_state_machine_alias_rc():
82 |     state_machine_version = Resource(STATE_MACHINE_VERSION_NAME, {})
83 |     result = state_machine_alias_rc(state_machine_version)
84 |     expect = Resource(STATE_MACHINE_ALIAS_NAME,
85 |                       {
86 |                           "Type": "AWS::StepFunctions::StateMachineAlias",
87 |                           "Properties": {
88 |                               "Name": "current",
89 |                               "Description": "Current active version",
90 |                               "DeploymentPreference": {
91 |                                   "StateMachineVersionArn": {"Ref": STATE_MACHINE_VERSION_NAME},
92 |                                   "Type": "ALL_AT_ONCE",
93 |                               },
94 |                           },
95 |                       })
96 |     assert result == expect
97 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/util.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | import json
  3 | import os
  4 | import re
  5 | from typing import Any, NamedTuple
  6 | 
  7 | import boto3
  8 | import jmespath
  9 | 
 10 | 
 11 | class Step(NamedTuple):
 12 |     name: str
 13 |     spec: dict
 14 |     next: str
 15 | 
 16 |     @property
 17 |     def is_terminal(self) -> bool:
 18 |         return self.next == ""
 19 | 
 20 |     @property
 21 |     def next_or_end(self) -> dict:
 22 |         if self.is_terminal:
 23 |             return {"End": True}
 24 |         else:
 25 |             return {"Next": self.next}
 26 | 
 27 |     @property
 28 |     def input_field(self) -> dict:
 29 |         if self.spec["inputs"] is None:
 30 |             ret = {"inputs.$": "States.JsonToString($.prev_outputs)"}
 31 |         else:
 32 |             ret = {"inputs": json.dumps(self.spec["inputs"], separators=(",", ":"))}
 33 |         return ret
 34 | 
 35 | 
 36 | class Resource(NamedTuple):
 37 |     name: str
 38 |     spec: dict
 39 | 
 40 | 
 41 | class State(NamedTuple):
 42 |     name: str
 43 |     spec: dict
 44 | 
 45 | 
 46 | def make_logical_name(s: str) -> str:
 47 |     words = (w.capitalize() for w in re.split(r"[\W_]+", s))
 48 |     ret = "".join(words)
 49 |     return ret
 50 | 
 51 | 
 52 | # given "${something}":
 53 | #   match.group(0) == "${something}"
 54 | #   match.group(1) == "something"
 55 | PARAM_FINDER = re.compile(r"\${([A-Za-z0-9]+)}")
 56 | 
 57 | def substitute_params(params: dict, target: Any):
 58 |     if isinstance(target, str):
 59 |         ret = PARAM_FINDER.sub(lambda m: str(params.get(m.group(1), m.group(0))), target)
 60 |     elif isinstance(target, list):
 61 |         ret = [substitute_params(params, v) for v in target]
 62 |     elif isinstance(target, dict):
 63 |         ret = {k: substitute_params(params, v) for k, v in target.items()}
 64 |     else:
 65 |         ret = target
 66 |     return ret
 67 | 
 68 | 
 69 | def lambda_logging_block(step_name: str) -> dict:
 70 |     ret = {
 71 |         "logging": {
 72 |             "branch.$": "$.index",
 73 |             "job_file_bucket.$": "$.job_file.bucket",
 74 |             "job_file_key.$": "$.job_file.key",
 75 |             "job_file_version.$": "$.job_file.version",
 76 |             "sfn_execution_id.$": "$$.Execution.Name",
 77 |             "step_name": step_name,
 78 |             "workflow_name": "${WorkflowName}",
 79 |         },
 80 |     }
 81 |     return ret
 82 | 
 83 | 
 84 | def time_string_to_seconds(time: str) -> int:
 85 |     units = {"s": "seconds", "m": "minutes", "h": "hours", "d": "days", "w": "weeks"}
 86 |     count = int(time[:-1])
 87 |     unit = units[time[-1]]
 88 |     td = timedelta(**{unit: count})
 89 |     ret = td.seconds + 60 * 60 * 24 * td.days
 90 |     return ret
 91 | 
 92 | 
 93 | def lambda_retry(
 94 |         max_attempts: int = 5,
 95 |         interval_seconds: int = 2,
 96 |         backoff_rate: float = 2.0,
 97 |         jitter_strategy: str = "FULL") -> dict:
 98 |     # https://docs.aws.amazon.com/step-functions/latest/dg/bp-lambda-serviceexception.html
 99 |     ret = {
100 |         "Retry": [
101 |             {
102 |                 "ErrorEquals": [
103 |                     "Lambda.ClientExecutionTimeoutException",
104 |                     "Lambda.ServiceException",
105 |                     "Lambda.AWSLambdaException",
106 |                     "Lambda.SdkClientException",
107 |                     "Lambda.TooManyRequestsException",
108 |                 ],
109 |                 "MaxAttempts": max_attempts,
110 |                 "IntervalSeconds": interval_seconds,
111 |                 "BackoffRate": backoff_rate,
112 |                 "JitterStrategy": jitter_strategy,
113 |             },
114 |         ]
115 |     }
116 |     return ret
117 | 
118 | 
119 | def merge_params_and_options(params: dict, options: dict) -> dict:
120 |     ret = params | options
121 |     if ret["task_role"] is None:
122 |         ret["task_role"] = params["task_role"]
123 |     return ret
124 | 


--------------------------------------------------------------------------------
/lambda/tests/common/test_substitutions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import pytest
  4 | 
  5 | from ...src.common.python.substitutions import _lookup, substitute_job_data, substitute_into_filenames
  6 | 
  7 | 
  8 | @pytest.mark.parametrize("target, expect", [
  9 |     ("string", "string_value"),
 10 |     ("number", "99"),
 11 |     ("list", [1,2,3]),
 12 |     ("dict", {"a": 1,"b": 2}),
 13 |     ("boolean_T", "True"),
 14 |     ("boolean_F", "False"),
 15 |     ("empty_string", ""),
 16 |     ("zero", "0"),
 17 | ])
 18 | def test_lookup(target, expect):
 19 |     data = {
 20 |         "string": "string_value",
 21 |         "number": 99,
 22 |         "list": [1, 2, 3],
 23 |         "dict": {"a": 1, "b": 2},
 24 |         "boolean_T": True,
 25 |         "boolean_F": False,
 26 |         "empty_string": "",
 27 |         "zero": 0,
 28 |     }
 29 |     result = _lookup(target, data)
 30 |     assert isinstance(result, str)
 31 | 
 32 |     if isinstance(expect, str):
 33 |         assert result == expect
 34 |     else:
 35 |         result2 = json.loads(json.loads(result))
 36 |         assert result2 == expect
 37 | 
 38 | 
 39 | def test_lookup_fail():
 40 |     data = {"a": 1, "b": 2}
 41 |     target = "z"
 42 |     with pytest.raises(RuntimeError, match="not found in job data"):
 43 |         _lookup(target, data)
 44 | 
 45 | 
 46 | def test_substitute_job_data():
 47 |     subject = {
 48 |         "a_string": "a ${job.value} b ${scatter.value} c ${parent.value}",
 49 |         "a_list": [
 50 |             "e ${job.value}",
 51 |             "f ${scatter.value}",
 52 |             "g ${parent.value}",
 53 |         ],
 54 |         "a_dict": {
 55 |             "eh": "h ${job.value}",
 56 |             "bee": "i ${scatter.value}",
 57 |             "sea": "j ${parent.value}",
 58 |         },
 59 |     }
 60 | 
 61 |     job_data = {
 62 |         "job": {
 63 |             "value": "one"
 64 |         },
 65 |         "scatter": {
 66 |             "value": 2
 67 |         },
 68 |         "parent": {
 69 |             "value": ["three"]
 70 |         },
 71 |     }
 72 | 
 73 |     expect = {
 74 |         "a_string": 'a one b 2 c "[\\"three\\"]"',
 75 |         "a_list": [
 76 |             "e one",
 77 |             "f 2",
 78 |             'g "[\\"three\\"]"',
 79 |         ],
 80 |         "a_dict": {
 81 |             "eh": "h one",
 82 |             "bee": "i 2",
 83 |             "sea": 'j "[\\"three\\"]"',
 84 |         },
 85 |     }
 86 | 
 87 |     result = substitute_job_data(subject, job_data)
 88 |     assert result == expect
 89 | 
 90 | 
 91 | def test_substitute_into_filenames():
 92 |     subject = {
 93 |         "file": "s3:/${bucket}/${path}/${name}.${ext}",
 94 |         "files": [
 95 |             "s3:/${bucket}/${path}/${name}1.${ext}",
 96 |             "s3:/${bucket}/${path}/${name}2.${ext}",
 97 |             "s3:/${bucket}/${path}/${name}3.${ext}",
 98 |         ],
 99 |         "fileses": {
100 |             "file_a": "s3:/${bucket}/${path}/${name}_a.${ext}",
101 |             "file_b": "s3:/${bucket}/${path}/${name}_b.${ext}",
102 |             "file_c": "s3:/${bucket}/${path}/${name}_c.${ext}",
103 |         }
104 |     }
105 |     subs = {
106 |         "bucket": "bucket_name",
107 |         "path": "path/to/whatever",
108 |         "name": "file_name",
109 |     }
110 |     expect = {
111 |         "file": "s3:/bucket_name/path/to/whatever/file_name.${ext}",
112 |         "files": [
113 |             "s3:/bucket_name/path/to/whatever/file_name1.${ext}",
114 |             "s3:/bucket_name/path/to/whatever/file_name2.${ext}",
115 |             "s3:/bucket_name/path/to/whatever/file_name3.${ext}",
116 |         ],
117 |         "fileses": {
118 |             "file_a": "s3:/bucket_name/path/to/whatever/file_name_a.${ext}",
119 |             "file_b": "s3:/bucket_name/path/to/whatever/file_name_b.${ext}",
120 |             "file_c": "s3:/bucket_name/path/to/whatever/file_name_c.${ext}",
121 |         }
122 |     }
123 |     result = substitute_into_filenames(subject, subs)
124 |     assert result == expect
125 | 


--------------------------------------------------------------------------------
/lambda/tests/compiler/test_util.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pytest
  3 | 
  4 | from ...src.compiler.pkg.util import Step, make_logical_name, substitute_params, time_string_to_seconds, \
  5 |     merge_params_and_options
  6 | 
  7 | 
  8 | @pytest.mark.parametrize("next_step, expect", [
  9 |     ("", True),
 10 |     ("not_terminal", False)
 11 | ])
 12 | def test_step_is_terminal(next_step, expect):
 13 |     step = Step("name", {}, next_step)
 14 |     result = step.is_terminal
 15 |     assert result == expect
 16 | 
 17 | 
 18 | @pytest.mark.parametrize("step, expect", [
 19 |     (Step("name1", {"Other": "stuff"}, "next_step"), {"Next": "next_step"}),
 20 |     (Step("name2", {"Other": "stuff"}, ""), {"End": True}),
 21 | ])
 22 | def test_step_next_or_end(step, expect):
 23 |     result = step.next_or_end
 24 |     assert result == expect
 25 | 
 26 | 
 27 | @pytest.mark.parametrize("step, expect", [
 28 |     (Step("name1", {"Other": "stuff", "inputs": {"file1": "one", "file2": "two"}}, ""), {"inputs": json.dumps({"file1": "one", "file2": "two"}, separators=(",", ":"))}),
 29 |     (Step("name2", {"Other": "stuff", "inputs": {}}, ""), {"inputs": json.dumps({})}),
 30 |     (Step("name3", {"Other": "stuff", "inputs": None}, ""), {"inputs.$": "States.JsonToString($.prev_outputs)"})
 31 | ])
 32 | def test_step_input_field(step, expect):
 33 |     result = step.input_field
 34 |     assert result == expect
 35 | 
 36 | 
 37 | def test_make_logical_name():
 38 |     orig_name = "a-name  with++LOTS___of%wEiRd,,\n,,characters/that~will&NEVER(work)as\ta##LOGICAL!name12345"
 39 |     result = make_logical_name(orig_name)
 40 |     expect = "ANameWithLotsOfWeirdCharactersThatWillNeverWorkAsALogicalName12345"
 41 |     assert result == expect
 42 | 
 43 | 
 44 | def test_substitute_params():
 45 |     target = {
 46 |         "one": "${value1} ${value2} ${value3} ${skip_me} ${value1} again",
 47 |         "two": [
 48 |             "eh ${value1}",
 49 |             "bee ${value2}",
 50 |             "sea ${value3}",
 51 |             "dee ${skip_me}"
 52 |         ],
 53 |         "three": {
 54 |             "k1": "double-u ${value1}",
 55 |             "k2": "ecks ${value2}",
 56 |             "k3": "why ${value3}",
 57 |             "k4": "zee ${skip_me}"
 58 |         },
 59 |         "four": 99,
 60 |     }
 61 |     params = {
 62 |         "value1": "string",
 63 |         "value2": "${reference}",
 64 |         "value3": 42,
 65 |         "value4": "not used",
 66 |     }
 67 |     expect = {
 68 |         "one": "string ${reference} 42 ${skip_me} string again",
 69 |         "two": [
 70 |             "eh string",
 71 |             "bee ${reference}",
 72 |             "sea 42",
 73 |             "dee ${skip_me}"
 74 |         ],
 75 |         "three": {
 76 |             "k1": "double-u string",
 77 |             "k2": "ecks ${reference}",
 78 |             "k3": "why 42",
 79 |             "k4": "zee ${skip_me}"
 80 |         },
 81 |         "four": 99,
 82 |     }
 83 |     result = substitute_params(params, target)
 84 |     assert result == expect
 85 | 
 86 | 
 87 | def test_substitute_params_empty_params():
 88 |     params = {}
 89 |     target = "${one} ${two} ${three}"
 90 |     result = substitute_params(params, target)
 91 |     assert result == target
 92 | 
 93 | 
 94 | @pytest.mark.parametrize("timestring, seconds", [
 95 |     ("70s", 70),
 96 |     ("20 m", 1200),
 97 |     ("3h", 3600*3),
 98 |     ("2 d", 86400*2),
 99 |     ("1w", 86400*7)
100 | ])
101 | def test_time_string_to_seconds(timestring, seconds):
102 |     result = time_string_to_seconds(timestring)
103 |     assert result == seconds
104 | 
105 | 
106 | @pytest.mark.parametrize("p_role, o_role, x_role", [
107 |     (None, None, None),
108 |     (None, "opt_role", "opt_role"),
109 |     ("parm_role", None, "parm_role"),
110 |     ("parm_role", "opt_role", "opt_role"),
111 | ])
112 | def test_merge_params_and_options(p_role, o_role, x_role):
113 |     params = {"a": 1, "b": 2, "c": 3, "task_role": p_role}
114 |     options = {"z": 9, "y": 8, "task_role": o_role}
115 |     expect = {"a": 1, "b": 2, "c": 3, "z": 9, "y": 8, "task_role": x_role}
116 |     result = merge_params_and_options(params, options)
117 |     assert result == expect
118 | 


--------------------------------------------------------------------------------
/lambda/tests/common/test_file_select.py:
--------------------------------------------------------------------------------
  1 | import json as j
  2 | import os
  3 | 
  4 | import boto3
  5 | import moto
  6 | import pytest
  7 | 
  8 | from ...src.common.python.file_select import select_file_contents, read_json, read_yaml
  9 | 
 10 | csv = b"""\
 11 | id,one,two,three,four
 12 | a,11,12,13,14
 13 | b,21,22,23,24
 14 | c,31,32,33,34
 15 | d,41,42,43,44
 16 | """
 17 | 
 18 | json = b"""\
 19 | [
 20 |     {"a":11, "b":12, "c":13, "d":14},
 21 |     {"a":21, "b":22, "c":23, "d":24},
 22 |     {"a":31, "b":32, "c":33, "d":34},
 23 |     {"a":41, "b":42, "c":43, "d":44}
 24 | ]
 25 | """
 26 | 
 27 | jsonl = b"""\
 28 | {"a":11, "b":12, "c":13, "d":14}
 29 | {"a":21, "b":22, "c":23, "d":24}
 30 | {"a":31, "b":32, "c":33, "d":34}
 31 | {"a":41, "b":42, "c":43, "d":44}
 32 | """
 33 | 
 34 | tsv = b"""\
 35 | id	one	two	three	four
 36 | a	11	12	13	14
 37 | b	21	22	23	24
 38 | c	31	32	33	34
 39 | d	41	42	43	44
 40 | """
 41 | 
 42 | txt = b"""\
 43 | row1
 44 | row2
 45 | row3
 46 | row4
 47 | row5
 48 | """
 49 | 
 50 | yaml = b"""\
 51 | data:
 52 |   - a: 11
 53 |     b: 12
 54 |     c: 13
 55 |     d: 14
 56 |   - a: 21
 57 |     b: 22
 58 |     c: 23
 59 |     d: 24
 60 |   - a: 31
 61 |     b: 32
 62 |     c: 33
 63 |     d: 34
 64 |   - a: 41
 65 |     b: 42
 66 |     c: 43
 67 |     d: 44
 68 | """
 69 | 
 70 | 
 71 | @pytest.fixture(scope="module")
 72 | def src_bucket():
 73 |     with moto.mock_aws():
 74 |         yld = boto3.resource("s3", region_name="us-east-1").Bucket("test-bucket")
 75 |         yld.create()
 76 |         yld.put_object(Key="test-data/file.csv", Body=csv)
 77 |         yld.put_object(Key="test-data/file.json", Body=json)
 78 |         yld.put_object(Key="test-data/file.jsonl", Body=jsonl)
 79 |         yld.put_object(Key="test-data/file.tsv", Body=tsv)
 80 |         yld.put_object(Key="test-data/file.txt", Body=txt)
 81 |         yld.put_object(Key="test-data/file.yaml", Body=yaml)
 82 |         yield yld
 83 | 
 84 | 
 85 | @pytest.mark.parametrize("query, expect", [
 86 |     # ("s3://test-bucket/test-data/file.jsonl", ["14", "24", "34", "44"]),          # select all "d" elements
 87 |     ("s3://test-bucket/test-data/file.json:$[*].d", ["14", "24", "34", "44"]),          # select all "d" elements
 88 |     ("s3://test-bucket/test-data/file.jsonl:$[*].c", ["13", "23", "33", "43"]),         # select all "c" elements
 89 |     ("s3://test-bucket/test-data/file.yaml:$.data[*].a", ["11", "21", "31", "41"]),     # select all "a" elements
 90 |     ("s3://test-bucket/test-data/file.csv:$[*].three", ["13", "23", "33", "43"]),       # select column "three"
 91 |     ("s3://test-bucket/test-data/file.csv:$[*].two", ["12", "22", "32", "42"]),         # select column "two"
 92 |     ("s3://test-bucket/test-data/file.txt:$[2:4]", ["row3", "row4"]),                   # select lines 2 and 3 (zero-based)
 93 |     ("s3://test-bucket/test-data/file.txt", ["row1", "row2", "row3", "row4", "row5"]),  # select all lines
 94 | ])
 95 | def test_select_file_contents(src_bucket, query, expect):
 96 |     result = select_file_contents(query)
 97 |     print(str(result))
 98 |     assert result == expect
 99 | 
100 | 
101 | """
102 | Tests: file_select.read_json(body)
103 | This converts a file with a JSON-like structure into JSON format
104 | """
105 | # @pytest.mark.skip
106 | def test_read_json0(tmp_path):
107 |     json_data = {"key": "value"}
108 |     json_file = tmp_path / "test.json"
109 |     with json_file.open(mode="w") as fp:
110 |         j.dump(json_data, fp)
111 | 
112 |     # read in file contents in memory
113 |     with json_file.open(mode="r") as json_content:
114 |         response = read_json(
115 |             body=json_content
116 |         )
117 | 
118 |         assert(response == json_data)
119 | 
120 | 
121 | """
122 | Tests: file_select.read_yaml(body)
123 | Test the Conversion of YAML into a dictionary object
124 | """
125 | #@pytest.mark.skip()
126 | def test_read_yaml0():
127 | 
128 |     # expected response
129 |     response_should_be = {'a': 1, 'b': {'c': 3, 'd': 4}}
130 | 
131 |     # test input value to function
132 |     input_body = """
133 |     a: 1
134 |     b:
135 |         c: 3
136 |         d: 4
137 |     """
138 | 
139 |     response = read_yaml (
140 |         body=input_body
141 |     )
142 | 
143 |     assert(response == response_should_be)
144 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/test_qc_check.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import boto3
  4 | import moto
  5 | import pytest
  6 | 
  7 | from ..src.runner.qc_check import abort_execution, run_one_qc_check, run_all_qc_checks, do_checks, QCFailure
  8 | 
  9 | QC_DATA_1 = {
 10 |     "a": 1,
 11 |     "b": 2.
 12 | }
 13 | QC_DATA_2 = {
 14 |     "x": 99,
 15 |     "y": 98,
 16 | }
 17 | 
 18 | 
 19 | @pytest.fixture(scope="function")
 20 | def mock_state_machine():
 21 |     with moto.mock_aws():
 22 |         iam = boto3.resource("iam", region_name="us-east-1")
 23 |         role = iam.create_role(
 24 |             RoleName="fakeRole",
 25 |             AssumeRolePolicyDocument="{}"
 26 |         )
 27 | 
 28 |         sfn = boto3.client("stepfunctions", region_name="us-east-1")
 29 |         state_machine = sfn.create_state_machine(
 30 |             name="fakeStateMachine",
 31 |             definition="{}",
 32 |             roleArn=role.arn
 33 |         )
 34 | 
 35 |         yield state_machine["stateMachineArn"]
 36 | 
 37 | 
 38 | @pytest.fixture(scope="function")
 39 | def mock_qc_data_files(mocker, request):
 40 |     qc_file1 = mocker.mock_open(read_data=json.dumps(QC_DATA_1))
 41 |     qc_file2 = mocker.mock_open(read_data=json.dumps(QC_DATA_2))
 42 |     ret = mocker.patch("builtins.open", qc_file1)
 43 |     ret.side_effect = [qc_file1.return_value, qc_file2.return_value]
 44 | 
 45 | 
 46 | def test_abort_execution(mock_state_machine, monkeypatch):
 47 |     sfn = boto3.client("stepfunctions", region_name="us-east-1")
 48 |     sfn_execution = sfn.start_execution(
 49 |         stateMachineArn=mock_state_machine,
 50 |         name="fake_execution",
 51 |         input='{"in": "put"}'
 52 |     )
 53 | 
 54 |     monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
 55 |     monkeypatch.setenv("AWS_ACCOUNT_ID", "123456789012")
 56 |     monkeypatch.setenv("BC_WORKFLOW_NAME", "fakeStateMachine")
 57 |     monkeypatch.setenv("BC_EXECUTION_ID", "fake_execution")
 58 |     monkeypatch.setenv("BC_STEP_NAME", "test_step")
 59 | 
 60 |     abort_execution(["failure1", "failure2"])
 61 | 
 62 |     execution_desc = sfn.describe_execution(executionArn=sfn_execution["executionArn"])
 63 |     assert execution_desc["status"] == "ABORTED"
 64 | 
 65 | 
 66 | @pytest.mark.parametrize("expression, expect", [
 67 |     ("x == 1", True),
 68 |     ("x != 1", False),
 69 | ])
 70 | def test_run_one_qc_check(expression, expect):
 71 |     qc_data = {"x": 1}
 72 |     result = run_one_qc_check(qc_data, expression)
 73 |     assert result == expect
 74 | 
 75 | 
 76 | @pytest.mark.parametrize("fake1_cond, fake2_cond, expect", [
 77 |     (["a>1"], ["x<99"], []),  # all pass
 78 |     (["a>1", "b==2"], ["y<98"], ["fake1: b==2"]),  # one fail
 79 |     (["b==1"], ["x==99", "y==98"], ["fake2: x==99", "fake2: y==98"]),  # multi fail
 80 |     (["a==1", "b==2"], ["x==99", "y==98"], ["fake1: a==1", "fake1: b==2", "fake2: x==99", "fake2: y==98"]),  # all fail
 81 | ])
 82 | def test_run_all_qc_checks(fake1_cond, fake2_cond, expect, mock_qc_data_files):
 83 |     spec = [
 84 |         {
 85 |             "qc_result_file": "fake1",
 86 |             "stop_early_if": fake1_cond,
 87 |         },
 88 |         {
 89 |             "qc_result_file": "fake2",
 90 |             "stop_early_if": fake2_cond,
 91 |         },
 92 |     ]
 93 | 
 94 |     result = list(run_all_qc_checks(spec))
 95 |     assert result == expect
 96 | 
 97 | 
 98 | @pytest.mark.parametrize("fake1_cond, fake2_cond, expect", [
 99 |     (None, None, []),  # no checks
100 |     (["a>1"], ["x<99"], []),  # all pass
101 |     (["a>1", "b==2"], ["y<98"], ["fake1: b==2"]),  # one fail
102 |     (["b==1"], ["x==99", "y==98"], ["fake2: x==99", "fake2: y==98"]),  # multi fail
103 |     (["a==1", "b==2"], ["x==99", "y==98"],
104 |      ["fake1: a==1", "fake1: b==2", "fake2: x==99", "fake2: y==98"]),  # all fail
105 | ])
106 | def test_do_checks(fake1_cond, fake2_cond, expect, mock_qc_data_files, mocker):
107 |     mock_abort_execution = mocker.patch("bclaw_runner.src.runner.qc_check.abort_execution")
108 | 
109 |     if fake1_cond is None:
110 |         spec = []
111 |     else:
112 |         spec = [
113 |             {
114 |                 "qc_result_file": "fake1",
115 |                 "stop_early_if": fake1_cond,
116 |             },
117 |             {
118 |                 "qc_result_file": "fake2",
119 |                 "stop_early_if": fake2_cond,
120 |             },
121 |         ]
122 | 
123 |     if expect:
124 |         with pytest.raises(QCFailure) as qcf:
125 |             do_checks(spec)
126 |         assert qcf.value.failures == expect
127 |     else:
128 |         do_checks(spec)
129 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/test_cache.py:
--------------------------------------------------------------------------------
  1 | import fcntl
  2 | import logging
  3 | import os
  4 | 
  5 | import boto3
  6 | import moto
  7 | import pytest
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | 
 11 | from ..src.runner.cache import _blocking_download, _download_to_cache, get_reference_inputs
 12 | 
 13 | TEST_BUCKET = "test-bucket"
 14 | FILE1_CONTENT = "file one"
 15 | FILE2_CONTENT = "file two"
 16 | FILE3_CONTENT = "file three"
 17 | 
 18 | 
 19 | @pytest.fixture(scope="module")
 20 | def s3_bucket():
 21 |     with moto.mock_aws():
 22 |         boto3.client("s3", region_name="us-east-1").create_bucket(Bucket=TEST_BUCKET)
 23 |         yld = boto3.resource("s3", region_name="us-east-1").Bucket(TEST_BUCKET)
 24 |         yld.put_object(Key="some/path/file1", Body=FILE1_CONTENT.encode("utf-8"))
 25 |         yld.put_object(Key="other/path/file2", Body=FILE2_CONTENT.encode("utf-8"))
 26 |         yld.put_object(Key="one/more/path/file3", Body=FILE3_CONTENT.encode("utf-8"))
 27 |         yield yld
 28 | 
 29 | 
 30 | def test_blocking_download(tmp_path, s3_bucket):
 31 |     src = s3_bucket.Object("some/path/file1")
 32 |     dst = f"{tmp_path}/file1"
 33 |     _blocking_download(src, dst, "file1")
 34 |     assert os.path.isfile(dst)
 35 | 
 36 | 
 37 | def test_blocking_download_already_there(tmp_path, caplog):
 38 |     caplog.set_level(logging.INFO)
 39 |     dst = f"{tmp_path}/file99"
 40 |     open(dst, "w").close()
 41 |     _blocking_download("s3://does/not/exist", dst, "file99")
 42 |     assert "found file99 in cache" in caplog.text
 43 | 
 44 | 
 45 | def test_blocking_download_blocked(tmp_path, s3_bucket):
 46 |     src = s3_bucket.Object("some/path/file1")
 47 |     dst = f"{tmp_path}/file1"
 48 |     lock_file = f"{os.path.dirname(dst)}.lock"
 49 |     with open(lock_file, "w") as lfp:
 50 |         fcntl.flock(lfp, fcntl.LOCK_EX | fcntl.LOCK_NB)
 51 |         with pytest.raises(BlockingIOError):
 52 |             _blocking_download(src, dst, "file1")
 53 |     os.remove(lock_file)
 54 | 
 55 | 
 56 | def test_download_to_cache(monkeypatch, tmp_path, s3_bucket):
 57 |     monkeypatch.setenv("BC_SCRATCH_PATH", str(tmp_path))
 58 | 
 59 |     src_etag = s3_bucket.Object("some/path/file1").e_tag.strip('"')
 60 | 
 61 |     result = _download_to_cache(("test_file", f"s3://{TEST_BUCKET}/some/path/file1"))
 62 |     expected = "test_file", f"{tmp_path}/{src_etag}/file1"
 63 |     assert result == expected
 64 | 
 65 |     key, cached_file = result
 66 |     assert os.path.isfile(cached_file)
 67 |     with open(cached_file) as fp:
 68 |         cached_content = fp.readline()
 69 |         assert cached_content == FILE1_CONTENT
 70 | 
 71 |     with open(cached_file, "a") as fp:
 72 |         print("extra content", file=fp)
 73 | 
 74 |     result2 = _download_to_cache(("test_file", f"s3://{TEST_BUCKET}/some/path/file1"))
 75 |     _, cached_file2 = result2
 76 |     with open(cached_file2) as fp2:
 77 |         cached_content2 = fp2.readline()
 78 |         assert cached_content2 == FILE1_CONTENT + "extra content\n"
 79 | 
 80 | 
 81 | def test_get_reference_inputs(monkeypatch, tmp_path, s3_bucket):
 82 |     monkeypatch.setenv("BC_SCRATCH_PATH", str(tmp_path))
 83 | 
 84 |     ref_spec = {
 85 |         "file1": f"s3://{TEST_BUCKET}/some/path/file1",
 86 |         "file2": f"s3://{TEST_BUCKET}/other/path/file2",
 87 |         "file3": f"s3://{TEST_BUCKET}/one/more/path/file3",
 88 |     }
 89 | 
 90 |     workspace = f"{str(tmp_path)}/workdir"
 91 |     os.makedirs(workspace)
 92 |     os.chdir(workspace)
 93 | 
 94 |     result = get_reference_inputs(ref_spec)
 95 |     expect = {
 96 |         "file1": "file1",
 97 |         "file2": "file2",
 98 |         "file3": "file3",
 99 |     }
100 |     assert result == expect
101 | 
102 |     for file, expected_content in {"file1": FILE1_CONTENT, "file2": FILE2_CONTENT, "file3": FILE3_CONTENT}.items():
103 |         assert os.path.isfile(file)
104 |         assert os.stat(file).st_nlink >= 2  # make sure the file is a hard link
105 |         with open(file) as fp:
106 |             content = fp.readline()
107 |             assert content == expected_content
108 | 
109 | 
110 | def test_get_reference_inputs_fail(monkeypatch, tmp_path, s3_bucket):
111 |     monkeypatch.setenv("BC_SCRATCH_PATH", str(tmp_path))
112 | 
113 |     ref_spec = {
114 |         "file1": f"s3://{TEST_BUCKET}/some/path/file1",
115 |         "file2": f"s3://{TEST_BUCKET}/other/path/file2",
116 |         "file3": f"s3://{TEST_BUCKET}/does/not/exist/file99",
117 |     }
118 | 
119 |     workspace = f"{str(tmp_path)}/workdir"
120 |     os.makedirs(workspace)
121 |     os.chdir(workspace)
122 | 
123 |     with pytest.raises(Exception):
124 |         _ = get_reference_inputs(ref_spec)
125 | 


--------------------------------------------------------------------------------
/doc/options_and_parameters.md:
--------------------------------------------------------------------------------
  1 | # Workflow options and parameters
  2 | 
  3 | ## Options
  4 | 
  5 | The Options block of a BayerCLAW workflow template allows you to set values that affect how
  6 | BayerCLAW itself operates when building and running your workflow.
  7 | 
  8 | ### `shell`
  9 | 
 10 | BayerCLAW provides the ability to choose which Unix shell to run Batch job commands
 11 | under. You can specify the shell to use globally, using the setting in the `Options` block
 12 | or for individual steps in the `compute` block. The choices for the `shell` setting are
 13 | `sh`, `bash`, and `sh-pipefail`:
 14 | 
 15 | | Choice      | Shell | Shell options  | Default? |
 16 | |-------------|-------|----------------|----------|
 17 | | sh          | sh    | -veu           | yes      |
 18 | | bash        | bash  | -veuo pipefail | no       |
 19 | | sh-pipefail | sh    | -veuo pipefail | no       |
 20 | 
 21 | Bourne shell (`sh`) is for all intents and purposes supported by all Unix implementations,
 22 | so  it is the default. The `bash` choice is provided mostly for backward compatibility
 23 | but is still supported by most popular Linuxen.
 24 | 
 25 | The shell options are based on the so-called [Bash Strict Mode](http://redsymbol.net/articles/unofficial-bash-strict-mode/)
 26 | as an aid to debugging. Since the `pipefail` option is not included in the Bourne shell
 27 | specification (as of June 2022), it is not included in the default shell options. Nevertheless,
 28 | some `sh` implementations do provide a `pipefail` option, 
 29 | hence the `sh-pipefail` choice. To check whether `pipefail` is implemented in your favorite
 30 | `sh`, use the command `sh -c "set -o"` and look for a `pipefail` entry in the resulting list.
 31 | 
 32 | The `-v` shell option is used to echo each command before execution. Some users
 33 | may prefer the similar `-x` option. The difference is that `-x` prints commands after
 34 | variable substitution has happened, which can cause privileged information (passwords,
 35 | etc.) to be exposed in the logs. With `-v`, commands are printed before variable substitution,
 36 | and thus is the safer choice.
 37 | 
 38 | ### `task_role`
 39 | 
 40 | The `task_role` option allows you to override the IAM role that BayerCLAW will use to run your workflow.
 41 | By default, BayerCLAW batch jobs run under an IAM role that provides access to a minimal set of AWS
 42 | services (S3, EC2, ECR, CloudWatch logs). If your workflow has tasks that utilize other services, you can
 43 | create a custom task role and provide its ARN to through the `task_role` option.
 44 | 
 45 | The global `task_role` setting can itself be overridden using the per-step `task_role` option.
 46 | 
 47 | ### `versioned` ‼️ **DEPRECATED**
 48 | 
 49 | BayerCLAW workflows are now always versioned. 
 50 | 
 51 | ## Parameters
 52 | 
 53 | The Parameters block allows you to customize workflows without editing the template file. The basic Parameter
 54 | definition format is described [here](./language.md/#the-parameters-block).
 55 | 
 56 | ### Setting parameters
 57 | 
 58 | If you compile your workflow using the AWS CloudFormation console, you will be prompted for Parameter values on
 59 | the `Specify stack details` page. If you use the AWS CLI, you can provide Parameter values using the `parameter
 60 | overrides` option:
 61 | 
 62 | ```bash
 63 | aws cloudformation deploy \
 64 | --template-file my-template.yaml \
 65 | --stack-name my-workflow \
 66 | --capabilities CAPABILITY_IAM \
 67 | --parameter-overrides theKing="elvis" status="lives"
 68 | ```
 69 | 
 70 | You can also provide Parameter values using a JSON file:
 71 | 
 72 | ```bash
 73 | aws cloudformation deploy \
 74 | --template-file my-template.yaml \
 75 | --stack-name my-workflow \
 76 | --capabilities CAPABILITY_IAM \
 77 | --parameter-overrides file:///path/more_path/parameters.json
 78 | ```
 79 | 
 80 | where `parameters.json` contains:
 81 | 
 82 | ```json5
 83 | [
 84 |   "theKing=elvis",
 85 |   "status=lives"
 86 | ]
 87 | ```
 88 | 
 89 | In addition, Parameter values can be retrieved from
 90 | (AWS Systems Manager Parameter Store)[https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/parameters-section-structure.html#aws-ssm-parameter-types].
 91 | To do so, in the simplest case, declare your Parameter with type `AWS::SSM::Parameter::Value<String>` and
 92 | use the name of a parameter stored in Parameter Store. For example for the Parameter value:
 93 | 
 94 | ```yaml
 95 | Parameters:
 96 |   storedParameter:
 97 |     Type: AWS::SSM::Parameter::Value<String>
 98 |     Default: myStoredParameter
 99 | ```
100 | 
101 | This will retrieve the value of `myStoredParameter` from Parameter Store and use it in your workflow. Parameters
102 | used in this way must exist before compilation time. Parameter Store SecureString parameters are not supported.
103 | 


--------------------------------------------------------------------------------
/doc/subpipes.md:
--------------------------------------------------------------------------------
  1 | # Using Subpipes in BayerCLAW
  2 | 
  3 | ## Overview
  4 | An BayerCLAW workflow can execute other BayerCLAW workflows as subpipes. Use cases for this include:
  5 | - Creating reusable, modular workflows
  6 | - Breaking large, complex workflows into manageable units
  7 | - Enabling workflows to be developed and tested as separate, logical units 
  8 | 
  9 | ### How it works
 10 | 
 11 | In general, the main workflow should create a new job data file that can be submitted to the subpipe:
 12 | 
 13 | ```yaml
 14 |   -
 15 |     MakeNewJobData:
 16 |       image: docker.io/library/ubuntu
 17 |       commands:
 18 |         - "echo '{\"a\": \"eh\", \"b\": \"bee\", \"c\": \"sea\"}' > ${sub_job_data}"
 19 |       outputs:
 20 |         sub_job_data: sub_job.json
 21 |   -
 22 |     RunTheSubpipe:
 23 |       job_data: sub_job.json
 24 |       subpipe: my-subpipe
 25 | ```
 26 | 
 27 | If the main workflow's job data file contains all of the information needed to run the subpipe, you may
 28 | omit the subpipe step's `job_data` field, and the original job data file will be submitted directly to the
 29 | subpipe.
 30 | 
 31 | The subpipe step creates a repository for the subpipe (this will be in a folder inside of the main
 32 | repository) where the subpipe will store its intermediate files. After the subpipe finishes, the
 33 | main pipeline can optionally copy files out of the sub-repository into the main repository.
 34 | 
 35 | There are no special requirements for the subpipe. It can be an ordinary BayerCLAW workflow -- however,
 36 | the repository established by the main workflow overrides the repository designated in the subpipe's
 37 | workflow definition.
 38 | 
 39 | ## Calling a subpipe
 40 | To invoke a subpipe, the parent pipeline must contain a *subpipe step*.
 41 | 
 42 | ### Subpipe step syntax
 43 | ```yaml
 44 |   SubpipeStepName:
 45 |     job_data: sub_job.json
 46 |     subpipe: my-subpipe-workflow
 47 |     retrieve:
 48 |       - filenameX.txt -> filenameY.txt
 49 |       - filenameZ.txt
 50 | ```
 51 | The fields of the subpipe step are:
 52 | - `job_data`: An S3 file that will be used to launch the subpipe. This may be the name of a file in the
 53 | main workflow's repository, or a full S3 URI of a file that exists elsewhere.
 54 | 
 55 | - `subpipe`: The name of the BayerCLAW workflow to be run as a subpipe. For testing purposes, you may also provide the
 56 | Amazon Resource Name (ARN) of a Step Functions state machine that simulates the behavior of the real subpipe.
 57 |  
 58 | - `retrieve`: A list of files to be copied from the subpipe's repository to the parent workflow's repository.
 59 | Use the syntax `subpipe_filename -> parent_wf_filename` to rename the file, or just the name
 60 | of the file if it does not need to be renamed. The `retrieve` field may be omitted if there are no files to
 61 | copy into the parent workflow's repository.
 62 | 
 63 | ### String substitution and file globs
 64 | Values from the execution's job data file can be substituted into any filename in the `retrieve`
 65 | field. For instance, this would be valid (though not really recommented): `${job.project_id}.txt -> ${job.sample_id}.txt`.
 66 | 
 67 | Filename globbing is not available in subpipe steps.
 68 | 
 69 | ### Subpipes and scatter/gather
 70 | A subpipe may be invoked from inside of a scatter step. For instance, this is a small workflow that scatters
 71 | over a set of sequence files, each branch passing a sequence file and a configuration file to a subpipe and
 72 | collecting the .bam files produced: 
 73 | 
 74 | ```yaml
 75 |   DoScatter:
 76 |     scatter:
 77 |       contigs: contigs*.fa
 78 |     inputs:
 79 |       config: config.cfg
 80 |     steps:
 81 |       -
 82 |         RunSubpipe:
 83 |           # no "job_data" field here, were passing along the main job data file
 84 |           subpipe: sub-workflow
 85 |           retrieve:
 86 |             - output.bam
 87 |     outputs:
 88 |       bamfile: output.bam
 89 | ```
 90 | 
 91 | While the `scatter` and `parent` variables from the scatter step are available to the subpipe
 92 | step itself, *the workflow invoked by the subpipe will not have access to these values*.
 93 | 
 94 | The sub-workflow, itself being an BayerCLAW workflow, may also contain its own scatter steps.
 95 | 
 96 | ## Job tracking in the AWS console
 97 | Although a subpipe call involves invoking a completely different workflow, AWS Step Functions makes it easy to track
 98 | both executions through the AWS console.
 99 | 
100 | In the console, the parent pipeline execution will contain links to the subpipe execution under the 
101 | `Execution event history` list:
102 | ![link to subpipe](resources/subpipes_step_functions_link1.png)
103 | 
104 | And the subpipe execution console page will be linked back to the parent in the `Execution details` box:
105 | ![link to parent](resources/subpipes_step_functions_link2.png)
106 | 
107 | Due to Step Functions execution naming restrictions, the subpipe execution will have a different name from the
108 | parent execution.
109 | 


--------------------------------------------------------------------------------
/util/bclaw_logs/lambda/tests/test_job_status.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | from decimal import Decimal
  3 | import json
  4 | 
  5 | import boto3
  6 | from boto3.dynamodb.conditions import Key
  7 | from moto import mock_dynamodb2
  8 | import pytest
  9 | 
 10 | from ..src.job_status import lambda_handler
 11 | 
 12 | 
 13 | @pytest.fixture(scope="function")
 14 | def ddb_table():
 15 |     with mock_dynamodb2():
 16 |         dynamodb = boto3.resource("dynamodb", region_name="us-east-1")
 17 |         yld = dynamodb.create_table(
 18 |             AttributeDefinitions=[
 19 |                 {
 20 |                     "AttributeName": "workflowName",
 21 |                     "AttributeType": "S",
 22 |                 },
 23 |                 {
 24 |                     "AttributeName": "executionId",
 25 |                     "AttributeType": "S",
 26 |                 },
 27 |             ],
 28 |             TableName="testTable",
 29 |             KeySchema=[
 30 |                 {
 31 |                     "AttributeName": "workflowName",
 32 |                     "KeyType": "HASH",
 33 |                 },
 34 |                 {
 35 |                     "AttributeName": "executionId",
 36 |                     "KeyType": "RANGE",
 37 |                 }
 38 |             ],
 39 |             BillingMode="PAY_PER_REQUEST"
 40 |         )
 41 | 
 42 |         yield yld
 43 | 
 44 | 
 45 | @pytest.mark.parametrize("status", ["RUNNING", "SUCCEEDED", "FAILED", "ABORTED"])
 46 | def test_lambda_handler(status, ddb_table, monkeypatch):
 47 |     monkeypatch.setenv("JOB_STATUS_TABLE", "testTable")
 48 |     monkeypatch.setenv("EXPIRATION_DAYS", "90")
 49 |     monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
 50 | 
 51 |     timestamp_str = "2023-02-24T15:18:13Z"
 52 | 
 53 |     event = {
 54 |         "time": timestamp_str,
 55 |         "detail": {
 56 |             "name": "12345678-1234...",
 57 |             "status": status,
 58 |             "input": json.dumps(
 59 |                 {
 60 |                     "job_file": {
 61 |                         "key": "test-workflow/path/to/job.file",
 62 |                         "version": "987654321",
 63 |                     },
 64 |                 }
 65 |             ),
 66 |         },
 67 |     }
 68 | 
 69 |     lambda_handler(event, {})
 70 | 
 71 |     chek = ddb_table.query(
 72 |         KeyConditionExpression=Key("workflow_name").eq("test-workflow"),
 73 |         Select="ALL_ATTRIBUTES"
 74 |     )
 75 | 
 76 |     timestamp_obj = dt.datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S%z")
 77 |     expiration_obj = timestamp_obj + dt.timedelta(days=90)
 78 |     expected_timestamp = Decimal(int(timestamp_obj.timestamp()))
 79 |     expected_expiration = Decimal(int(expiration_obj.timestamp()))
 80 | 
 81 |     expect = {
 82 |         "executionId": "12345678-1234...",
 83 |         "workflowName": "test-workflow",
 84 |         "jobFile": "path/to/job.file#987654321",
 85 |         "status": status,
 86 |         "timestamp": expected_timestamp,
 87 |         "expiration": expected_expiration,
 88 |     }
 89 |     assert chek["Items"][0] == expect
 90 | 
 91 | 
 92 | @pytest.mark.parametrize("old_status, new_status, expected_file_version", [
 93 |     ("RUNNING", "RUNNING", "new"),
 94 |     ("SUCCEEDED", "RUNNING", "old"),
 95 |     ("FAILED", "RUNNING", "old"),
 96 |     ("ABORTED", "RUNNING", "old"),
 97 |     ("RUNNING", "SUCCEEDED", "new"),
 98 |     ("RUNNING", "FAILED", "new"),
 99 |     ("RUNNING", "ABORTED", "new"),
100 | ])
101 | def test_lambda_handler_overwrite(old_status, new_status, expected_file_version, ddb_table, monkeypatch):
102 |     monkeypatch.setenv("JOB_STATUS_TABLE", "testTable")
103 |     monkeypatch.setenv("EXPIRATION_DAYS", "90")
104 |     monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
105 | 
106 |     ddb_table.put_item(
107 |         Item={
108 |             "workflowName": "test-workflow",
109 |             "executionId": "12345678-1234...",
110 |             "status": old_status,
111 |             "jobFile": "path/to/job.file#old"
112 |         }
113 |     )
114 | 
115 |     event = {
116 |         "time": "2021-05-28T17:53:54Z",
117 |         "detail": {
118 |             "name": "12345678-1234...",
119 |             "status": new_status,
120 |             "input": json.dumps(
121 |                 {
122 |                     "job_file": {
123 |                         "key": "test-workflow/path/to/job.file",
124 |                         # job file versions won't actually change during a run, this is just
125 |                         # a hack to check whether the record was overwritten
126 |                         "version": "new",
127 |                     },
128 |                 }
129 |             ),
130 |         },
131 |     }
132 | 
133 |     lambda_handler(event, {})
134 | 
135 |     chek = ddb_table.query(
136 |         KeyConditionExpression=Key("workflow_name").eq("test-workflow"),
137 |         Select="ALL_ATTRIBUTES"
138 |     )
139 | 
140 |     assert chek["Items"][0]["jobFile"].endswith("#" + expected_file_version)
141 | 


--------------------------------------------------------------------------------
/lambda/tests/compiler/test_subpipe_resources.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import textwrap
  3 | 
  4 | import pytest
  5 | import yaml
  6 | 
  7 | from ...src.compiler.pkg.subpipe_resources import file_submit_step, run_subpipe_step, file_retrieve_step, handle_subpipe
  8 | from ...src.compiler.pkg.util import Step, lambda_retry
  9 | 
 10 | SUBMIT_BLOCK = {
 11 |     "submit": [
 12 |         "file1.txt -> fileA.txt",
 13 |         "file2.txt",
 14 |     ],
 15 | }
 16 | 
 17 | 
 18 | @pytest.fixture(scope="module")
 19 | def sample_subpipe_spec() -> dict:
 20 |     ret = {
 21 |         "job_data": "test_job_data.json",
 22 |         **SUBMIT_BLOCK,
 23 |         "subpipe": "arn:aws:states:us-east-1:123456789012:StateMachine:test-machine",
 24 |         "retrieve": [
 25 |             "fileX.txt -> file3.txt",
 26 |             "fileY.txt",
 27 |         ],
 28 |     }
 29 |     return ret
 30 | 
 31 | 
 32 | def test_file_submit_step(sample_subpipe_spec, compiler_env):
 33 |     test_step = Step("step_name", sample_subpipe_spec, "next_step_name")
 34 |     result = file_submit_step(test_step, "run_subpipe_step_name")
 35 |     expect = {
 36 |         "Type": "Task",
 37 |         "Resource": "subpipes_lambda_arn",
 38 |         "Parameters": {
 39 |             "repo.$": "$.repo.uri",
 40 |             "job_data": "test_job_data.json",
 41 |             "submit": json.dumps(SUBMIT_BLOCK["submit"]),
 42 |             "step_name": "step_name",
 43 |             "logging": {
 44 |                 "branch.$": "$.index",
 45 |                 "job_file_bucket.$": "$.job_file.bucket",
 46 |                 "job_file_key.$": "$.job_file.key",
 47 |                 "job_file_version.$": "$.job_file.version",
 48 |                 "sfn_execution_id.$": "$$.Execution.Name",
 49 |                 "step_name": "step_name",
 50 |                 "workflow_name": "${WorkflowName}",
 51 |             },
 52 |         },
 53 |         **lambda_retry(),
 54 |         "ResultPath": "$.subpipe",
 55 |         "OutputPath": "$",
 56 |         "Next": "run_subpipe_step_name",
 57 |     }
 58 |     assert result == expect
 59 | 
 60 | 
 61 | def test_run_subpipe_step(sample_subpipe_spec):
 62 |     test_step = Step("step_name", sample_subpipe_spec, "next_step_name")
 63 |     result = run_subpipe_step(test_step, "retrieve_step_name")
 64 |     expect = {
 65 |         "Type": "Task",
 66 |         "Resource": "arn:aws:states:::states:startExecution.sync",
 67 |         "Parameters": {
 68 |             "Input": {
 69 |                 "index": "main",
 70 |                 "job_file.$": "$.job_file",
 71 |                 "prev_outputs": {},
 72 |                 "repo.$": "$.subpipe.sub_repo",
 73 |                 "share_id.$": "$.share_id",
 74 |                 "AWS_STEP_FUNCTIONS_STARTED_BY_EXECUTION_ID.$": "$$.Execution.Id",
 75 |             },
 76 |             "Name.$": "States.Format('{}_step_name', $$.Execution.Name)",
 77 |             "StateMachineArn": sample_subpipe_spec["subpipe"],
 78 |         },
 79 |         "ResultPath": None,
 80 |         "OutputPath": "$",
 81 |         "Next": "retrieve_step_name"
 82 |     }
 83 |     assert result == expect
 84 | 
 85 | 
 86 | @pytest.mark.parametrize("next_step_name, next_or_end", [
 87 |     ("next_step", {"Next": "next_step"}),
 88 |     ("", {"End": True}),
 89 | ])
 90 | def test_file_retrieve_step(next_step_name, next_or_end, sample_subpipe_spec, compiler_env):
 91 |     test_step = Step("step_name", sample_subpipe_spec, next_step_name)
 92 |     result = file_retrieve_step(test_step)
 93 |     expect = {
 94 |         "Type": "Task",
 95 |         "Resource": "subpipes_lambda_arn",
 96 |         "Parameters": {
 97 |             "repo.$": "$.repo.uri",
 98 |             "retrieve": json.dumps(sample_subpipe_spec["retrieve"]),
 99 |             "subpipe": {
100 |                 "sub_repo.$": "$.subpipe.sub_repo.uri",
101 |             },
102 |             "logging": {
103 |                 "branch.$": "$.index",
104 |                 "job_file_bucket.$": "$.job_file.bucket",
105 |                 "job_file_key.$": "$.job_file.key",
106 |                 "job_file_version.$": "$.job_file.version",
107 |                 "sfn_execution_id.$": "$$.Execution.Name",
108 |                 "step_name": "step_name",
109 |                 "workflow_name": "${WorkflowName}",
110 |             },
111 |         },
112 |         **lambda_retry(),
113 |         "ResultSelector": {},
114 |         "ResultPath": "$.prev_outputs",
115 |         "OutputPath": "$",
116 |         **next_or_end
117 |     }
118 |     assert result == expect
119 | 
120 | 
121 | def test_handle_subpipe(sample_subpipe_spec, compiler_env):
122 |     test_step = Step("step_name", sample_subpipe_spec, "next_step_name")
123 |     states = handle_subpipe(test_step)
124 |     assert len(states) == 3
125 | 
126 |     assert states[0].name == "step_name"
127 |     assert states[0].spec["Next"] == "step_name.subpipe"
128 | 
129 |     assert states[1].name == "step_name.subpipe"
130 |     assert states[1].spec["Next"] == "step_name.retrieve"
131 | 
132 |     assert states[2].name == "step_name.retrieve"
133 |     assert states[2].spec["Next"] == "next_step_name"
134 | 


--------------------------------------------------------------------------------
/lambda/src/compiler/pkg/scatter_gather_resources.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import re
  5 | from typing import Generator, List
  6 | 
  7 | from . import state_machine_resources as sm
  8 | from .util import Step, Resource, State, lambda_logging_block, lambda_retry
  9 | 
 10 | 
 11 | def scatter_step(step: Step, map_step_name: str) -> dict:
 12 |     ret = {
 13 |         "Type": "Task",
 14 |         "Resource": os.environ["SCATTER_LAMBDA_ARN"],
 15 |         "Parameters": {
 16 |             "repo.$": "$.repo",
 17 |             "scatter": json.dumps(step.spec["scatter"]),
 18 |             **step.input_field,
 19 |             "outputs": json.dumps(step.spec["outputs"]),
 20 |             "step_name": step.name,
 21 |             **lambda_logging_block(step.name),
 22 |         },
 23 |         **lambda_retry(),
 24 |         "ResultPath": "$.scatter",
 25 |         "Next": map_step_name
 26 |     }
 27 | 
 28 |     return ret
 29 | 
 30 | 
 31 | def error_tolerance(spec) -> dict:
 32 |     # spec will have passed validation, so...
 33 |     if isinstance(spec, str):
 34 |         # if it's a string, it's a percentage between 0 and 100 and the last character is %
 35 |         ret = {"ToleratedFailurePercentage": int(spec[:-1])}
 36 |     else:
 37 |         # otherwise it's an int >= 0
 38 |         ret = {"ToleratedFailureCount": spec}
 39 |     return ret
 40 | 
 41 | 
 42 | def map_step(step: Step, sub_branch: dict, gather_step_name: str) -> dict:
 43 |     label = re.sub(r"\W", "", step.name)
 44 | 
 45 |     ret = {
 46 |         "Type": "Map",
 47 |         "MaxConcurrency": step.spec["max_concurrency"],
 48 |         **error_tolerance(step.spec["error_tolerance"]),
 49 |         "Label": label[:40],
 50 |         "ItemReader": {
 51 |             "Resource": "arn:aws:states:::s3:getObject",
 52 |             "ReaderConfig": {
 53 |                 "InputType": "CSV",
 54 |                 "CSVHeaderLocation": "FIRST_ROW",
 55 |             },
 56 |             "Parameters": {
 57 |                 "Bucket.$": "$.scatter.items.bucket",
 58 |                 "Key.$": "$.scatter.items.key",
 59 |             }
 60 |         },
 61 |         "ItemSelector": {
 62 |             "index.$": "States.Format('{}', $$.Map.Item.Index)",  # stringify the index
 63 |             "job_file.$": "$.job_file",
 64 |             "prev_outputs": {},
 65 |             "scatter.$": "$$.Map.Item.Value",
 66 |             "repo.$": "$.scatter.repo",
 67 |             "share_id.$": "$.share_id"
 68 |         },
 69 |         "ItemProcessor": {
 70 |             "ProcessorConfig": {
 71 |                 "Mode": "DISTRIBUTED",
 72 |                 "ExecutionType": "STANDARD"
 73 |             },
 74 |             **sub_branch
 75 |         },
 76 |         "ResultPath": None,
 77 |         "Next": gather_step_name,
 78 |     }
 79 | 
 80 |     return ret
 81 | 
 82 | 
 83 | def scatter_init_step(parent_step_name: str) ->  dict:
 84 |     step_name = f"{parent_step_name}.initialize"
 85 |     ret = {
 86 |         step_name: {
 87 |             "Type": "Task",
 88 |             "Resource": os.environ["SCATTER_INIT_LAMBDA_ARN"],
 89 |             "Parameters": {
 90 |                 "index.$": "$.index",
 91 |                 "repo.$": "$.repo",
 92 |                 "scatter.$": "$.scatter",
 93 |                 **lambda_logging_block(step_name)
 94 |             },
 95 |             **lambda_retry(max_attempts=10),
 96 |             "ResultPath": "$.repo",
 97 |             "_stet": True,
 98 |         },
 99 |     }
100 |     return ret
101 | 
102 | 
103 | def gather_step(step: Step) -> dict:
104 |     ret = {
105 |         "Type": "Task",
106 |         "Resource": os.environ["GATHER_LAMBDA_ARN"],
107 |         "Parameters": {
108 |             "repo.$": "$.repo.uri",
109 |             "outputs": json.dumps(step.spec["outputs"]),
110 |             "step_name": step.name,
111 |             **lambda_logging_block(step.name),
112 |         },
113 |         **lambda_retry(),
114 |         "ResultPath": "$.prev_outputs",
115 |         "OutputPath": "$",
116 |         **step.next_or_end,
117 |     }
118 | 
119 |     return ret
120 | 
121 | 
122 | def handle_scatter_gather(step: Step,
123 |                           options: dict,
124 |                           map_depth: int
125 |                           ) -> Generator[Resource, None, List[State]]:
126 |     logger = logging.getLogger(__name__)
127 |     logger.info(f"making scatter gather steps for {step.name}")
128 | 
129 |     if map_depth > 0:
130 |         raise RuntimeError("Nested Scatter steps are not supported")
131 | 
132 |     sub_branch = yield from sm.make_branch([scatter_init_step(step.name)] + step.spec["steps"],
133 |                                            options, depth=map_depth + 1)
134 | 
135 |     scatter_step_name = step.name
136 |     map_step_name = f"{step.name}.map"
137 |     gather_step_name = f"{step.name}.gather"
138 | 
139 |     ret = [
140 |         State(scatter_step_name, scatter_step(step, map_step_name)),
141 |         State(map_step_name, map_step(step, sub_branch, gather_step_name)),
142 |         State(gather_step_name, gather_step(step))
143 |     ]
144 | 
145 |     return ret
146 | 


--------------------------------------------------------------------------------
/lambda/src/job_def/register.py:
--------------------------------------------------------------------------------
  1 | """
  2 | When CloudFormation updates a Batch job definition, it will deactivate the old version automatically. This doesn't
  3 | work well with blue/green deployments, where we want to keep the old version active in case a rollback is required.
  4 | This lambda function will register a new version of the job definition without deactivating the old one. It is meant
  5 | to be used as a custom resource in CloudFormation.
  6 | """
  7 | 
  8 | from contextlib import contextmanager
  9 | from dataclasses import dataclass, asdict, field
 10 | import http.client
 11 | import json
 12 | import logging
 13 | import os
 14 | from typing import Generator
 15 | import urllib.parse
 16 | 
 17 | import boto3
 18 | 
 19 | logger = logging.getLogger()
 20 | logger.setLevel(logging.INFO)
 21 | 
 22 | 
 23 | @dataclass()
 24 | class Response:
 25 |     PhysicalResourceId: str
 26 |     StackId: str
 27 |     RequestId: str
 28 |     LogicalResourceId: str
 29 |     Status: str = "FAILED"
 30 |     Reason: str = ""
 31 |     NoEcho: bool = False
 32 |     Data: dict = field(default_factory=dict)
 33 | 
 34 |     def return_this(self, **kwargs):
 35 |         self.Data.update(**kwargs)
 36 | 
 37 | 
 38 | def respond(url: str, body: dict):
 39 |     url_obj = urllib.parse.urlparse(url)
 40 |     body_json = json.dumps(body)
 41 | 
 42 |     https = http.client.HTTPSConnection(url_obj.hostname)
 43 |     https.request("PUT", url_obj.path + "?" + url_obj.query, body_json)
 44 | 
 45 | 
 46 | @contextmanager
 47 | def responder(event, context, no_echo=False) -> Generator[Response, None, None]:
 48 |     response = Response(
 49 |         PhysicalResourceId=event.get("PhysicalResourceId"),
 50 |         StackId=event["StackId"],
 51 |         RequestId=event["RequestId"],
 52 |         LogicalResourceId=event["LogicalResourceId"],
 53 |         NoEcho=no_echo
 54 |     )
 55 |     try:
 56 |         yield response
 57 |         logger.info("succeeded")
 58 |         response.Status = "SUCCESS"
 59 |     except:
 60 |         logger.exception("failed: ")
 61 |         response.Reason = f"see log group {context.log_group_name} / log stream {context.log_stream_name}"
 62 |     finally:
 63 |         logger.info(f"{asdict(response)=}")
 64 |         respond(event["ResponseURL"], asdict(response))
 65 | 
 66 | 
 67 | def edit_spec(spec: dict, wf_name: str, step_name: str, image: dict) -> dict:
 68 |     ret = spec.copy()
 69 |     ret["jobDefinitionName"] = f"{wf_name}_{step_name}"
 70 |     ret["containerProperties"]["environment"] += [{"name": "BC_WORKFLOW_NAME", "value": wf_name},
 71 |                                                   {"name": "BC_STEP_NAME", "value": step_name},
 72 |                                                   {"name": "AWS_DEFAULT_REGION", "value": os.environ["REGION"]},
 73 |                                                   {"name": "AWS_ACCOUNT_ID", "value": os.environ["ACCT_NUM"]}]
 74 |     ret["parameters"]["image"] = json.dumps(image, sort_keys=True, separators=(",", ":"))
 75 |     ret["tags"]["bclaw:workflow"] = wf_name
 76 |     return ret
 77 | 
 78 | 
 79 | def lambda_handler(event: dict, context: object):
 80 |     # event[ResourceProperties] = {
 81 |     #   workflowName: str
 82 |     #   stepName: str
 83 |     #   image: dict  # str
 84 |     #   spec: "{
 85 |     #     type: str
 86 |     #     parameters: {str: str}
 87 |     #     containerProperties: {
 88 |     #       image: str
 89 |     #       command: [str]
 90 |     #       jobRoleArn: str
 91 |     #       volumes: [dict]
 92 |     #       environment: [{name: str, value: str}]
 93 |     #       mountPoints: [dict]
 94 |     #       resourceRequirements: [{value: str, type: str}]
 95 |     #     }
 96 |     #     consumableResourceProperties: dict
 97 |     #     schedulingPriority: int
 98 |     #     timeout: dict
 99 |     #     propagateTags: bool
100 |     #     tags: dict
101 |     #   }"
102 |     # }
103 | 
104 |     logger.info(f"{event=}")
105 | 
106 |     batch = boto3.client("batch")
107 | 
108 |     with responder(event, context) as cfn_response:
109 |         if event["RequestType"] in ["Create", "Update"]:
110 |             spec0 = json.loads(event["ResourceProperties"]["spec"])
111 |             spec = edit_spec(spec0,
112 |                              event["ResourceProperties"]["workflowName"],
113 |                              event["ResourceProperties"]["stepName"],
114 |                              event["ResourceProperties"]["image"])
115 |             logger.info(f"{spec=}")
116 | 
117 |             result = batch.register_job_definition(**spec)
118 |             cfn_response.PhysicalResourceId = result["jobDefinitionArn"]
119 |             cfn_response.return_this(Arn=result["jobDefinitionArn"])
120 | 
121 |         else:
122 |             # handle Delete requests
123 |             try:
124 |                 if (job_def_id := event.get("PhysicalResourceId")) is not None:
125 |                     batch.deregister_job_definition(jobDefinition=job_def_id)
126 |                 else:
127 |                     logger.warning("no physical resource id found")
128 |             except:
129 |                 logger.warning("deregistration failed: ")
130 | 


--------------------------------------------------------------------------------
/lambda/tests/compiler/test_enhanced_parallel_resources.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import textwrap
  3 | 
  4 | import pytest
  5 | import yaml
  6 | 
  7 | from ...src.compiler.pkg.enhanced_parallel_resources import handle_parallel_step
  8 | from ...src.compiler.pkg.util import Step, State, lambda_logging_block, lambda_retry
  9 | 
 10 | 
 11 | @pytest.mark.parametrize("next_step_name, next_or_end", [
 12 |     ("next_step", {"Next": "next_step"}),
 13 |     ("", {"End": True}),
 14 | ])
 15 | def test_handle_parallel_step_enhanced(next_step_name, next_or_end, compiler_env):
 16 |     spec_yaml = textwrap.dedent("""\
 17 |       inputs:
 18 |         input1: file1.json
 19 |         input2: file2.json
 20 |       branches:
 21 |         -
 22 |           if: input1.qc == 1
 23 |           steps:
 24 |             -
 25 |               do_this:
 26 |                 Type: Pass
 27 |             -
 28 |               do_that:
 29 |                 Type: Pass
 30 |         -
 31 |           if: input2.qc == 2
 32 |           steps:
 33 |             -
 34 |               do_the_other:
 35 |                 Type: Pass
 36 |         -
 37 |           steps:
 38 |             -
 39 |               always_do_this:
 40 |                 Type: Pass
 41 |             -
 42 |               this_too:
 43 |                 Type: Pass
 44 |     """)
 45 | 
 46 |     spec = yaml.safe_load(spec_yaml)
 47 |     options = {"wf": "params"}
 48 | 
 49 |     def helper():
 50 |         step = Step("step_name", spec, next_step_name)
 51 | 
 52 |         result, *more = yield from handle_parallel_step(step, options, 0)
 53 |         assert len(more) == 0
 54 |         assert isinstance(result, State)
 55 |         assert result.spec["Type"] == "Parallel"
 56 |         assert result.spec["ResultPath"] is None
 57 |         assert result.spec["OutputPath"] == "$"
 58 |         assert next_or_end.items() <= result.spec.items()
 59 |         assert len(result.spec["Branches"]) == 3
 60 | 
 61 |         # branch "1"
 62 |         branch_1 = result.spec["Branches"][0]
 63 |         condition_1 = step.spec["branches"][0]["if"]
 64 |         check_step_name_1 = f"step_name: {condition_1}?"
 65 |         skip_step_name_1 = "step_name: skip_1"
 66 |         assert branch_1["StartAt"] == check_step_name_1
 67 |         assert set(branch_1["States"].keys()) == {check_step_name_1, skip_step_name_1, "do_this", "do_that"}
 68 | 
 69 |         expected_inputs = json.dumps(step.spec["inputs"], separators=(",", ":"))
 70 | 
 71 |         # -- check step 1
 72 |         check_1 = branch_1["States"][check_step_name_1]
 73 |         expect_1 = {
 74 |             "Type": "Task",
 75 |             "Resource": "chooser_lambda_arn",
 76 |             "Parameters": {
 77 |                 "repo.$": "$.repo.uri",
 78 |                 "inputs": expected_inputs,
 79 |                 "expression": condition_1,
 80 |                 **lambda_logging_block("step_name")
 81 |             },
 82 |             **lambda_retry(),
 83 |             "Catch": [
 84 |                 {
 85 |                     "ErrorEquals": ["ConditionFailed"],
 86 |                     "Next": skip_step_name_1,
 87 |                 },
 88 |             ],
 89 |             "ResultPath": None,
 90 |             "OutputPath": "$",
 91 |             "Next": "do_this",
 92 |         }
 93 |         assert check_1 == expect_1
 94 | 
 95 |         # -- skip step 1
 96 |         skip_branch_1 = branch_1["States"][skip_step_name_1]
 97 |         assert skip_branch_1["Type"] == "Succeed"
 98 | 
 99 |         # branch "2"
100 |         branch_2 = result.spec["Branches"][1]
101 |         condition_2 = step.spec["branches"][1]["if"]
102 |         check_step_name_2 = f"step_name: {condition_2}?"
103 |         skip_step_name_2 = "step_name: skip_2"
104 |         assert branch_2["StartAt"] == check_step_name_2
105 |         assert set(branch_2["States"].keys()) == {check_step_name_2, skip_step_name_2, "do_the_other"}
106 | 
107 |         # -- step check_2
108 |         check_2 = branch_2["States"][check_step_name_2]
109 |         expect_2 = {
110 |             "Type": "Task",
111 |             "Resource": "chooser_lambda_arn",
112 |             "Parameters": {
113 |                 "repo.$": "$.repo.uri",
114 |                 "inputs": expected_inputs,
115 |                 "expression": condition_2,
116 |                 **lambda_logging_block("step_name")
117 |             },
118 |             **lambda_retry(),
119 |             "Catch": [
120 |                 {
121 |                     "ErrorEquals": ["ConditionFailed"],
122 |                     "Next": skip_step_name_2,
123 |                 },
124 |             ],
125 |             "ResultPath": None,
126 |             "OutputPath": "$",
127 |             "Next": "do_the_other",
128 |         }
129 |         assert check_2 == expect_2
130 | 
131 |         # -- step skip_branch_2
132 |         skip_branch_2 = branch_2["States"][skip_step_name_2]
133 |         assert skip_branch_2["Type"] == "Succeed"
134 | 
135 |         # branch "3"
136 |         branch_3 = result.spec["Branches"][2]
137 |         assert branch_3["StartAt"] == "always_do_this"
138 |         assert set(branch_3["States"].keys()) == {"always_do_this", "this_too"}
139 | 
140 |     _ = list(helper())
141 | 


--------------------------------------------------------------------------------
/lambda/src/initializer/initializer.py:
--------------------------------------------------------------------------------
  1 | from contextlib import closing
  2 | from functools import partial
  3 | import json
  4 | import logging
  5 | import re
  6 | 
  7 | import boto3
  8 | import jmespath
  9 | 
 10 | from lambda_logs import log_preamble, log_event
 11 | from repo_utils import SYSTEM_FILE_TAG
 12 | 
 13 | logger = logging.getLogger()
 14 | logger.setLevel(logging.INFO)
 15 | 
 16 | EXTENDED_JOB_DATA_FILE_NAME = "_JOB_DATA_"
 17 | 
 18 | 
 19 | def read_s3_object(bucket: str, key: str, version: str) -> dict:
 20 |     s3 = boto3.client("s3")
 21 | 
 22 |     # raises "ClientError: An error occurred (NoSuchVersion)...The specified version does not exist." if file doesn't exist
 23 |     # raises "ClientError: An error occurred (InvalidArgument)...Invalid version id specified" if version doesn't exist
 24 |     response = s3.get_object(Bucket=bucket, Key=key, VersionId=version)
 25 | 
 26 |     with closing(response["Body"]) as fp:
 27 |         # this will raise JSONDecodeError for folder creation events (also empty
 28 |         # files & malformed JSON)
 29 |         ret = json.load(fp)
 30 |     return ret
 31 | 
 32 | 
 33 | JOB_FINDER = re.compile(r"\${!?job.(.+?)}")
 34 | 
 35 | def lookup(m: re.Match, job_data: dict) -> str:
 36 |     ret = jmespath.search(m.group(1), job_data)
 37 |     if ret is None:
 38 |         raise KeyError(f"'{m.group(1)}' not found in job data")
 39 |     return str(ret)
 40 | 
 41 | 
 42 | def substitute_job_data(target: str, job_data: dict) -> str:
 43 |     _lookup = partial(lookup, job_data=job_data)
 44 |     ret = JOB_FINDER.sub(_lookup, target)
 45 |     return ret
 46 | 
 47 | 
 48 | def check_recursive_launch(src_bucket: str, src_path: str, repo_bucket: str, repo_prefix: str) -> None:
 49 |     if src_bucket == repo_bucket:
 50 |         src_path_top_dir = src_path.split("/", 1)[0]
 51 |         repo_path_top_dir = repo_prefix.split("/", 1)[0]
 52 |         if src_path_top_dir == repo_path_top_dir:
 53 |             raise RuntimeError("repo cannot be in the launcher folder")
 54 | 
 55 | 
 56 | def copy_job_data_to_repo(src_bucket: str, src_key: str, src_version: str, dst_bucket: str, dst_prefix: str) -> None:
 57 |     filename = src_key.rsplit("/", 1)[-1]
 58 |     dst_key = f"{dst_prefix}/{filename}"
 59 |     s3 = boto3.client("s3")
 60 |     s3.copy_object(CopySource={"Bucket": src_bucket, "Key": src_key, "VersionId": src_version},
 61 |                    Bucket=dst_bucket, Key=dst_key,
 62 |                    Tagging=SYSTEM_FILE_TAG,
 63 |                    TaggingDirective="REPLACE")
 64 | 
 65 | 
 66 | def write_extended_job_data_object(raw_job_data: dict, dst_bucket: str, dst_prefix: str) -> None:
 67 |     job_data = {
 68 |         "job": raw_job_data,
 69 |         "scatter": {},
 70 |         "parent": {},
 71 |     }
 72 |     dst_key = f"{dst_prefix}/{EXTENDED_JOB_DATA_FILE_NAME}"
 73 |     s3 = boto3.client("s3")
 74 |     s3.put_object(Bucket=dst_bucket, Key=dst_key,
 75 |                   Body=json.dumps(job_data).encode("utf-8"),
 76 |                   Tagging=SYSTEM_FILE_TAG)
 77 | 
 78 | 
 79 | def handle_s3_launch(event: dict) -> dict:
 80 |     src_bucket = event["input_obj"]["job_file"]["bucket"]
 81 |     src_key = event["input_obj"]["job_file"]["key"]
 82 |     src_version = event["input_obj"]["job_file"]["version"]
 83 | 
 84 |     # if bucket versioning is suspended,version will be an empty string
 85 |     job_data = read_s3_object(src_bucket, src_key, src_version)
 86 | 
 87 |     repo = substitute_job_data(event["repo_template"], job_data)
 88 |     repo_bucket, repo_prefix = repo.split("/", 3)[2:]
 89 | 
 90 |     check_recursive_launch(src_bucket, src_key, repo_bucket, repo_prefix)
 91 | 
 92 |     copy_job_data_to_repo(src_bucket, src_key, src_version, repo_bucket, repo_prefix)
 93 |     write_extended_job_data_object(job_data, repo_bucket, repo_prefix)
 94 | 
 95 |     share_id = re.sub(r"[\W_]+", "", event["workflow_name"])
 96 | 
 97 |     ret = {
 98 |         "index": event["input_obj"]["index"],
 99 |         "job_file": {
100 |             "bucket": src_bucket,
101 |             "key": src_key,
102 |             "version": src_version,
103 |         },
104 |         "repo": {
105 |             "bucket": repo_bucket,
106 |             "prefix": repo_prefix,
107 |             "uri": repo,
108 |         },
109 |         "prev_outputs": {},
110 |         "share_id": share_id,
111 |     }
112 | 
113 |     return ret
114 | 
115 | 
116 | def lambda_handler(event: dict, context: object) -> dict:
117 |     # event = {
118 |     #   workflow_name: str
119 |     #   repo_template: str
120 |     #   input_obj: {}
121 |     #   logging: {
122 |     #     branch: str
123 |     #     job_file_bucket: str
124 |     #     job_file_key: str
125 |     #     job_file_version: str
126 |     #     sfn_execution_id: str
127 |     #     step_name: str
128 |     #     workflow_name: str
129 |     #   }
130 |     # }
131 | 
132 |     log_preamble(**event.pop("logging"), logger=logger)
133 |     log_event(logger, event)
134 | 
135 |     if "AWS_STEP_FUNCTIONS_STARTED_BY_EXECUTION_ID" in event["input_obj"]:
136 |         # this is a subpipe execution...nothing to do but pass along the input object
137 |         logger.info("subpipe launch detected")
138 |         ret = event["input_obj"]
139 | 
140 |     else:
141 |         logger.info(f"s3 launch detected")
142 |         ret = handle_s3_launch(event)
143 | 
144 |     logger.info(f"returning {str(ret)}")
145 |     return ret
146 | 


--------------------------------------------------------------------------------
/bclaw_runner/src/runner/runner_main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | run stuff
  3 | 
  4 | Usage:
  5 |     bclaw_runner.py [options]
  6 | 
  7 | Options:
  8 |     -c COMMANDS     command
  9 |     -f JSON_STRING  reference files
 10 |     -i JSON_STRING  input files
 11 |     -k STRING       step skip condition: output, rerun, none [default: none]
 12 |     -m JSON_STRING  Docker image spec
 13 |     -o JSON_STRING  output files
 14 |     -q JSON_STRING  QC check spec
 15 |     -r S3_PATH      repository path
 16 |     -s SHELL        unix shell to run commands in (bash | sh | sh-pipefail) [default: sh]
 17 |     -t JSON_STRING  global s3 tags
 18 |     -h              show help
 19 |     --version       show version
 20 | """
 21 | 
 22 | from functools import partial, partialmethod
 23 | import json
 24 | import logging.config
 25 | import os
 26 | from typing import Dict, List
 27 | 
 28 | from docopt import docopt
 29 | 
 30 | from .cache import get_reference_inputs
 31 | from .string_subs import substitute, substitute_image_tag
 32 | from .preamble import log_preamble
 33 | from .qc_check import do_checks, abort_execution, QCFailure
 34 | from .repo import Repository, SkipExecution
 35 | from .instance import get_imdsv2_token, tag_this_instance, spot_termination_checker
 36 | from .workspace import workspace, write_job_data_file, run_commands, UserCommandsFailed
 37 | 
 38 | logging.basicConfig(level=logging.INFO)
 39 | logger = logging.getLogger(__name__)
 40 | 
 41 | 
 42 | def main(commands: List[str],
 43 |          image_spec: dict,
 44 |          inputs: Dict[str, str],
 45 |          outputs: Dict[str, str | Dict],
 46 |          qc: List[dict],
 47 |          references: Dict[str, str],
 48 |          repo_path: str,
 49 |          shell: str,
 50 |          skip: str,
 51 |          tags: Dict[str, str]) -> int:
 52 |     exit_code = 0
 53 |     try:
 54 |         repo = Repository(repo_path)
 55 | 
 56 |         if skip == "rerun":
 57 |             repo.check_for_previous_run()
 58 |         elif skip == "output":
 59 |             repo.check_files_exist(list(outputs.values()))
 60 | 
 61 |         repo.clear_run_status()
 62 | 
 63 |         job_data_obj = repo.read_job_data()
 64 | 
 65 |         jobby_commands   = substitute(commands,   job_data_obj)
 66 |         jobby_inputs     = substitute(inputs,     job_data_obj)
 67 |         jobby_outputs    = substitute(outputs,    job_data_obj)  # this will recurse down to s3_tags
 68 |         jobby_references = substitute(references, job_data_obj)
 69 |         jobby_tags       = substitute(tags,       job_data_obj)
 70 | 
 71 |         jobby_image_spec = substitute_image_tag(image_spec, job_data_obj)
 72 | 
 73 |         with workspace() as wrk:
 74 |             # download references, link to workspace
 75 |             local_references = get_reference_inputs(jobby_references)
 76 | 
 77 |             # download inputs -> returns local filenames
 78 |             local_inputs = repo.download_inputs(jobby_inputs)
 79 |             local_outputs = {k.rstrip("!"): v["name"] for k, v in jobby_outputs.items()}
 80 | 
 81 |             subbed_commands = substitute(jobby_commands,
 82 |                                          local_inputs |
 83 |                                          local_outputs |
 84 |                                          local_references)
 85 | 
 86 |             local_job_data = write_job_data_file(job_data_obj, wrk)
 87 | 
 88 |             try:
 89 |                 run_commands(jobby_image_spec, subbed_commands, wrk, local_job_data, shell)
 90 |                 do_checks(qc)
 91 | 
 92 |             finally:
 93 |                 repo.upload_outputs(jobby_outputs, jobby_tags)
 94 | 
 95 |     except UserCommandsFailed as uce:
 96 |         logger.error(str(uce))
 97 |         exit_code = uce.exit_code
 98 | 
 99 |     except QCFailure as qcf:
100 |         logger.error(str(qcf))
101 |         abort_execution(qcf.failures)
102 | 
103 |     except SkipExecution as se:
104 |         logger.info(str(se))
105 |         pass
106 | 
107 |     except Exception as e:
108 |         logger.exception("bclaw_runner error: ")
109 |         exit_code = 199
110 | 
111 |     else:
112 |         repo.put_run_status()
113 |         logger.info("runner finished")
114 | 
115 |     return exit_code
116 | 
117 | 
118 | def cli() -> int:
119 |     log_preamble()
120 |     logger.info("---------- bclaw_runner starting ----------")
121 |     get_imdsv2_token()
122 |     tag_this_instance()
123 | 
124 |     # create custom log level for user commands
125 |     # https://stackoverflow.com/a/55276759
126 |     logging.USER_CMD = logging.INFO + 5  # between INFO and WARNING
127 |     logging.addLevelName(logging.USER_CMD, "USER_CMD")
128 |     logging.Logger.user_cmd = partialmethod(logging.Logger.log, logging.USER_CMD)
129 |     logging.user_cmd = partial(logging.log, logging.USER_CMD)
130 | 
131 |     with spot_termination_checker():
132 |         args = docopt(__doc__, version=os.environ["BC_VERSION"])
133 | 
134 |         commands = json.loads(args["-c"])
135 |         image    = json.loads(args["-m"])
136 |         inputs   = json.loads(args["-i"])
137 |         outputs  = json.loads(args["-o"])
138 |         qc       = json.loads(args["-q"])
139 |         refs     = json.loads(args["-f"])
140 |         repo     = args["-r"]
141 |         shell    = args["-s"]
142 |         skip     = args["-k"]
143 |         tags     = json.loads(args["-t"])
144 | 
145 |         ret = main(commands, image, inputs, outputs, qc, refs, repo, shell, skip, tags)
146 |         return ret
147 | 


--------------------------------------------------------------------------------
/bclaw_runner/tests/test_string_subs.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pytest
  4 | 
  5 | from ..src.runner.string_subs import lookup, substitute, substitute_image_tag
  6 | 
  7 | 
  8 | @pytest.mark.parametrize("pattern, string, expect", [
  9 |     (r"(one)", "one", "wun"),
 10 |     (r"(two)", "two", "2"),
 11 |     (r"(three)", "three", ""),
 12 |     (r"(four)", "four", "False"),
 13 |     (r"(not_found)", "not_found", "not_found")
 14 | ])
 15 | def test_lookup(pattern, string, expect):
 16 |     spec = {
 17 |         "one": "wun",
 18 |         "two": 2,
 19 |         "three": "",
 20 |         "four": False,
 21 |     }
 22 |     match = re.match(pattern, string)
 23 |     result = lookup(match, spec)
 24 |     assert isinstance(result, str)
 25 |     assert result == expect
 26 | 
 27 | 
 28 | def test_substitute_string():
 29 |     subs = {
 30 |         "w": "was",
 31 |         "x": "am",
 32 |         "y": {
 33 |             "z": "very",
 34 |             "t": "singular",
 35 |         },
 36 |         "p": ["exemplar", {"what": "model"}, "blueprint"],
 37 |         "q": ["modern", "major"],
 38 |     }
 39 |     target = "I ${x} the ${y.z} ${p[1].what} of a ${q} ${general}"
 40 |     result = substitute(target, subs)
 41 |     expect = "I am the very model of a ['modern', 'major'] ${general}"
 42 |     assert result == expect
 43 | 
 44 | 
 45 | def test_substitute_nested():
 46 |     subs = {
 47 |         "metadata": {
 48 |             "received": "2022-06-01",
 49 |         },
 50 |     }
 51 |     target = "received on ${metadata.received}"
 52 |     result = substitute(target, subs)
 53 |     expect = "received on 2022-06-01"
 54 |     assert result == expect
 55 | 
 56 | 
 57 | def test_substitute_falsy_values():
 58 |     subs = {
 59 |         "job": {
 60 |             "boolean_T": True,
 61 |             "boolean_F": False,
 62 |             "null": None,
 63 |             "zero": 0,
 64 |             "empty_string": "",
 65 |         }
 66 |     }
 67 |     target = "command ${job.boolean_T} ${job.boolean_F} ${job.null} ${job.zero} <${job.empty_string}> ${job.not_found}"
 68 |     result = substitute(target, subs)
 69 |     expect = "command True False ${job.null} 0 <> ${job.not_found}"
 70 |     assert result == expect
 71 | 
 72 | 
 73 | def test_substitute_recursion():
 74 |     subs = {
 75 |         "a": 99,
 76 |         "b": "two",
 77 |     }
 78 |     target = {
 79 |         "one": [
 80 |             {
 81 |                 "three": "${a}",
 82 |                 "four": "${b}",
 83 |             },
 84 |             [
 85 |                 "${a}",
 86 |                 "${b}",
 87 |             ],
 88 |             "${a} ${b}",
 89 |         ],
 90 |         "two": {
 91 |             "seven": {
 92 |                 "five": "${a}",
 93 |                 "six": "${b}",
 94 |             },
 95 |             "eight": [
 96 |                 "${a}",
 97 |                 "${b}",
 98 |             ],
 99 |             "nine": "${a} ${b}"
100 |         },
101 |     }
102 |     result = substitute(target, subs)
103 |     expect = {
104 |         "one": [
105 |             {
106 |                 "three": "99",
107 |                 "four": "two",
108 |             },
109 |             [
110 |                 "99",
111 |                 "two",
112 |             ],
113 |             "99 two",
114 |         ],
115 |         "two": {
116 |             "seven": {
117 |                 "five": "99",
118 |                 "six": "two",
119 |             },
120 |             "eight": [
121 |                 "99",
122 |                 "two",
123 |             ],
124 |             "nine": "99 two"
125 |         },
126 |     }
127 |     assert result == expect
128 | 
129 | 
130 | @pytest.mark.parametrize("original, expect", [
131 |     ("docker.io/library/single:${sub}", "docker.io/library/single:tag"),
132 |     ("no_${a}_registry:${sub}", "no_eh_registry:tag"),
133 |     ("no_registry:no_subs", "no_registry:no_subs"),
134 |     ("public.ecr.aws/docker/library/multi:${a}_${b}_${c}", "public.ecr.aws/docker/library/multi:eh_bee_sea"),
135 |     ("123456789012.dkr.ecr.us-east-1.amazonaws.com/no:subs", "123456789012.dkr.ecr.us-east-1.amazonaws.com/no:subs"),
136 |     ("123456789012.dkr.ecr.us-east-1.amazonaws.com/no_tags", "123456789012.dkr.ecr.us-east-1.amazonaws.com/no_tags"),
137 |     ("myregistryhost:5000/fedora/httpd:${sub}", "myregistryhost:5000/fedora/httpd:tag"),  # https://docs.docker.com/engine/reference/commandline/tag/#tag-an-image-for-a-private-repository
138 |     ("probably:${a}/highly/${b}/illegal/${c}:${sub}", "probably:${a}/highly/${b}/illegal/sea:tag"),
139 | ])
140 | def test_substitute_image_tag(original, expect):
141 |     spec = {
142 |         "sub": "tag",
143 |         "a": "eh",
144 |         "b": "bee",
145 |         "c": "sea",
146 |     }
147 |     image_spec = {"name": original, "auth": "doesnt_change"}
148 |     result = substitute_image_tag(image_spec, spec)
149 |     assert result["name"] == expect
150 |     assert result["auth"] == "doesnt_change"
151 | 
152 | 
153 | def test_substitute_tagged_output():
154 |     output_spec = {
155 |         "name": "fake_${a}_filename",
156 |         "s3_tags": {
157 |             "tag1": "value_${a}",
158 |             "tag2": "value_${b}",
159 |         }
160 |     }
161 | 
162 |     subs = {
163 |         "a": 99,
164 |         "b": "two",
165 |     }
166 | 
167 |     expect = {
168 |         "name": "fake_99_filename",
169 |         "s3_tags": {
170 |             "tag1": "value_99",
171 |             "tag2": "value_two",
172 |         }
173 |     }
174 | 
175 |     result = substitute(output_spec, subs)
176 |     assert result == expect
177 | 


--------------------------------------------------------------------------------
/lambda/src/subpipes/subpipes.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ThreadPoolExecutor
  2 | from contextlib import closing
  3 | from functools import partial
  4 | import json
  5 | import logging
  6 | import re
  7 | 
  8 | import boto3
  9 | 
 10 | from lambda_logs import log_preamble, log_event
 11 | from repo_utils import SYSTEM_FILE_TAG
 12 | from substitutions import substitute_job_data
 13 | 
 14 | logger = logging.getLogger()
 15 | logger.setLevel(logging.INFO)
 16 | 
 17 | 
 18 | def get_s3_object(s3_uri: str) -> dict:
 19 |     logger.info(f"reading {s3_uri}")
 20 |     bucket, key = s3_uri.split("/", 3)[2:]
 21 |     obj = boto3.resource("s3").Object(bucket, key)
 22 |     response = obj.get()
 23 |     with closing(response["Body"]) as fp:
 24 |         ret = json.load(fp)
 25 |     return ret
 26 | 
 27 | 
 28 | # this is only used to write the _JOB_DATA_ object in the subpipe repo
 29 | def put_s3_object(s3_uri: str, body: bytes) -> None:
 30 |     logger.info(f"writing {s3_uri}")
 31 |     bucket, key = s3_uri.split("/", 3)[2:]
 32 |     obj = boto3.resource("s3").Object(bucket, key)
 33 |     obj.put(Body=body, Tagging=SYSTEM_FILE_TAG)
 34 | 
 35 | 
 36 | def copy_file_impl(spec: str, src_repo_uri: str, dst_repo_uri: str) -> None:
 37 |     try:
 38 |         src_file, dst_file = re.split(r"\s*->\s*", spec)
 39 |     except ValueError:
 40 |         src_file = dst_file = spec
 41 | 
 42 |     src_uri = f"{src_repo_uri}/{src_file}"
 43 |     dst_uri = f"{dst_repo_uri}/{dst_file}"
 44 | 
 45 |     src_bucket, src_key = src_uri.split("/", 3)[2:]
 46 |     dst_bucket, dst_key = dst_uri.split("/", 3)[2:]
 47 | 
 48 |     logger.info(f"copying s3://{src_bucket}/{src_key} to s3://{dst_bucket}/{dst_key}")
 49 | 
 50 |     copy_src = {
 51 |         "Bucket": src_bucket,
 52 |         "Key": src_key
 53 |     }
 54 |     dst_obj = boto3.resource("s3").Object(dst_bucket, dst_key)
 55 |     dst_obj.copy(copy_src)
 56 | 
 57 | 
 58 | def lambda_handler(event: dict, context: object) -> dict:
 59 |     # submit event = {
 60 |     #   repo: str
 61 |     #   job_data: str | None
 62 |     #   submit: str
 63 |     #   step_name: str
 64 |     #   logging: {
 65 |     #     branch: str
 66 |     #     job_file_bucket: str
 67 |     #     job_file_key: str
 68 |     #     job_file_version: str
 69 |     #     sfn_execution_id: str
 70 |     #     step_name: str
 71 |     #     workflow_name: str
 72 |     #   }
 73 |     # }
 74 |     # retrieve event = {
 75 |     #   repo: str
 76 |     #   retrieve: str
 77 |     #   subpipe: {
 78 |     #     sub_repo: str
 79 |     #   }
 80 |     #   logging: {
 81 |     #     branch: str
 82 |     #     job_file_bucket: str
 83 |     #     job_file_key: str
 84 |     #     job_file_version: str
 85 |     #     sfn_execution_id: str
 86 |     #     step_name: str
 87 |     #     workflow_name: str
 88 |     #   }
 89 |     # }
 90 | 
 91 |     log_preamble(**event.pop("logging"), logger=logger)
 92 |     log_event(logger, event)
 93 | 
 94 |     parent_repo = event["repo"]
 95 |     parent_job_data = get_s3_object(f"{parent_repo}/_JOB_DATA_")
 96 | 
 97 |     if "submit" in event:
 98 |         # establish subpipe repo
 99 |         sub_repo = f"{parent_repo}/{event['step_name']}"
100 |         logger.info(f"{sub_repo=}")
101 | 
102 |         if (sub_job_data_uri := event.get("job_data")) is not None:
103 |             if not sub_job_data_uri.startswith("s3://"):
104 |                 sub_job_data_uri = f"{parent_repo}/{sub_job_data_uri}"
105 |             logger.info(f"{sub_job_data_uri=}")
106 |             sub_job_data = get_s3_object(sub_job_data_uri)
107 | 
108 |         else:
109 |             logger.info("using parent job data for subpipe")
110 |             sub_job_data = parent_job_data["job"]
111 | 
112 |         # create job data for subpipe
113 |         sub_job_data = {
114 |             "job": sub_job_data,
115 |             "parent": {},
116 |             "scatter": {},
117 |         }
118 | 
119 |         # write job data to subpipe repo
120 |         sub_job_data_dst = f"{sub_repo}/_JOB_DATA_"
121 |         logger.info(f"writing job data to {sub_job_data_dst}")
122 |         put_s3_object(sub_job_data_dst, json.dumps(sub_job_data).encode("utf-8"))
123 | 
124 |         # get submit strings -> spec strings
125 |         spec_strings = json.loads(event["submit"])
126 | 
127 |         src_repo_uri = parent_repo
128 |         dst_repo_uri = sub_repo
129 | 
130 |     elif "retrieve" in event:
131 |         # get retrieve strings -> spec strings
132 |         spec_strings = json.loads(event["retrieve"])
133 | 
134 |         # get subpipe repo
135 |         src_repo_uri = sub_repo = event["subpipe"]["sub_repo"]
136 |         dst_repo_uri = parent_repo
137 | 
138 |     else:
139 |         raise RuntimeError("unknown input type")
140 | 
141 |     if spec_strings:
142 |         # substitute job data into spec strings
143 |         subbed_specs = substitute_job_data(spec_strings, parent_job_data)
144 | 
145 |         # copy files from src repo to dest
146 |         with ThreadPoolExecutor(max_workers=len(subbed_specs)) as executor:
147 |             copy_file = partial(copy_file_impl,
148 |                                 src_repo_uri=src_repo_uri,
149 |                                 dst_repo_uri=dst_repo_uri)
150 |             _ = list(executor.map(copy_file, subbed_specs))
151 | 
152 |     # return sub repo
153 |     sub_repo_bucket, sub_repo_prefix = sub_repo.split("/", 3)[2:]
154 |     ret = {
155 |         "sub_repo": {
156 |             "bucket": sub_repo_bucket,
157 |             "prefix": sub_repo_prefix,
158 |             "uri": sub_repo,
159 |         }
160 |     }
161 | 
162 |     return ret
163 | 


--------------------------------------------------------------------------------
/doc/notifications.md:
--------------------------------------------------------------------------------
  1 | # BayerCLAW notifications
  2 | 
  3 | BayerCLAW is able to send notifications of about job status to users through Amazon's Simple
  4 | Notification Service (SNS). Through SNS, users may receive email or SMS text messages when
  5 | a job is received for processing or when processing starts, finishes successfully, or
  6 | fails. 
  7 | 
  8 | ### Subscribing to BayerCLAW notifications
  9 | 
 10 | The default BayerCLAW notification topic is named `bayerclaw2-core-notifications`<sup id="a1">[1](#f1)</sup>.
 11 | All workflows send notifications to this topic. To receive notifications from a workflow, users must
 12 | subscribe to this topic. Using the AWS console:
 13 | 
 14 | 1. Navigate to Amazon SNS → Subscriptions → Create subscription.
 15 | 2. Find the `bayerclaw2-core-notifications` topic in the Topic ARN search box.
 16 | 3. Choose a protocol, such as Email or SMS. Other protocols, such as AWS Lambda or Amazon SQS,
 17 | are also available to use for automation (see [Automation](#automation)).
 18 | 4. Enter an endpoint: your email address or mobile number for SMS.
 19 | 5. (Optional) To subscribe to a subset of messages, enter a filter policy.
 20 | See [Filtering notifications](#filtering-notifications) for details.
 21 | 6. Click `Create subscription`.
 22 | 
 23 | AWS will send you an email or text message requesting confirmation of your subscription. You must
 24 | accept to start receiving notifications.
 25 | 
 26 | Notification messages will have a format similar to this:
 27 | ```yaml
 28 | Job input_file_09876543 ('input_file.json') on workflow sample-workflow has finished.
 29 | ---
 30 | details:
 31 |   workflow_name: sample-workflow
 32 |   execution_id: input_file_09876543
 33 |   job_status: SUCCEEDED
 34 |   job_data: s3://bclaw-main-launcher-123456789012/sample-workflow/input_file.json
 35 |   job_data_version: 09876543211234567890
 36 | ```
 37 | 
 38 | See the [SNS documentation](https://docs.aws.amazon.com/sns/latest/dg/sns-create-subscribe-endpoint-to-topic.html)
 39 | for more information on SNS subscriptions.
 40 | 
 41 | ### Filtering notifications
 42 | 
 43 | It is very unlikely that you will want to be notified of every event in every workflow that you run.
 44 | SNS messages can be filtered based on attributes attached to the message. Filters are
 45 | expressed as filter policies that are added to your subscription.
 46 | 
 47 | BayerCLAW provides the following attributes for filtering messages:
 48 | 
 49 | - `workflow_name`: The name of the workflow that sent the notification.
 50 | - `status`: The value of the `job_status` detail as shown in the sample
 51 | message above. The possible values of `status` are:
 52 |     - RECEIVED: your job data file has been received by the workflow. 
 53 |     - RUNNING: execution of your job has started.
 54 |     - SUCCEEDED: execution finished successfully.
 55 |     - FAILED: execution finished unsuccessfully.
 56 |     - ABORTED: the job was aborted, possibly on user request.
 57 |     - TIMED_OUT: if, somehow, your job manages to run for more than a year, you'll see this one...
 58 | - `execution_id`: The ID of the Step Functions execution that sent the notification.
 59 | - `job_file_bucket`, `job_file_key` and `job_file_version`: Together, these specify the job data file that
 60 | launched the execution in question.
 61 | 
 62 | Filter policies are JSON-formatted documents.
 63 | As an example, a filter policy that only allows messages from jobs that failed or were aborted
 64 | on workflow `sample-workflow` would look like this:
 65 |  
 66 | ```json5
 67 | {
 68 |   "workflow_name": ["sample_workflow"],
 69 |   "status": ["FAILED", "ABORTED"]
 70 | }
 71 | ```
 72 | 
 73 | For more information on SNS filter policies, see the AWS documentation
 74 | [here](https://docs.aws.amazon.com/sns/latest/dg/sns-subscription-filter-policies.html) and
 75 | [here](https://docs.aws.amazon.com/sns/latest/dg/message-filtering-apply.html).
 76 | 
 77 | ### Automation
 78 | 
 79 | Besides sending messages to users, SNS can be used to trigger AWS Lambda functions which can in turn
 80 | launch follow-on processes or send the notifications on to services like Slack. To facilitate this,
 81 | BayerCLAW notification messages are actually YAML-formatted data structures<sup id="a2">[2](#f2)</sup>.
 82 | 
 83 | In Python, an BayerCLAW message can be parsed using the [PyYAML package](https://pypi.org/project/PyYAML/) as follows:
 84 | 
 85 | ```python
 86 | import yaml
 87 | ...
 88 | result = list(yaml.safe_load_all(message))
 89 | ```
 90 | 
 91 | Using this command, message that looks like this:
 92 | 
 93 | ```yaml
 94 | Job input_file_09876543 ('input_file.json') on workflow sample-workflow has finished.
 95 | ---
 96 | details:
 97 |   workflow_name: sample-workflow
 98 |   execution_id: input_file_09876543
 99 |   job_status: SUCCEEDED
100 |   job_data: s3://bclaw-main-launcher-123456789012/sample-workflow/input_file.json
101 |   job_data_version: 09876543211234567890
102 | ```
103 | 
104 | will become a data structure that looks like this:
105 | 
106 | ```python
107 | [
108 |     "Job input_file_09876543 ('input_file.json') on workflow sample-workflow has finished.",
109 |     {
110 |         "details": {
111 |             "workflow_name": "sample-workflow",
112 |             "job_status": "SUCCEEDED",
113 |             "execution_id": "input_file_09876543",
114 |             "job_data": "s3://bclaw-main-launcher-123456789012/sample-workflow/input_file.json",
115 |             "job_data_version": "09876543211234567890",
116 |         }
117 |     }
118 | ]
119 | ```
120 | 
121 | <hr>
122 | 
123 | <b id="f1">1</b> If you have multiple BayerCLAW installations in your account, each installation will have
124 | a topic with a corresponding name[↵](#a1)
125 | 
126 | <b id="f2">2</b> Technically, a pair of YAML documents: a bare string and a mapping. [↵](#a2)
127 | 
128 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contribution Guidelines
  2 | 
  3 | ## Pull requests are always welcome
  4 | 
  5 | We're trying very hard to keep our systems simple, lean and focused. We don't want them to be everything for everybody. This means that we might decide
  6 | against incorporating a new request.
  7 | 
  8 | 
  9 | ## Create issues...
 10 | 
 11 | Any significant change should be documented as a GitHub issue before anybody starts working on it.
 12 | 
 13 | 
 14 | ### ...but check for existing issues first!
 15 | 
 16 | Please take a moment to check that an issue doesn't already exist documenting your request. If it does, it never hurts to add a quick "+1" or "I need this too". This will help prioritize the most common requests.
 17 | 
 18 | 
 19 | ## Conventions
 20 | 
 21 | Fork the repository and make changes on your fork on a branch:
 22 | 
 23 | 1. Create the right type of issue (defect, enhancement, test, etc)
 24 | 2. Name the branch N-something where N is the number of the issue.
 25 | 
 26 | Note that the maintainers work on branches in this repository.
 27 | 
 28 | Work hard to ensure your pull request is valid. This includes code quality, clear naming, and including unit tests. Please read the Code Of Conduct at the bottom of this file.
 29 | 
 30 | Pull request descriptions should be as clear as possible and include a reference to all the issues that they address. In GitHub, you can reference an
 31 | issue by adding a line to your commit description that follows the format:
 32 | 
 33 |   `Fixes #N`
 34 | 
 35 | where N is the issue number.
 36 | 
 37 | 
 38 | ## Merge approval
 39 | 
 40 | Repository maintainers will review the pull request and make sure it provides the correct level of code quality & correctness. 
 41 | 
 42 | 
 43 | 
 44 | ## How are decisions made?
 45 | 
 46 | Short answer: with pull requests to this repository.
 47 | 
 48 | All decisions, big and small, follow the same 3 steps:
 49 | 
 50 | 1. Open a pull request. Anyone can do this.
 51 | 
 52 | 2. Discuss the pull request. Anyone can do this.
 53 | 
 54 | 3. Accept or refuse a pull request. The relevant maintainers do this (see below "Who decides what?")
 55 | 
 56 |    1. Accepting pull requests
 57 | 
 58 |       1. If the pull request appears to be ready to merge, approve it.
 59 | 
 60 |       2. If the pull request has some small problems that need to be changed, make a comment addressing the issues.
 61 | 
 62 |       3. If the changes needed to a PR are small, you can add a "LGTM once the following comments are addressed..." this will reduce needless back and forth.
 63 | 
 64 |       4. If the PR only needs a few changes before being merged, any MAINTAINER can make a replacement PR that incorporates the existing commits and fixes the problems before a fast track merge.
 65 | 
 66 |    2. Closing pull requests
 67 | 
 68 |       1. If a PR appears to be abandoned, after having attempted to contact the original contributor, then a replacement PR may be made. Once the replacement PR is made, any contributor may close the original one.
 69 | 
 70 |       2. If you are not sure if the pull request implements a good feature or you do not understand the purpose of the PR, ask the contributor to provide more documentation. If the contributor is not able to adequately explain the purpose of the PR, the PR may be closed by any MAINTAINER.
 71 | 
 72 |       3. If a MAINTAINER feels that the pull request is sufficiently architecturally flawed, or if the pull request needs significantly more design discussion before being considered, the MAINTAINER should close the pull request with a short explanation of what discussion still needs to be had. It is important not to leave such pull requests open, as this will waste both the MAINTAINER's time and the contributor's time. It is not good to string a contributor on for weeks or months, having them make many changes to a PR that will eventually be rejected.
 73 | 
 74 | 
 75 | ## Who decides what?
 76 | 
 77 | All decisions are pull requests, and the relevant maintainers make decisions by accepting or refusing pull requests. Review and acceptance by anyone is
 78 | denoted by adding a comment in the pull request: `LGTM`. However, only currently listed `MAINTAINERS` are counted towards the required majority.
 79 | 
 80 | The maintainers will be listed in the MAINTAINER file, all these people will be in the employment of Bayer.
 81 | 
 82 | 
 83 | ## I'm a maintainer, should I make pull requests too?
 84 | 
 85 | Yes. Nobody should ever push to master directly. All changes should be made through a pull request.
 86 | 
 87 | ## Code Of Conduct
 88 | 
 89 | As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
 90 | 
 91 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
 92 | 
 93 | Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
 94 | 
 95 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
 96 | 
 97 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
 98 | 
 99 | This Code of Conduct is adapted from the Contributor Covenant, version 1.0.0, available at https://www.contributor-covenant.org/version/1/0/0/code-of-conduct.html
100 | 


--------------------------------------------------------------------------------
/lambda/tests/notifications/test_notifications.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import boto3
  4 | import moto
  5 | import pytest
  6 | import yaml
  7 | 
  8 | from ...src.notifications.notifications import (make_state_change_message, make_message_attributes,
  9 |                                                 make_sns_payload, lambda_handler)
 10 | 
 11 | WORKFLOW_NAME = "test_workflow"
 12 | 
 13 | REGION = "us-east-1"
 14 | EXECUTION_NAME = "12345678-etc-etc"
 15 | STATE_MACHINE_NAME = "testStateMachine"
 16 | STATE_MACHINE_ARN = f"arn:aws:states:{REGION}:123456789012:stateMachine:{STATE_MACHINE_NAME}"
 17 | EXECUTION_ARN = f"arn:aws:states:{REGION}:123456789012:execution:{STATE_MACHINE_NAME}:{EXECUTION_NAME}"
 18 | 
 19 | LAUNCHER_BUCKET = "test-bucket"
 20 | JOB_DATA_KEY = "path/to/job.json"
 21 | JOB_DATA_VERSION = "1234567890"
 22 | JOB_DATA_URI = f"s3://{LAUNCHER_BUCKET}/{JOB_DATA_KEY}"
 23 | 
 24 | 
 25 | @pytest.fixture(scope="module")
 26 | def state_change_event_factory():
 27 |     input_obj = {
 28 |         "job_file": {
 29 |             "bucket": LAUNCHER_BUCKET,
 30 |             "key": JOB_DATA_KEY,
 31 |             "version": JOB_DATA_VERSION,
 32 |         },
 33 |         "index": "main",
 34 |     }
 35 | 
 36 |     def _event_impl(status: str = "UNKNOWN") -> dict:
 37 |         ret = {
 38 |             "detail": {
 39 |                 "executionArn": EXECUTION_ARN,
 40 |                 "stateMachineArn": STATE_MACHINE_ARN,
 41 |                 "name": EXECUTION_NAME,
 42 |                 "status": status,
 43 |                 "input": json.dumps(input_obj),
 44 |                 "inputDetails": {
 45 |                     "included": True,
 46 |                 },
 47 |             },
 48 |         }
 49 |         return ret
 50 | 
 51 |     return _event_impl
 52 | 
 53 | 
 54 | @pytest.mark.parametrize("status, action", [
 55 |     ("RUNNING", "has started."),
 56 |     ("SUCCEEDED", "has finished."),
 57 |     ("FAILED", "has failed."),
 58 |     ("ABORTED", "has been aborted."),
 59 |     ("TIMED_OUT", "has timed out."),
 60 | ])
 61 | def test_make_state_change_message(status, action):
 62 |     attributes = {
 63 |         "status": {
 64 |             "DataType": "String",
 65 |             "StringValue": status,
 66 |         },
 67 |         "workflow_name": {
 68 |             "DataType": "String",
 69 |             "StringValue": WORKFLOW_NAME,
 70 |         },
 71 |         "execution_id": {
 72 |             "DataType": "String",
 73 |             "StringValue": EXECUTION_NAME,
 74 |         },
 75 |         "job_file_bucket": {
 76 |             "DataType": "String",
 77 |             "StringValue": LAUNCHER_BUCKET,
 78 |         },
 79 |         "job_file_key": {
 80 |             "DataType": "String",
 81 |             "StringValue": JOB_DATA_KEY,
 82 |         },
 83 |         "job_file_version": {
 84 |             "DataType": "String",
 85 |             "StringValue": JOB_DATA_VERSION,
 86 |         },
 87 |     }
 88 | 
 89 |     expected_details = {
 90 |         "details": {
 91 |             "workflow_name": WORKFLOW_NAME,
 92 |             "execution_id": EXECUTION_NAME,
 93 |             "job_status": status,
 94 |             "job_data": JOB_DATA_URI,
 95 |             "job_data_version": JOB_DATA_VERSION,
 96 |         },
 97 |     }
 98 | 
 99 |     message = make_state_change_message(attributes)
100 |     text, details = yaml.safe_load_all(message)
101 | 
102 |     assert WORKFLOW_NAME in text
103 |     assert EXECUTION_NAME in text
104 |     assert "job.json" in text
105 |     assert text.endswith(action)
106 | 
107 |     assert details == expected_details
108 | 
109 | 
110 | def test_make_message_attributes(state_change_event_factory):
111 |     event = state_change_event_factory(status="FAKE_STATUS")
112 |     result = make_message_attributes(event)
113 |     expect = {
114 |         "status": {
115 |             "DataType": "String",
116 |             "StringValue": "FAKE_STATUS",
117 |         },
118 |         "workflow_name": {
119 |             "DataType": "String",
120 |             "StringValue": STATE_MACHINE_NAME,
121 |         },
122 |         "execution_id": {
123 |             "DataType": "String",
124 |             "StringValue": EXECUTION_NAME,
125 |         },
126 |         "job_file_bucket": {
127 |             "DataType": "String",
128 |             "StringValue": LAUNCHER_BUCKET,
129 |         },
130 |         "job_file_key": {
131 |             "DataType": "String",
132 |             "StringValue": JOB_DATA_KEY,
133 |         },
134 |         "job_file_version": {
135 |             "DataType": "String",
136 |             "StringValue": JOB_DATA_VERSION,
137 |         },
138 |     }
139 |     assert result == expect
140 | 
141 | 
142 | def test_make_sns_payload(state_change_event_factory, monkeypatch):
143 |     monkeypatch.setenv("TOPIC_ARN", "arn:of:fake:topic")
144 |     attributes = {
145 |         "status": {
146 |             "DataType": "String",
147 |             "StringValue": "FAKE_STATUS"
148 |         },
149 |         "workflow_name": {
150 |             "DataType": "String",
151 |             "StringValue": WORKFLOW_NAME
152 |         }
153 |     }
154 |     result = make_sns_payload("test message", attributes)
155 |     expect = {
156 |         "TopicArn": "arn:of:fake:topic",
157 |         "Message": "test message",
158 |         "Subject": f"{WORKFLOW_NAME}: job fake_status",
159 |         "MessageAttributes": attributes,
160 |     }
161 |     assert result == expect
162 | 
163 | 
164 | @moto.mock_aws
165 | def test_lambda_handler(monkeypatch, state_change_event_factory):
166 |     monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
167 |     sns = boto3.client("sns")
168 |     response0 = sns.create_topic(Name="test_topic")
169 | 
170 |     monkeypatch.setenv("TOPIC_ARN", response0["TopicArn"])
171 |     event = state_change_event_factory(status="SUCCEEDED")
172 | 
173 |     response = lambda_handler(event, {})
174 |     assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
175 | 


--------------------------------------------------------------------------------