├── .dockerignore
├── .github
└── workflows
│ ├── ghcr_image.yml
│ └── lint.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── miniwdl_aws.cfg
├── miniwdl_aws
├── __init__.py
├── __main__.py
├── _util.py
├── batch_job.py
├── cli_run_s3upload.py
└── cli_submit.py
├── plugin_log_task_usage
├── StressTest.wdl
├── miniwdl_log_task_usage.py
└── setup.py
├── release.sh
├── setup.py
├── test
├── assets
│ ├── count_lines.wdl
│ ├── test_call_cache.wdl
│ ├── test_directory.wdl
│ ├── test_nonexistent_docker.wdl
│ ├── test_retry_streams.wdl
│ └── test_termination.wdl
├── build_test_image.sh
├── requirements.txt
├── run_tests.sh
└── test.py
└── version.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | __pycache__
3 | .eggs/
4 | dist/
5 | *.egg-info/
6 | build/
7 | .venv/
8 | venv/
9 |
--------------------------------------------------------------------------------
/.github/workflows/ghcr_image.yml:
--------------------------------------------------------------------------------
1 | name: ghcr_image
2 | on: [push]
3 |
4 | jobs:
5 |
6 | ghcr_image:
7 | if: github.repository == 'miniwdl-ext/miniwdl-aws' # don't run from forks
8 | runs-on: ubuntu-20.04
9 | steps:
10 | - uses: actions/checkout@v2
11 | with:
12 | fetch-depth: 0
13 | - name: docker login ghcr.io
14 | uses: docker/login-action@v1
15 | with:
16 | registry: ghcr.io
17 | username: ${{ github.repository_owner }}
18 | password: ${{ secrets.GITHUB_TOKEN }}
19 | - name: docker_build
20 | run: |
21 | python3 setup.py --version # generate RELEASE-VERSION
22 |
23 | REPO="ghcr.io/miniwdl-ext/miniwdl-aws"
24 | TAG="$(git describe --tags --always --dirty)"
25 |
26 | docker pull public.ecr.aws/amazonlinux/amazonlinux:2
27 | docker build --no-cache -t "${REPO}:${TAG}" .
28 | IMAGE_ID="$(docker inspect ${REPO}:${TAG} | jq -r .[0].Id)"
29 |
30 | docker push "${REPO}:${TAG}"
31 | REPO_DIGEST="$(docker inspect ${REPO}:${TAG} | jq -r '.[0].RepoDigests[0]')"
32 |
33 | echo "REPO=${REPO}" >> $GITHUB_ENV
34 | echo "TAG=${TAG}" >> $GITHUB_ENV
35 | echo "IMAGE_ID=${IMAGE_ID}" >> $GITHUB_ENV
36 | echo "REPO_DIGEST=${REPO_DIGEST}" >> $GITHUB_ENV
37 | - name: display
38 | run: |
39 | >&2 echo "Id: ${IMAGE_ID}"
40 | echo "::set-output name=Id::${REPO}:${IMAGE_ID}"
41 | >&2 echo "Tag: ${REPO}:${TAG}"
42 | echo "::set-output name=Tag::${REPO}:${TAG}"
43 | >&2 echo "RepoDigest: ${REPO_DIGEST}"
44 | echo "::set-output name=RepoDigest::${REPO_DIGEST}"
45 | outputs:
46 | Id: ${{steps.display.outputs.Id}}
47 | Tag: ${{steps.display.outputs.Tag}}
48 | RepoDigest: ${{steps.display.outputs.RepoDigest}}
49 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | name: lint
2 | on: [push, pull_request]
3 |
4 | jobs:
5 |
6 | lint:
7 | runs-on: ubuntu-20.04
8 | steps:
9 | - uses: actions/checkout@v2
10 | with:
11 | fetch-depth: 0
12 | - name: deps
13 | run: sudo pip3 install --system pre-commit black flake8 pylint
14 | - name: pre-commit
15 | run: pre-commit run --all-files
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | __pycache__
3 | .eggs/
4 | RELEASE-VERSION
5 | dist/
6 | *.egg-info/
7 | build/
8 | .venv/
9 | venv/
10 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: local
3 | hooks:
4 | - id: black
5 | name: black
6 | language: system
7 | files: \.py$
8 | verbose: true
9 | entry: black
10 | args: [-l,'100']
11 | - id: flake8
12 | name: flake8
13 | language: system
14 | files: \.py$
15 | verbose: true
16 | entry: flake8
17 | args: [--max-line-length, "100", "--ignore=E501,W503,E722,E203"]
18 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Docker image with miniwdl & the AWS plugin baked in. Suitable for submission to Batch as the
2 | # "workflow job" launching & monitoring other jobs (WDL tasks).
3 |
4 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023
5 |
6 | # rpm dependencies
7 | RUN yum check-update; yum install -y \
8 | python3-pip \
9 | python3-setuptools \
10 | unzip
11 |
12 | # AWS CLI v2 (`yum install awscli` is a really old version)
13 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip"
14 | RUN sh -c 'cd /tmp && unzip awscliv2.zip' && sh /tmp/aws/install
15 |
16 | # miniwdl-aws (and PyPI dependencies listed in setup.py)
17 | COPY ./ /tmp/miniwdl-aws/
18 | RUN bash -c 'cd /tmp/miniwdl-aws && pip3 install . && pip3 install ./plugin_log_task_usage'
19 |
20 | # cleanup (for squashed image)
21 | RUN yum clean all && rm -rf /tmp/miniwdl* /tmp/aws*
22 |
23 | # boilerplate configuration file & test assets
24 | COPY miniwdl_aws.cfg /etc/xdg/miniwdl.cfg
25 | COPY test/assets/ /var/miniwdl_aws_test_assets/
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2021 Wid L. Hacker
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include RELEASE-VERSION version.py
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # miniwdl AWS plugin
2 |
3 | **Extends [miniwdl](https://github.com/chanzuckerberg/miniwdl) to run workflows on [AWS Batch](https://aws.amazon.com/batch/) and [EFS](https://aws.amazon.com/efs/)**
4 |
5 | This miniwdl plugin enables it to execute WDL tasks as AWS Batch jobs. It uses EFS for work-in-progress file I/O, optionally uploading final workflow outputs to S3.
6 |
7 | **Before diving into this, first consider [AWS HealthOmics](https://aws.amazon.com/healthomics/)**, which includes a [managed service for WDL workflows](https://docs.aws.amazon.com/omics/latest/dev/creating-workflows.html) that doesn't need you to provision all the infrastructure. Our companion project **[miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run)** provides a convenient CLI for launching HealthOmics runs with local WDL source code files.
8 |
9 | There are a few ways to deploy this miniwdl-aws plugin:
10 |
11 | ## Amazon Genomics CLI
12 |
13 | [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) can deploy a [miniwdl-aws context](https://aws.github.io/amazon-genomics-cli/docs/workflow-engines/miniwdl/) into your AWS account with all the necessary infrastructure.
14 |
15 | ## Amazon SageMaker Studio
16 |
17 | Or, try the [**miniwdl-aws-studio**](https://github.com/miniwdl-ext/miniwdl-aws-studio) recipe to install miniwdl for interactive use within [Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/), a web IDE with a terminal and filesystem browser. You can use the terminal to operate `miniwdl run` against AWS Batch, the filesystem browser to manage the inputs and outputs on EFS, and the Jupyter notebooks to further analyze the outputs.
18 |
19 | [
](https://github.com/miniwdl-ext/miniwdl-aws-studio)
20 |
21 | ## `miniwdl-aws-submit` with custom infrastructure
22 |
23 | Lastly, advanced operators can use [**miniwdl-aws-terraform**](https://github.com/miniwdl-ext/miniwdl-aws-terraform) to deploy/customize the necessary AWS infrastructure, including a VPC, EFS file system, Batch queues, and IAM roles.
24 |
25 | In this scheme, a local command-line wrapper `miniwdl-aws-submit` *launches miniwdl in its own small Batch job* to orchestrate a workflow. This **workflow job** then spawns WDL **task jobs** as needed, without needing the submitting laptop to remain connected for the duration. The workflow jobs run on lightweight [Fargate](https://docs.aws.amazon.com/batch/latest/userguide/fargate.html) resources, while task jobs run on EC2 spot instances.
26 |
27 | ### Submitting workflow jobs
28 |
29 | After deploying [miniwdl-aws-terraform](https://github.com/miniwdl-ext/miniwdl-aws-terraform), `pip3 install miniwdl-aws` locally to make the `miniwdl-aws-submit` program available. Try the self-test:
30 |
31 | ```
32 | miniwdl-aws-submit --self-test --follow --workflow-queue miniwdl-workflow
33 | ```
34 |
35 | Then launch a [viral genome assembly](https://github.com/broadinstitute/viral-pipelines/) that should run in 10-15 minutes:
36 |
37 | ```
38 | miniwdl-aws-submit \
39 | https://github.com/broadinstitute/viral-pipelines/raw/v2.1.28.0/pipes/WDL/workflows/assemble_refbased.wdl \
40 | reads_unmapped_bams=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/G5012.3.testreads.bam \
41 | reference_fasta=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/ebov-makona.fasta \
42 | sample_name=G5012.3 \
43 | --workflow-queue miniwdl-workflow \
44 | --s3upload s3://MY-BUCKET/assemblies \
45 | --verbose --follow
46 | ```
47 |
48 | The command line resembles `miniwdl run`'s with extra AWS-related arguments:
49 |
50 | * `--workflow-queue` Batch job queue on which to schedule the workflow job; output from miniwdl-aws-terraform, default `miniwdl-workflow`. (Also set by environment variable `MINIWDL__AWS__WORKFLOW_QUEUE`)
51 | * `--follow` live-streams the workflow log instead of exiting immediately upon submission. (`--wait` blocks on the workflow without streaming the log.)
52 | * `--s3upload` (optional) S3 folder URI under which to upload the workflow products, including the log and output files (if successful). The bucket must be allow-listed in the miniwdl-aws-terraform deployment.
53 | * Unless `--s3upload` ends with /, one more subfolder is added to the uploaded URI prefix, equal to miniwdl's automatic timestamp-prefixed run name. If it does end in /, then the uploads go directly into/under that folder (and a repeat invocation would be expected to overwrite them).
54 |
55 | `miniwdl-aws-submit` detects other infrastructure details (task queue, EFS access point, IAM role) based on tags set on the workflow queue; see `miniwdl-aws-submit --help` for additional options to override those defaults.
56 |
57 | Arguments not consumed by `miniwdl-aws-submit` are *passed through* to `miniwdl run` inside the workflow job; as are environment variables whose names begin with `MINIWDL__`, allowing override of any [miniwdl configuration option](https://miniwdl.readthedocs.io/en/latest/runner_reference.html#configuration) (disable wih `--no-env`). See [miniwdl_aws.cfg](miniwdl_aws.cfg) for various options preconfigured in the workflow job container, some of which can be adjusted to benefit specific workloads. For example, to halve the maximum rate at which miniwdl invokes the AWS Batch SubmitJob API, set `MINIWDL__AWS__SUBMIT_PERIOD=2` in the `miniwdl-aws-submit` environment.
58 |
59 | If the specified WDL source code is an existing local .wdl or .zip file, `miniwdl-aws-submit` automatically ships it with the workflow job as the WDL to execute. Given a .wdl file, it runs [`miniwdl zip`](https://miniwdl.readthedocs.io/en/latest/zip.html) to detect & include any imported WDL files; while it assumes .zip files were also generated by `miniwdl zip`. If the source code is too large to fit in the AWS Batch request payload (~50KB), then you'll instead need to pass it by reference to a URL or EFS path.
60 |
61 | The workflow and task jobs all mount EFS at `/mnt/efs`. Although workflow input files are usually specified using HTTPS or S3 URIs, files already resident on EFS can be used with their `/mnt/efs` paths (which probably don't exist locally on the submitting machine). Unlike the WDL source code, `miniwdl-aws-submit` will not attempt to ship/upload local input files.
62 |
63 | ## Run directories on EFS
64 |
65 | Miniwdl runs the workflow in a directory beneath `/mnt/efs/miniwdl_run` (override with `--dir`). The outputs also remain cached there for potential reuse in future runs (to avoid, submit with `--no-cache` or wipe `/mnt/efs/miniwdl_run/_CACHE`).
66 |
67 | Given the EFS-centric I/O model, you'll need a way to browse and manage the filesystem contents remotely. The companion recipe [lambdash-efs](https://github.com/miniwdl-ext/lambdash-efs) is one option; miniwdl-aws-terraform outputs the infrastructure details needed to deploy it (pick any subnet). Or, set up an instance/container mounting your EFS, to access via SSH or web app (e.g. [JupyterHub](https://jupyter.org/hub), [Cloud Commander](http://cloudcmd.io/), [VS Code server](https://github.com/cdr/code-server)).
68 |
69 | You can also automate cleanup of EFS run directories by setting `miniwdl-aws-submit --s3upload` and:
70 |
71 | * `--delete-after success` to delete the run directory immediately after successful output upload
72 | * `--delete-after failure` to delete the directory after failure
73 | * `--delete-after always` to delete it in either case
74 | * (or set environment variable `MINIWDL__AWS__DELETE_AFTER_S3_UPLOAD`)
75 |
76 | Deleting a run directory after success prevents the outputs from being reused in future runs. Deleting it after failures can make debugging more difficult (although logs are retained, see below).
77 |
78 | ### Security note on file system isolation
79 |
80 | Going through AWS Batch & EFS, miniwdl can't enforce the strict file system isolation between WDL task containers that it does locally. All the AWS Batch containers have read/write access to the entire EFS file system (as viewed through the access point), not only their initial working directory.
81 |
82 | This is usually benign, because WDL tasks should only read their declared inputs and write into their respective working/temporary directories. But poorly- or maliciously-written tasks could read & write files elsewhere on EFS, even changing their own input files or those of other tasks. This risks unintentional side-effects or worse security hazards from untrusted code.
83 |
84 | To mitigate this, test workflows thoroughly using the local backend, which strictly isolates task containers' file systems. If WDL tasks insist on modifying their input files in place, then `--copy-input-files` can unblock them (at a cost in time, space, and IOPS). Lastly, avoid using untrusted WDL code or container images; but if they're necessary, then use a separate EFS access point and restrict the IAM and network configuration for the AWS Batch containers appropriately.
85 |
86 | ### EFS performance considerations
87 |
88 | To scale up to larger workloads, it's important to study AWS documentation on EFS [performance](https://docs.aws.amazon.com/efs/latest/ug/performance.html) and [monitoring](https://docs.aws.amazon.com/efs/latest/ug/monitoring-cloudwatch.html). Like any network file system, EFS limits on throughput and IOPS can cause bottlenecks, in the worst case effectively freezing a workflow.
89 |
90 | Management tips:
91 |
92 | * Monitor file system throughput limits, IOPS, and burst credits (if applicable) in the EFS area of the AWS Console.
93 | * Retain the default *Elastic* throughput mode (though it may cost more than other modes)
94 | * Code WDL tasks to write any purely-temporary files under `/tmp`, which may use local scratch space, instead of the EFS working directory.
95 | * Configure miniwdl and AWS Batch to limit the number of concurrent jobs and/or the rate at which they turn over (see [miniwdl_aws.cfg](https://github.com/miniwdl-ext/miniwdl-aws/blob/main/miniwdl_aws.cfg) for relevant details).
96 | * Spread out separate workflow runs over time or across multiple EFS file systems.
97 |
98 | ### FSx for Lustre and other shared filesystems
99 |
100 | If EFS performance remains insufficient, then you can configure your Batch compute environments to automatically mount some other shared filesystem upon instance startup. Then use `miniwdl-aws-submit --no-efs` to make it assume the filesystem will already be mounted at a certain location (default `--mount /mnt/net`) across all instances. In this case, the compute environment for workflow jobs is expected to use EC2 instead of Fargate resources (usually necessary for mounting).
101 |
102 | The miniwdl-aws-terraform repo [includes a variant](https://github.com/miniwdl-ext/miniwdl-aws-terraform/tree/main/fsx) setting this up with [FSx for Lustre](https://aws.amazon.com/fsx/lustre/). FSx offers higher throughput scalability, but has other downsides compared to EFS (higher upfront costs, manual capacity scaling, single-AZ deployment, fewer AWS service integrations).
103 |
104 | ## Logs & troubleshooting
105 |
106 | If the terminal log isn't available (through Studio or `miniwdl-submit-awsbatch --follow`) to trace a workflow failure, look for miniwdl's usual log files written in the run directory on EFS or copied to S3.
107 |
108 | Each task job's log is also forwarded to [CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html) under the `/aws/batch/job` group and a log stream name reported in miniwdl's log. Using `miniwdl-aws-submit`, the workflow job's log is also forwarded. CloudWatch Logs indexes the logs for structured search through the AWS Console & API.
109 |
110 | Misconfigured infrastructure might prevent logs from being written to EFS or CloudWatch at all. In that case, use the AWS Batch console/API to find status messages for the workflow or task jobs.
111 |
112 | Tasks can self-report their CPU & memory usage in their standard error logs, by setting `MINIWDL__LOG_TASK_USAGE__PERIOD=60` to report every 60 seconds (or as desired). Submit with `--verbose --follow`, or look in any task's CloudWatch Logs stream or `stderr.txt` file, to see the "container usage" log messages.
113 |
114 | ## GPU jobs
115 |
116 | Miniwdl-aws recognizes the `gpu: true` setting in a task `runtime{}` section, and translates that to a [GPU resource requirement](https://docs.aws.amazon.com/batch/latest/userguide/gpu-jobs.html) for AWS Batch. For the job to be scheduled, the Batch compute environment must of course make GPU instance types available.
117 |
118 | By default, `gpu: true` translates to a requirement for a single GPU. The WDL spec defines this as a boolean value, so there is no clear way to request multiple GPUs for a given task. The configuration `MINIWDL__AWS__GPU_VALUE` can be set to an integer *N* to make *all* tasks with `gpu: true` require *N* GPUs.
119 |
120 | Alternatively, miniwdl-aws also recognizes the `acceleratorType` and `acceleratorCount` attributes used by [AWS HealthOmics](https://docs.aws.amazon.com/omics/latest/dev/parameters-and-input-wdl.html). Any `acceleratorType` starting with "nvidia" translates to a Batch GPU requirement; the actual GPU type will depend on the instance type(s) made available by the compute environment.
121 |
122 | Multi-GPU operations may need more shared memory than Batch typically makes available in each task container. To increase the available shared memory, set e.g. `MINIWDL__AWS__CONTAINER_PROPERTIES='{"linuxParameters":{"sharedMemorySize":4096}}'`
123 |
124 | ## Contributing
125 |
126 | Pull requests are welcome! For help, open an issue here or drop in on [#miniwdl in the OpenWDL Slack](https://openwdl.slack.com/archives/C02JCRJU79T).
127 |
128 | **Code formatting and linting.** To prepare your code to pass the CI checks,
129 |
130 | ```
131 | pip3 install --upgrade -r test/requirements.txt
132 | pre-commit run --all-files
133 | ```
134 |
135 | **Running tests.** In an AWS-credentialed terminal session,
136 |
137 | ```
138 | MINIWDL__AWS__WORKFLOW_QUEUE=miniwdl-workflow test/run_tests.sh
139 | ```
140 |
141 | This builds the requisite Docker image from the current code revision and pushes it to an ECR repository (which must be prepared once with `aws ecr create-repository --repository-name miniwdl-aws`). To test an image from the [GitHub public registry](https://github.com/miniwdl-ext/miniwdl-aws/pkgs/container/miniwdl-aws) or some other version, set `MINIWDL__AWS__WORKFLOW_IMAGE` to the desired tag.
142 |
--------------------------------------------------------------------------------
/miniwdl_aws.cfg:
--------------------------------------------------------------------------------
1 | # miniwdl configuration file built into the miniwdl-aws Docker image for use with
2 | # miniwdl-aws-submit
3 | #
4 | # The easiest way to override these options is usually to set environment variables with the
5 | # convention MINIWDL__{SECTION}__{KEY}={VALUE}. Full info on the miniwdl configuration loader:
6 | # https://miniwdl.readthedocs.io/en/latest/runner_reference.html#configuration
7 | #
8 | # Additionally, the following are also usually set via environment variables:
9 | # * MINIWDL__AWS__TASK_QUEUE: the desired AWS Batch queue
10 | # * MINIWDL__AWS__FSAP: EFS Access Point ID (fsap-xxxx)
11 | # * MINIWDL__AWS__FS: EFS file system ID (fs-xxxx) matching the access point; can be detected if
12 | # omitted, but doing so requires IAM permission to DescribeAccessPoints.
13 |
14 | [scheduler]
15 | container_backend = aws_batch_job
16 | # One `miniwdl run` process will be able to orchestrate this many concurrent AWS Batch jobs. (This
17 | # controls the size of a thread pool, so setting it too high tends to be counterproductive.)
18 | call_concurrency = 80
19 | # Reduced concurrency limit for URI download jobs; since these are typically S3 downloads that are
20 | # very fast, running many concurrently is likely to overstress EFS.
21 | download_concurrency = 5
22 |
23 | [file_io]
24 | # This must be set to the host's mount point for the EFS Access Point. The plugin will also
25 | # configure AWS Batch jobs to mount the filesystem at this same location.
26 | root = /mnt/efs
27 |
28 | [task_runtime]
29 | # Default policy to retry spot-terminated jobs (up to three total attempts)
30 | defaults = {
31 | "docker": "ubuntu:20.04",
32 | "preemptible": 2
33 | }
34 | # Default retry policy for URI download tasks, to overcome transient `aws s3 cp` errors
35 | download_defaults = {
36 | "cpu": 2,
37 | "memory": "1G",
38 | "maxRetries": 2
39 | }
40 |
41 | [call_cache]
42 | # Cache call outputs in EFS folder, valid so long as all referenced input & output files remain
43 | # unmodified on EFS. (Relative to [file_io] root)
44 | dir = miniwdl_run/_CACHE/call
45 | get = true
46 | put = true
47 |
48 | [download_cache]
49 | dir = miniwdl_run/_CACHE/download
50 | get = true
51 | # Disabling S3 download cache by default to prevent confusing coherence problems (as the cache
52 | # logic does not check for modification of the original S3 object). Recommend enabling, if that can
53 | # be managed adequately.
54 | put = false
55 | # disable flock on files used from download cache due to EFS' low limits on flocks
56 | flock = false
57 |
58 | [aws]
59 | # Last-resort job timeout for AWS Batch to enforce (attemptDurationSeconds)
60 | job_timeout = 864000
61 | # Internal rate-limiting periods (seconds) for AWS Batch API requests
62 | # (may need to be increased if many concurrent workflow runs are planned)
63 | describe_period = 1
64 | submit_period = 1
65 | # Boto3 Config retries policy for miniwdl's AWS Batch API requests.
66 | # see: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html
67 | boto3_retries = {
68 | "max_attempts": 5,
69 | "mode": "standard"
70 | }
71 | # Wait this many seconds before retrying a job after a spot instance interruption or other
72 | # retry-able failure. Provides a time window for convergence of any "eventually consistent"
73 | # activities from the first attempt (involving e.g. EFS, CloudWatch Logs, etc.).
74 | retry_wait = 20
75 | # Explicitly `sync` files in the task working directory before exiting task container. Requires
76 | # `find`, `xargs`, and `sync` commands available in the container image.
77 | container_sync = false
78 | # When task runtime includes "gpu: true", request this many GPUs from AWS Batch. (The WDL spec
79 | # defines runtime.gpu as a Boolean, as of this writing.)
80 | gpu_value = 1
81 | # ContainerProperties fields to set on AWS Batch jobs for tasks, OTHER than the following which are
82 | # set by miniwdl-aws, task runtime{}, or other available config options:
83 | # image command environment resourceRequirements mountPoints privileged
84 | # see: https://docs.aws.amazon.com/batch/latest/APIReference/API_ContainerProperties.html
85 | container_properties = {
86 | }
87 | # Add this many mebibytes (MiB) to each task's runtime.memory setting when filling out the
88 | # memory requirement for each AWS Batch job. The default is meant to offset the memory that AWS
89 | # Batch itself reserves on each worker instance; without this, if runtime.memory is e.g. "8 GiB"
90 | # then AWS Batch might use larger-than-necessary worker instances and pack them inefficiently.
91 | # see: https://docs.aws.amazon.com/batch/latest/userguide/memory-management.html
92 | memory_delta = -33
93 |
--------------------------------------------------------------------------------
/miniwdl_aws/__init__.py:
--------------------------------------------------------------------------------
1 | from .batch_job import BatchJob, BatchJobNoEFS # noqa: F401
2 | from .cli_run_s3upload import miniwdl_run_s3upload # noqa: F401
3 | from .cli_submit import miniwdl_submit_awsbatch # noqa: F401
4 |
--------------------------------------------------------------------------------
/miniwdl_aws/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from .cli_submit import miniwdl_submit_awsbatch
3 |
4 |
5 | def main(args=sys.argv):
6 | miniwdl_submit_awsbatch(args)
7 |
8 |
9 | if __name__ == "__main__":
10 | sys.exit(main())
11 |
--------------------------------------------------------------------------------
/miniwdl_aws/_util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import boto3
3 | import base64
4 | import json
5 | import uuid
6 | import requests
7 | import subprocess
8 | from WDL._util import StructuredLogMessage as _
9 |
10 |
11 | def detect_aws_region(cfg):
12 | if cfg and cfg.has_option("aws", "region") and cfg.get("aws", "region"):
13 | return cfg.get("aws", "region")
14 |
15 | # check environment variables
16 | for ev in ("AWS_REGION", "AWS_DEFAULT_REGION"):
17 | if os.environ.get(ev):
18 | return os.environ[ev]
19 |
20 | # check boto3, which will load ~/.aws
21 | if boto3.DEFAULT_SESSION and boto3.DEFAULT_SESSION.region_name:
22 | return boto3.DEFAULT_SESSION.region_name
23 | session = boto3.Session()
24 | if session.region_name:
25 | return session.region_name
26 |
27 | # query EC2 metadata
28 | try:
29 | return requests.get(
30 | "http://169.254.169.254/latest/meta-data/placement/region", timeout=2.0
31 | ).text
32 | except:
33 | pass
34 |
35 | return None
36 |
37 |
38 | def randomize_job_name(job_name):
39 | # Append entropy to the Batch job name to avoid race condition using identical names in
40 | # concurrent RegisterJobDefinition requests
41 | return (
42 | job_name[:103] # 119 + 1 + 8 = 128
43 | + "-"
44 | + base64.b32encode(uuid.uuid4().bytes[:5]).lower().decode()
45 | )
46 |
47 |
48 | def efs_id_from_access_point(region_name, fsap_id):
49 | # Resolve the EFS access point id (fsap-xxxx) to the associated file system id (fs-xxxx). Saves
50 | # user from having to specify both.
51 | aws_efs = boto3.Session().client("efs", region_name=region_name)
52 | desc = aws_efs.describe_access_points(AccessPointId=fsap_id)
53 | assert len(desc.get("AccessPoints", [])) == 1
54 | desc = desc["AccessPoints"][0]
55 | fs_id = desc["FileSystemId"]
56 | assert isinstance(fs_id, str) and fs_id.startswith("fs-")
57 | return fs_id
58 |
59 |
60 | def detect_sagemaker_studio_efs(logger, **kwargs):
61 | # Detect if we're operating inside SageMaker Studio and if so, record EFS mount details
62 | METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"
63 | metadata = None
64 | try:
65 | with open(METADATA_FILE) as infile:
66 | metadata = json.load(infile)
67 | assert metadata["DomainId"] and metadata["UserProfileName"]
68 | except:
69 | return None
70 | try:
71 | api = boto3.client("sagemaker", **kwargs)
72 | domain = api.describe_domain(DomainId=metadata["DomainId"])
73 | efs_id = domain["HomeEfsFileSystemId"]
74 | profile = api.describe_user_profile(
75 | DomainId=metadata["DomainId"], UserProfileName=metadata["UserProfileName"]
76 | )
77 | efs_uid = profile["HomeEfsFileSystemUid"]
78 | assert efs_id and efs_uid
79 | efs_home = f"/{efs_uid}" # home directory on EFS
80 | efs_mount = os.getenv("HOME") # where the EFS home directory is mounted inside Studio
81 | logger.notice(
82 | _(
83 | "detected SageMaker Studio",
84 | domain=metadata["DomainId"],
85 | user=metadata["UserProfileName"],
86 | efs_id=efs_id,
87 | efs_home=efs_home,
88 | efs_mount=efs_mount,
89 | )
90 | )
91 | return (efs_id, efs_uid, efs_home, efs_mount)
92 | except Exception as exn:
93 | logger.warning(
94 | _(
95 | "detected local AWS SageMaker Studio metadata, but failed to query domain",
96 | error=str(exn),
97 | metadata=METADATA_FILE,
98 | domain=metadata["DomainId"],
99 | user=metadata["UserProfileName"],
100 | )
101 | )
102 | return None
103 |
104 |
105 | def detect_studio_fsap(logger, efs_id, efs_uid, efs_home, **kwargs):
106 | # Look for an Access Point with the appropriate configuration to mount the SageMaker Studio EFS
107 | # (in the same way it's presented through Studio)
108 | try:
109 | efs = boto3.client("efs", **kwargs)
110 | access_points = efs.describe_access_points(FileSystemId=efs_id, MaxResults=100).get(
111 | "AccessPoints", []
112 | )
113 | if len(access_points) >= 100:
114 | logger.warn(
115 | _(
116 | "EFS has >=100 Access Points; set configuration [aws] fsap or environment MINIWDL__AWS__FSAP to avoid searching through them",
117 | efs_id=efs_id,
118 | )
119 | )
120 | for ap in access_points:
121 | assert ap["FileSystemId"] == efs_id
122 | if (
123 | ap["LifeCycleState"] == "available"
124 | and ap.get("RootDirectory", {}).get("Path", "") == efs_home
125 | and str(ap.get("PosixUser", {}).get("Uid", "")) == efs_uid
126 | ):
127 | logger.notice(
128 | _(
129 | "detected suitable EFS Access Point; to override, set configuration [aws] fsap or environment MINIWDL__AWS__FSAP",
130 | arn=ap["AccessPointArn"],
131 | )
132 | )
133 | return ap["AccessPointId"]
134 | return None
135 | except Exception as exn:
136 | logger.warning(
137 | _(
138 | "error detecting EFS Access Point",
139 | error=str(exn),
140 | efs_id=efs_id,
141 | ufs_uid=efs_uid,
142 | )
143 | )
144 | return None
145 |
146 |
147 | def detect_gwfcore_batch_queue(logger, efs_id, **kwargs):
148 | # Look for a Batch job queue tagged with the Studio EFS id (indicating it's our default)
149 | try:
150 | batch = boto3.client("batch", **kwargs)
151 | queues = batch.describe_job_queues(maxResults=100).get("jobQueues", [])
152 | if len(queues) >= 100:
153 | logger.warn(
154 | "AWS Batch has >=100 job queues; set configuration [aws] task_queue or environment MINIWDL__AWS__TASK_QUEUE to avoid searching through them"
155 | )
156 | queues = [
157 | q
158 | for q in queues
159 | if q.get("state", "") == "ENABLED"
160 | and q.get("status", "") == "VALID"
161 | and q.get("tags", {}).get("MiniwdlStudioEfsId", "") == efs_id
162 | ]
163 | if not queues:
164 | return None
165 | if len(queues) > 1:
166 | default_queues = [q for q in queues if q.get("jobQueueName", "").startswith("default-")]
167 | if default_queues:
168 | queues = default_queues
169 | logger.notice(
170 | _(
171 | "detected suitable AWS Batch job queue; to override, set configuration [aws] task_queue or environment MINIWDL__AWS__TASK_QUEUE",
172 | arn=queues[0]["jobQueueArn"],
173 | )
174 | )
175 | return queues[0]["jobQueueName"]
176 | except Exception as exn:
177 | logger.warning(
178 | _(
179 | "error detecting AWS Batch job queue",
180 | error=str(exn),
181 | efs_id=efs_id,
182 | )
183 | )
184 | return None
185 |
186 |
187 | def subprocess_run_with_clean_exit(*args, check=False, **kwargs):
188 | """
189 | As subprocess.run(*args, **kwargs), but in the event of a SystemExit, KeyboardInterrupt, or
190 | BrokenPipe exception, sends SIGTERM to the subprocess and waits for it to exit before
191 | re-raising. Typically paired with signal handlers for SIGTERM/SIGINT/etc. to raise SystemExit.
192 | """
193 |
194 | assert "timeout" not in kwargs
195 | with subprocess.Popen(*args, **kwargs) as subproc:
196 | while True:
197 | try:
198 | stdout, stderr = subproc.communicate(timeout=0.1)
199 | assert isinstance(subproc.returncode, int)
200 | completed = subprocess.CompletedProcess(
201 | subproc.args, subproc.returncode, stdout, stderr
202 | )
203 | if check:
204 | completed.check_returncode()
205 | return completed
206 | except (SystemExit, KeyboardInterrupt, BrokenPipeError):
207 | subproc.terminate()
208 | subproc.communicate()
209 | raise
210 | except subprocess.TimeoutExpired:
211 | pass
212 |
213 |
214 | END_OF_LOG = "[miniwdl_run_s3upload] -- END OF LOG --"
215 |
--------------------------------------------------------------------------------
/miniwdl_aws/batch_job.py:
--------------------------------------------------------------------------------
1 | """
2 | BatchJob: implements miniwdl TaskContainer by submitting jobs to an AWS Batch queue and polling
3 | their status. Assumes a shared filesystem (typically EFS) between the miniwdl host and the Batch
4 | workers.
5 | """
6 |
7 | import os
8 | import math
9 | import time
10 | import json
11 | import threading
12 | import heapq
13 | from contextlib import ExitStack, suppress
14 | import boto3
15 | import botocore
16 | import WDL
17 | import WDL.runtime.task_container
18 | import WDL.runtime._statusbar
19 | from WDL._util import rmtree_atomic, symlink_force, write_atomic
20 | from WDL._util import StructuredLogMessage as _
21 | from ._util import (
22 | detect_aws_region,
23 | randomize_job_name,
24 | efs_id_from_access_point,
25 | detect_sagemaker_studio_efs,
26 | detect_studio_fsap,
27 | detect_gwfcore_batch_queue,
28 | )
29 |
30 |
31 | class BatchJobBase(WDL.runtime.task_container.TaskContainer):
32 | """
33 | Abstract base class implementing the AWS Batch backend for miniwdl TaskContainer. Concrete
34 | subclasses add configuration specific to the the shared filesystem in use.
35 | """
36 |
37 | @classmethod
38 | def global_init(cls, cfg, logger):
39 | cls._submit_lock = threading.Lock()
40 | cls._last_submit_time = [0.0]
41 | cls._init_time = time.time()
42 | cls._describer = BatchJobDescriber()
43 |
44 | cls._region_name = detect_aws_region(cfg)
45 | assert (
46 | cls._region_name
47 | ), "Failed to detect AWS region; configure AWS CLI or set environment AWS_DEFAULT_REGION"
48 |
49 | # set AWS Batch job queue
50 | cls._job_queue = cfg.get("aws", "task_queue", "")
51 | cls._job_queue_fallback = cfg.get("aws", "task_queue_fallback", "")
52 |
53 | # TODO: query Batch compute environment for resource limits
54 | cls._resource_limits = {"cpu": 9999, "mem_bytes": 999999999999999}
55 |
56 | cls._fs_mount = cfg.get("file_io", "root")
57 | assert (
58 | cls._fs_mount.startswith("/") and cls._fs_mount != "/"
59 | ), "misconfiguration, set [file_io] root / MINIWDL__FILE_IO__ROOT to EFS mount point"
60 |
61 | @classmethod
62 | def detect_resource_limits(cls, cfg, logger):
63 | return cls._resource_limits
64 |
65 | def __init__(self, cfg, run_id, host_dir):
66 | super().__init__(cfg, run_id, host_dir)
67 | self._logStreamName = None
68 | self._inputs_copied = False
69 | # We expect the Batch job containers to have the shared filesystem mounted at the same
70 | # location we, the workflow job, have it mounted ourselves. Therefore container_dir will be
71 | # the same as host_dir (unlike the default Swarm backend, which mounts it at a different
72 | # virtualized location)
73 | self.container_dir = self.host_dir
74 | self._aws_interrupts = 0
75 |
76 | def copy_input_files(self, logger):
77 | self._inputs_copied = True
78 | return super().copy_input_files(logger)
79 |
80 | def host_work_dir(self):
81 | # Since we aren't virtualizing the in-container paths as noted above, always use the same
82 | # working directory on task retries, instead of the base class behavior of appending the
83 | # try counter (on the host side). This loses some robustness to a split-brain condition
84 | # where the previous try is actually still running when we start the retry.
85 | # (see also retry_wait)
86 | return os.path.join(self.host_dir, "work")
87 |
88 | def host_stdout_txt(self):
89 | return os.path.join(self.host_dir, "stdout.txt")
90 |
91 | def host_stderr_txt(self):
92 | return os.path.join(self.host_dir, "stderr.txt")
93 |
94 | def reset(self, logger) -> None:
95 | cooldown = self.cfg.get_float("aws", "retry_wait", 20.0)
96 | if cooldown > 0.0:
97 | logger.info(
98 | _(
99 | "waiting to retry per configuration [aws] retry_wait",
100 | seconds=cooldown,
101 | )
102 | )
103 | time.sleep(cooldown)
104 |
105 | rmtree_atomic(self.host_work_dir())
106 | with suppress(FileNotFoundError):
107 | os.unlink(self.host_stderr_txt() + ".offset") # PygtailLogger state file
108 | super().reset(logger)
109 |
110 | def process_runtime(self, logger, runtime_eval):
111 | super().process_runtime(logger, runtime_eval) # handles cpu, memory, docker, gpu
112 | if "acceleratorType" in runtime_eval:
113 | if not isinstance(runtime_eval["acceleratorType"], WDL.Value.String):
114 | raise WDL.Error.RuntimeError("invalid setting of runtime.acceleratorType")
115 | accty = runtime_eval["acceleratorType"].value
116 | if accty.startswith("nvidia"):
117 | self.runtime_values["gpu"] = True
118 | else:
119 | logger.warning(_("ignored unrecognized runtime.acceleratorType", value=accty))
120 | if "acceleratorCount" in runtime_eval:
121 | if not isinstance(runtime_eval["acceleratorCount"], WDL.Value.Int):
122 | raise WDL.Error.RuntimeError("invalid setting of runtime.acceleratorCount")
123 | self.runtime_values["acceleratorCount"] = runtime_eval["acceleratorCount"].value
124 |
125 | def _run(self, logger, terminating, command):
126 | """
127 | Run task
128 | """
129 | self._observed_states = set()
130 | boto3_retries = self.cfg.get_dict(
131 | "aws", "boto3_retries", {"max_attempts": 5, "mode": "standard"}
132 | )
133 | try:
134 | aws_batch = boto3.Session().client( # Session() needed for thread safety
135 | "batch",
136 | region_name=self._region_name,
137 | config=botocore.config.Config(retries=boto3_retries),
138 | )
139 | with ExitStack() as cleanup:
140 | # prepare the task working directory
141 | self._prepare_dir(logger, cleanup, command)
142 | # submit Batch job (with request throttling)
143 | job_id = None
144 | submit_period = self.cfg.get_float("aws", "submit_period", 1.0)
145 | while True:
146 | with self._submit_lock:
147 | if terminating():
148 | raise WDL.runtime.Terminated(quiet=True)
149 | if (
150 | time.time() - self._last_submit_time[0]
151 | >= submit_period * self._submit_period_multiplier()
152 | ):
153 | job_id = self._submit_batch_job(logger, cleanup, aws_batch)
154 | self._last_submit_time[0] = time.time()
155 | break
156 | time.sleep(submit_period / 4)
157 | # poll Batch job status
158 | return self._await_batch_job(logger, cleanup, aws_batch, job_id, terminating)
159 | except botocore.exceptions.ClientError as exn:
160 | wrapper = AWSError(exn)
161 | logger.error(wrapper)
162 | raise wrapper
163 |
164 | def _prepare_dir(self, logger, cleanup, command):
165 | # Prepare control files. We do NOT use super().touch_mount_point(...) because it fails if
166 | # the desired mount point already exists; which it may in our case after a retry (see
167 | # self.host_work_dir() override above.)
168 | with open(os.path.join(self.host_dir, "command"), "w") as outfile:
169 | outfile.write(command)
170 | with open(self.host_stdout_txt(), "w"):
171 | pass
172 | with open(self.host_stderr_txt(), "w"):
173 | pass
174 |
175 | if not self._inputs_copied:
176 | # Prepare symlinks to the input Files & Directories
177 | container_prefix = os.path.join(self.container_dir, "work/_miniwdl_inputs/")
178 | link_dirs_made = set()
179 | for host_fn, container_fn in self.input_path_map.items():
180 | assert container_fn.startswith(container_prefix) and len(container_fn) > len(
181 | container_prefix
182 | )
183 | if host_fn.endswith("/"):
184 | assert container_fn.endswith("/")
185 | host_fn = host_fn[:-1]
186 | container_fn = container_fn[:-1]
187 | else:
188 | assert not container_fn.endswith("/")
189 | link_dn = os.path.dirname(container_fn)
190 | if link_dn not in link_dirs_made:
191 | os.makedirs(link_dn)
192 | link_dirs_made.add(link_dn)
193 | symlink_force(host_fn, container_fn)
194 |
195 | def _submit_batch_job(self, logger, cleanup, aws_batch):
196 | """
197 | Register & submit AWS batch job, leaving a cleanup callback to deregister the transient
198 | job definition.
199 | """
200 |
201 | job_name = self.run_id
202 | if job_name.startswith("call-"):
203 | job_name = job_name[5:]
204 | if self.try_counter > 1:
205 | job_name += f"-try{self.try_counter}"
206 | # Append entropy to the job name to avoid race condition using identical job names in
207 | # concurrent RegisterJobDefinition requests
208 | job_name = randomize_job_name(job_name)
209 |
210 | container_properties = self._prepare_container_properties(logger)
211 | job_def = aws_batch.register_job_definition(
212 | jobDefinitionName=job_name,
213 | type="container",
214 | containerProperties=container_properties,
215 | )
216 | job_def_handle = f"{job_def['jobDefinitionName']}:{job_def['revision']}"
217 | logger.debug(
218 | _(
219 | "registered Batch job definition",
220 | jobDefinition=job_def_handle,
221 | **container_properties,
222 | )
223 | )
224 |
225 | self._cleanup_job_definition(logger, cleanup, aws_batch, job_def_handle)
226 |
227 | job_queue = self._select_job_queue()
228 | job_tags = self.cfg.get_dict("aws", "job_tags", {})
229 | if "AWS_BATCH_JOB_ID" in os.environ:
230 | # If we find ourselves running inside an AWS Batch job, tag the new job identifying
231 | # ourself as the "parent" job.
232 | job_tags["AWS_BATCH_PARENT_JOB_ID"] = os.environ["AWS_BATCH_JOB_ID"]
233 | # TODO: set a tag to indicate that this job is a retry of another
234 | job = aws_batch.submit_job(
235 | jobName=job_name,
236 | jobQueue=job_queue,
237 | jobDefinition=job_def_handle,
238 | timeout={"attemptDurationSeconds": self.cfg.get_int("aws", "job_timeout", 86400)},
239 | tags=job_tags,
240 | )
241 | logger.info(
242 | _(
243 | "AWS Batch job submitted",
244 | jobQueue=job_queue,
245 | jobId=job["jobId"],
246 | tags=job_tags,
247 | )
248 | )
249 | return job["jobId"]
250 |
251 | def _select_job_queue(self):
252 | if self._job_queue_fallback:
253 | preemptible = self.runtime_values.get("preemptible", 0)
254 | if self._aws_interrupts >= preemptible and preemptible > 0:
255 | return self._job_queue_fallback
256 | return self._job_queue
257 |
258 | def _prepare_container_properties(self, logger):
259 | image_tag = self.runtime_values.get("docker", "ubuntu:20.04")
260 | vcpu = self.runtime_values.get("cpu", 1)
261 | memory_mbytes = max(
262 | (
263 | math.ceil(self.runtime_values.get("memory_reservation", 0) / 1048576)
264 | + self.cfg.get_int("aws", "memory_delta", -33)
265 | ),
266 | 991,
267 | )
268 | commands = [
269 | f"cd {self.container_dir}/work",
270 | "exit_code=0",
271 | self.cfg.get("task_runtime", "command_shell")
272 | + " ../command >> ../stdout.txt 2> >(tee -a ../stderr.txt >&2) || exit_code=$?",
273 | ]
274 | if self.cfg.get_bool("aws", "container_sync", False):
275 | commands.append("find . -type f | xargs sync")
276 | commands.append("sync ../stdout.txt ../stderr.txt")
277 | commands.append("exit $exit_code")
278 |
279 | resource_requirements = [
280 | {"type": "VCPU", "value": str(vcpu)},
281 | {"type": "MEMORY", "value": str(memory_mbytes)},
282 | ]
283 |
284 | if self.runtime_values.get("gpu", False):
285 | gpu_value = self.cfg.get_int("aws", "gpu_value", 1)
286 | if "acceleratorCount" in self.runtime_values:
287 | gpu_value = self.runtime_values["acceleratorCount"]
288 | elif gpu_value > 1:
289 | logger.info(
290 | _("requesting multiple GPUs (per config [aws] gpu_value)", gpu_value=gpu_value)
291 | )
292 | if gpu_value > 0:
293 | resource_requirements += [{"type": "GPU", "value": str(gpu_value)}]
294 |
295 | container_properties = {
296 | "image": image_tag,
297 | "command": ["/bin/bash", "-ec", "\n".join(commands)],
298 | "environment": [
299 | {"name": ev_name, "value": ev_value}
300 | for ev_name, ev_value in self.runtime_values.get("env", dict()).items()
301 | ],
302 | "resourceRequirements": resource_requirements,
303 | "privileged": self.runtime_values.get("privileged", False),
304 | "mountPoints": [{"containerPath": self._fs_mount, "sourceVolume": "file_io_root"}],
305 | }
306 |
307 | for k, v in self.cfg.get_dict("aws", "container_properties", {}).items():
308 | if k in container_properties:
309 | raise WDL.Error.RuntimeError(
310 | f"Config [aws] container_properties may not override '{k}'"
311 | )
312 | container_properties[k] = v
313 |
314 | if self.cfg["task_runtime"].get_bool("as_user"):
315 | user = f"{os.geteuid()}:{os.getegid()}"
316 | if user.startswith("0:"):
317 | logger.warning(
318 | "container command will run explicitly as root, since you are root and set --as-me"
319 | )
320 | container_properties["user"] = user
321 |
322 | return container_properties
323 |
324 | def _cleanup_job_definition(self, logger, cleanup, aws_batch, job_def_handle):
325 | def deregister(logger, aws_batch, job_def_handle):
326 | try:
327 | aws_batch.deregister_job_definition(jobDefinition=job_def_handle)
328 | logger.debug(_("deregistered Batch job definition", jobDefinition=job_def_handle))
329 | except botocore.exceptions.ClientError as exn:
330 | # AWS expires job definitions after 6mo, so failing to delete them isn't fatal
331 | logger.warning(
332 | _(
333 | "failed to deregister Batch job definition",
334 | jobDefinition=job_def_handle,
335 | error=str(AWSError(exn)),
336 | )
337 | )
338 |
339 | cleanup.callback(deregister, logger, aws_batch, job_def_handle)
340 |
341 | def _await_batch_job(self, logger, cleanup, aws_batch, job_id, terminating):
342 | """
343 | Poll for Batch job success or failure & return exit code
344 | """
345 | describe_period = self.cfg.get_float("aws", "describe_period", 1.0)
346 | cleanup.callback((lambda job_id: self._describer.unsubscribe(job_id)), job_id)
347 | poll_stderr = cleanup.enter_context(self.poll_stderr_context(logger))
348 | last_job_desc_json = None
349 | exit_code = None
350 | while exit_code is None:
351 | time.sleep(describe_period)
352 | job_desc = self._describer.describe(aws_batch, job_id, describe_period)
353 | job_desc_json = json.dumps(job_desc, indent=2, sort_keys=True)
354 | if job_desc_json != last_job_desc_json:
355 | last_job_desc_json = job_desc_json
356 | write_atomic(
357 | job_desc_json,
358 | os.path.join(self.host_dir, f"awsBatchJobDetail.{job_id}.json"),
359 | )
360 | job_status = job_desc["status"]
361 | if "container" in job_desc and "logStreamName" in job_desc["container"]:
362 | self._logStreamName = job_desc["container"]["logStreamName"]
363 | if job_status not in self._observed_states:
364 | self._observed_states.add(job_status)
365 | logfn = (
366 | logger.notice
367 | if job_status in ("RUNNING", "SUCCEEDED", "FAILED")
368 | else logger.info
369 | )
370 | logdetails = {"status": job_status, "jobId": job_id}
371 | if self._logStreamName:
372 | logdetails["logStreamName"] = self._logStreamName
373 | logfn(_("AWS Batch job change", **logdetails))
374 | if job_status == "STARTING" or (
375 | job_status == "RUNNING" and "STARTING" not in self._observed_states
376 | ):
377 | cleanup.enter_context(self.task_running_context())
378 | if job_status not in (
379 | "SUBMITTED",
380 | "PENDING",
381 | "RUNNABLE",
382 | "STARTING",
383 | "RUNNING",
384 | "SUCCEEDED",
385 | "FAILED",
386 | ):
387 | logger.warning(_("unknown job status from AWS Batch", status=job_status))
388 | if job_status == "SUCCEEDED":
389 | exit_code = 0
390 | elif job_status == "FAILED":
391 | reason = job_desc.get("container", {}).get("reason", None)
392 | status_reason = job_desc.get("statusReason", None)
393 | self.failure_info = {"jobId": job_id}
394 | if reason:
395 | self.failure_info["reason"] = reason
396 | if status_reason:
397 | self.failure_info["statusReason"] = status_reason
398 | if self._logStreamName:
399 | self.failure_info["logStreamName"] = self._logStreamName
400 | if status_reason and "Host EC2" in status_reason and "terminated" in status_reason:
401 | self._aws_interrupts += 1
402 | raise WDL.runtime.Interrupted(
403 | "AWS Batch job interrupted (likely spot instance termination)",
404 | more_info=self.failure_info,
405 | )
406 | if "exitCode" not in job_desc.get("container", {}):
407 | raise WDL.Error.RuntimeError(
408 | "AWS Batch job failed", more_info=self.failure_info
409 | )
410 | exit_code = job_desc["container"]["exitCode"]
411 | assert isinstance(exit_code, int) and exit_code != 0
412 | if "RUNNING" in self._observed_states:
413 | poll_stderr()
414 | if terminating():
415 | aws_batch.terminate_job(jobId=job_id, reason="terminated by miniwdl")
416 | raise WDL.runtime.Terminated(
417 | quiet=not self._observed_states.difference({"SUBMITTED", "PENDING", "RUNNABLE"})
418 | )
419 | for _root, _dirs, _files in os.walk(self.host_dir, followlinks=False):
420 | # no-op traversal of working directory to refresh NFS metadata cache (speculative)
421 | pass
422 | poll_stderr()
423 | return exit_code
424 |
425 | def _submit_period_multiplier(self):
426 | if self._describer.jobs:
427 | b = self.cfg.get_float("aws", "submit_period_b", 0.0)
428 | if b > 0.0:
429 | t = time.time() - self._init_time
430 | c = self.cfg.get_float("aws", "submit_period_c", 0.0)
431 | return max(1.0, c - t / b)
432 | return 1.0
433 |
434 |
435 | class BatchJob(BatchJobBase):
436 | """
437 | EFS-based implementation, including the case of SageMaker Studio's built-in EFS. Assumes we're
438 | running on an EC2 instance or Fargate container mounting an EFS Access Point at [file_io] root,
439 | and configures each Batch job with the same mount.
440 | """
441 |
442 | @classmethod
443 | def global_init(cls, cfg, logger):
444 | super().global_init(cfg, logger)
445 |
446 | # EFS configuration based on:
447 | # - [aws] fsap / MINIWDL__AWS__FSAP
448 | # - [aws] fs / MINIWDL__AWS__FS
449 | # - SageMaker Studio metadata, if applicable
450 | cls._fs_id = None
451 | cls._fsap_id = None
452 | if cfg.has_option("aws", "fs"):
453 | cls._fs_id = cfg.get("aws", "fs")
454 | if cfg.has_option("aws", "fsap"):
455 | cls._fsap_id = cfg.get("aws", "fsap")
456 | if not cls._fs_id:
457 | cls._fs_id = efs_id_from_access_point(cls._region_name, cls._fsap_id)
458 | cls._studio_efs_uid = None
459 | sagemaker_studio_efs = detect_sagemaker_studio_efs(logger, region_name=cls._region_name)
460 | if sagemaker_studio_efs:
461 | (
462 | studio_efs_id,
463 | studio_efs_uid,
464 | studio_efs_home,
465 | studio_efs_mount,
466 | ) = sagemaker_studio_efs
467 | assert (
468 | not cls._fs_id or cls._fs_id == studio_efs_id
469 | ), "Configured EFS ([aws] fs / MINIWDL__AWS__FS, [aws] fsap / MINIWDL__AWS__FSAP) isn't associated with current SageMaker Studio domain EFS"
470 | cls._fs_id = studio_efs_id
471 | assert (
472 | cls._fs_mount.rstrip("/") == studio_efs_mount.rstrip("/")
473 | ) or cls._fs_mount.startswith(studio_efs_mount.rstrip("/") + "/"), (
474 | "misconfiguration, set [file_io] root / MINIWDL__FILE_IO__ROOT to "
475 | + studio_efs_mount.rstrip("/")
476 | )
477 | cls._studio_efs_uid = studio_efs_uid
478 | if not cls._fsap_id:
479 | cls._fsap_id = detect_studio_fsap(
480 | logger,
481 | studio_efs_id,
482 | studio_efs_uid,
483 | studio_efs_home,
484 | region_name=cls._region_name,
485 | )
486 | assert (
487 | cls._fsap_id
488 | ), "Unable to detect suitable EFS Access Point for use with SageMaker Studio; set [aws] fsap / MINIWDL__AWS__FSAP"
489 | # TODO: else sanity-check that FSAP's root directory equals studio_efs_home
490 | assert (
491 | cls._fs_id
492 | ), "Missing EFS configuration ([aws] fs / MINIWDL__AWS__FS or [aws] fsap / MINIWDL__AWS__FSAP)"
493 | if not cls._fsap_id:
494 | logger.warning(
495 | "AWS BatchJob plugin recommends using EFS Access Point to simplify permissions between containers (configure [aws] fsap / MINIWDL__AWS__FSAP to fsap-xxxx)"
496 | )
497 |
498 | # if no task queue in config file, try detecting miniwdl-aws-studio
499 | if not cls._job_queue and sagemaker_studio_efs:
500 | cls._job_queue = detect_gwfcore_batch_queue(
501 | logger, sagemaker_studio_efs[0], region_name=cls._region_name
502 | )
503 | assert (
504 | cls._job_queue
505 | ), "Missing AWS Batch job queue configuration ([aws] task_queue / MINIWDL__AWS__TASK_QUEUE)"
506 |
507 | logger.info(
508 | _(
509 | "initialized AWS BatchJob (EFS) plugin",
510 | region_name=cls._region_name,
511 | job_queue=cls._job_queue,
512 | resource_limits=cls._resource_limits,
513 | file_io_root=cls._fs_mount,
514 | efs_id=cls._fs_id,
515 | efsap_id=cls._fsap_id,
516 | )
517 | )
518 |
519 | def _prepare_container_properties(self, logger):
520 | container_properties = super()._prepare_container_properties(logger)
521 |
522 | # add EFS volume & mount point
523 | volumes = [
524 | {
525 | "name": "file_io_root",
526 | "efsVolumeConfiguration": {
527 | "fileSystemId": self._fs_id,
528 | "transitEncryption": "ENABLED",
529 | },
530 | }
531 | ]
532 | if self._fsap_id:
533 | volumes[0]["efsVolumeConfiguration"]["authorizationConfig"] = {
534 | "accessPointId": self._fsap_id
535 | }
536 | container_properties["volumes"] = volumes
537 |
538 | # set Studio UID if appropriate
539 | if self.cfg["task_runtime"].get_bool("as_user") and self._studio_efs_uid:
540 | container_properties["user"] = f"{self._studio_efs_uid}:{self._studio_efs_uid}"
541 |
542 | return container_properties
543 |
544 |
545 | class BatchJobNoEFS(BatchJobBase):
546 | """
547 | Implementation assuming the Batch compute environment is configured to mount the shared
548 | filesystem without further specification by us; e.g. FSxL mounted by cloud-init user data
549 | script.
550 | """
551 |
552 | @classmethod
553 | def global_init(cls, cfg, logger):
554 | super().global_init(cfg, logger)
555 |
556 | assert (
557 | cls._job_queue
558 | ), "Missing AWS Batch job queue configuration ([aws] task_queue / MINIWDL__AWS__TASK_QUEUE)"
559 |
560 | logger.info(
561 | _(
562 | "initialized AWS BatchJob plugin",
563 | region_name=cls._region_name,
564 | job_queue=cls._job_queue,
565 | resource_limits=cls._resource_limits,
566 | file_io_root=cls._fs_mount,
567 | )
568 | )
569 |
570 | def _prepare_container_properties(self, logger):
571 | container_properties = super()._prepare_container_properties(logger)
572 |
573 | container_properties["volumes"] = [
574 | {
575 | "name": "file_io_root",
576 | "host": {"sourcePath": self._fs_mount},
577 | }
578 | ]
579 |
580 | return container_properties
581 |
582 |
583 | class BatchJobDescriber:
584 | """
585 | This singleton object handles calling the AWS Batch DescribeJobs API with up to 100 job IDs
586 | per request, then dispensing each job description to the thread interested in it. This helps
587 | avoid AWS API request rate limits when we're tracking many concurrent jobs.
588 | """
589 |
590 | JOBS_PER_REQUEST = 100 # maximum jobs per DescribeJob request
591 |
592 | def __init__(self):
593 | self.lock = threading.Lock()
594 | self.last_request_time = 0
595 | self.job_queue = []
596 | self.jobs = {}
597 |
598 | def describe(self, aws_batch, job_id, period):
599 | """
600 | Get the latest Batch job description
601 | """
602 | while True:
603 | with self.lock:
604 | if job_id not in self.jobs:
605 | # register new job to be described ASAP
606 | heapq.heappush(self.job_queue, (0.0, job_id))
607 | self.jobs[job_id] = None
608 | # update as many job descriptions as possible
609 | self._update(aws_batch, period)
610 | # return the desired job description if we have it
611 | desc = self.jobs[job_id]
612 | if desc:
613 | return desc
614 | # otherwise wait (outside the lock) and try again
615 | time.sleep(period / 4)
616 |
617 | def unsubscribe(self, job_id):
618 | """
619 | Unsubscribe from a job_id once we'll no longer be interested in it
620 | """
621 | with self.lock:
622 | if job_id in self.jobs:
623 | del self.jobs[job_id]
624 |
625 | def _update(self, aws_batch, period):
626 | # if enough time has passed since our last DescribeJobs request
627 | if time.time() - self.last_request_time >= period:
628 | # take the N least-recently described jobs
629 | job_ids = set()
630 | assert self.job_queue
631 | while self.job_queue and len(job_ids) < self.JOBS_PER_REQUEST:
632 | job_id = heapq.heappop(self.job_queue)[1]
633 | assert job_id not in job_ids
634 | if job_id in self.jobs:
635 | job_ids.add(job_id)
636 | if not job_ids:
637 | return
638 | # describe them
639 | try:
640 | job_descs = aws_batch.describe_jobs(jobs=list(job_ids))
641 | finally:
642 | # always: bump last_request_time and re-enqueue these jobs
643 | self.last_request_time = time.time()
644 | for job_id in job_ids:
645 | heapq.heappush(self.job_queue, (self.last_request_time, job_id))
646 | # update self.jobs with the new descriptions
647 | for job_desc in job_descs["jobs"]:
648 | job_ids.remove(job_desc["jobId"])
649 | self.jobs[job_desc["jobId"]] = job_desc
650 | assert not job_ids, "AWS Batch DescribeJobs didn't return all expected results"
651 |
652 |
653 | class AWSError(WDL.Error.RuntimeError):
654 | """
655 | Repackage botocore.exceptions.ClientError to surface it more-informatively in miniwdl task log
656 | """
657 |
658 | def __init__(self, client_error: botocore.exceptions.ClientError):
659 | assert isinstance(client_error, botocore.exceptions.ClientError)
660 | msg = (
661 | f"{client_error.response['Error']['Code']}, {client_error.response['Error']['Message']}"
662 | )
663 | super().__init__(
664 | msg, more_info={"ResponseMetadata": client_error.response["ResponseMetadata"]}
665 | )
666 |
--------------------------------------------------------------------------------
/miniwdl_aws/cli_run_s3upload.py:
--------------------------------------------------------------------------------
1 | """
2 | miniwdl_run_s3upload CLI entry point (console script) which passes through its arguments to
3 | `miniwdl run`, then uploads run artifacts to $S3_UPLOAD_FOLDER. This includes the log file and if
4 | the run succeeded, the output files and outputs.json (rewritten with the uploaded S3 URIs instead
5 | of local filenames).
6 |
7 | With the BatchJob plugin also enabled, this may be used from an SSH session on an EC2 instance or
8 | container with EFS suitably mounted at /mnt/efs; or within a Batch "workflow job."
9 | """
10 |
11 | import sys
12 | import os
13 | import json
14 | import subprocess
15 | import shutil
16 | import argparse
17 | import tempfile
18 | import signal
19 | from ._util import END_OF_LOG, subprocess_run_with_clean_exit
20 |
21 |
22 | def miniwdl_run_s3upload():
23 | # Set signal handler. SystemExit may be handled below and/or by subprocess_run_with_clean_exit.
24 | for s in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT):
25 | signal.signal(s, lambda sig, _: (_ for _ in ()).throw(SystemExit(sig)))
26 |
27 | # run main logic with handlers
28 | try:
29 | end_log_and_exit(miniwdl_run_s3upload_inner())
30 | except SystemExit as exc:
31 | end_log_and_exit(exc.code)
32 | except KeyboardInterrupt:
33 | end_log_and_exit(int(signal.SIGINT))
34 | except BrokenPipeError:
35 | end_log_and_exit(int(signal.SIGPIPE))
36 |
37 |
38 | def end_log_and_exit(code):
39 | print(
40 | "\n" + END_OF_LOG,
41 | file=sys.stderr,
42 | )
43 | sys.exit(code)
44 |
45 |
46 | def miniwdl_run_s3upload_inner():
47 | parser = argparse.ArgumentParser(
48 | prog="miniwdl-run-s3upload",
49 | description="Pass through arguments to `miniwdl run` and afterwards, upload outputs to S3 and optionally delete local run directory.",
50 | usage="miniwdl-run-s3upload [miniwdl_run_arg ...]",
51 | allow_abbrev=False,
52 | )
53 | parser.add_argument(
54 | "--s3upload",
55 | help="s3://bucket/folder/ at which to upload run outputs [env MINIWDL__AWS__S3_UPLOAD_FOLDER]",
56 | )
57 | parser.add_argument(
58 | "--delete-after",
59 | choices=("always", "success", "failure"),
60 | help="with --s3upload, delete EFS run directory afterwards [env MINIWDL__AWS__S3_UPLOAD_DELETE_AFTER]",
61 | )
62 | parser.add_argument(
63 | "--task-queue", help="AWS Batch job queue for task jobs [env MINIWDL__AWS__TASK_QUEUE]"
64 | )
65 |
66 | args, unused_args = parser.parse_known_args(sys.argv[1:])
67 | args.s3upload = (
68 | args.s3upload if args.s3upload else os.environ.get("MINIWDL__AWS__S3_UPLOAD_FOLDER", None)
69 | )
70 | args.delete_after = (
71 | args.delete_after.strip().lower()
72 | if args.delete_after
73 | else os.environ.get("MINIWDL__AWS__S3_UPLOAD_DELETE_AFTER", None)
74 | )
75 | if args.delete_after and not args.s3upload:
76 | print("--delete-after requires --s3upload", file=sys.stderr)
77 | sys.exit(1)
78 |
79 | if args.s3upload:
80 | with tempfile.TemporaryDirectory() as tmpdir:
81 | testfile = os.path.join(tmpdir, ".test.miniwdl-run-s3upload")
82 | with open(testfile, "w") as outfile:
83 | print(
84 | "miniwdl-run-s3upload created this object to test bucket permissions.",
85 | file=outfile,
86 | )
87 | upload1(testfile, args.s3upload + ("/" if not args.s3upload.endswith("/") else ""))
88 |
89 | zip_arg = next((i for i, arg in enumerate(unused_args) if arg == "--WDL--ZIP--"), -1)
90 | if zip_arg >= 0:
91 | # get `miniwdl zip`ped WDL source code shipped to us by miniwdl-aws-submit
92 | unused_args[zip_arg] = get_wdl_zip()
93 |
94 | cmd = ["miniwdl", "run"] + unused_args
95 | if "--error-json" not in unused_args:
96 | cmd.append("--error-json")
97 | miniwdl_env = dict(os.environ)
98 | if args.task_queue: # pass through to BatchJob plugin via env var
99 | miniwdl_env["MINIWDL__AWS__TASK_QUEUE"] = args.task_queue
100 |
101 | # run miniwdl & tee its standard output
102 | miniwdl = subprocess_run_with_clean_exit(
103 | cmd, stdout=subprocess.PIPE, env=miniwdl_env, check=False
104 | )
105 | sys.stdout.buffer.write(miniwdl.stdout)
106 |
107 | if not args.s3upload:
108 | # nothing to do
109 | print(
110 | f"[miniwdl_run_s3upload] no setting for --s3upload / MINIWDL__AWS__S3_UPLOAD_FOLDER; exiting (code = {miniwdl.returncode})",
111 | file=sys.stderr,
112 | )
113 | return miniwdl.returncode
114 |
115 | # read miniwdl standard output JSON
116 | try:
117 | miniwdl_json = json.loads(miniwdl.stdout)
118 | run_dir = miniwdl_json["dir"]
119 | assert os.path.isdir(run_dir)
120 | except:
121 | print(
122 | f"[miniwdl_run_s3upload] no run directory in miniwdl standard output; exiting (code = {miniwdl.returncode})",
123 | file=sys.stderr,
124 | )
125 | return miniwdl.returncode
126 |
127 | # append miniwdl's run name to S3_UPLOAD_FOLDER (unless the latter ends in '/')
128 | s3_upload_folder = args.s3upload
129 | if not s3_upload_folder.endswith("/"):
130 | s3_upload_folder += "/" + os.path.basename(run_dir.rstrip("/")) + "/"
131 |
132 | # upload logs
133 | print(
134 | f"[miniwdl_run_s3upload] miniwdl exit code = {miniwdl.returncode}; uploading logs & outputs to {s3_upload_folder}",
135 | file=sys.stderr,
136 | )
137 | for p in (os.path.join(run_dir, fn) for fn in ("workflow.log", "task.log")):
138 | if os.path.isfile(p):
139 | upload1(p, s3_upload_folder)
140 |
141 | # upload error.json, and the std{out,err}_file it points to, if any
142 | error_json_file = os.path.join(run_dir, "error.json")
143 | if os.path.isfile(error_json_file):
144 | upload1(error_json_file, s3_upload_folder)
145 | reupload = False
146 | with open(error_json_file) as infile:
147 | error_json = json.load(infile)
148 | for std_key in ("stderr", "stdout"):
149 | std_file = error_json.get("cause", {}).get(std_key + "_file", None)
150 | if std_file and os.path.isfile(std_file):
151 | std_s3file = f"{s3_upload_folder}CommandFailed_{std_key}.txt"
152 | upload1(std_file, std_s3file)
153 | error_json["cause"][std_key + "_s3file"] = std_s3file
154 | reupload = True
155 | if reupload:
156 | with tempfile.NamedTemporaryFile() as tmp:
157 | tmp.write(json.dumps(error_json, indent=2).encode())
158 | tmp.flush()
159 | upload1(tmp.name, s3_upload_folder + "error.json")
160 |
161 | # upload output files, if any
162 | if os.path.isdir(os.path.join(run_dir, "out")):
163 | subprocess_run_with_clean_exit(
164 | [
165 | "aws",
166 | "s3",
167 | "sync",
168 | "--no-progress",
169 | "--follow-symlinks",
170 | os.path.join(run_dir, "out"),
171 | s3_upload_folder,
172 | ],
173 | check=True,
174 | )
175 |
176 | if "outputs" not in miniwdl_json:
177 | if args.delete_after in ("always", "failure"):
178 | shutil.rmtree(run_dir)
179 | print(
180 | f"[miniwdl_run_s3upload] deleted {run_dir}",
181 | file=sys.stderr,
182 | )
183 | return miniwdl.returncode
184 |
185 | # recursively rewrite outputs JSON
186 | def rewrite(v):
187 | if v and isinstance(v, str) and v[0] == "/" and os.path.exists(v):
188 | # miniwdl writes File/Directory outputs with absolute paths
189 | return rebase_output_path(v, run_dir, s3_upload_folder)
190 | if isinstance(v, list):
191 | return [rewrite(u) for u in v]
192 | if isinstance(v, dict):
193 | return dict((k, rewrite(u)) for (k, u) in v.items())
194 | return v
195 |
196 | rewritten_outputs = rewrite(miniwdl_json["outputs"])
197 | outputs_s3_json = os.path.join(run_dir, "outputs.s3.json")
198 | with open(outputs_s3_json + ".tmp", "w") as outfile:
199 | print(json.dumps(rewritten_outputs, indent=2), file=outfile)
200 | os.rename(outputs_s3_json + ".tmp", outputs_s3_json)
201 | upload1(outputs_s3_json, s3_upload_folder + "outputs.json")
202 | print(
203 | f"[miniwdl_run_s3upload] uploaded {s3_upload_folder}outputs.json",
204 | file=sys.stderr,
205 | )
206 | print(json.dumps({"s3upload": s3_upload_folder, "outputs": rewritten_outputs}, indent=2))
207 | if args.delete_after in ("always", "success"):
208 | shutil.rmtree(run_dir)
209 | print(
210 | f"[miniwdl_run_s3upload] deleted {run_dir}",
211 | file=sys.stderr,
212 | )
213 |
214 | return miniwdl.returncode
215 |
216 |
217 | def upload1(fn, dest):
218 | subprocess_run_with_clean_exit(["aws", "s3", "cp", "--no-progress", fn, dest], check=True)
219 |
220 |
221 | def rebase_output_path(fn, run_dir, s3_upload_folder):
222 | """
223 | Given extant filename `fn` from JSON outputs and the current run directory, figure the uploaded
224 | S3 URI under s3_upload_folder, where the file should be uploaded by our `aws s3 sync` operation
225 | on the "run out" directory. Or return fn unmodified if it seems to be something that looks like
226 | an output path, but isn't really.
227 |
228 | Subtlety: if the output fn originated from the call cache, it will be from some other run
229 | directory, not the current one. In that case we need to see that there's a corresponding link
230 | under the current run out directory.
231 |
232 | There should be no danger of inadvertently uploading non-output files (e.g. if the workflow
233 | outputs the string "/home/root/.ssh/id_rsa") because we're not actually performing the upload,
234 | just figuring the path where `aws s3 sync` ought to have uploaded it.
235 | """
236 | fn_parts = fn.strip("/").split("/")
237 | while fn_parts:
238 | fn_rel = "/".join(fn_parts)
239 | fn_rebased = os.path.join(run_dir, "out", fn_rel)
240 | if os.path.exists(fn_rebased) and os.path.isdir(fn) == os.path.isdir(fn_rebased):
241 | return s3_upload_folder + fn_rel
242 | fn_parts = fn_parts[1:]
243 | return fn
244 |
245 |
246 | def get_wdl_zip():
247 | """
248 | Load `miniwdl zip`ped WDL source code shipped to us by miniwdl-aws-submit, encoded in the
249 | environment variable WDL_ZIP
250 | """
251 |
252 | encoded_zip = os.environ["WDL_ZIP"]
253 | if len(encoded_zip) >= 4096:
254 | # Look for spillover in job & job def tags
255 | job_desc = json.loads(
256 | subprocess_run_with_clean_exit(
257 | ["aws", "batch", "describe-jobs", "--jobs", os.environ["AWS_BATCH_JOB_ID"]],
258 | stdout=subprocess.PIPE,
259 | check=True,
260 | ).stdout
261 | )["jobs"][0]
262 | job_tags = job_desc["tags"]
263 | job_def_tags = json.loads(
264 | subprocess_run_with_clean_exit(
265 | [
266 | "aws",
267 | "batch",
268 | "describe-job-definitions",
269 | "--job-definitions",
270 | job_desc["jobDefinition"],
271 | ],
272 | stdout=subprocess.PIPE,
273 | check=True,
274 | ).stdout
275 | )["jobDefinitions"][0]["tags"]
276 | # if no job_def_tags, then there shouldn't be job_tags either
277 | assert job_def_tags or not job_tags
278 | for tags in (job_def_tags, job_tags):
279 | for key in sorted(tags.keys()):
280 | if key.startswith("WZ") and len(key) > 3:
281 | encoded_zip += key[3:] + tags[key]
282 |
283 | import base64
284 | import lzma
285 |
286 | zip_bytes = lzma.decompress(base64.urlsafe_b64decode(encoded_zip), format=lzma.FORMAT_ALONE)
287 | fd, fn = tempfile.mkstemp(suffix=".zip", prefix="wdl_")
288 | os.write(fd, zip_bytes)
289 | os.close(fd)
290 | return fn
291 |
--------------------------------------------------------------------------------
/miniwdl_aws/cli_submit.py:
--------------------------------------------------------------------------------
1 | """
2 | miniwdl-aws-submit CLI entry point (console script) to submit a miniwdl "workflow job" to an AWS
3 | Batch queue, which will invoke miniwdl-run-s3upload to run the workflow (spawning additional Batch
4 | jobs as needed to execute tasks). This is typically used on-laptop to kick off workflows, without
5 | the laptop needing to stay on/connected. It can also wait for the workflow job to complete and
6 | stream its logs.
7 | """
8 |
9 | import sys
10 | import os
11 | import time
12 | import argparse
13 | import shlex
14 | from datetime import datetime
15 | from collections import defaultdict
16 | import boto3
17 | from ._util import detect_aws_region, randomize_job_name, END_OF_LOG, efs_id_from_access_point
18 |
19 |
20 | def miniwdl_submit_awsbatch(argv):
21 | # Configure from arguments/environment/tags
22 | args, unused_args = parse_args(argv)
23 | verbose = (
24 | args.follow or args.self_test or "--verbose" in unused_args or "--debug" in unused_args
25 | )
26 | detect_env_args(args)
27 | if verbose:
28 | print("Workflow job queue: " + args.workflow_queue, file=sys.stderr)
29 |
30 | aws_region_name = detect_aws_region(None)
31 | if not aws_region_name:
32 | print(
33 | "Failed to detect AWS region; configure AWS CLI or set environment AWS_DEFAULT_REGION",
34 | file=sys.stderr,
35 | )
36 | sys.exit(1)
37 | aws_batch = boto3.client("batch", region_name=aws_region_name)
38 | detect_tags_args(aws_batch, args)
39 |
40 | if verbose:
41 | print("Task job queue: " + args.task_queue, file=sys.stderr)
42 | if args.efs:
43 | print("Workflow IAM role ARN: " + args.workflow_role, file=sys.stderr)
44 | print("EFS Access Point: " + args.fsap, file=sys.stderr)
45 |
46 | fs_id = None
47 | if args.efs:
48 | fs_id = efs_id_from_access_point(aws_region_name, args.fsap)
49 | if verbose:
50 | print("EFS: " + fs_id, file=sys.stderr)
51 |
52 | # Prepare workflow job: command, environment, and container properties
53 | job_name, miniwdl_run_cmd, wdl_zip = form_miniwdl_run_cmd(args, unused_args, verbose)
54 | job_name = randomize_job_name(job_name)
55 | if verbose:
56 | print("Workflow job image: " + args.image, file=sys.stderr)
57 | print("Invocation: " + " ".join(shlex.quote(s) for s in miniwdl_run_cmd), file=sys.stderr)
58 | (
59 | workflow_container_props,
60 | workflow_container_overrides,
61 | job_def_tags,
62 | job_tags,
63 | ) = form_workflow_container_props(args, miniwdl_run_cmd, fs_id, wdl_zip, verbose)
64 |
65 | # Register & submit workflow job
66 | try:
67 | workflow_job_def = aws_batch.register_job_definition(
68 | jobDefinitionName=job_name,
69 | platformCapabilities=["FARGATE" if args.efs else "EC2"],
70 | type="container",
71 | containerProperties=workflow_container_props,
72 | tags=job_def_tags,
73 | )
74 | except BaseException as exc:
75 | if wdl_zip and "JobDefinition size must be less than" in str(exc):
76 | print(_WDL_ZIP_SIZE_MSG, file=sys.stderr)
77 | sys.exit(123)
78 | raise
79 | workflow_job_def_handle = (
80 | f"{workflow_job_def['jobDefinitionName']}:{workflow_job_def['revision']}"
81 | )
82 | try:
83 | workflow_job_id = aws_batch.submit_job(
84 | jobName=job_name,
85 | jobQueue=args.workflow_queue,
86 | jobDefinition=workflow_job_def_handle,
87 | containerOverrides=workflow_container_overrides,
88 | tags=job_tags,
89 | )["jobId"]
90 | if verbose:
91 | print(f"Submitted {job_name} to {args.workflow_queue}:", file=sys.stderr)
92 | sys.stderr.flush()
93 | print(workflow_job_id)
94 | if not sys.stdout.isatty():
95 | print(workflow_job_id, file=sys.stderr)
96 | finally:
97 | aws_batch.deregister_job_definition(jobDefinition=workflow_job_def_handle)
98 |
99 | # Wait for workflow job, if requested
100 | exit_code = 0
101 | if args.wait or args.follow:
102 | exit_code = wait(
103 | aws_region_name,
104 | aws_batch,
105 | workflow_job_id,
106 | args.follow,
107 | expect_log_eof=not args.self_test,
108 | )
109 | sys.exit(exit_code)
110 |
111 |
112 | def parse_args(argv):
113 | if "COLUMNS" not in os.environ:
114 | os.environ["COLUMNS"] = "100"
115 | parser = argparse.ArgumentParser(
116 | prog="miniwdl-aws-submit",
117 | description="Launch `miniwdl run` on AWS Batch (+ EFS at /mnt/efs), itself launching additional"
118 | " Batch jobs to execute WDL tasks. Passed-through arguments to `miniwdl run` should refer to"
119 | " s3:// or /mnt/efs/ input paths, rather than the local filesystem.",
120 | usage="miniwdl-aws-submit [miniwdl_run_arg ...] --workflow-queue WORKFLOW_QUEUE",
121 | allow_abbrev=False,
122 | )
123 | group = parser.add_argument_group("AWS Batch")
124 | group.add_argument(
125 | "--workflow-queue",
126 | help="job queue for workflow job [env MINIWDL__AWS__WORKFLOW_QUEUE]",
127 | )
128 | group.add_argument(
129 | "--task-queue",
130 | help="job queue for task jobs [env MINIWDL__AWS__TASK_QUEUE"
131 | " or detect from DefaultTaskQueue tag on workflow job queue]",
132 | )
133 | group.add_argument(
134 | "--task-queue-fallback",
135 | help="job queue for task jobs following runtime.preemptible spot interruptions [env"
136 | " MINIWDL__AWS__TASK_QUEUE_FALLBACK or detect from DefaultTaskQueueFallback tag on workflow job queue]",
137 | )
138 | group.add_argument(
139 | "--fsap",
140 | help="EFS Access Point ID (fsap-xxxx) for mounting [env MINIWDL__AWS__FSAP"
141 | " or detect from DefaultFsap tag on workflow job queue]",
142 | )
143 | group.add_argument(
144 | "--no-efs",
145 | "--no-EFS",
146 | action="store_false",
147 | dest="efs",
148 | help="instead of EFS, expect EC2 compute environments to automatically mount some other shared filesystem [env MINIWDL__AWS__FS=0]",
149 | )
150 | group.add_argument(
151 | "--mount",
152 | default=None,
153 | help="shared filesystem mount point in all containers [/mnt/efs or /mnt/net]",
154 | )
155 | group = parser.add_argument_group("Workflow job provisioning")
156 | group.add_argument(
157 | "--workflow-role",
158 | help="ARN of execution+job role for workflow job [env MINIWDL__AWS__WORKFLOW_ROLE"
159 | " or detect from WorkflowEngineRoleArn tag on workflow job queue]",
160 | )
161 | group.add_argument("--name", help="workflow job name [WDL filename]")
162 | group.add_argument(
163 | "--cpu", metavar="N", type=str, default="1", help="vCPUs for workflow job [1]"
164 | )
165 | group.add_argument(
166 | "--memory-GiB", metavar="N", type=int, default=4, help="memory for workflow job [4]"
167 | )
168 | group.add_argument(
169 | "--image",
170 | help="override miniwdl-aws Docker image tag for workflow job [env MINIWDL__AWS__WORKFLOW_IMAGE]",
171 | )
172 | group.add_argument(
173 | "--no-env", action="store_true", help="don't pass through MINIWDL__* environment variables"
174 | )
175 | group.add_argument(
176 | "--no-public-ip",
177 | action="store_true",
178 | help="don't assign public IP (workflow compute env has private subnet & NAT)",
179 | )
180 | group = parser.add_argument_group("miniwdl I/O")
181 | group.add_argument(
182 | "--dir",
183 | default=None,
184 | help="run directory prefix [{mount}/miniwdl_run or {mount}/miniwdl_run]",
185 | )
186 | group.add_argument(
187 | "--s3upload",
188 | help="s3://bucket/folder/ at which to upload run outputs (otherwise left on shared filesystem)",
189 | )
190 | group.add_argument(
191 | "--delete-after",
192 | choices=("always", "success", "failure"),
193 | help="with --s3upload, delete run directory afterwards",
194 | )
195 | parser.add_argument(
196 | "--wait", "-w", action="store_true", help="wait for workflow job to complete"
197 | )
198 | parser.add_argument(
199 | "--follow",
200 | "-f",
201 | action="store_true",
202 | help="live-stream workflow log to standard error (implies --wait)",
203 | )
204 | parser.add_argument("--self-test", action="store_true", help="perform `miniwdl run_self_test`")
205 |
206 | args, unused_args = parser.parse_known_args(argv[1:])
207 |
208 | if os.environ.get("MINIWDL__AWS__FS", "").strip().lower() in ("false", "f", "0", "no", "n"):
209 | args.efs = False
210 | if not args.mount:
211 | args.mount = "/mnt/efs" if args.efs else "/mnt/net"
212 | if args.mount.endswith("/"):
213 | args.mount = args.mount[:-1]
214 | assert args.mount
215 | if not args.dir:
216 | args.dir = os.path.join(args.mount, "miniwdl_run")
217 | if not args.dir.startswith(args.mount):
218 | print(f"--dir must begin with {args.mount}", file=sys.stderr)
219 | sys.exit(1)
220 |
221 | return (args, unused_args)
222 |
223 |
224 | def detect_env_args(args):
225 | """
226 | Detect configuration set through environment variables (that weren't set by command-line args)
227 | """
228 | args.fsap = args.fsap if args.fsap else os.environ.get("MINIWDL__AWS__FSAP", "")
229 | args.workflow_queue = (
230 | args.workflow_queue
231 | if args.workflow_queue
232 | else os.environ.get("MINIWDL__AWS__WORKFLOW_QUEUE", None)
233 | )
234 | if not args.workflow_queue:
235 | print(
236 | "--workflow-queue is required (or environment variable MINIWDL__AWS__WORKFLOW_QUEUE)",
237 | file=sys.stderr,
238 | )
239 | sys.exit(1)
240 | args.fsap = args.fsap if args.fsap else os.environ.get("MINIWDL__AWS__FSAP", "")
241 | args.task_queue = (
242 | args.task_queue if args.task_queue else os.environ.get("MINIWDL__AWS__TASK_QUEUE", None)
243 | )
244 | if not args.task_queue_fallback:
245 | args.task_queue_fallback = os.environ.get("MINIWDL__AWS__TASK_QUEUE_FALLBACK", None)
246 | args.workflow_role = (
247 | args.workflow_role
248 | if args.workflow_role
249 | else os.environ.get("MINIWDL__AWS__WORKFLOW_ROLE", None)
250 | )
251 | args.image = args.image if args.image else os.environ.get("MINIWDL__AWS__WORKFLOW_IMAGE", None)
252 | if not args.image:
253 | # version-matched default image from our GitHub build
254 | import importlib_metadata
255 |
256 | try:
257 | args.image = "ghcr.io/miniwdl-ext/miniwdl-aws:v" + importlib_metadata.version(
258 | "miniwdl-aws"
259 | )
260 | except importlib_metadata.PackageNotFoundError:
261 | print(
262 | "Failed to detect miniwdl Docker image version tag; set explicitly with --image or MINIWDL__AWS__WORKFLOW_IMAGE",
263 | file=sys.stderr,
264 | )
265 | sys.exit(1)
266 |
267 | if args.delete_after and not args.s3upload:
268 | print("--delete-after requires --s3upload", file=sys.stderr)
269 | sys.exit(1)
270 | args.s3upload = (
271 | args.s3upload if args.s3upload else os.environ.get("MINIWDL__AWS__S3_UPLOAD_FOLDER", None)
272 | )
273 | args.delete_after = (
274 | args.delete_after.strip().lower()
275 | if args.delete_after
276 | else os.environ.get("MINIWDL__AWS__DELETE_AFTER_S3_UPLOAD", None)
277 | )
278 |
279 |
280 | def detect_tags_args(aws_batch, args):
281 | """
282 | If not otherwise set by command line arguments or environment, inspect tags of the workflow job
283 | queue to detect default task job queue and (if applicable) EFS Access Point ID and workflow
284 | role ARN. Infra provisioning (CloudFormation, Terraform, etc.) may have set the expected tags.
285 | """
286 | if not args.task_queue or (args.efs and not (args.fsap or args.workflow_role)):
287 | workflow_queue_tags = aws_batch.describe_job_queues(jobQueues=[args.workflow_queue])[
288 | "jobQueues"
289 | ][0]["tags"]
290 | if not args.task_queue:
291 | args.task_queue = workflow_queue_tags.get("DefaultTaskQueue", None)
292 | if not args.task_queue:
293 | print(
294 | "Unable to detect default task job queue name from DefaultTaskQueue tag of workflow job queue."
295 | " Set --task-queue or environment variable MINIWDL__AWS__TASK_QUEUE.",
296 | file=sys.stderr,
297 | )
298 | sys.exit(1)
299 | if not args.task_queue_fallback:
300 | args.task_queue_fallback = workflow_queue_tags.get("DefaultTaskQueueFallback", None)
301 | if args.efs and not args.fsap:
302 | try:
303 | args.fsap = workflow_queue_tags["DefaultFsap"]
304 | assert args.fsap.startswith("fsap-")
305 | except:
306 | if not args.fsap:
307 | print(
308 | "Unable to detect default EFS Access Point (fsap-xxxx) from DefaultFsap tag of workflow job queue."
309 | " Set --fsap or environment variable MINIWDL__AWS__FSAP.",
310 | file=sys.stderr,
311 | )
312 | sys.exit(1)
313 | if args.efs and not args.workflow_role:
314 | # Workflow role ARN is needed for Fargate Batch (unlike EC2 Batch, where a role is
315 | # associated with the EC2 instance profile in the compute environment).
316 | try:
317 | args.workflow_role = aws_batch.describe_job_queues(jobQueues=[args.workflow_queue])[
318 | "jobQueues"
319 | ][0]["tags"]["WorkflowEngineRoleArn"]
320 | assert args.workflow_role.startswith("arn:aws:iam::")
321 | except:
322 | if not args.workflow_role:
323 | print(
324 | "Unable to detect ARN of workflow engine IAM role from WorkflowEngineRoleArn tag of workflow job queue."
325 | " Double-check --workflow-queue, or set --workflow-role or environment MINIWDL__AWS__WORKFLOW_ROLE.",
326 | file=sys.stderr,
327 | )
328 | sys.exit(1)
329 |
330 |
331 | def form_miniwdl_run_cmd(args, unused_args, verbose=False):
332 | """
333 | Formulate the `miniwdl run` command line to be invoked in the workflow job container
334 | """
335 | wdl_zip = None
336 | if args.self_test:
337 | self_test_dir = os.path.join(
338 | args.mount, "miniwdl_run_self_test", datetime.today().strftime("%Y%m%d_%H%M%S")
339 | )
340 | miniwdl_run_cmd = ["miniwdl", "run_self_test", "--dir", self_test_dir]
341 | job_name = args.name if args.name else "miniwdl_run_self_test"
342 | else:
343 | wdl_filename_pos = next(
344 | (i for i, arg in enumerate(unused_args) if not arg.startswith("-")), -1
345 | )
346 | if wdl_filename_pos < 0:
347 | print("Command line appears to be missing WDL filename", file=sys.stderr)
348 | sys.exit(1)
349 | wdl_filename = unused_args[wdl_filename_pos]
350 | wdl_zip = zip_wdl(wdl_filename, args.mount, verbose)
351 | if wdl_zip:
352 | # this sentinel argument will be recognized by miniwdl-run-s3upload
353 | unused_args[wdl_filename_pos] = "--WDL--ZIP--"
354 | job_name = args.name
355 | if not job_name:
356 | job_name = os.path.basename(wdl_filename).lstrip(".")
357 | try:
358 | for punct in (".", "?"):
359 | if job_name.index(punct) > 0:
360 | job_name = job_name[: job_name.index(punct)]
361 | except ValueError:
362 | pass
363 | job_name = ("miniwdl_run_" + job_name)[:128]
364 | # pass most arguments through to miniwdl-run-s3upload inside workflow job
365 | miniwdl_run_cmd = ["miniwdl-run-s3upload"] + unused_args
366 | miniwdl_run_cmd.extend(["--dir", args.dir])
367 | miniwdl_run_cmd.extend(["--s3upload", args.s3upload] if args.s3upload else [])
368 | miniwdl_run_cmd.extend(["--delete-after", args.delete_after] if args.delete_after else [])
369 | return (job_name, miniwdl_run_cmd, wdl_zip)
370 |
371 |
372 | def zip_wdl(wdl_filename, mount, verbose=False):
373 | """
374 | If wdl_filename is an existing local .wdl or .zip file, prepare to ship it as the WDL source
375 | code for the workflow job to execute. (Otherwise, it'll be passed through assuming it's some
376 | path or URI the workflow job will be able to open directly.)
377 |
378 | If it's a .zip file, assume it's generated by `miniwdl zip`.
379 |
380 | If it's a .wdl file, run `miniwdl zip` on it.
381 | """
382 | if not os.path.isfile(wdl_filename) or not (
383 | wdl_filename.endswith(".wdl") or wdl_filename.endswith(".zip")
384 | ):
385 | if verbose:
386 | print(
387 | f"WDL: {wdl_filename} (not a local WDL file; assuming accessible inside workflow job)"
388 | )
389 | return None
390 | if os.path.normpath(os.path.abspath(wdl_filename)).startswith(mount + "/"):
391 | if verbose:
392 | print(f"WDL: {wdl_filename} (assuming {mount} accessible inside workflow job)")
393 | return None
394 |
395 | # load zip bytes
396 | if wdl_filename.endswith(".wdl"):
397 | import subprocess
398 | import tempfile
399 |
400 | try:
401 | with tempfile.TemporaryDirectory() as tmpdir:
402 | zip_fn = os.path.join(tmpdir, os.path.basename(wdl_filename)) + ".zip"
403 | subprocess.check_call(["miniwdl", "zip", "-o", zip_fn, wdl_filename])
404 | with open(zip_fn, "rb") as zip_file:
405 | zip_bytes = zip_file.read()
406 | # TODO: detect -i file.json in unused_args and provide it to miniwdl zip too
407 | except subprocess.CalledProcessError as exn:
408 | sys.exit(exn.returncode)
409 | else:
410 | assert wdl_filename.endswith(".zip")
411 | with open(wdl_filename, "rb") as zip_file:
412 | zip_bytes = zip_file.read()
413 | assert zip_bytes, "empty WDL zip"
414 |
415 | # aggressively compress, to maximize chance of fitting within the 30KiB limit on Batch
416 | # SubmitJob request: https://docs.aws.amazon.com/batch/latest/userguide/service_limits.html
417 | import base64
418 | import lzma
419 |
420 | zip_str = base64.urlsafe_b64encode(
421 | lzma.compress(zip_bytes, format=lzma.FORMAT_ALONE, preset=(9 | lzma.PRESET_EXTREME))
422 | ).decode("ascii")
423 | if verbose:
424 | print(
425 | f"WDL/ZIP: {wdl_filename} (encoded as {len(zip_str)} bytes to submit with workflow job)",
426 | file=sys.stderr,
427 | )
428 | return zip_str
429 |
430 |
431 | def form_workflow_container_props(args, miniwdl_run_cmd, fs_id, wdl_zip=None, verbose=False):
432 | environment = [
433 | {"name": "MINIWDL__AWS__TASK_QUEUE", "value": args.task_queue},
434 | {"name": "MINIWDL__FILE_IO__ROOT", "value": args.mount},
435 | ]
436 | if args.task_queue_fallback:
437 | environment.append(
438 | {"name": "MINIWDL__AWS__TASK_QUEUE_FALLBACK", "value": args.task_queue_fallback}
439 | )
440 | if args.efs:
441 | environment.append({"name": "MINIWDL__AWS__FS", "value": fs_id})
442 | environment.append({"name": "MINIWDL__AWS__FSAP", "value": args.fsap})
443 | else:
444 | environment.append(
445 | {"name": "MINIWDL__SCHEDULER__CONTAINER_BACKEND", "value": "aws_batch_job_no_efs"}
446 | )
447 | extra_env = set()
448 | if not args.no_env:
449 | # pass through environment variables starting with MINIWDL__ (except those specific to
450 | # workflow job launch, or passed through via command line)
451 | for k in os.environ:
452 | if k.startswith("MINIWDL__") and k not in (
453 | "MINIWDL__AWS__FS",
454 | "MINIWDL__AWS__FSAP",
455 | "MINIWDL__AWS__TASK_QUEUE",
456 | "MINIWDL__AWS__TASK_QUEUE_FALLBACK",
457 | "MINIWDL__AWS__WORKFLOW_QUEUE",
458 | "MINIWDL__AWS__WORKFLOW_ROLE",
459 | "MINIWDL__AWS__WORKFLOW_IMAGE",
460 | "MINIWDL__AWS__S3_UPLOAD_FOLDER",
461 | "MINIWDL__AWS__S3_UPLOAD_DELETE_AFTER",
462 | "MINIWDL__FILE_IO__ROOT",
463 | ):
464 | environment.append({"name": k, "value": os.environ[k]})
465 | extra_env.add(k)
466 |
467 | if verbose and extra_env:
468 | print(
469 | "Passing through environment variables (--no-env to disable): "
470 | + " ".join(list(extra_env)),
471 | file=sys.stderr,
472 | )
473 |
474 | workflow_container_props = {
475 | "image": args.image,
476 | "resourceRequirements": [
477 | {"type": "VCPU", "value": str(args.cpu)},
478 | {"type": "MEMORY", "value": str(args.memory_GiB * 1024)},
479 | ],
480 | "environment": [],
481 | }
482 | job_def_tags = {}
483 | job_tags = {}
484 | if wdl_zip:
485 | # If the command line provided a local WDL (or WDL zipped by `miniwdl zip`), ship it in the
486 | # workflow job environment, to be picked up by miniwdl-run-s3upload. If the encoded zip is
487 | # over 4096 characters, then spray the remainder across tags on the workflow job definition
488 | # and workflow job itself. The 4KiB keeps our container properties (+overrides) within AWS'
489 | # 8KiB limit. Then we use up to 42 tags on the job def & job, each with 381 usable bytes
490 | # (within the AWS limits of 50 tags per resource with key length 128 and value length 256).
491 | # Total capacity = 4096 + 2*42*381 = 36100 characters.
492 | workflow_container_props["environment"].append({"name": "WDL_ZIP", "value": wdl_zip[:4096]})
493 | wdl_zip = wdl_zip[4096:]
494 | tag_num = 0
495 | while wdl_zip:
496 | if tag_num >= 84:
497 | print(_WDL_ZIP_SIZE_MSG, file=sys.stderr)
498 | sys.exit(123)
499 | tag_key = (
500 | "WZ"
501 | + chr((ord("A") if tag_num % 42 < 26 else (ord("a") - 26)) + tag_num % 42)
502 | + wdl_zip[:125]
503 | )
504 | tag_value = wdl_zip[125:381]
505 | wdl_zip = wdl_zip[381:]
506 | if tag_num < 42:
507 | job_def_tags[tag_key] = tag_value
508 | else:
509 | job_tags[tag_key] = tag_value
510 | tag_num += 1
511 | workflow_container_overrides = {
512 | "command": miniwdl_run_cmd,
513 | "environment": environment,
514 | }
515 | if args.efs:
516 | # EFS: set EFS volume/mountPoint and Fargate execution role
517 | assert args.workflow_role and fs_id and args.fsap
518 | workflow_container_props.update(
519 | {
520 | "fargatePlatformConfiguration": {"platformVersion": "1.4.0"},
521 | "executionRoleArn": args.workflow_role,
522 | "jobRoleArn": args.workflow_role,
523 | "volumes": [
524 | {
525 | "name": "efs",
526 | "efsVolumeConfiguration": {
527 | "fileSystemId": fs_id,
528 | "transitEncryption": "ENABLED",
529 | "authorizationConfig": {"accessPointId": args.fsap},
530 | },
531 | }
532 | ],
533 | "mountPoints": [{"containerPath": args.mount, "sourceVolume": "efs"}],
534 | }
535 | )
536 | if not args.no_public_ip:
537 | workflow_container_props["networkConfiguration"] = {"assignPublicIp": "ENABLED"}
538 | else:
539 | # non-EFS: set volume/mountPoint assuming compute environments mount automatically
540 | workflow_container_props.update(
541 | {
542 | "volumes": [
543 | {
544 | "name": "file_io_root",
545 | "host": {"sourcePath": args.mount},
546 | }
547 | ],
548 | "mountPoints": [{"containerPath": args.mount, "sourceVolume": "file_io_root"}],
549 | }
550 | )
551 |
552 | return (workflow_container_props, workflow_container_overrides, job_def_tags, job_tags)
553 |
554 |
555 | def wait(aws_region_name, aws_batch, workflow_job_id, follow, expect_log_eof=True):
556 | """
557 | Wait for workflow job to complete & return its exit code; optionally tail its log to stderr
558 | """
559 | try:
560 | log_follower = None
561 | exit_code = None
562 | saw_end = False
563 | while exit_code is None:
564 | time.sleep(1.0)
565 | job_descs = aws_batch.describe_jobs(jobs=[workflow_job_id])
566 | job_desc = job_descs["jobs"][0]
567 | if (
568 | not log_follower
569 | and "container" in job_desc
570 | and "logStreamName" in job_desc["container"]
571 | ):
572 | log_stream_name = job_desc["container"]["logStreamName"]
573 | print("Log stream: " + log_stream_name, file=sys.stderr)
574 | sys.stderr.flush()
575 | log_follower = CloudWatchLogsFollower(
576 | boto3.DEFAULT_SESSION, aws_region_name, "/aws/batch/job", log_stream_name
577 | )
578 | if follow and log_follower:
579 | for event in log_follower.new_events():
580 | if END_OF_LOG not in event["message"]:
581 | print(event["message"], file=sys.stderr)
582 | else:
583 | saw_end = True
584 | sys.stderr.flush()
585 | if job_desc["status"] == "SUCCEEDED":
586 | exit_code = 0
587 | elif job_desc["status"] == "FAILED":
588 | exit_code = -1
589 | if "container" in job_desc and "exitCode" in job_desc["container"]:
590 | exit_code = job_desc["container"]["exitCode"]
591 | assert exit_code != 0
592 | if expect_log_eof and follow and log_follower and not saw_end:
593 | # give straggler log messages a few seconds to appear
594 | time.sleep(3.0)
595 | for event in log_follower.new_events():
596 | if END_OF_LOG not in event["message"]:
597 | print(event["message"], file=sys.stderr)
598 | else:
599 | saw_end = True
600 | if not saw_end:
601 | print(
602 | f"[miniwdl-aws-submit] WARNING: end-of-log marker not seen; more information may appear in log stream {log_stream_name}",
603 | file=sys.stderr,
604 | )
605 | sys.stderr.flush()
606 | status = job_desc["status"]
607 | reason = job_desc.get("statusReason", "")
608 | if reason:
609 | reason = (
610 | f"\t{reason}" if reason and reason != "Essential container in task exited" else ""
611 | )
612 | print(status + "\t" + workflow_job_id + reason, file=sys.stderr)
613 | if status == "FAILED" and "Container Overrides length must be at most" in reason:
614 | print(_WDL_ZIP_SIZE_MSG, file=sys.stderr)
615 | exit_code = 123
616 | assert isinstance(exit_code, int) and (exit_code != 0 or status == "SUCCEEDED")
617 | return exit_code
618 | except KeyboardInterrupt:
619 | print(
620 | "[miniwdl-aws-submit] interrupted by Ctrl-C; workflow job probably remains active. To terminate:\n"
621 | f" aws batch terminate-job --reason abort --job-id {workflow_job_id}",
622 | file=sys.stderr,
623 | )
624 | return -1
625 |
626 |
627 | class CloudWatchLogsFollower:
628 | # Based loosely on:
629 | # https://github.com/aws/aws-cli/blob/v2/awscli/customizations/logs/tail.py
630 | # which wasn't suitable to use directly at the time of this writing, because of
631 | # https://github.com/aws/aws-cli/issues/5560
632 | def __init__(self, boto_session, region_name, group_name, stream_name=None):
633 | self.group_name = group_name
634 | self.stream_name = stream_name
635 | self._newest_timestamp = None
636 | self._newest_event_ids = set()
637 | self._client = boto_session.client("logs", region_name=region_name)
638 |
639 | def new_events(self):
640 | event_ids_per_timestamp = defaultdict(set)
641 |
642 | filter_args = {"logGroupName": self.group_name}
643 | if self.stream_name:
644 | filter_args["logStreamNames"] = [self.stream_name]
645 | if self._newest_timestamp:
646 | filter_args["startTime"] = self._newest_timestamp
647 | while True:
648 | try:
649 | response = self._client.filter_log_events(**filter_args)
650 | except self._client.exceptions.ResourceNotFoundException:
651 | return # we may learn the Batch job's log stream name before it actually exists
652 | for event in response["events"]:
653 | # For the case where we've hit the last page, we will be
654 | # reusing the newest timestamp of the received events to keep polling.
655 | # This means it is possible that duplicate log events with same timestamp
656 | # are returned back which we do not want to yield again.
657 | # We only want to yield log events that we have not seen.
658 | if event["eventId"] not in self._newest_event_ids:
659 | event_ids_per_timestamp[event["timestamp"]].add(event["eventId"])
660 | yield event
661 | if "nextToken" in response:
662 | filter_args["nextToken"] = response["nextToken"]
663 | else:
664 | break
665 |
666 | if event_ids_per_timestamp:
667 | self._newest_timestamp = max(event_ids_per_timestamp.keys())
668 | self._newest_event_ids = event_ids_per_timestamp[self._newest_timestamp]
669 |
670 |
671 | _WDL_ZIP_SIZE_MSG = (
672 | "\nExceeded AWS Batch request payload size limit; make the WDL source code and/or inputs"
673 | " available by URL or remote filesystem path, to pass by reference."
674 | )
675 |
--------------------------------------------------------------------------------
/plugin_log_task_usage/StressTest.wdl:
--------------------------------------------------------------------------------
1 | version 1.1
2 | # MINIWDL__LOG_TASK_USAGE__PERIOD=2 miniwdl run examples/plugin_log_task_usage/StressTest.wdl --dir /tmp --verbose
3 | # MINIWDL__LOG_TASK_USAGE__PERIOD=2 miniwdl-aws-submit plugin_log_task_usage/StressTest.wdl --verbose --follow
4 |
5 | task StressTest {
6 | input {
7 | Int cpu = 4
8 | Int memory_G = 2
9 | Int cpu_memory_duration_s = 10
10 | Int disk_load_G = 2
11 |
12 | String docker = "polinux/stress" # Docker image with stress tool
13 | }
14 |
15 | command <<<
16 | set -euxo pipefail
17 |
18 | >&2 ls -l /sys/fs/cgroup
19 |
20 | stress --cpu 4 --vm 1 --vm-bytes ~{memory_G}G --vm-hang 0 --timeout ~{cpu_memory_duration_s}s || true
21 | dd if=/dev/zero of=testfile bs=1G count=~{disk_load_G}
22 | sync
23 | cat testfile > /dev/null &
24 | sleep 5
25 | >>>
26 |
27 | runtime {
28 | docker: docker
29 | memory: "${memory_G*2}G"
30 | cpu: cpu
31 | }
32 |
33 | output {
34 | File stderr_txt = stderr()
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/plugin_log_task_usage/miniwdl_log_task_usage.py:
--------------------------------------------------------------------------------
1 | """
2 | miniwdl plugin instrumenting each task container to log its own CPU & memory resource usage
3 | periodically. The logs are written to the task's standard error stream, so they'll appear on the
4 | console only with --verbose logging (but are always recorded in each task's stderr.txt).
5 |
6 | To enable, install this plugin (`pip3 install .` & confirm listed by `miniwdl --version`) and
7 | set configuration [log_task_usage] period (or the environment variable
8 | MINIWDL__LOG_TASK_USAGE__PERIOD) to the desired logging period in seconds.
9 |
10 | YMMV because host OS version & configuration may affect availability of the cgroup counters read
11 | from pseudo-files under /sys/fs/cgroup
12 | """
13 |
14 |
15 | def main(cfg, logger, run_id, run_dir, task, **recv):
16 | # do nothing with inputs
17 | recv = yield recv
18 |
19 | # inject logger into command script
20 | if cfg.has_option("log_task_usage", "period"):
21 | period = cfg["log_task_usage"].get_int("period")
22 | recv["command"] = _logger_sh + f"_miniwdl_log_task_usage {period} &\n\n" + recv["command"]
23 | recv = yield recv
24 |
25 | # do nothing with outputs
26 | yield recv
27 |
28 |
29 | _logger_sh = r"""
30 | _miniwdl_log_task_usage() {
31 | set +ex
32 | local PERIOD_SECS=${1:-10} # logging period (default 10s)
33 |
34 | # detect whether host provides cgroup v2 or v1, and helper functions to read CPU & memory usage
35 | # counters from the appropriate pseudo-files
36 | local cgroup_version=""
37 | if [ -f /sys/fs/cgroup/cpu.stat ]; then
38 | cgroup_version=2
39 | elif [ -f /sys/fs/cgroup/cpuacct/cpuacct.stat ]; then
40 | cgroup_version=1
41 | else
42 | >&2 echo "miniwdl_log_task_usage unable to report: cgroup CPU usage counters not found"
43 | exit 1
44 | fi
45 |
46 | cpu_secs() {
47 | local ans
48 | if [ $cgroup_version -eq 2 ]; then
49 | ans=$(awk '/^usage_usec/ {print $2}' /sys/fs/cgroup/cpu.stat)
50 | echo $(( ans / 1000000 ))
51 | else
52 | ans=$(cut -f2 -d ' ' /sys/fs/cgroup/cpuacct/cpuacct.stat | head -n 1)
53 | echo $(( ans / 100 )) # 100 "jiffies" per second
54 | fi
55 | }
56 |
57 | mem_bytes() {
58 | if [ $cgroup_version -eq 2 ]; then
59 | awk '$1 == "anon" { print $2 }' /sys/fs/cgroup/memory.stat
60 | else
61 | awk -F ' ' '$1 == "total_rss" { print $2 }' /sys/fs/cgroup/memory/memory.stat
62 | fi
63 | }
64 |
65 | local T_0=$(date +%s)
66 | local t_last=$T_0
67 | local cpu_secs_0=$(cpu_secs)
68 | local cpu_secs_last=$cpu_secs_0
69 |
70 | while true; do
71 | sleep "$PERIOD_SECS"
72 | local t=$(date +%s)
73 | local wall_secs=$(( t - T_0 ))
74 |
75 | local cpu_secs_current=$(cpu_secs)
76 | local cpu_total_secs=$(( cpu_secs_current - cpu_secs_0 ))
77 | local cpu_period_secs=$(( cpu_secs_current - cpu_secs_last ))
78 |
79 | local mem_bytes_current=$(mem_bytes)
80 |
81 | >&2 echo "container usage :: cpu_pct: $(( 100 * cpu_period_secs / PERIOD_SECS )), mem_MiB: $(( mem_bytes_current/1048576 )), cpu_total_s: ${cpu_total_secs}, elapsed_s: ${wall_secs}"
82 |
83 | cpu_secs_last=$cpu_secs_current
84 | t_last=$t
85 | done
86 | }
87 | """
88 |
--------------------------------------------------------------------------------
/plugin_log_task_usage/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name="miniwdl_log_task_usage",
5 | version="0.1.0",
6 | description="miniwdl task plugin to log container cpu/mem usage",
7 | author="Wid L. Hacker",
8 | py_modules=["miniwdl_log_task_usage"],
9 | python_requires=">=3.6",
10 | setup_requires=["reentry"],
11 | install_requires=["miniwdl"],
12 | reentry_register=True,
13 | entry_points={
14 | "miniwdl.plugin.task": ["log_task_usage = miniwdl_log_task_usage:main"],
15 | },
16 | )
17 |
--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -euo pipefail
4 |
5 | HERE="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
6 | cd "$HERE"
7 |
8 | if grep dirty <(git describe --always --dirty); then
9 | >&2 echo "Cannot release dirty working tree"
10 | exit 1
11 | fi
12 |
13 | rm -rf build dist *.egg-info
14 | python3 setup.py sdist
15 | echo -e "\033[0;31;5m -- Pushing $(basename `ls -1 dist/*.tar.gz` .tar.gz) to PyPI! -- \033[0m"
16 | twine upload dist/*.tar.gz
17 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from version import get_version
3 |
4 | with open("README.md") as fp:
5 | long_description = fp.read()
6 |
7 | setup(
8 | name="miniwdl-aws",
9 | version=get_version(),
10 | description="miniwdl AWS backend (Batch+EFS)",
11 | long_description=long_description,
12 | long_description_content_type="text/markdown",
13 | author="Wid L. Hacker",
14 | python_requires=">=3.6",
15 | packages=find_packages(),
16 | setup_requires=["reentry"],
17 | install_requires=["miniwdl>=1.11.1", "boto3>=1.17", "requests"],
18 | reentry_register=True,
19 | entry_points={
20 | "miniwdl.plugin.container_backend": [
21 | "aws_batch_job = miniwdl_aws:BatchJob",
22 | "aws_batch_job_no_efs = miniwdl_aws:BatchJobNoEFS",
23 | ],
24 | "console_scripts": [
25 | "miniwdl-run-s3upload = miniwdl_aws:miniwdl_run_s3upload",
26 | "miniwdl-aws-submit = miniwdl_aws.__main__:main",
27 | ],
28 | },
29 | )
30 |
--------------------------------------------------------------------------------
/test/assets/count_lines.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 | workflow count_lines {
3 | input {
4 | Array[File] files
5 | }
6 | scatter (file in files) {
7 | Array[String] file_lines = read_lines(file)
8 | }
9 | output {
10 | Int lines = length(flatten(file_lines))
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/test/assets/test_call_cache.wdl:
--------------------------------------------------------------------------------
1 | version 1.1
2 |
3 | workflow test_call_cache {
4 | input {
5 | Array[String] names
6 | Int timestamp_in # set to test case start time, preventing use of stale cache test entries
7 | Boolean fail = false
8 | }
9 | scatter (name in names) {
10 | call write_name {
11 | input:
12 | name = name,
13 | timestamp_in = timestamp_in
14 | }
15 | call t {
16 | input:
17 | who = write_name.name_file,
18 | timestamp_in = timestamp_in
19 | }
20 | }
21 | if (fail) {
22 | call failer after t
23 | }
24 | output {
25 | Array[Int] timestamps_out = t.timestamp_out
26 | Array[File] messages = t.message
27 | }
28 | }
29 |
30 | task write_name {
31 | input {
32 | String name
33 | Int timestamp_in
34 | }
35 | command {
36 | cp '~{write_lines([name])}' name.txt
37 | }
38 | output {
39 | File name_file = "name.txt"
40 | Int timestamp_out = timestamp_in
41 | }
42 | }
43 |
44 | task t {
45 | input {
46 | File who
47 | Int timestamp_in
48 | }
49 | command <<<
50 | t=$(date +%s)
51 | echo "$t" > timestamp_out
52 | echo "Hello, $(cat ~{who})! @$t" | tee message.txt
53 | >>>
54 | output {
55 | Int timestamp_out = read_int("timestamp_out")
56 | File message = "message.txt"
57 | }
58 | }
59 |
60 | task failer {
61 | command {
62 | exit 1
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/test/assets/test_directory.wdl:
--------------------------------------------------------------------------------
1 | version development
2 |
3 | workflow test_directory_workflow {
4 | input {
5 | Array[String] names = ["Alice", "Bob", "Carol"]
6 | }
7 | call make_directory {
8 | input:
9 | names
10 | }
11 | call test_directory {
12 | input:
13 | dir = make_directory.dir
14 | }
15 | output {
16 | Directory dir = make_directory.dir
17 | File report = test_directory.report
18 | Int file_count = test_directory.file_count
19 | }
20 | }
21 |
22 | task make_directory {
23 | input {
24 | Array[String] names
25 | }
26 |
27 | File names_file = write_lines(names)
28 |
29 | command <<<
30 | mkdir messages
31 | while read -r name; do
32 | echo "Hello, $name!" > "messages/$name.txt"
33 | done < '~{names_file}'
34 | >>>
35 |
36 | output {
37 | Directory dir = "messages"
38 | }
39 | }
40 |
41 | task test_directory {
42 | input {
43 | Directory dir
44 | }
45 |
46 | command <<<
47 | find '~{dir}' -type f | xargs sha256sum > report.txt
48 | find '~{dir}' -type f | wc -l > file.count
49 | >>>
50 |
51 | output {
52 | File report = "report.txt"
53 | Int file_count = read_int("file.count")
54 | }
55 |
56 | runtime {
57 | docker: "ubuntu:22.04"
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/test/assets/test_nonexistent_docker.wdl:
--------------------------------------------------------------------------------
1 | version 1.1
2 |
3 | task t {
4 | input {
5 | String docker
6 | }
7 | command {
8 | echo "Hello, world!"
9 | }
10 | runtime {
11 | docker: docker
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/test/assets/test_retry_streams.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 | # This WDL tests stdout/stderr outputs and automatic task retry. The task outputs messages via
3 | # captured standard output and error files, but fails on 3/4 attempts. We're looking for the tasks
4 | # to ultimately succeed and to produce the expected outputs (in particular, each output file should
5 | # have only one message, despite the task potentially having been tried multiple times).
6 |
7 | workflow test_retry_streams {
8 | input {}
9 |
10 | scatter (i in range(4)) {
11 | call test_retry_streams_task
12 | }
13 |
14 | output {
15 | Array[File] messages = test_retry_streams_task.message
16 | Array[File] stdouts = test_retry_streams_task.stdout
17 | Array[File] stderrs = test_retry_streams_task.stderr
18 | }
19 | }
20 |
21 | task test_retry_streams_task {
22 | input {}
23 |
24 | command <<<
25 | echo "Hello, stdout!" | tee message.txt
26 | >&2 echo "Hello, stderr!"
27 | if (( RANDOM % 4 > 0)); then
28 | exit 42
29 | fi
30 | >>>
31 |
32 | output {
33 | File message = "message.txt"
34 | File stdout = stdout()
35 | File stderr = stderr()
36 | }
37 |
38 | runtime {
39 | docker: "ubuntu:20.04"
40 | cpu: 1
41 | maxRetries: 99
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/test/assets/test_termination.wdl:
--------------------------------------------------------------------------------
1 | version 1.1
2 |
3 | workflow w {
4 | scatter (i in range(4)) {
5 | call t {
6 | input:
7 | i
8 | }
9 | }
10 | }
11 |
12 | task t {
13 | input {
14 | Int i
15 | }
16 |
17 | command <<<
18 | if (( ~{i} == 3 )); then
19 | sleep 10
20 | >&2 echo -n "This is the end, my only friend"
21 | echo "I'll never look into your eyes again"
22 | exit 42
23 | fi
24 | sleep 600
25 | >>>
26 | }
27 |
--------------------------------------------------------------------------------
/test/build_test_image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Build the miniwdl-aws docker image and push to ECR (of the currently-credentialed account) for
3 | # use with live tests. Prepare in advance:
4 | # aws ecr create-repository --repository-name miniwdl-aws
5 |
6 | set -euo pipefail
7 |
8 | # build local image
9 | cd "$(dirname "$0")/.."
10 | >&2 python3 setup.py check
11 | >&2 docker pull public.ecr.aws/amazonlinux/amazonlinux:2023
12 | >&2 docker build -t miniwdl-aws .
13 |
14 | # login to ECR
15 | AWS_REGION="$(aws configure get region)"
16 | ECR_REGISTRY_ID="$(aws ecr describe-registry | jq -r .registryId)"
17 | ECR_REPO="${ECR_REGISTRY_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/miniwdl-aws"
18 | aws ecr get-login-password --region $(aws configure get region) \
19 | | >&2 docker login --username AWS --password-stdin $ECR_REPO
20 |
21 | # set ECR tag & push
22 | >&2 docker tag miniwdl-aws:latest ${ECR_REPO}:latest
23 | >&2 docker push ${ECR_REPO}
24 |
25 | # print full RepoDigest (for use with `docker pull`) to stdout
26 | >&2 echo
27 | echo "$(docker inspect ${ECR_REPO}:latest | jq -r '.[0].RepoDigests[0]')"
28 |
--------------------------------------------------------------------------------
/test/requirements.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | black
3 | flake8
4 | pylint
5 | pytest
6 | pytest-xdist
7 | boto3
8 |
--------------------------------------------------------------------------------
/test/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -euo pipefail
4 |
5 | cd "$(dirname "$0")"
6 |
7 | export AWS_DEFAULT_REGION=$(aws configure get region)
8 | if [[ -z ${MINIWDL__AWS__WORKFLOW_IMAGE:-} ]]; then
9 | export MINIWDL__AWS__WORKFLOW_IMAGE=$(./build_test_image.sh)
10 | if [[ -z $MINIWDL__AWS__WORKFLOW_IMAGE ]]; then
11 | exit 1
12 | fi
13 | fi
14 | export MINIWDL_AWS_TEST_BUCKET="miniwdl-test-$(aws sts get-caller-identity | jq -r .Account)"
15 | >&2 echo "Creating S3 bucket $MINIWDL_AWS_TEST_BUCKET (BucketAlreadyOwnedByYou error is OK):"
16 | aws s3api create-bucket --bucket "$MINIWDL_AWS_TEST_BUCKET" \
17 | --region "$AWS_DEFAULT_REGION" --create-bucket-configuration LocationConstraint="$AWS_DEFAULT_REGION" \
18 | || true
19 | # NOTE: workflow IAM role needs to be able to write to that bucket...
20 |
21 | pytest -sxv test*.py $@
22 |
--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import subprocess
4 | import time
5 | import pytest
6 | import boto3
7 | import random
8 | from datetime import datetime
9 | from urllib.parse import urlparse
10 |
11 | assert "AWS_DEFAULT_REGION" in os.environ
12 | assert (
13 | "MINIWDL__AWS__WORKFLOW_IMAGE" in os.environ
14 | and "miniwdl-aws" in os.environ["MINIWDL__AWS__WORKFLOW_IMAGE"]
15 | ), "set environment MINIWDL__AWS__WORKFLOW_IMAGE to repo:digest"
16 | assert (
17 | "MINIWDL__AWS__WORKFLOW_QUEUE" in os.environ
18 | ), "set MINIWDL__AWS__WORKFLOW_QUEUE to Batch queue name"
19 | assert (
20 | "MINIWDL_AWS_TEST_BUCKET" in os.environ
21 | ), "set MINIWDL_AWS_TEST_BUCKET to test S3 bucket (name only)"
22 |
23 |
24 | @pytest.fixture(scope="module")
25 | def aws_batch():
26 | return boto3.client("batch", region_name=os.environ["AWS_DEFAULT_REGION"])
27 |
28 |
29 | def batch_miniwdl(aws_batch, args, env=None, upload=None, cache=False):
30 | """
31 | Submit & await a Batch job to run cmd in the miniwdl_aws container (usually ~miniwdl run~
32 | to launch other Batch jobs in turn)
33 | """
34 | cmd = ["python3", "-m", "miniwdl_aws"]
35 | cmd.extend(args)
36 | cmd.append("--follow")
37 | if not cache:
38 | cmd.append("--no-cache")
39 | if upload:
40 | if not upload.endswith("/"):
41 | upload += "/"
42 | cmd.extend(["--s3upload", upload])
43 |
44 | exit_code = subprocess.run(
45 | cmd, cwd=os.path.dirname(os.path.dirname(__file__)), check=False, env=env
46 | ).returncode
47 |
48 | if exit_code != 0:
49 | ans = {"success": False, "exit_code": exit_code}
50 | if upload:
51 | error = get_s3uri(upload + "error.json")
52 | if error:
53 | ans["error"] = json.loads(error)
54 | return ans
55 |
56 | ans = {"success": True}
57 | if upload:
58 | outputs = get_s3uri(upload + "outputs.json")
59 | if outputs:
60 | ans["outputs"] = json.loads(outputs)
61 | return ans
62 |
63 |
64 | def get_s3uri(uri):
65 | """
66 | Download bytes from s3:// URI
67 | """
68 | try:
69 | assert uri.startswith("s3://")
70 | parts = urlparse(uri)
71 | obj = boto3.resource("s3", region_name=os.environ["AWS_DEFAULT_REGION"]).Object(
72 | parts.netloc, parts.path.lstrip("/")
73 | )
74 | return obj.get()["Body"].read()
75 | except Exception as exn:
76 | if "NoSuchKey" in str(exn):
77 | return None
78 | raise
79 |
80 |
81 | def test_miniwdl_run_self_test(aws_batch):
82 | subprocess.run(
83 | [
84 | "python3",
85 | "-m",
86 | "miniwdl_aws",
87 | "--follow",
88 | "--self-test",
89 | "--no-cache",
90 | "--mount",
91 | "/mnt/shared",
92 | ],
93 | cwd=os.path.dirname(os.path.dirname(__file__)),
94 | check=True,
95 | )
96 |
97 |
98 | @pytest.fixture(scope="session")
99 | def test_s3_folder():
100 | """
101 | S3 folder for this test session
102 | """
103 | return f"s3://{os.environ['MINIWDL_AWS_TEST_BUCKET']}/{datetime.today().strftime('%Y%m%d_%H%M%S')}/"
104 |
105 |
106 | def test_retry_streams(aws_batch, test_s3_folder):
107 | env = dict(os.environ)
108 | env["MINIWDL__AWS__RETRY_WAIT"] = "1"
109 | rslt = batch_miniwdl(
110 | aws_batch,
111 | [
112 | "/var/miniwdl_aws_test_assets/test_retry_streams.wdl",
113 | "--dir",
114 | "/mnt/efs/miniwdl_aws_tests",
115 | "--verbose",
116 | ],
117 | upload=test_s3_folder + "test_retry_streams/",
118 | env=env,
119 | )
120 | assert rslt["success"]
121 | assert len(rslt["outputs"]["test_retry_streams.messages"]) == 4
122 | assert len(rslt["outputs"]["test_retry_streams.stdouts"]) == 4
123 | assert len(rslt["outputs"]["test_retry_streams.stderrs"]) == 4
124 | for i in range(4):
125 | assert (
126 | get_s3uri(rslt["outputs"]["test_retry_streams.messages"][i]).decode().strip()
127 | == "Hello, stdout!"
128 | )
129 | assert (
130 | get_s3uri(rslt["outputs"]["test_retry_streams.stdouts"][i]).decode().strip()
131 | == "Hello, stdout!"
132 | )
133 | assert (
134 | get_s3uri(rslt["outputs"]["test_retry_streams.stderrs"][i]).decode().strip()
135 | == "Hello, stderr!"
136 | )
137 |
138 |
139 | def test_assemble_refbased(aws_batch, test_s3_folder):
140 | rslt = batch_miniwdl(
141 | aws_batch,
142 | [
143 | "https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/pipes/WDL/workflows/assemble_refbased.wdl",
144 | "reads_unmapped_bams=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/G5012.3.testreads.bam",
145 | "reference_fasta=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/ebov-makona.fasta",
146 | "sample_name=G5012.3",
147 | "--dir",
148 | "/mnt/efs/miniwdl_aws_tests",
149 | "--verbose",
150 | ],
151 | upload=test_s3_folder + "test_assemble_refbased/",
152 | )
153 | assert rslt["success"]
154 | # TODO: more assertions
155 |
156 |
157 | def test_termination(aws_batch, test_s3_folder):
158 | """
159 | Upon a CommandFailed task failure, the workflow with parallel tasks quickly self-terminates.
160 | """
161 | t0 = time.time()
162 | env = dict(os.environ)
163 | env["MINIWDL__AWS__CONTAINER_SYNC"] = "true"
164 | rslt = batch_miniwdl(
165 | aws_batch,
166 | [
167 | "/var/miniwdl_aws_test_assets/test_termination.wdl",
168 | "--dir",
169 | "/mnt/efs/miniwdl_aws_tests",
170 | "--verbose",
171 | ],
172 | upload=test_s3_folder + "test_termination/",
173 | env=env,
174 | )
175 | assert not rslt["success"]
176 | assert rslt["error"]["cause"]["error"] == "CommandFailed"
177 | assert rslt["error"]["cause"]["exit_status"] == 42
178 | assert (
179 | "This is the end, my only friend"
180 | in get_s3uri(rslt["error"]["cause"]["stderr_s3file"]).decode()
181 | )
182 | assert (
183 | "I'll never look into your eyes again"
184 | in get_s3uri(rslt["error"]["cause"]["stdout_s3file"]).decode()
185 | )
186 | assert time.time() - t0 < 600
187 |
188 |
189 | def test_nonexistent_docker(aws_batch, test_s3_folder):
190 | """
191 | Workflow specifies a docker image that doesn't exist; does this error bubble up from AWS Batch
192 | in a reasonable way?
193 | """
194 | rslt = batch_miniwdl(
195 | aws_batch,
196 | [
197 | "/var/miniwdl_aws_test_assets/test_nonexistent_docker.wdl",
198 | "docker=nonexistent_bogus_12345",
199 | "--dir",
200 | "/mnt/efs/miniwdl_aws_tests",
201 | "--delete-after",
202 | "failure",
203 | "--verbose",
204 | ],
205 | upload=test_s3_folder + "test_nonexistent_docker/",
206 | )
207 | assert not rslt["success"]
208 | assert "CannotPullContainerError" in str(rslt["error"])
209 |
210 |
211 | def test_call_cache(aws_batch, test_s3_folder):
212 | """
213 | Call cache works (short-term, where previous outputs remain on /mnt/shared)
214 | """
215 | t0 = int(time.time())
216 | # run once to prime cache
217 | rslt = batch_miniwdl(
218 | aws_batch,
219 | [
220 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl",
221 | "timestamp_in=",
222 | str(t0),
223 | "names=Alice",
224 | "names=Bob",
225 | "names=Carol",
226 | "fail=true",
227 | "--verbose",
228 | "--dir",
229 | "/mnt/efs/miniwdl_aws_tests",
230 | ],
231 | cache=False,
232 | )
233 | assert not rslt["success"]
234 |
235 | # run again where a subset of calls should be reused
236 | t1 = int(time.time())
237 | rslt = batch_miniwdl(
238 | aws_batch,
239 | [
240 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl",
241 | "timestamp_in=",
242 | str(t0),
243 | "names=Alice",
244 | "names=Bob",
245 | "names=Xavier",
246 | "--verbose",
247 | "--dir",
248 | "/mnt/efs/miniwdl_aws_tests",
249 | ],
250 | cache=True,
251 | upload=test_s3_folder + "test_call_cache/",
252 | )
253 | assert rslt["success"]
254 |
255 | # Alice and Bob were cached, Xavier was not:
256 | assert t0 <= rslt["outputs"]["test_call_cache.timestamps_out"][0] <= t1
257 | assert t0 <= rslt["outputs"]["test_call_cache.timestamps_out"][1] <= t1
258 | assert rslt["outputs"]["test_call_cache.timestamps_out"][2] > t1
259 | assert "Hello, Alice!" in get_s3uri(rslt["outputs"]["test_call_cache.messages"][0]).decode()
260 | assert "Hello, Bob!" in get_s3uri(rslt["outputs"]["test_call_cache.messages"][1]).decode()
261 | assert "Hello, Xavier!" in get_s3uri(rslt["outputs"]["test_call_cache.messages"][2]).decode()
262 |
263 |
264 | def test_call_cache_one_task(aws_batch, test_s3_folder):
265 | """
266 | Short-term call cache of one task (where the entire run outputs, not just a portion thereof,
267 | are sourced from the cache.)
268 | """
269 | t0 = int(time.time())
270 | rslt = batch_miniwdl(
271 | aws_batch,
272 | [
273 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl",
274 | "timestamp_in=",
275 | str(t0),
276 | "name=Alyssa",
277 | "--task",
278 | "write_name",
279 | "--verbose",
280 | "--dir",
281 | "/mnt/efs/miniwdl_aws_tests",
282 | ],
283 | cache=False,
284 | )
285 | assert rslt["success"]
286 |
287 | t1 = int(time.time())
288 | rslt = batch_miniwdl(
289 | aws_batch,
290 | [
291 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl",
292 | "timestamp_in=",
293 | str(t0),
294 | "name=Alyssa",
295 | "--task",
296 | "write_name",
297 | "--verbose",
298 | "--dir",
299 | "/mnt/efs/miniwdl_aws_tests",
300 | ],
301 | cache=True,
302 | upload=test_s3_folder + "test_call_cache_one_task/",
303 | )
304 | assert rslt["success"]
305 |
306 | assert t0 <= rslt["outputs"]["write_name.timestamp_out"] <= t1
307 | assert "Alyssa" in get_s3uri(rslt["outputs"]["write_name.name_file"]).decode()
308 |
309 |
310 | def test_download(aws_batch):
311 | """
312 | Test workflow can use https:// and s3:// input files. This is functionality built-in to miniwdl
313 | so ought to just work, but nice to cover it here.
314 | """
315 | rslt = batch_miniwdl(
316 | aws_batch,
317 | [
318 | "/var/miniwdl_aws_test_assets/count_lines.wdl",
319 | "files=https://raw.githubusercontent.com/chanzuckerberg/miniwdl/main/tests/alyssa_ben.txt",
320 | "files=s3://1000genomes/CHANGELOG",
321 | "--dir",
322 | "/mnt/efs/miniwdl_aws_tests",
323 | "--verbose",
324 | ],
325 | )
326 | assert rslt["success"]
327 |
328 |
329 | def test_directory(aws_batch, test_s3_folder):
330 | """
331 | Test Directory I/O
332 | """
333 |
334 | rslt = batch_miniwdl(
335 | aws_batch,
336 | [
337 | "/var/miniwdl_aws_test_assets/test_directory.wdl",
338 | "--dir",
339 | "/mnt/efs/miniwdl_aws_tests",
340 | "--verbose",
341 | ],
342 | upload=test_s3_folder + "test_directory/",
343 | )
344 | assert rslt["success"]
345 | assert rslt["outputs"]["test_directory_workflow.dir"].startswith("s3://")
346 | assert rslt["outputs"]["test_directory_workflow.file_count"] == 3
347 |
348 | rslt = batch_miniwdl(
349 | aws_batch,
350 | [
351 | "/var/miniwdl_aws_test_assets/test_directory.wdl",
352 | "dir=s3://1000genomes/changelog_details/",
353 | "--task",
354 | "test_directory",
355 | "--dir",
356 | "/mnt/efs/miniwdl_aws_tests",
357 | "--verbose",
358 | ],
359 | upload=test_s3_folder + "test_directory/",
360 | )
361 | assert rslt["success"]
362 | assert rslt["outputs"]["test_directory.file_count"] > 100
363 |
364 |
365 | def test_shipping_local_wdl(aws_batch, tmp_path, test_s3_folder):
366 | with open(tmp_path / "outer.wdl", "w") as outfile:
367 | print(
368 | """
369 | version development
370 | import "inner.wdl"
371 |
372 | workflow outer {
373 | input {
374 | String who
375 | }
376 | call inner.hello { input: who }
377 | output {
378 | String message = hello.message
379 | }
380 | }
381 | """,
382 | file=outfile,
383 | )
384 | with open(tmp_path / "inner.wdl", "w") as outfile:
385 | print(
386 | """
387 | version development
388 |
389 | task hello {
390 | input {
391 | String who
392 | }
393 | command {
394 | echo 'Hello, ~{who}!'
395 | }
396 | output {
397 | String message = read_string(stdout())
398 | }
399 | }
400 | """,
401 | file=outfile,
402 | )
403 | rslt = batch_miniwdl(
404 | aws_batch,
405 | [
406 | str(tmp_path / "outer.wdl"),
407 | "who=world",
408 | "--dir",
409 | "/mnt/efs/miniwdl_aws_tests",
410 | ],
411 | upload=test_s3_folder + "test_shipping_local_wdl/",
412 | )
413 | assert rslt["outputs"]["outer.message"] == "Hello, world!"
414 |
415 |
416 | def test_shipping_local_wdl_error(aws_batch, tmp_path, test_s3_folder):
417 | almost_big_str = "".join(chr(random.randrange(ord("A"), ord("Z"))) for _ in range(42000))
418 | with open(tmp_path / "almost_big.wdl", "w") as outfile:
419 | print(
420 | """
421 | version development
422 |
423 | workflow outer {
424 | input {
425 | }
426 | output {
427 | String big = "XXX"
428 | }
429 | }
430 | """.replace(
431 | "XXX", almost_big_str
432 | ),
433 | file=outfile,
434 | )
435 | rslt = batch_miniwdl(
436 | aws_batch,
437 | [
438 | str(tmp_path / "almost_big.wdl"),
439 | "--dir",
440 | "/mnt/efs/miniwdl_aws_tests",
441 | ],
442 | upload=test_s3_folder + "test_shipping_local_wdl_error/",
443 | )
444 | assert rslt["success"]
445 | assert rslt["outputs"]["outer.big"] == almost_big_str
446 |
447 | # Test for reasonable error when zipped WDL is too large
448 | big_str = "".join(chr(random.randrange(ord("A"), ord("Z"))) for _ in range(50000))
449 | with open(tmp_path / "big.wdl", "w") as outfile:
450 | print(
451 | """
452 | version development
453 |
454 | workflow outer {
455 | input {
456 | }
457 | output {
458 | String big = "XXX"
459 | }
460 | }
461 | """.replace(
462 | "XXX", big_str
463 | ),
464 | file=outfile,
465 | )
466 | rslt = batch_miniwdl(
467 | aws_batch,
468 | [
469 | str(tmp_path / "big.wdl"),
470 | "--dir",
471 | "/mnt/efs/miniwdl_aws_tests",
472 | ],
473 | )
474 | assert rslt["exit_code"] == 123
475 |
476 |
477 | def test_log_task_usage(aws_batch, test_s3_folder):
478 | env = dict(os.environ)
479 | env["MINIWDL__LOG_TASK_USAGE__PERIOD"] = "2"
480 | rslt = batch_miniwdl(
481 | aws_batch,
482 | [
483 | os.path.join(os.path.dirname(__file__), "../plugin_log_task_usage/StressTest.wdl"),
484 | "--dir",
485 | "/mnt/efs/miniwdl_aws_tests",
486 | "--verbose",
487 | "--delete-after",
488 | "always",
489 | ],
490 | upload=test_s3_folder + "test_log_task_usage/",
491 | env=env,
492 | )
493 | assert rslt["success"]
494 | assert "container usage ::" in get_s3uri(rslt["outputs"]["StressTest.stderr_txt"]).decode()
495 |
--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """Calculates the current version number.
4 |
5 | If possible, uses output of “git describe” modified to conform to the
6 | visioning scheme that setuptools uses (see PEP 386). Releases must be
7 | labelled with annotated tags (signed tags are annotated) of the following
8 | format:
9 |
10 | v(.)+ [ {a|b|c|rc} (.)* ]
11 |
12 | If “git describe” returns an error (likely because we're in an unpacked copy
13 | of a release tarball, rather than a git working copy), or returns a tag that
14 | does not match the above format, version is read from RELEASE-VERSION file.
15 |
16 | To use this script, simply import it your setup.py file, and use the results
17 | of get_version() as your package version:
18 |
19 | import version
20 | setup(
21 | version=version.get_version(),
22 | .
23 | .
24 | .
25 | )
26 |
27 | This will automatically update the RELEASE-VERSION file. The RELEASE-VERSION
28 | file should *not* be checked into git but it *should* be included in sdist
29 | tarballs (as should version.py file). To do this, run:
30 |
31 | echo include RELEASE-VERSION version.py >>MANIFEST.in
32 | echo RELEASE-VERSION >>.gitignore
33 |
34 | With that setup, a new release can be labelled by simply invoking:
35 |
36 | git tag -s v1.0
37 | """
38 |
39 | __author__ = ("Douglas Creager ", "Michal Nazarewicz ")
40 | __license__ = "This file is placed into the public domain."
41 | __maintainer__ = "Michal Nazarewicz"
42 | __email__ = "mina86@mina86.com"
43 |
44 | __all__ = "get_version"
45 |
46 |
47 | import re
48 | import subprocess
49 | import sys
50 |
51 |
52 | RELEASE_VERSION_FILE = "RELEASE-VERSION"
53 |
54 | # http://www.python.org/dev/peps/pep-0386/
55 | _PEP386_SHORT_VERSION_RE = r"\d+(?:\.\d+)+(?:(?:[abc]|rc)\d+(?:\.\d+)*)?"
56 | _PEP386_VERSION_RE = r"^%s(?:\.post\d+)?(?:\.dev\d+)?$" % (_PEP386_SHORT_VERSION_RE)
57 | _GIT_DESCRIPTION_RE = r"^v(?P%s)-(?P\d+)-g(?P[\da-f]+)$" % (
58 | _PEP386_SHORT_VERSION_RE
59 | )
60 |
61 |
62 | def read_git_version():
63 | try:
64 | proc = subprocess.Popen( # pylint: disable=R1732
65 | ("git", "describe", "--long", "--tags", "--match", "v[0-9]*.*"),
66 | stdout=subprocess.PIPE,
67 | stderr=subprocess.PIPE,
68 | )
69 | data, _ = proc.communicate()
70 | if proc.returncode:
71 | return None
72 | ver = data.decode().splitlines()[0].strip()
73 | except:
74 | return None
75 |
76 | if not ver:
77 | return None
78 | match = re.search(_GIT_DESCRIPTION_RE, ver)
79 | if not match:
80 | sys.stderr.write("version: git description (%s) is invalid, " "ignoring\n" % ver)
81 | return None
82 |
83 | commits = int(match.group("commits"))
84 | if not commits:
85 | return match.group("ver")
86 | return "%s.post%d.dev%d" % (match.group("ver"), commits, int(match.group("sha"), 16))
87 |
88 |
89 | def read_release_version():
90 | try:
91 | with open(RELEASE_VERSION_FILE) as infile:
92 | ver = infile.readline().strip()
93 | if not re.search(_PEP386_VERSION_RE, ver):
94 | sys.stderr.write(
95 | "version: release version (%s) is invalid, " "will use it anyway\n" % ver
96 | )
97 | return ver
98 | except:
99 | return None
100 |
101 |
102 | def write_release_version(version):
103 | with open(RELEASE_VERSION_FILE, "w") as outfile:
104 | outfile.write("%s\n" % version)
105 |
106 |
107 | def get_version():
108 | release_version = read_release_version()
109 | version = read_git_version() or release_version
110 | if not version:
111 | raise ValueError("Cannot find the version number")
112 | if version != release_version:
113 | write_release_version(version)
114 | return version
115 |
116 |
117 | if __name__ == "__main__":
118 | print(get_version())
119 |
--------------------------------------------------------------------------------