├── .dockerignore ├── .github └── workflows │ ├── ghcr_image.yml │ └── lint.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── miniwdl_aws.cfg ├── miniwdl_aws ├── __init__.py ├── __main__.py ├── _util.py ├── batch_job.py ├── cli_run_s3upload.py └── cli_submit.py ├── plugin_log_task_usage ├── StressTest.wdl ├── miniwdl_log_task_usage.py └── setup.py ├── release.sh ├── setup.py ├── test ├── assets │ ├── count_lines.wdl │ ├── test_call_cache.wdl │ ├── test_directory.wdl │ ├── test_nonexistent_docker.wdl │ ├── test_retry_streams.wdl │ └── test_termination.wdl ├── build_test_image.sh ├── requirements.txt ├── run_tests.sh └── test.py └── version.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | __pycache__ 3 | .eggs/ 4 | dist/ 5 | *.egg-info/ 6 | build/ 7 | .venv/ 8 | venv/ 9 | -------------------------------------------------------------------------------- /.github/workflows/ghcr_image.yml: -------------------------------------------------------------------------------- 1 | name: ghcr_image 2 | on: [push] 3 | 4 | jobs: 5 | 6 | ghcr_image: 7 | if: github.repository == 'miniwdl-ext/miniwdl-aws' # don't run from forks 8 | runs-on: ubuntu-20.04 9 | steps: 10 | - uses: actions/checkout@v2 11 | with: 12 | fetch-depth: 0 13 | - name: docker login ghcr.io 14 | uses: docker/login-action@v1 15 | with: 16 | registry: ghcr.io 17 | username: ${{ github.repository_owner }} 18 | password: ${{ secrets.GITHUB_TOKEN }} 19 | - name: docker_build 20 | run: | 21 | python3 setup.py --version # generate RELEASE-VERSION 22 | 23 | REPO="ghcr.io/miniwdl-ext/miniwdl-aws" 24 | TAG="$(git describe --tags --always --dirty)" 25 | 26 | docker pull public.ecr.aws/amazonlinux/amazonlinux:2 27 | docker build --no-cache -t "${REPO}:${TAG}" . 28 | IMAGE_ID="$(docker inspect ${REPO}:${TAG} | jq -r .[0].Id)" 29 | 30 | docker push "${REPO}:${TAG}" 31 | REPO_DIGEST="$(docker inspect ${REPO}:${TAG} | jq -r '.[0].RepoDigests[0]')" 32 | 33 | echo "REPO=${REPO}" >> $GITHUB_ENV 34 | echo "TAG=${TAG}" >> $GITHUB_ENV 35 | echo "IMAGE_ID=${IMAGE_ID}" >> $GITHUB_ENV 36 | echo "REPO_DIGEST=${REPO_DIGEST}" >> $GITHUB_ENV 37 | - name: display 38 | run: | 39 | >&2 echo "Id: ${IMAGE_ID}" 40 | echo "::set-output name=Id::${REPO}:${IMAGE_ID}" 41 | >&2 echo "Tag: ${REPO}:${TAG}" 42 | echo "::set-output name=Tag::${REPO}:${TAG}" 43 | >&2 echo "RepoDigest: ${REPO_DIGEST}" 44 | echo "::set-output name=RepoDigest::${REPO_DIGEST}" 45 | outputs: 46 | Id: ${{steps.display.outputs.Id}} 47 | Tag: ${{steps.display.outputs.Tag}} 48 | RepoDigest: ${{steps.display.outputs.RepoDigest}} 49 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | 6 | lint: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v2 10 | with: 11 | fetch-depth: 0 12 | - name: deps 13 | run: sudo pip3 install --system pre-commit black flake8 pylint 14 | - name: pre-commit 15 | run: pre-commit run --all-files 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | __pycache__ 3 | .eggs/ 4 | RELEASE-VERSION 5 | dist/ 6 | *.egg-info/ 7 | build/ 8 | .venv/ 9 | venv/ 10 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: black 5 | name: black 6 | language: system 7 | files: \.py$ 8 | verbose: true 9 | entry: black 10 | args: [-l,'100'] 11 | - id: flake8 12 | name: flake8 13 | language: system 14 | files: \.py$ 15 | verbose: true 16 | entry: flake8 17 | args: [--max-line-length, "100", "--ignore=E501,W503,E722,E203"] 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Docker image with miniwdl & the AWS plugin baked in. Suitable for submission to Batch as the 2 | # "workflow job" launching & monitoring other jobs (WDL tasks). 3 | 4 | FROM public.ecr.aws/amazonlinux/amazonlinux:2023 5 | 6 | # rpm dependencies 7 | RUN yum check-update; yum install -y \ 8 | python3-pip \ 9 | python3-setuptools \ 10 | unzip 11 | 12 | # AWS CLI v2 (`yum install awscli` is a really old version) 13 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 14 | RUN sh -c 'cd /tmp && unzip awscliv2.zip' && sh /tmp/aws/install 15 | 16 | # miniwdl-aws (and PyPI dependencies listed in setup.py) 17 | COPY ./ /tmp/miniwdl-aws/ 18 | RUN bash -c 'cd /tmp/miniwdl-aws && pip3 install . && pip3 install ./plugin_log_task_usage' 19 | 20 | # cleanup (for squashed image) 21 | RUN yum clean all && rm -rf /tmp/miniwdl* /tmp/aws* 22 | 23 | # boilerplate configuration file & test assets 24 | COPY miniwdl_aws.cfg /etc/xdg/miniwdl.cfg 25 | COPY test/assets/ /var/miniwdl_aws_test_assets/ 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Wid L. Hacker 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include RELEASE-VERSION version.py 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # miniwdl AWS plugin 2 | 3 | **Extends [miniwdl](https://github.com/chanzuckerberg/miniwdl) to run workflows on [AWS Batch](https://aws.amazon.com/batch/) and [EFS](https://aws.amazon.com/efs/)** 4 | 5 | This miniwdl plugin enables it to execute WDL tasks as AWS Batch jobs. It uses EFS for work-in-progress file I/O, optionally uploading final workflow outputs to S3. 6 | 7 | **Before diving into this, first consider [AWS HealthOmics](https://aws.amazon.com/healthomics/)**, which includes a [managed service for WDL workflows](https://docs.aws.amazon.com/omics/latest/dev/creating-workflows.html) that doesn't need you to provision all the infrastructure. Our companion project **[miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run)** provides a convenient CLI for launching HealthOmics runs with local WDL source code files. 8 | 9 | There are a few ways to deploy this miniwdl-aws plugin: 10 | 11 | ## Amazon Genomics CLI 12 | 13 | [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) can deploy a [miniwdl-aws context](https://aws.github.io/amazon-genomics-cli/docs/workflow-engines/miniwdl/) into your AWS account with all the necessary infrastructure. 14 | 15 | ## Amazon SageMaker Studio 16 | 17 | Or, try the [**miniwdl-aws-studio**](https://github.com/miniwdl-ext/miniwdl-aws-studio) recipe to install miniwdl for interactive use within [Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/), a web IDE with a terminal and filesystem browser. You can use the terminal to operate `miniwdl run` against AWS Batch, the filesystem browser to manage the inputs and outputs on EFS, and the Jupyter notebooks to further analyze the outputs. 18 | 19 | [image](https://github.com/miniwdl-ext/miniwdl-aws-studio) 20 | 21 | ## `miniwdl-aws-submit` with custom infrastructure 22 | 23 | Lastly, advanced operators can use [**miniwdl-aws-terraform**](https://github.com/miniwdl-ext/miniwdl-aws-terraform) to deploy/customize the necessary AWS infrastructure, including a VPC, EFS file system, Batch queues, and IAM roles. 24 | 25 | In this scheme, a local command-line wrapper `miniwdl-aws-submit` *launches miniwdl in its own small Batch job* to orchestrate a workflow. This **workflow job** then spawns WDL **task jobs** as needed, without needing the submitting laptop to remain connected for the duration. The workflow jobs run on lightweight [Fargate](https://docs.aws.amazon.com/batch/latest/userguide/fargate.html) resources, while task jobs run on EC2 spot instances. 26 | 27 | ### Submitting workflow jobs 28 | 29 | After deploying [miniwdl-aws-terraform](https://github.com/miniwdl-ext/miniwdl-aws-terraform), `pip3 install miniwdl-aws` locally to make the `miniwdl-aws-submit` program available. Try the self-test: 30 | 31 | ``` 32 | miniwdl-aws-submit --self-test --follow --workflow-queue miniwdl-workflow 33 | ``` 34 | 35 | Then launch a [viral genome assembly](https://github.com/broadinstitute/viral-pipelines/) that should run in 10-15 minutes: 36 | 37 | ``` 38 | miniwdl-aws-submit \ 39 | https://github.com/broadinstitute/viral-pipelines/raw/v2.1.28.0/pipes/WDL/workflows/assemble_refbased.wdl \ 40 | reads_unmapped_bams=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/G5012.3.testreads.bam \ 41 | reference_fasta=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/ebov-makona.fasta \ 42 | sample_name=G5012.3 \ 43 | --workflow-queue miniwdl-workflow \ 44 | --s3upload s3://MY-BUCKET/assemblies \ 45 | --verbose --follow 46 | ``` 47 | 48 | The command line resembles `miniwdl run`'s with extra AWS-related arguments: 49 | 50 | * `--workflow-queue` Batch job queue on which to schedule the workflow job; output from miniwdl-aws-terraform, default `miniwdl-workflow`. (Also set by environment variable `MINIWDL__AWS__WORKFLOW_QUEUE`) 51 | * `--follow` live-streams the workflow log instead of exiting immediately upon submission. (`--wait` blocks on the workflow without streaming the log.) 52 | * `--s3upload` (optional) S3 folder URI under which to upload the workflow products, including the log and output files (if successful). The bucket must be allow-listed in the miniwdl-aws-terraform deployment. 53 | * Unless `--s3upload` ends with /, one more subfolder is added to the uploaded URI prefix, equal to miniwdl's automatic timestamp-prefixed run name. If it does end in /, then the uploads go directly into/under that folder (and a repeat invocation would be expected to overwrite them). 54 | 55 | `miniwdl-aws-submit` detects other infrastructure details (task queue, EFS access point, IAM role) based on tags set on the workflow queue; see `miniwdl-aws-submit --help` for additional options to override those defaults. 56 | 57 | Arguments not consumed by `miniwdl-aws-submit` are *passed through* to `miniwdl run` inside the workflow job; as are environment variables whose names begin with `MINIWDL__`, allowing override of any [miniwdl configuration option](https://miniwdl.readthedocs.io/en/latest/runner_reference.html#configuration) (disable wih `--no-env`). See [miniwdl_aws.cfg](miniwdl_aws.cfg) for various options preconfigured in the workflow job container, some of which can be adjusted to benefit specific workloads. For example, to halve the maximum rate at which miniwdl invokes the AWS Batch SubmitJob API, set `MINIWDL__AWS__SUBMIT_PERIOD=2` in the `miniwdl-aws-submit` environment. 58 | 59 | If the specified WDL source code is an existing local .wdl or .zip file, `miniwdl-aws-submit` automatically ships it with the workflow job as the WDL to execute. Given a .wdl file, it runs [`miniwdl zip`](https://miniwdl.readthedocs.io/en/latest/zip.html) to detect & include any imported WDL files; while it assumes .zip files were also generated by `miniwdl zip`. If the source code is too large to fit in the AWS Batch request payload (~50KB), then you'll instead need to pass it by reference to a URL or EFS path. 60 | 61 | The workflow and task jobs all mount EFS at `/mnt/efs`. Although workflow input files are usually specified using HTTPS or S3 URIs, files already resident on EFS can be used with their `/mnt/efs` paths (which probably don't exist locally on the submitting machine). Unlike the WDL source code, `miniwdl-aws-submit` will not attempt to ship/upload local input files. 62 | 63 | ## Run directories on EFS 64 | 65 | Miniwdl runs the workflow in a directory beneath `/mnt/efs/miniwdl_run` (override with `--dir`). The outputs also remain cached there for potential reuse in future runs (to avoid, submit with `--no-cache` or wipe `/mnt/efs/miniwdl_run/_CACHE`). 66 | 67 | Given the EFS-centric I/O model, you'll need a way to browse and manage the filesystem contents remotely. The companion recipe [lambdash-efs](https://github.com/miniwdl-ext/lambdash-efs) is one option; miniwdl-aws-terraform outputs the infrastructure details needed to deploy it (pick any subnet). Or, set up an instance/container mounting your EFS, to access via SSH or web app (e.g. [JupyterHub](https://jupyter.org/hub), [Cloud Commander](http://cloudcmd.io/), [VS Code server](https://github.com/cdr/code-server)). 68 | 69 | You can also automate cleanup of EFS run directories by setting `miniwdl-aws-submit --s3upload` and: 70 | 71 | * `--delete-after success` to delete the run directory immediately after successful output upload 72 | * `--delete-after failure` to delete the directory after failure 73 | * `--delete-after always` to delete it in either case 74 | * (or set environment variable `MINIWDL__AWS__DELETE_AFTER_S3_UPLOAD`) 75 | 76 | Deleting a run directory after success prevents the outputs from being reused in future runs. Deleting it after failures can make debugging more difficult (although logs are retained, see below). 77 | 78 | ### Security note on file system isolation 79 | 80 | Going through AWS Batch & EFS, miniwdl can't enforce the strict file system isolation between WDL task containers that it does locally. All the AWS Batch containers have read/write access to the entire EFS file system (as viewed through the access point), not only their initial working directory. 81 | 82 | This is usually benign, because WDL tasks should only read their declared inputs and write into their respective working/temporary directories. But poorly- or maliciously-written tasks could read & write files elsewhere on EFS, even changing their own input files or those of other tasks. This risks unintentional side-effects or worse security hazards from untrusted code. 83 | 84 | To mitigate this, test workflows thoroughly using the local backend, which strictly isolates task containers' file systems. If WDL tasks insist on modifying their input files in place, then `--copy-input-files` can unblock them (at a cost in time, space, and IOPS). Lastly, avoid using untrusted WDL code or container images; but if they're necessary, then use a separate EFS access point and restrict the IAM and network configuration for the AWS Batch containers appropriately. 85 | 86 | ### EFS performance considerations 87 | 88 | To scale up to larger workloads, it's important to study AWS documentation on EFS [performance](https://docs.aws.amazon.com/efs/latest/ug/performance.html) and [monitoring](https://docs.aws.amazon.com/efs/latest/ug/monitoring-cloudwatch.html). Like any network file system, EFS limits on throughput and IOPS can cause bottlenecks, in the worst case effectively freezing a workflow. 89 | 90 | Management tips: 91 | 92 | * Monitor file system throughput limits, IOPS, and burst credits (if applicable) in the EFS area of the AWS Console. 93 | * Retain the default *Elastic* throughput mode (though it may cost more than other modes) 94 | * Code WDL tasks to write any purely-temporary files under `/tmp`, which may use local scratch space, instead of the EFS working directory. 95 | * Configure miniwdl and AWS Batch to limit the number of concurrent jobs and/or the rate at which they turn over (see [miniwdl_aws.cfg](https://github.com/miniwdl-ext/miniwdl-aws/blob/main/miniwdl_aws.cfg) for relevant details). 96 | * Spread out separate workflow runs over time or across multiple EFS file systems. 97 | 98 | ### FSx for Lustre and other shared filesystems 99 | 100 | If EFS performance remains insufficient, then you can configure your Batch compute environments to automatically mount some other shared filesystem upon instance startup. Then use `miniwdl-aws-submit --no-efs` to make it assume the filesystem will already be mounted at a certain location (default `--mount /mnt/net`) across all instances. In this case, the compute environment for workflow jobs is expected to use EC2 instead of Fargate resources (usually necessary for mounting). 101 | 102 | The miniwdl-aws-terraform repo [includes a variant](https://github.com/miniwdl-ext/miniwdl-aws-terraform/tree/main/fsx) setting this up with [FSx for Lustre](https://aws.amazon.com/fsx/lustre/). FSx offers higher throughput scalability, but has other downsides compared to EFS (higher upfront costs, manual capacity scaling, single-AZ deployment, fewer AWS service integrations). 103 | 104 | ## Logs & troubleshooting 105 | 106 | If the terminal log isn't available (through Studio or `miniwdl-submit-awsbatch --follow`) to trace a workflow failure, look for miniwdl's usual log files written in the run directory on EFS or copied to S3. 107 | 108 | Each task job's log is also forwarded to [CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html) under the `/aws/batch/job` group and a log stream name reported in miniwdl's log. Using `miniwdl-aws-submit`, the workflow job's log is also forwarded. CloudWatch Logs indexes the logs for structured search through the AWS Console & API. 109 | 110 | Misconfigured infrastructure might prevent logs from being written to EFS or CloudWatch at all. In that case, use the AWS Batch console/API to find status messages for the workflow or task jobs. 111 | 112 | Tasks can self-report their CPU & memory usage in their standard error logs, by setting `MINIWDL__LOG_TASK_USAGE__PERIOD=60` to report every 60 seconds (or as desired). Submit with `--verbose --follow`, or look in any task's CloudWatch Logs stream or `stderr.txt` file, to see the "container usage" log messages. 113 | 114 | ## GPU jobs 115 | 116 | Miniwdl-aws recognizes the `gpu: true` setting in a task `runtime{}` section, and translates that to a [GPU resource requirement](https://docs.aws.amazon.com/batch/latest/userguide/gpu-jobs.html) for AWS Batch. For the job to be scheduled, the Batch compute environment must of course make GPU instance types available. 117 | 118 | By default, `gpu: true` translates to a requirement for a single GPU. The WDL spec defines this as a boolean value, so there is no clear way to request multiple GPUs for a given task. The configuration `MINIWDL__AWS__GPU_VALUE` can be set to an integer *N* to make *all* tasks with `gpu: true` require *N* GPUs. 119 | 120 | Alternatively, miniwdl-aws also recognizes the `acceleratorType` and `acceleratorCount` attributes used by [AWS HealthOmics](https://docs.aws.amazon.com/omics/latest/dev/parameters-and-input-wdl.html). Any `acceleratorType` starting with "nvidia" translates to a Batch GPU requirement; the actual GPU type will depend on the instance type(s) made available by the compute environment. 121 | 122 | Multi-GPU operations may need more shared memory than Batch typically makes available in each task container. To increase the available shared memory, set e.g. `MINIWDL__AWS__CONTAINER_PROPERTIES='{"linuxParameters":{"sharedMemorySize":4096}}'` 123 | 124 | ## Contributing 125 | 126 | Pull requests are welcome! For help, open an issue here or drop in on [#miniwdl in the OpenWDL Slack](https://openwdl.slack.com/archives/C02JCRJU79T). 127 | 128 | **Code formatting and linting.** To prepare your code to pass the CI checks, 129 | 130 | ``` 131 | pip3 install --upgrade -r test/requirements.txt 132 | pre-commit run --all-files 133 | ``` 134 | 135 | **Running tests.** In an AWS-credentialed terminal session, 136 | 137 | ``` 138 | MINIWDL__AWS__WORKFLOW_QUEUE=miniwdl-workflow test/run_tests.sh 139 | ``` 140 | 141 | This builds the requisite Docker image from the current code revision and pushes it to an ECR repository (which must be prepared once with `aws ecr create-repository --repository-name miniwdl-aws`). To test an image from the [GitHub public registry](https://github.com/miniwdl-ext/miniwdl-aws/pkgs/container/miniwdl-aws) or some other version, set `MINIWDL__AWS__WORKFLOW_IMAGE` to the desired tag. 142 | -------------------------------------------------------------------------------- /miniwdl_aws.cfg: -------------------------------------------------------------------------------- 1 | # miniwdl configuration file built into the miniwdl-aws Docker image for use with 2 | # miniwdl-aws-submit 3 | # 4 | # The easiest way to override these options is usually to set environment variables with the 5 | # convention MINIWDL__{SECTION}__{KEY}={VALUE}. Full info on the miniwdl configuration loader: 6 | # https://miniwdl.readthedocs.io/en/latest/runner_reference.html#configuration 7 | # 8 | # Additionally, the following are also usually set via environment variables: 9 | # * MINIWDL__AWS__TASK_QUEUE: the desired AWS Batch queue 10 | # * MINIWDL__AWS__FSAP: EFS Access Point ID (fsap-xxxx) 11 | # * MINIWDL__AWS__FS: EFS file system ID (fs-xxxx) matching the access point; can be detected if 12 | # omitted, but doing so requires IAM permission to DescribeAccessPoints. 13 | 14 | [scheduler] 15 | container_backend = aws_batch_job 16 | # One `miniwdl run` process will be able to orchestrate this many concurrent AWS Batch jobs. (This 17 | # controls the size of a thread pool, so setting it too high tends to be counterproductive.) 18 | call_concurrency = 80 19 | # Reduced concurrency limit for URI download jobs; since these are typically S3 downloads that are 20 | # very fast, running many concurrently is likely to overstress EFS. 21 | download_concurrency = 5 22 | 23 | [file_io] 24 | # This must be set to the host's mount point for the EFS Access Point. The plugin will also 25 | # configure AWS Batch jobs to mount the filesystem at this same location. 26 | root = /mnt/efs 27 | 28 | [task_runtime] 29 | # Default policy to retry spot-terminated jobs (up to three total attempts) 30 | defaults = { 31 | "docker": "ubuntu:20.04", 32 | "preemptible": 2 33 | } 34 | # Default retry policy for URI download tasks, to overcome transient `aws s3 cp` errors 35 | download_defaults = { 36 | "cpu": 2, 37 | "memory": "1G", 38 | "maxRetries": 2 39 | } 40 | 41 | [call_cache] 42 | # Cache call outputs in EFS folder, valid so long as all referenced input & output files remain 43 | # unmodified on EFS. (Relative to [file_io] root) 44 | dir = miniwdl_run/_CACHE/call 45 | get = true 46 | put = true 47 | 48 | [download_cache] 49 | dir = miniwdl_run/_CACHE/download 50 | get = true 51 | # Disabling S3 download cache by default to prevent confusing coherence problems (as the cache 52 | # logic does not check for modification of the original S3 object). Recommend enabling, if that can 53 | # be managed adequately. 54 | put = false 55 | # disable flock on files used from download cache due to EFS' low limits on flocks 56 | flock = false 57 | 58 | [aws] 59 | # Last-resort job timeout for AWS Batch to enforce (attemptDurationSeconds) 60 | job_timeout = 864000 61 | # Internal rate-limiting periods (seconds) for AWS Batch API requests 62 | # (may need to be increased if many concurrent workflow runs are planned) 63 | describe_period = 1 64 | submit_period = 1 65 | # Boto3 Config retries policy for miniwdl's AWS Batch API requests. 66 | # see: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html 67 | boto3_retries = { 68 | "max_attempts": 5, 69 | "mode": "standard" 70 | } 71 | # Wait this many seconds before retrying a job after a spot instance interruption or other 72 | # retry-able failure. Provides a time window for convergence of any "eventually consistent" 73 | # activities from the first attempt (involving e.g. EFS, CloudWatch Logs, etc.). 74 | retry_wait = 20 75 | # Explicitly `sync` files in the task working directory before exiting task container. Requires 76 | # `find`, `xargs`, and `sync` commands available in the container image. 77 | container_sync = false 78 | # When task runtime includes "gpu: true", request this many GPUs from AWS Batch. (The WDL spec 79 | # defines runtime.gpu as a Boolean, as of this writing.) 80 | gpu_value = 1 81 | # ContainerProperties fields to set on AWS Batch jobs for tasks, OTHER than the following which are 82 | # set by miniwdl-aws, task runtime{}, or other available config options: 83 | # image command environment resourceRequirements mountPoints privileged 84 | # see: https://docs.aws.amazon.com/batch/latest/APIReference/API_ContainerProperties.html 85 | container_properties = { 86 | } 87 | # Add this many mebibytes (MiB) to each task's runtime.memory setting when filling out the 88 | # memory requirement for each AWS Batch job. The default is meant to offset the memory that AWS 89 | # Batch itself reserves on each worker instance; without this, if runtime.memory is e.g. "8 GiB" 90 | # then AWS Batch might use larger-than-necessary worker instances and pack them inefficiently. 91 | # see: https://docs.aws.amazon.com/batch/latest/userguide/memory-management.html 92 | memory_delta = -33 93 | -------------------------------------------------------------------------------- /miniwdl_aws/__init__.py: -------------------------------------------------------------------------------- 1 | from .batch_job import BatchJob, BatchJobNoEFS # noqa: F401 2 | from .cli_run_s3upload import miniwdl_run_s3upload # noqa: F401 3 | from .cli_submit import miniwdl_submit_awsbatch # noqa: F401 4 | -------------------------------------------------------------------------------- /miniwdl_aws/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from .cli_submit import miniwdl_submit_awsbatch 3 | 4 | 5 | def main(args=sys.argv): 6 | miniwdl_submit_awsbatch(args) 7 | 8 | 9 | if __name__ == "__main__": 10 | sys.exit(main()) 11 | -------------------------------------------------------------------------------- /miniwdl_aws/_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import base64 4 | import json 5 | import uuid 6 | import requests 7 | import subprocess 8 | from WDL._util import StructuredLogMessage as _ 9 | 10 | 11 | def detect_aws_region(cfg): 12 | if cfg and cfg.has_option("aws", "region") and cfg.get("aws", "region"): 13 | return cfg.get("aws", "region") 14 | 15 | # check environment variables 16 | for ev in ("AWS_REGION", "AWS_DEFAULT_REGION"): 17 | if os.environ.get(ev): 18 | return os.environ[ev] 19 | 20 | # check boto3, which will load ~/.aws 21 | if boto3.DEFAULT_SESSION and boto3.DEFAULT_SESSION.region_name: 22 | return boto3.DEFAULT_SESSION.region_name 23 | session = boto3.Session() 24 | if session.region_name: 25 | return session.region_name 26 | 27 | # query EC2 metadata 28 | try: 29 | return requests.get( 30 | "http://169.254.169.254/latest/meta-data/placement/region", timeout=2.0 31 | ).text 32 | except: 33 | pass 34 | 35 | return None 36 | 37 | 38 | def randomize_job_name(job_name): 39 | # Append entropy to the Batch job name to avoid race condition using identical names in 40 | # concurrent RegisterJobDefinition requests 41 | return ( 42 | job_name[:103] # 119 + 1 + 8 = 128 43 | + "-" 44 | + base64.b32encode(uuid.uuid4().bytes[:5]).lower().decode() 45 | ) 46 | 47 | 48 | def efs_id_from_access_point(region_name, fsap_id): 49 | # Resolve the EFS access point id (fsap-xxxx) to the associated file system id (fs-xxxx). Saves 50 | # user from having to specify both. 51 | aws_efs = boto3.Session().client("efs", region_name=region_name) 52 | desc = aws_efs.describe_access_points(AccessPointId=fsap_id) 53 | assert len(desc.get("AccessPoints", [])) == 1 54 | desc = desc["AccessPoints"][0] 55 | fs_id = desc["FileSystemId"] 56 | assert isinstance(fs_id, str) and fs_id.startswith("fs-") 57 | return fs_id 58 | 59 | 60 | def detect_sagemaker_studio_efs(logger, **kwargs): 61 | # Detect if we're operating inside SageMaker Studio and if so, record EFS mount details 62 | METADATA_FILE = "/opt/ml/metadata/resource-metadata.json" 63 | metadata = None 64 | try: 65 | with open(METADATA_FILE) as infile: 66 | metadata = json.load(infile) 67 | assert metadata["DomainId"] and metadata["UserProfileName"] 68 | except: 69 | return None 70 | try: 71 | api = boto3.client("sagemaker", **kwargs) 72 | domain = api.describe_domain(DomainId=metadata["DomainId"]) 73 | efs_id = domain["HomeEfsFileSystemId"] 74 | profile = api.describe_user_profile( 75 | DomainId=metadata["DomainId"], UserProfileName=metadata["UserProfileName"] 76 | ) 77 | efs_uid = profile["HomeEfsFileSystemUid"] 78 | assert efs_id and efs_uid 79 | efs_home = f"/{efs_uid}" # home directory on EFS 80 | efs_mount = os.getenv("HOME") # where the EFS home directory is mounted inside Studio 81 | logger.notice( 82 | _( 83 | "detected SageMaker Studio", 84 | domain=metadata["DomainId"], 85 | user=metadata["UserProfileName"], 86 | efs_id=efs_id, 87 | efs_home=efs_home, 88 | efs_mount=efs_mount, 89 | ) 90 | ) 91 | return (efs_id, efs_uid, efs_home, efs_mount) 92 | except Exception as exn: 93 | logger.warning( 94 | _( 95 | "detected local AWS SageMaker Studio metadata, but failed to query domain", 96 | error=str(exn), 97 | metadata=METADATA_FILE, 98 | domain=metadata["DomainId"], 99 | user=metadata["UserProfileName"], 100 | ) 101 | ) 102 | return None 103 | 104 | 105 | def detect_studio_fsap(logger, efs_id, efs_uid, efs_home, **kwargs): 106 | # Look for an Access Point with the appropriate configuration to mount the SageMaker Studio EFS 107 | # (in the same way it's presented through Studio) 108 | try: 109 | efs = boto3.client("efs", **kwargs) 110 | access_points = efs.describe_access_points(FileSystemId=efs_id, MaxResults=100).get( 111 | "AccessPoints", [] 112 | ) 113 | if len(access_points) >= 100: 114 | logger.warn( 115 | _( 116 | "EFS has >=100 Access Points; set configuration [aws] fsap or environment MINIWDL__AWS__FSAP to avoid searching through them", 117 | efs_id=efs_id, 118 | ) 119 | ) 120 | for ap in access_points: 121 | assert ap["FileSystemId"] == efs_id 122 | if ( 123 | ap["LifeCycleState"] == "available" 124 | and ap.get("RootDirectory", {}).get("Path", "") == efs_home 125 | and str(ap.get("PosixUser", {}).get("Uid", "")) == efs_uid 126 | ): 127 | logger.notice( 128 | _( 129 | "detected suitable EFS Access Point; to override, set configuration [aws] fsap or environment MINIWDL__AWS__FSAP", 130 | arn=ap["AccessPointArn"], 131 | ) 132 | ) 133 | return ap["AccessPointId"] 134 | return None 135 | except Exception as exn: 136 | logger.warning( 137 | _( 138 | "error detecting EFS Access Point", 139 | error=str(exn), 140 | efs_id=efs_id, 141 | ufs_uid=efs_uid, 142 | ) 143 | ) 144 | return None 145 | 146 | 147 | def detect_gwfcore_batch_queue(logger, efs_id, **kwargs): 148 | # Look for a Batch job queue tagged with the Studio EFS id (indicating it's our default) 149 | try: 150 | batch = boto3.client("batch", **kwargs) 151 | queues = batch.describe_job_queues(maxResults=100).get("jobQueues", []) 152 | if len(queues) >= 100: 153 | logger.warn( 154 | "AWS Batch has >=100 job queues; set configuration [aws] task_queue or environment MINIWDL__AWS__TASK_QUEUE to avoid searching through them" 155 | ) 156 | queues = [ 157 | q 158 | for q in queues 159 | if q.get("state", "") == "ENABLED" 160 | and q.get("status", "") == "VALID" 161 | and q.get("tags", {}).get("MiniwdlStudioEfsId", "") == efs_id 162 | ] 163 | if not queues: 164 | return None 165 | if len(queues) > 1: 166 | default_queues = [q for q in queues if q.get("jobQueueName", "").startswith("default-")] 167 | if default_queues: 168 | queues = default_queues 169 | logger.notice( 170 | _( 171 | "detected suitable AWS Batch job queue; to override, set configuration [aws] task_queue or environment MINIWDL__AWS__TASK_QUEUE", 172 | arn=queues[0]["jobQueueArn"], 173 | ) 174 | ) 175 | return queues[0]["jobQueueName"] 176 | except Exception as exn: 177 | logger.warning( 178 | _( 179 | "error detecting AWS Batch job queue", 180 | error=str(exn), 181 | efs_id=efs_id, 182 | ) 183 | ) 184 | return None 185 | 186 | 187 | def subprocess_run_with_clean_exit(*args, check=False, **kwargs): 188 | """ 189 | As subprocess.run(*args, **kwargs), but in the event of a SystemExit, KeyboardInterrupt, or 190 | BrokenPipe exception, sends SIGTERM to the subprocess and waits for it to exit before 191 | re-raising. Typically paired with signal handlers for SIGTERM/SIGINT/etc. to raise SystemExit. 192 | """ 193 | 194 | assert "timeout" not in kwargs 195 | with subprocess.Popen(*args, **kwargs) as subproc: 196 | while True: 197 | try: 198 | stdout, stderr = subproc.communicate(timeout=0.1) 199 | assert isinstance(subproc.returncode, int) 200 | completed = subprocess.CompletedProcess( 201 | subproc.args, subproc.returncode, stdout, stderr 202 | ) 203 | if check: 204 | completed.check_returncode() 205 | return completed 206 | except (SystemExit, KeyboardInterrupt, BrokenPipeError): 207 | subproc.terminate() 208 | subproc.communicate() 209 | raise 210 | except subprocess.TimeoutExpired: 211 | pass 212 | 213 | 214 | END_OF_LOG = "[miniwdl_run_s3upload] -- END OF LOG --" 215 | -------------------------------------------------------------------------------- /miniwdl_aws/batch_job.py: -------------------------------------------------------------------------------- 1 | """ 2 | BatchJob: implements miniwdl TaskContainer by submitting jobs to an AWS Batch queue and polling 3 | their status. Assumes a shared filesystem (typically EFS) between the miniwdl host and the Batch 4 | workers. 5 | """ 6 | 7 | import os 8 | import math 9 | import time 10 | import json 11 | import threading 12 | import heapq 13 | from contextlib import ExitStack, suppress 14 | import boto3 15 | import botocore 16 | import WDL 17 | import WDL.runtime.task_container 18 | import WDL.runtime._statusbar 19 | from WDL._util import rmtree_atomic, symlink_force, write_atomic 20 | from WDL._util import StructuredLogMessage as _ 21 | from ._util import ( 22 | detect_aws_region, 23 | randomize_job_name, 24 | efs_id_from_access_point, 25 | detect_sagemaker_studio_efs, 26 | detect_studio_fsap, 27 | detect_gwfcore_batch_queue, 28 | ) 29 | 30 | 31 | class BatchJobBase(WDL.runtime.task_container.TaskContainer): 32 | """ 33 | Abstract base class implementing the AWS Batch backend for miniwdl TaskContainer. Concrete 34 | subclasses add configuration specific to the the shared filesystem in use. 35 | """ 36 | 37 | @classmethod 38 | def global_init(cls, cfg, logger): 39 | cls._submit_lock = threading.Lock() 40 | cls._last_submit_time = [0.0] 41 | cls._init_time = time.time() 42 | cls._describer = BatchJobDescriber() 43 | 44 | cls._region_name = detect_aws_region(cfg) 45 | assert ( 46 | cls._region_name 47 | ), "Failed to detect AWS region; configure AWS CLI or set environment AWS_DEFAULT_REGION" 48 | 49 | # set AWS Batch job queue 50 | cls._job_queue = cfg.get("aws", "task_queue", "") 51 | cls._job_queue_fallback = cfg.get("aws", "task_queue_fallback", "") 52 | 53 | # TODO: query Batch compute environment for resource limits 54 | cls._resource_limits = {"cpu": 9999, "mem_bytes": 999999999999999} 55 | 56 | cls._fs_mount = cfg.get("file_io", "root") 57 | assert ( 58 | cls._fs_mount.startswith("/") and cls._fs_mount != "/" 59 | ), "misconfiguration, set [file_io] root / MINIWDL__FILE_IO__ROOT to EFS mount point" 60 | 61 | @classmethod 62 | def detect_resource_limits(cls, cfg, logger): 63 | return cls._resource_limits 64 | 65 | def __init__(self, cfg, run_id, host_dir): 66 | super().__init__(cfg, run_id, host_dir) 67 | self._logStreamName = None 68 | self._inputs_copied = False 69 | # We expect the Batch job containers to have the shared filesystem mounted at the same 70 | # location we, the workflow job, have it mounted ourselves. Therefore container_dir will be 71 | # the same as host_dir (unlike the default Swarm backend, which mounts it at a different 72 | # virtualized location) 73 | self.container_dir = self.host_dir 74 | self._aws_interrupts = 0 75 | 76 | def copy_input_files(self, logger): 77 | self._inputs_copied = True 78 | return super().copy_input_files(logger) 79 | 80 | def host_work_dir(self): 81 | # Since we aren't virtualizing the in-container paths as noted above, always use the same 82 | # working directory on task retries, instead of the base class behavior of appending the 83 | # try counter (on the host side). This loses some robustness to a split-brain condition 84 | # where the previous try is actually still running when we start the retry. 85 | # (see also retry_wait) 86 | return os.path.join(self.host_dir, "work") 87 | 88 | def host_stdout_txt(self): 89 | return os.path.join(self.host_dir, "stdout.txt") 90 | 91 | def host_stderr_txt(self): 92 | return os.path.join(self.host_dir, "stderr.txt") 93 | 94 | def reset(self, logger) -> None: 95 | cooldown = self.cfg.get_float("aws", "retry_wait", 20.0) 96 | if cooldown > 0.0: 97 | logger.info( 98 | _( 99 | "waiting to retry per configuration [aws] retry_wait", 100 | seconds=cooldown, 101 | ) 102 | ) 103 | time.sleep(cooldown) 104 | 105 | rmtree_atomic(self.host_work_dir()) 106 | with suppress(FileNotFoundError): 107 | os.unlink(self.host_stderr_txt() + ".offset") # PygtailLogger state file 108 | super().reset(logger) 109 | 110 | def process_runtime(self, logger, runtime_eval): 111 | super().process_runtime(logger, runtime_eval) # handles cpu, memory, docker, gpu 112 | if "acceleratorType" in runtime_eval: 113 | if not isinstance(runtime_eval["acceleratorType"], WDL.Value.String): 114 | raise WDL.Error.RuntimeError("invalid setting of runtime.acceleratorType") 115 | accty = runtime_eval["acceleratorType"].value 116 | if accty.startswith("nvidia"): 117 | self.runtime_values["gpu"] = True 118 | else: 119 | logger.warning(_("ignored unrecognized runtime.acceleratorType", value=accty)) 120 | if "acceleratorCount" in runtime_eval: 121 | if not isinstance(runtime_eval["acceleratorCount"], WDL.Value.Int): 122 | raise WDL.Error.RuntimeError("invalid setting of runtime.acceleratorCount") 123 | self.runtime_values["acceleratorCount"] = runtime_eval["acceleratorCount"].value 124 | 125 | def _run(self, logger, terminating, command): 126 | """ 127 | Run task 128 | """ 129 | self._observed_states = set() 130 | boto3_retries = self.cfg.get_dict( 131 | "aws", "boto3_retries", {"max_attempts": 5, "mode": "standard"} 132 | ) 133 | try: 134 | aws_batch = boto3.Session().client( # Session() needed for thread safety 135 | "batch", 136 | region_name=self._region_name, 137 | config=botocore.config.Config(retries=boto3_retries), 138 | ) 139 | with ExitStack() as cleanup: 140 | # prepare the task working directory 141 | self._prepare_dir(logger, cleanup, command) 142 | # submit Batch job (with request throttling) 143 | job_id = None 144 | submit_period = self.cfg.get_float("aws", "submit_period", 1.0) 145 | while True: 146 | with self._submit_lock: 147 | if terminating(): 148 | raise WDL.runtime.Terminated(quiet=True) 149 | if ( 150 | time.time() - self._last_submit_time[0] 151 | >= submit_period * self._submit_period_multiplier() 152 | ): 153 | job_id = self._submit_batch_job(logger, cleanup, aws_batch) 154 | self._last_submit_time[0] = time.time() 155 | break 156 | time.sleep(submit_period / 4) 157 | # poll Batch job status 158 | return self._await_batch_job(logger, cleanup, aws_batch, job_id, terminating) 159 | except botocore.exceptions.ClientError as exn: 160 | wrapper = AWSError(exn) 161 | logger.error(wrapper) 162 | raise wrapper 163 | 164 | def _prepare_dir(self, logger, cleanup, command): 165 | # Prepare control files. We do NOT use super().touch_mount_point(...) because it fails if 166 | # the desired mount point already exists; which it may in our case after a retry (see 167 | # self.host_work_dir() override above.) 168 | with open(os.path.join(self.host_dir, "command"), "w") as outfile: 169 | outfile.write(command) 170 | with open(self.host_stdout_txt(), "w"): 171 | pass 172 | with open(self.host_stderr_txt(), "w"): 173 | pass 174 | 175 | if not self._inputs_copied: 176 | # Prepare symlinks to the input Files & Directories 177 | container_prefix = os.path.join(self.container_dir, "work/_miniwdl_inputs/") 178 | link_dirs_made = set() 179 | for host_fn, container_fn in self.input_path_map.items(): 180 | assert container_fn.startswith(container_prefix) and len(container_fn) > len( 181 | container_prefix 182 | ) 183 | if host_fn.endswith("/"): 184 | assert container_fn.endswith("/") 185 | host_fn = host_fn[:-1] 186 | container_fn = container_fn[:-1] 187 | else: 188 | assert not container_fn.endswith("/") 189 | link_dn = os.path.dirname(container_fn) 190 | if link_dn not in link_dirs_made: 191 | os.makedirs(link_dn) 192 | link_dirs_made.add(link_dn) 193 | symlink_force(host_fn, container_fn) 194 | 195 | def _submit_batch_job(self, logger, cleanup, aws_batch): 196 | """ 197 | Register & submit AWS batch job, leaving a cleanup callback to deregister the transient 198 | job definition. 199 | """ 200 | 201 | job_name = self.run_id 202 | if job_name.startswith("call-"): 203 | job_name = job_name[5:] 204 | if self.try_counter > 1: 205 | job_name += f"-try{self.try_counter}" 206 | # Append entropy to the job name to avoid race condition using identical job names in 207 | # concurrent RegisterJobDefinition requests 208 | job_name = randomize_job_name(job_name) 209 | 210 | container_properties = self._prepare_container_properties(logger) 211 | job_def = aws_batch.register_job_definition( 212 | jobDefinitionName=job_name, 213 | type="container", 214 | containerProperties=container_properties, 215 | ) 216 | job_def_handle = f"{job_def['jobDefinitionName']}:{job_def['revision']}" 217 | logger.debug( 218 | _( 219 | "registered Batch job definition", 220 | jobDefinition=job_def_handle, 221 | **container_properties, 222 | ) 223 | ) 224 | 225 | self._cleanup_job_definition(logger, cleanup, aws_batch, job_def_handle) 226 | 227 | job_queue = self._select_job_queue() 228 | job_tags = self.cfg.get_dict("aws", "job_tags", {}) 229 | if "AWS_BATCH_JOB_ID" in os.environ: 230 | # If we find ourselves running inside an AWS Batch job, tag the new job identifying 231 | # ourself as the "parent" job. 232 | job_tags["AWS_BATCH_PARENT_JOB_ID"] = os.environ["AWS_BATCH_JOB_ID"] 233 | # TODO: set a tag to indicate that this job is a retry of another 234 | job = aws_batch.submit_job( 235 | jobName=job_name, 236 | jobQueue=job_queue, 237 | jobDefinition=job_def_handle, 238 | timeout={"attemptDurationSeconds": self.cfg.get_int("aws", "job_timeout", 86400)}, 239 | tags=job_tags, 240 | ) 241 | logger.info( 242 | _( 243 | "AWS Batch job submitted", 244 | jobQueue=job_queue, 245 | jobId=job["jobId"], 246 | tags=job_tags, 247 | ) 248 | ) 249 | return job["jobId"] 250 | 251 | def _select_job_queue(self): 252 | if self._job_queue_fallback: 253 | preemptible = self.runtime_values.get("preemptible", 0) 254 | if self._aws_interrupts >= preemptible and preemptible > 0: 255 | return self._job_queue_fallback 256 | return self._job_queue 257 | 258 | def _prepare_container_properties(self, logger): 259 | image_tag = self.runtime_values.get("docker", "ubuntu:20.04") 260 | vcpu = self.runtime_values.get("cpu", 1) 261 | memory_mbytes = max( 262 | ( 263 | math.ceil(self.runtime_values.get("memory_reservation", 0) / 1048576) 264 | + self.cfg.get_int("aws", "memory_delta", -33) 265 | ), 266 | 991, 267 | ) 268 | commands = [ 269 | f"cd {self.container_dir}/work", 270 | "exit_code=0", 271 | self.cfg.get("task_runtime", "command_shell") 272 | + " ../command >> ../stdout.txt 2> >(tee -a ../stderr.txt >&2) || exit_code=$?", 273 | ] 274 | if self.cfg.get_bool("aws", "container_sync", False): 275 | commands.append("find . -type f | xargs sync") 276 | commands.append("sync ../stdout.txt ../stderr.txt") 277 | commands.append("exit $exit_code") 278 | 279 | resource_requirements = [ 280 | {"type": "VCPU", "value": str(vcpu)}, 281 | {"type": "MEMORY", "value": str(memory_mbytes)}, 282 | ] 283 | 284 | if self.runtime_values.get("gpu", False): 285 | gpu_value = self.cfg.get_int("aws", "gpu_value", 1) 286 | if "acceleratorCount" in self.runtime_values: 287 | gpu_value = self.runtime_values["acceleratorCount"] 288 | elif gpu_value > 1: 289 | logger.info( 290 | _("requesting multiple GPUs (per config [aws] gpu_value)", gpu_value=gpu_value) 291 | ) 292 | if gpu_value > 0: 293 | resource_requirements += [{"type": "GPU", "value": str(gpu_value)}] 294 | 295 | container_properties = { 296 | "image": image_tag, 297 | "command": ["/bin/bash", "-ec", "\n".join(commands)], 298 | "environment": [ 299 | {"name": ev_name, "value": ev_value} 300 | for ev_name, ev_value in self.runtime_values.get("env", dict()).items() 301 | ], 302 | "resourceRequirements": resource_requirements, 303 | "privileged": self.runtime_values.get("privileged", False), 304 | "mountPoints": [{"containerPath": self._fs_mount, "sourceVolume": "file_io_root"}], 305 | } 306 | 307 | for k, v in self.cfg.get_dict("aws", "container_properties", {}).items(): 308 | if k in container_properties: 309 | raise WDL.Error.RuntimeError( 310 | f"Config [aws] container_properties may not override '{k}'" 311 | ) 312 | container_properties[k] = v 313 | 314 | if self.cfg["task_runtime"].get_bool("as_user"): 315 | user = f"{os.geteuid()}:{os.getegid()}" 316 | if user.startswith("0:"): 317 | logger.warning( 318 | "container command will run explicitly as root, since you are root and set --as-me" 319 | ) 320 | container_properties["user"] = user 321 | 322 | return container_properties 323 | 324 | def _cleanup_job_definition(self, logger, cleanup, aws_batch, job_def_handle): 325 | def deregister(logger, aws_batch, job_def_handle): 326 | try: 327 | aws_batch.deregister_job_definition(jobDefinition=job_def_handle) 328 | logger.debug(_("deregistered Batch job definition", jobDefinition=job_def_handle)) 329 | except botocore.exceptions.ClientError as exn: 330 | # AWS expires job definitions after 6mo, so failing to delete them isn't fatal 331 | logger.warning( 332 | _( 333 | "failed to deregister Batch job definition", 334 | jobDefinition=job_def_handle, 335 | error=str(AWSError(exn)), 336 | ) 337 | ) 338 | 339 | cleanup.callback(deregister, logger, aws_batch, job_def_handle) 340 | 341 | def _await_batch_job(self, logger, cleanup, aws_batch, job_id, terminating): 342 | """ 343 | Poll for Batch job success or failure & return exit code 344 | """ 345 | describe_period = self.cfg.get_float("aws", "describe_period", 1.0) 346 | cleanup.callback((lambda job_id: self._describer.unsubscribe(job_id)), job_id) 347 | poll_stderr = cleanup.enter_context(self.poll_stderr_context(logger)) 348 | last_job_desc_json = None 349 | exit_code = None 350 | while exit_code is None: 351 | time.sleep(describe_period) 352 | job_desc = self._describer.describe(aws_batch, job_id, describe_period) 353 | job_desc_json = json.dumps(job_desc, indent=2, sort_keys=True) 354 | if job_desc_json != last_job_desc_json: 355 | last_job_desc_json = job_desc_json 356 | write_atomic( 357 | job_desc_json, 358 | os.path.join(self.host_dir, f"awsBatchJobDetail.{job_id}.json"), 359 | ) 360 | job_status = job_desc["status"] 361 | if "container" in job_desc and "logStreamName" in job_desc["container"]: 362 | self._logStreamName = job_desc["container"]["logStreamName"] 363 | if job_status not in self._observed_states: 364 | self._observed_states.add(job_status) 365 | logfn = ( 366 | logger.notice 367 | if job_status in ("RUNNING", "SUCCEEDED", "FAILED") 368 | else logger.info 369 | ) 370 | logdetails = {"status": job_status, "jobId": job_id} 371 | if self._logStreamName: 372 | logdetails["logStreamName"] = self._logStreamName 373 | logfn(_("AWS Batch job change", **logdetails)) 374 | if job_status == "STARTING" or ( 375 | job_status == "RUNNING" and "STARTING" not in self._observed_states 376 | ): 377 | cleanup.enter_context(self.task_running_context()) 378 | if job_status not in ( 379 | "SUBMITTED", 380 | "PENDING", 381 | "RUNNABLE", 382 | "STARTING", 383 | "RUNNING", 384 | "SUCCEEDED", 385 | "FAILED", 386 | ): 387 | logger.warning(_("unknown job status from AWS Batch", status=job_status)) 388 | if job_status == "SUCCEEDED": 389 | exit_code = 0 390 | elif job_status == "FAILED": 391 | reason = job_desc.get("container", {}).get("reason", None) 392 | status_reason = job_desc.get("statusReason", None) 393 | self.failure_info = {"jobId": job_id} 394 | if reason: 395 | self.failure_info["reason"] = reason 396 | if status_reason: 397 | self.failure_info["statusReason"] = status_reason 398 | if self._logStreamName: 399 | self.failure_info["logStreamName"] = self._logStreamName 400 | if status_reason and "Host EC2" in status_reason and "terminated" in status_reason: 401 | self._aws_interrupts += 1 402 | raise WDL.runtime.Interrupted( 403 | "AWS Batch job interrupted (likely spot instance termination)", 404 | more_info=self.failure_info, 405 | ) 406 | if "exitCode" not in job_desc.get("container", {}): 407 | raise WDL.Error.RuntimeError( 408 | "AWS Batch job failed", more_info=self.failure_info 409 | ) 410 | exit_code = job_desc["container"]["exitCode"] 411 | assert isinstance(exit_code, int) and exit_code != 0 412 | if "RUNNING" in self._observed_states: 413 | poll_stderr() 414 | if terminating(): 415 | aws_batch.terminate_job(jobId=job_id, reason="terminated by miniwdl") 416 | raise WDL.runtime.Terminated( 417 | quiet=not self._observed_states.difference({"SUBMITTED", "PENDING", "RUNNABLE"}) 418 | ) 419 | for _root, _dirs, _files in os.walk(self.host_dir, followlinks=False): 420 | # no-op traversal of working directory to refresh NFS metadata cache (speculative) 421 | pass 422 | poll_stderr() 423 | return exit_code 424 | 425 | def _submit_period_multiplier(self): 426 | if self._describer.jobs: 427 | b = self.cfg.get_float("aws", "submit_period_b", 0.0) 428 | if b > 0.0: 429 | t = time.time() - self._init_time 430 | c = self.cfg.get_float("aws", "submit_period_c", 0.0) 431 | return max(1.0, c - t / b) 432 | return 1.0 433 | 434 | 435 | class BatchJob(BatchJobBase): 436 | """ 437 | EFS-based implementation, including the case of SageMaker Studio's built-in EFS. Assumes we're 438 | running on an EC2 instance or Fargate container mounting an EFS Access Point at [file_io] root, 439 | and configures each Batch job with the same mount. 440 | """ 441 | 442 | @classmethod 443 | def global_init(cls, cfg, logger): 444 | super().global_init(cfg, logger) 445 | 446 | # EFS configuration based on: 447 | # - [aws] fsap / MINIWDL__AWS__FSAP 448 | # - [aws] fs / MINIWDL__AWS__FS 449 | # - SageMaker Studio metadata, if applicable 450 | cls._fs_id = None 451 | cls._fsap_id = None 452 | if cfg.has_option("aws", "fs"): 453 | cls._fs_id = cfg.get("aws", "fs") 454 | if cfg.has_option("aws", "fsap"): 455 | cls._fsap_id = cfg.get("aws", "fsap") 456 | if not cls._fs_id: 457 | cls._fs_id = efs_id_from_access_point(cls._region_name, cls._fsap_id) 458 | cls._studio_efs_uid = None 459 | sagemaker_studio_efs = detect_sagemaker_studio_efs(logger, region_name=cls._region_name) 460 | if sagemaker_studio_efs: 461 | ( 462 | studio_efs_id, 463 | studio_efs_uid, 464 | studio_efs_home, 465 | studio_efs_mount, 466 | ) = sagemaker_studio_efs 467 | assert ( 468 | not cls._fs_id or cls._fs_id == studio_efs_id 469 | ), "Configured EFS ([aws] fs / MINIWDL__AWS__FS, [aws] fsap / MINIWDL__AWS__FSAP) isn't associated with current SageMaker Studio domain EFS" 470 | cls._fs_id = studio_efs_id 471 | assert ( 472 | cls._fs_mount.rstrip("/") == studio_efs_mount.rstrip("/") 473 | ) or cls._fs_mount.startswith(studio_efs_mount.rstrip("/") + "/"), ( 474 | "misconfiguration, set [file_io] root / MINIWDL__FILE_IO__ROOT to " 475 | + studio_efs_mount.rstrip("/") 476 | ) 477 | cls._studio_efs_uid = studio_efs_uid 478 | if not cls._fsap_id: 479 | cls._fsap_id = detect_studio_fsap( 480 | logger, 481 | studio_efs_id, 482 | studio_efs_uid, 483 | studio_efs_home, 484 | region_name=cls._region_name, 485 | ) 486 | assert ( 487 | cls._fsap_id 488 | ), "Unable to detect suitable EFS Access Point for use with SageMaker Studio; set [aws] fsap / MINIWDL__AWS__FSAP" 489 | # TODO: else sanity-check that FSAP's root directory equals studio_efs_home 490 | assert ( 491 | cls._fs_id 492 | ), "Missing EFS configuration ([aws] fs / MINIWDL__AWS__FS or [aws] fsap / MINIWDL__AWS__FSAP)" 493 | if not cls._fsap_id: 494 | logger.warning( 495 | "AWS BatchJob plugin recommends using EFS Access Point to simplify permissions between containers (configure [aws] fsap / MINIWDL__AWS__FSAP to fsap-xxxx)" 496 | ) 497 | 498 | # if no task queue in config file, try detecting miniwdl-aws-studio 499 | if not cls._job_queue and sagemaker_studio_efs: 500 | cls._job_queue = detect_gwfcore_batch_queue( 501 | logger, sagemaker_studio_efs[0], region_name=cls._region_name 502 | ) 503 | assert ( 504 | cls._job_queue 505 | ), "Missing AWS Batch job queue configuration ([aws] task_queue / MINIWDL__AWS__TASK_QUEUE)" 506 | 507 | logger.info( 508 | _( 509 | "initialized AWS BatchJob (EFS) plugin", 510 | region_name=cls._region_name, 511 | job_queue=cls._job_queue, 512 | resource_limits=cls._resource_limits, 513 | file_io_root=cls._fs_mount, 514 | efs_id=cls._fs_id, 515 | efsap_id=cls._fsap_id, 516 | ) 517 | ) 518 | 519 | def _prepare_container_properties(self, logger): 520 | container_properties = super()._prepare_container_properties(logger) 521 | 522 | # add EFS volume & mount point 523 | volumes = [ 524 | { 525 | "name": "file_io_root", 526 | "efsVolumeConfiguration": { 527 | "fileSystemId": self._fs_id, 528 | "transitEncryption": "ENABLED", 529 | }, 530 | } 531 | ] 532 | if self._fsap_id: 533 | volumes[0]["efsVolumeConfiguration"]["authorizationConfig"] = { 534 | "accessPointId": self._fsap_id 535 | } 536 | container_properties["volumes"] = volumes 537 | 538 | # set Studio UID if appropriate 539 | if self.cfg["task_runtime"].get_bool("as_user") and self._studio_efs_uid: 540 | container_properties["user"] = f"{self._studio_efs_uid}:{self._studio_efs_uid}" 541 | 542 | return container_properties 543 | 544 | 545 | class BatchJobNoEFS(BatchJobBase): 546 | """ 547 | Implementation assuming the Batch compute environment is configured to mount the shared 548 | filesystem without further specification by us; e.g. FSxL mounted by cloud-init user data 549 | script. 550 | """ 551 | 552 | @classmethod 553 | def global_init(cls, cfg, logger): 554 | super().global_init(cfg, logger) 555 | 556 | assert ( 557 | cls._job_queue 558 | ), "Missing AWS Batch job queue configuration ([aws] task_queue / MINIWDL__AWS__TASK_QUEUE)" 559 | 560 | logger.info( 561 | _( 562 | "initialized AWS BatchJob plugin", 563 | region_name=cls._region_name, 564 | job_queue=cls._job_queue, 565 | resource_limits=cls._resource_limits, 566 | file_io_root=cls._fs_mount, 567 | ) 568 | ) 569 | 570 | def _prepare_container_properties(self, logger): 571 | container_properties = super()._prepare_container_properties(logger) 572 | 573 | container_properties["volumes"] = [ 574 | { 575 | "name": "file_io_root", 576 | "host": {"sourcePath": self._fs_mount}, 577 | } 578 | ] 579 | 580 | return container_properties 581 | 582 | 583 | class BatchJobDescriber: 584 | """ 585 | This singleton object handles calling the AWS Batch DescribeJobs API with up to 100 job IDs 586 | per request, then dispensing each job description to the thread interested in it. This helps 587 | avoid AWS API request rate limits when we're tracking many concurrent jobs. 588 | """ 589 | 590 | JOBS_PER_REQUEST = 100 # maximum jobs per DescribeJob request 591 | 592 | def __init__(self): 593 | self.lock = threading.Lock() 594 | self.last_request_time = 0 595 | self.job_queue = [] 596 | self.jobs = {} 597 | 598 | def describe(self, aws_batch, job_id, period): 599 | """ 600 | Get the latest Batch job description 601 | """ 602 | while True: 603 | with self.lock: 604 | if job_id not in self.jobs: 605 | # register new job to be described ASAP 606 | heapq.heappush(self.job_queue, (0.0, job_id)) 607 | self.jobs[job_id] = None 608 | # update as many job descriptions as possible 609 | self._update(aws_batch, period) 610 | # return the desired job description if we have it 611 | desc = self.jobs[job_id] 612 | if desc: 613 | return desc 614 | # otherwise wait (outside the lock) and try again 615 | time.sleep(period / 4) 616 | 617 | def unsubscribe(self, job_id): 618 | """ 619 | Unsubscribe from a job_id once we'll no longer be interested in it 620 | """ 621 | with self.lock: 622 | if job_id in self.jobs: 623 | del self.jobs[job_id] 624 | 625 | def _update(self, aws_batch, period): 626 | # if enough time has passed since our last DescribeJobs request 627 | if time.time() - self.last_request_time >= period: 628 | # take the N least-recently described jobs 629 | job_ids = set() 630 | assert self.job_queue 631 | while self.job_queue and len(job_ids) < self.JOBS_PER_REQUEST: 632 | job_id = heapq.heappop(self.job_queue)[1] 633 | assert job_id not in job_ids 634 | if job_id in self.jobs: 635 | job_ids.add(job_id) 636 | if not job_ids: 637 | return 638 | # describe them 639 | try: 640 | job_descs = aws_batch.describe_jobs(jobs=list(job_ids)) 641 | finally: 642 | # always: bump last_request_time and re-enqueue these jobs 643 | self.last_request_time = time.time() 644 | for job_id in job_ids: 645 | heapq.heappush(self.job_queue, (self.last_request_time, job_id)) 646 | # update self.jobs with the new descriptions 647 | for job_desc in job_descs["jobs"]: 648 | job_ids.remove(job_desc["jobId"]) 649 | self.jobs[job_desc["jobId"]] = job_desc 650 | assert not job_ids, "AWS Batch DescribeJobs didn't return all expected results" 651 | 652 | 653 | class AWSError(WDL.Error.RuntimeError): 654 | """ 655 | Repackage botocore.exceptions.ClientError to surface it more-informatively in miniwdl task log 656 | """ 657 | 658 | def __init__(self, client_error: botocore.exceptions.ClientError): 659 | assert isinstance(client_error, botocore.exceptions.ClientError) 660 | msg = ( 661 | f"{client_error.response['Error']['Code']}, {client_error.response['Error']['Message']}" 662 | ) 663 | super().__init__( 664 | msg, more_info={"ResponseMetadata": client_error.response["ResponseMetadata"]} 665 | ) 666 | -------------------------------------------------------------------------------- /miniwdl_aws/cli_run_s3upload.py: -------------------------------------------------------------------------------- 1 | """ 2 | miniwdl_run_s3upload CLI entry point (console script) which passes through its arguments to 3 | `miniwdl run`, then uploads run artifacts to $S3_UPLOAD_FOLDER. This includes the log file and if 4 | the run succeeded, the output files and outputs.json (rewritten with the uploaded S3 URIs instead 5 | of local filenames). 6 | 7 | With the BatchJob plugin also enabled, this may be used from an SSH session on an EC2 instance or 8 | container with EFS suitably mounted at /mnt/efs; or within a Batch "workflow job." 9 | """ 10 | 11 | import sys 12 | import os 13 | import json 14 | import subprocess 15 | import shutil 16 | import argparse 17 | import tempfile 18 | import signal 19 | from ._util import END_OF_LOG, subprocess_run_with_clean_exit 20 | 21 | 22 | def miniwdl_run_s3upload(): 23 | # Set signal handler. SystemExit may be handled below and/or by subprocess_run_with_clean_exit. 24 | for s in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT): 25 | signal.signal(s, lambda sig, _: (_ for _ in ()).throw(SystemExit(sig))) 26 | 27 | # run main logic with handlers 28 | try: 29 | end_log_and_exit(miniwdl_run_s3upload_inner()) 30 | except SystemExit as exc: 31 | end_log_and_exit(exc.code) 32 | except KeyboardInterrupt: 33 | end_log_and_exit(int(signal.SIGINT)) 34 | except BrokenPipeError: 35 | end_log_and_exit(int(signal.SIGPIPE)) 36 | 37 | 38 | def end_log_and_exit(code): 39 | print( 40 | "\n" + END_OF_LOG, 41 | file=sys.stderr, 42 | ) 43 | sys.exit(code) 44 | 45 | 46 | def miniwdl_run_s3upload_inner(): 47 | parser = argparse.ArgumentParser( 48 | prog="miniwdl-run-s3upload", 49 | description="Pass through arguments to `miniwdl run` and afterwards, upload outputs to S3 and optionally delete local run directory.", 50 | usage="miniwdl-run-s3upload [miniwdl_run_arg ...]", 51 | allow_abbrev=False, 52 | ) 53 | parser.add_argument( 54 | "--s3upload", 55 | help="s3://bucket/folder/ at which to upload run outputs [env MINIWDL__AWS__S3_UPLOAD_FOLDER]", 56 | ) 57 | parser.add_argument( 58 | "--delete-after", 59 | choices=("always", "success", "failure"), 60 | help="with --s3upload, delete EFS run directory afterwards [env MINIWDL__AWS__S3_UPLOAD_DELETE_AFTER]", 61 | ) 62 | parser.add_argument( 63 | "--task-queue", help="AWS Batch job queue for task jobs [env MINIWDL__AWS__TASK_QUEUE]" 64 | ) 65 | 66 | args, unused_args = parser.parse_known_args(sys.argv[1:]) 67 | args.s3upload = ( 68 | args.s3upload if args.s3upload else os.environ.get("MINIWDL__AWS__S3_UPLOAD_FOLDER", None) 69 | ) 70 | args.delete_after = ( 71 | args.delete_after.strip().lower() 72 | if args.delete_after 73 | else os.environ.get("MINIWDL__AWS__S3_UPLOAD_DELETE_AFTER", None) 74 | ) 75 | if args.delete_after and not args.s3upload: 76 | print("--delete-after requires --s3upload", file=sys.stderr) 77 | sys.exit(1) 78 | 79 | if args.s3upload: 80 | with tempfile.TemporaryDirectory() as tmpdir: 81 | testfile = os.path.join(tmpdir, ".test.miniwdl-run-s3upload") 82 | with open(testfile, "w") as outfile: 83 | print( 84 | "miniwdl-run-s3upload created this object to test bucket permissions.", 85 | file=outfile, 86 | ) 87 | upload1(testfile, args.s3upload + ("/" if not args.s3upload.endswith("/") else "")) 88 | 89 | zip_arg = next((i for i, arg in enumerate(unused_args) if arg == "--WDL--ZIP--"), -1) 90 | if zip_arg >= 0: 91 | # get `miniwdl zip`ped WDL source code shipped to us by miniwdl-aws-submit 92 | unused_args[zip_arg] = get_wdl_zip() 93 | 94 | cmd = ["miniwdl", "run"] + unused_args 95 | if "--error-json" not in unused_args: 96 | cmd.append("--error-json") 97 | miniwdl_env = dict(os.environ) 98 | if args.task_queue: # pass through to BatchJob plugin via env var 99 | miniwdl_env["MINIWDL__AWS__TASK_QUEUE"] = args.task_queue 100 | 101 | # run miniwdl & tee its standard output 102 | miniwdl = subprocess_run_with_clean_exit( 103 | cmd, stdout=subprocess.PIPE, env=miniwdl_env, check=False 104 | ) 105 | sys.stdout.buffer.write(miniwdl.stdout) 106 | 107 | if not args.s3upload: 108 | # nothing to do 109 | print( 110 | f"[miniwdl_run_s3upload] no setting for --s3upload / MINIWDL__AWS__S3_UPLOAD_FOLDER; exiting (code = {miniwdl.returncode})", 111 | file=sys.stderr, 112 | ) 113 | return miniwdl.returncode 114 | 115 | # read miniwdl standard output JSON 116 | try: 117 | miniwdl_json = json.loads(miniwdl.stdout) 118 | run_dir = miniwdl_json["dir"] 119 | assert os.path.isdir(run_dir) 120 | except: 121 | print( 122 | f"[miniwdl_run_s3upload] no run directory in miniwdl standard output; exiting (code = {miniwdl.returncode})", 123 | file=sys.stderr, 124 | ) 125 | return miniwdl.returncode 126 | 127 | # append miniwdl's run name to S3_UPLOAD_FOLDER (unless the latter ends in '/') 128 | s3_upload_folder = args.s3upload 129 | if not s3_upload_folder.endswith("/"): 130 | s3_upload_folder += "/" + os.path.basename(run_dir.rstrip("/")) + "/" 131 | 132 | # upload logs 133 | print( 134 | f"[miniwdl_run_s3upload] miniwdl exit code = {miniwdl.returncode}; uploading logs & outputs to {s3_upload_folder}", 135 | file=sys.stderr, 136 | ) 137 | for p in (os.path.join(run_dir, fn) for fn in ("workflow.log", "task.log")): 138 | if os.path.isfile(p): 139 | upload1(p, s3_upload_folder) 140 | 141 | # upload error.json, and the std{out,err}_file it points to, if any 142 | error_json_file = os.path.join(run_dir, "error.json") 143 | if os.path.isfile(error_json_file): 144 | upload1(error_json_file, s3_upload_folder) 145 | reupload = False 146 | with open(error_json_file) as infile: 147 | error_json = json.load(infile) 148 | for std_key in ("stderr", "stdout"): 149 | std_file = error_json.get("cause", {}).get(std_key + "_file", None) 150 | if std_file and os.path.isfile(std_file): 151 | std_s3file = f"{s3_upload_folder}CommandFailed_{std_key}.txt" 152 | upload1(std_file, std_s3file) 153 | error_json["cause"][std_key + "_s3file"] = std_s3file 154 | reupload = True 155 | if reupload: 156 | with tempfile.NamedTemporaryFile() as tmp: 157 | tmp.write(json.dumps(error_json, indent=2).encode()) 158 | tmp.flush() 159 | upload1(tmp.name, s3_upload_folder + "error.json") 160 | 161 | # upload output files, if any 162 | if os.path.isdir(os.path.join(run_dir, "out")): 163 | subprocess_run_with_clean_exit( 164 | [ 165 | "aws", 166 | "s3", 167 | "sync", 168 | "--no-progress", 169 | "--follow-symlinks", 170 | os.path.join(run_dir, "out"), 171 | s3_upload_folder, 172 | ], 173 | check=True, 174 | ) 175 | 176 | if "outputs" not in miniwdl_json: 177 | if args.delete_after in ("always", "failure"): 178 | shutil.rmtree(run_dir) 179 | print( 180 | f"[miniwdl_run_s3upload] deleted {run_dir}", 181 | file=sys.stderr, 182 | ) 183 | return miniwdl.returncode 184 | 185 | # recursively rewrite outputs JSON 186 | def rewrite(v): 187 | if v and isinstance(v, str) and v[0] == "/" and os.path.exists(v): 188 | # miniwdl writes File/Directory outputs with absolute paths 189 | return rebase_output_path(v, run_dir, s3_upload_folder) 190 | if isinstance(v, list): 191 | return [rewrite(u) for u in v] 192 | if isinstance(v, dict): 193 | return dict((k, rewrite(u)) for (k, u) in v.items()) 194 | return v 195 | 196 | rewritten_outputs = rewrite(miniwdl_json["outputs"]) 197 | outputs_s3_json = os.path.join(run_dir, "outputs.s3.json") 198 | with open(outputs_s3_json + ".tmp", "w") as outfile: 199 | print(json.dumps(rewritten_outputs, indent=2), file=outfile) 200 | os.rename(outputs_s3_json + ".tmp", outputs_s3_json) 201 | upload1(outputs_s3_json, s3_upload_folder + "outputs.json") 202 | print( 203 | f"[miniwdl_run_s3upload] uploaded {s3_upload_folder}outputs.json", 204 | file=sys.stderr, 205 | ) 206 | print(json.dumps({"s3upload": s3_upload_folder, "outputs": rewritten_outputs}, indent=2)) 207 | if args.delete_after in ("always", "success"): 208 | shutil.rmtree(run_dir) 209 | print( 210 | f"[miniwdl_run_s3upload] deleted {run_dir}", 211 | file=sys.stderr, 212 | ) 213 | 214 | return miniwdl.returncode 215 | 216 | 217 | def upload1(fn, dest): 218 | subprocess_run_with_clean_exit(["aws", "s3", "cp", "--no-progress", fn, dest], check=True) 219 | 220 | 221 | def rebase_output_path(fn, run_dir, s3_upload_folder): 222 | """ 223 | Given extant filename `fn` from JSON outputs and the current run directory, figure the uploaded 224 | S3 URI under s3_upload_folder, where the file should be uploaded by our `aws s3 sync` operation 225 | on the "run out" directory. Or return fn unmodified if it seems to be something that looks like 226 | an output path, but isn't really. 227 | 228 | Subtlety: if the output fn originated from the call cache, it will be from some other run 229 | directory, not the current one. In that case we need to see that there's a corresponding link 230 | under the current run out directory. 231 | 232 | There should be no danger of inadvertently uploading non-output files (e.g. if the workflow 233 | outputs the string "/home/root/.ssh/id_rsa") because we're not actually performing the upload, 234 | just figuring the path where `aws s3 sync` ought to have uploaded it. 235 | """ 236 | fn_parts = fn.strip("/").split("/") 237 | while fn_parts: 238 | fn_rel = "/".join(fn_parts) 239 | fn_rebased = os.path.join(run_dir, "out", fn_rel) 240 | if os.path.exists(fn_rebased) and os.path.isdir(fn) == os.path.isdir(fn_rebased): 241 | return s3_upload_folder + fn_rel 242 | fn_parts = fn_parts[1:] 243 | return fn 244 | 245 | 246 | def get_wdl_zip(): 247 | """ 248 | Load `miniwdl zip`ped WDL source code shipped to us by miniwdl-aws-submit, encoded in the 249 | environment variable WDL_ZIP 250 | """ 251 | 252 | encoded_zip = os.environ["WDL_ZIP"] 253 | if len(encoded_zip) >= 4096: 254 | # Look for spillover in job & job def tags 255 | job_desc = json.loads( 256 | subprocess_run_with_clean_exit( 257 | ["aws", "batch", "describe-jobs", "--jobs", os.environ["AWS_BATCH_JOB_ID"]], 258 | stdout=subprocess.PIPE, 259 | check=True, 260 | ).stdout 261 | )["jobs"][0] 262 | job_tags = job_desc["tags"] 263 | job_def_tags = json.loads( 264 | subprocess_run_with_clean_exit( 265 | [ 266 | "aws", 267 | "batch", 268 | "describe-job-definitions", 269 | "--job-definitions", 270 | job_desc["jobDefinition"], 271 | ], 272 | stdout=subprocess.PIPE, 273 | check=True, 274 | ).stdout 275 | )["jobDefinitions"][0]["tags"] 276 | # if no job_def_tags, then there shouldn't be job_tags either 277 | assert job_def_tags or not job_tags 278 | for tags in (job_def_tags, job_tags): 279 | for key in sorted(tags.keys()): 280 | if key.startswith("WZ") and len(key) > 3: 281 | encoded_zip += key[3:] + tags[key] 282 | 283 | import base64 284 | import lzma 285 | 286 | zip_bytes = lzma.decompress(base64.urlsafe_b64decode(encoded_zip), format=lzma.FORMAT_ALONE) 287 | fd, fn = tempfile.mkstemp(suffix=".zip", prefix="wdl_") 288 | os.write(fd, zip_bytes) 289 | os.close(fd) 290 | return fn 291 | -------------------------------------------------------------------------------- /miniwdl_aws/cli_submit.py: -------------------------------------------------------------------------------- 1 | """ 2 | miniwdl-aws-submit CLI entry point (console script) to submit a miniwdl "workflow job" to an AWS 3 | Batch queue, which will invoke miniwdl-run-s3upload to run the workflow (spawning additional Batch 4 | jobs as needed to execute tasks). This is typically used on-laptop to kick off workflows, without 5 | the laptop needing to stay on/connected. It can also wait for the workflow job to complete and 6 | stream its logs. 7 | """ 8 | 9 | import sys 10 | import os 11 | import time 12 | import argparse 13 | import shlex 14 | from datetime import datetime 15 | from collections import defaultdict 16 | import boto3 17 | from ._util import detect_aws_region, randomize_job_name, END_OF_LOG, efs_id_from_access_point 18 | 19 | 20 | def miniwdl_submit_awsbatch(argv): 21 | # Configure from arguments/environment/tags 22 | args, unused_args = parse_args(argv) 23 | verbose = ( 24 | args.follow or args.self_test or "--verbose" in unused_args or "--debug" in unused_args 25 | ) 26 | detect_env_args(args) 27 | if verbose: 28 | print("Workflow job queue: " + args.workflow_queue, file=sys.stderr) 29 | 30 | aws_region_name = detect_aws_region(None) 31 | if not aws_region_name: 32 | print( 33 | "Failed to detect AWS region; configure AWS CLI or set environment AWS_DEFAULT_REGION", 34 | file=sys.stderr, 35 | ) 36 | sys.exit(1) 37 | aws_batch = boto3.client("batch", region_name=aws_region_name) 38 | detect_tags_args(aws_batch, args) 39 | 40 | if verbose: 41 | print("Task job queue: " + args.task_queue, file=sys.stderr) 42 | if args.efs: 43 | print("Workflow IAM role ARN: " + args.workflow_role, file=sys.stderr) 44 | print("EFS Access Point: " + args.fsap, file=sys.stderr) 45 | 46 | fs_id = None 47 | if args.efs: 48 | fs_id = efs_id_from_access_point(aws_region_name, args.fsap) 49 | if verbose: 50 | print("EFS: " + fs_id, file=sys.stderr) 51 | 52 | # Prepare workflow job: command, environment, and container properties 53 | job_name, miniwdl_run_cmd, wdl_zip = form_miniwdl_run_cmd(args, unused_args, verbose) 54 | job_name = randomize_job_name(job_name) 55 | if verbose: 56 | print("Workflow job image: " + args.image, file=sys.stderr) 57 | print("Invocation: " + " ".join(shlex.quote(s) for s in miniwdl_run_cmd), file=sys.stderr) 58 | ( 59 | workflow_container_props, 60 | workflow_container_overrides, 61 | job_def_tags, 62 | job_tags, 63 | ) = form_workflow_container_props(args, miniwdl_run_cmd, fs_id, wdl_zip, verbose) 64 | 65 | # Register & submit workflow job 66 | try: 67 | workflow_job_def = aws_batch.register_job_definition( 68 | jobDefinitionName=job_name, 69 | platformCapabilities=["FARGATE" if args.efs else "EC2"], 70 | type="container", 71 | containerProperties=workflow_container_props, 72 | tags=job_def_tags, 73 | ) 74 | except BaseException as exc: 75 | if wdl_zip and "JobDefinition size must be less than" in str(exc): 76 | print(_WDL_ZIP_SIZE_MSG, file=sys.stderr) 77 | sys.exit(123) 78 | raise 79 | workflow_job_def_handle = ( 80 | f"{workflow_job_def['jobDefinitionName']}:{workflow_job_def['revision']}" 81 | ) 82 | try: 83 | workflow_job_id = aws_batch.submit_job( 84 | jobName=job_name, 85 | jobQueue=args.workflow_queue, 86 | jobDefinition=workflow_job_def_handle, 87 | containerOverrides=workflow_container_overrides, 88 | tags=job_tags, 89 | )["jobId"] 90 | if verbose: 91 | print(f"Submitted {job_name} to {args.workflow_queue}:", file=sys.stderr) 92 | sys.stderr.flush() 93 | print(workflow_job_id) 94 | if not sys.stdout.isatty(): 95 | print(workflow_job_id, file=sys.stderr) 96 | finally: 97 | aws_batch.deregister_job_definition(jobDefinition=workflow_job_def_handle) 98 | 99 | # Wait for workflow job, if requested 100 | exit_code = 0 101 | if args.wait or args.follow: 102 | exit_code = wait( 103 | aws_region_name, 104 | aws_batch, 105 | workflow_job_id, 106 | args.follow, 107 | expect_log_eof=not args.self_test, 108 | ) 109 | sys.exit(exit_code) 110 | 111 | 112 | def parse_args(argv): 113 | if "COLUMNS" not in os.environ: 114 | os.environ["COLUMNS"] = "100" 115 | parser = argparse.ArgumentParser( 116 | prog="miniwdl-aws-submit", 117 | description="Launch `miniwdl run` on AWS Batch (+ EFS at /mnt/efs), itself launching additional" 118 | " Batch jobs to execute WDL tasks. Passed-through arguments to `miniwdl run` should refer to" 119 | " s3:// or /mnt/efs/ input paths, rather than the local filesystem.", 120 | usage="miniwdl-aws-submit [miniwdl_run_arg ...] --workflow-queue WORKFLOW_QUEUE", 121 | allow_abbrev=False, 122 | ) 123 | group = parser.add_argument_group("AWS Batch") 124 | group.add_argument( 125 | "--workflow-queue", 126 | help="job queue for workflow job [env MINIWDL__AWS__WORKFLOW_QUEUE]", 127 | ) 128 | group.add_argument( 129 | "--task-queue", 130 | help="job queue for task jobs [env MINIWDL__AWS__TASK_QUEUE" 131 | " or detect from DefaultTaskQueue tag on workflow job queue]", 132 | ) 133 | group.add_argument( 134 | "--task-queue-fallback", 135 | help="job queue for task jobs following runtime.preemptible spot interruptions [env" 136 | " MINIWDL__AWS__TASK_QUEUE_FALLBACK or detect from DefaultTaskQueueFallback tag on workflow job queue]", 137 | ) 138 | group.add_argument( 139 | "--fsap", 140 | help="EFS Access Point ID (fsap-xxxx) for mounting [env MINIWDL__AWS__FSAP" 141 | " or detect from DefaultFsap tag on workflow job queue]", 142 | ) 143 | group.add_argument( 144 | "--no-efs", 145 | "--no-EFS", 146 | action="store_false", 147 | dest="efs", 148 | help="instead of EFS, expect EC2 compute environments to automatically mount some other shared filesystem [env MINIWDL__AWS__FS=0]", 149 | ) 150 | group.add_argument( 151 | "--mount", 152 | default=None, 153 | help="shared filesystem mount point in all containers [/mnt/efs or /mnt/net]", 154 | ) 155 | group = parser.add_argument_group("Workflow job provisioning") 156 | group.add_argument( 157 | "--workflow-role", 158 | help="ARN of execution+job role for workflow job [env MINIWDL__AWS__WORKFLOW_ROLE" 159 | " or detect from WorkflowEngineRoleArn tag on workflow job queue]", 160 | ) 161 | group.add_argument("--name", help="workflow job name [WDL filename]") 162 | group.add_argument( 163 | "--cpu", metavar="N", type=str, default="1", help="vCPUs for workflow job [1]" 164 | ) 165 | group.add_argument( 166 | "--memory-GiB", metavar="N", type=int, default=4, help="memory for workflow job [4]" 167 | ) 168 | group.add_argument( 169 | "--image", 170 | help="override miniwdl-aws Docker image tag for workflow job [env MINIWDL__AWS__WORKFLOW_IMAGE]", 171 | ) 172 | group.add_argument( 173 | "--no-env", action="store_true", help="don't pass through MINIWDL__* environment variables" 174 | ) 175 | group.add_argument( 176 | "--no-public-ip", 177 | action="store_true", 178 | help="don't assign public IP (workflow compute env has private subnet & NAT)", 179 | ) 180 | group = parser.add_argument_group("miniwdl I/O") 181 | group.add_argument( 182 | "--dir", 183 | default=None, 184 | help="run directory prefix [{mount}/miniwdl_run or {mount}/miniwdl_run]", 185 | ) 186 | group.add_argument( 187 | "--s3upload", 188 | help="s3://bucket/folder/ at which to upload run outputs (otherwise left on shared filesystem)", 189 | ) 190 | group.add_argument( 191 | "--delete-after", 192 | choices=("always", "success", "failure"), 193 | help="with --s3upload, delete run directory afterwards", 194 | ) 195 | parser.add_argument( 196 | "--wait", "-w", action="store_true", help="wait for workflow job to complete" 197 | ) 198 | parser.add_argument( 199 | "--follow", 200 | "-f", 201 | action="store_true", 202 | help="live-stream workflow log to standard error (implies --wait)", 203 | ) 204 | parser.add_argument("--self-test", action="store_true", help="perform `miniwdl run_self_test`") 205 | 206 | args, unused_args = parser.parse_known_args(argv[1:]) 207 | 208 | if os.environ.get("MINIWDL__AWS__FS", "").strip().lower() in ("false", "f", "0", "no", "n"): 209 | args.efs = False 210 | if not args.mount: 211 | args.mount = "/mnt/efs" if args.efs else "/mnt/net" 212 | if args.mount.endswith("/"): 213 | args.mount = args.mount[:-1] 214 | assert args.mount 215 | if not args.dir: 216 | args.dir = os.path.join(args.mount, "miniwdl_run") 217 | if not args.dir.startswith(args.mount): 218 | print(f"--dir must begin with {args.mount}", file=sys.stderr) 219 | sys.exit(1) 220 | 221 | return (args, unused_args) 222 | 223 | 224 | def detect_env_args(args): 225 | """ 226 | Detect configuration set through environment variables (that weren't set by command-line args) 227 | """ 228 | args.fsap = args.fsap if args.fsap else os.environ.get("MINIWDL__AWS__FSAP", "") 229 | args.workflow_queue = ( 230 | args.workflow_queue 231 | if args.workflow_queue 232 | else os.environ.get("MINIWDL__AWS__WORKFLOW_QUEUE", None) 233 | ) 234 | if not args.workflow_queue: 235 | print( 236 | "--workflow-queue is required (or environment variable MINIWDL__AWS__WORKFLOW_QUEUE)", 237 | file=sys.stderr, 238 | ) 239 | sys.exit(1) 240 | args.fsap = args.fsap if args.fsap else os.environ.get("MINIWDL__AWS__FSAP", "") 241 | args.task_queue = ( 242 | args.task_queue if args.task_queue else os.environ.get("MINIWDL__AWS__TASK_QUEUE", None) 243 | ) 244 | if not args.task_queue_fallback: 245 | args.task_queue_fallback = os.environ.get("MINIWDL__AWS__TASK_QUEUE_FALLBACK", None) 246 | args.workflow_role = ( 247 | args.workflow_role 248 | if args.workflow_role 249 | else os.environ.get("MINIWDL__AWS__WORKFLOW_ROLE", None) 250 | ) 251 | args.image = args.image if args.image else os.environ.get("MINIWDL__AWS__WORKFLOW_IMAGE", None) 252 | if not args.image: 253 | # version-matched default image from our GitHub build 254 | import importlib_metadata 255 | 256 | try: 257 | args.image = "ghcr.io/miniwdl-ext/miniwdl-aws:v" + importlib_metadata.version( 258 | "miniwdl-aws" 259 | ) 260 | except importlib_metadata.PackageNotFoundError: 261 | print( 262 | "Failed to detect miniwdl Docker image version tag; set explicitly with --image or MINIWDL__AWS__WORKFLOW_IMAGE", 263 | file=sys.stderr, 264 | ) 265 | sys.exit(1) 266 | 267 | if args.delete_after and not args.s3upload: 268 | print("--delete-after requires --s3upload", file=sys.stderr) 269 | sys.exit(1) 270 | args.s3upload = ( 271 | args.s3upload if args.s3upload else os.environ.get("MINIWDL__AWS__S3_UPLOAD_FOLDER", None) 272 | ) 273 | args.delete_after = ( 274 | args.delete_after.strip().lower() 275 | if args.delete_after 276 | else os.environ.get("MINIWDL__AWS__DELETE_AFTER_S3_UPLOAD", None) 277 | ) 278 | 279 | 280 | def detect_tags_args(aws_batch, args): 281 | """ 282 | If not otherwise set by command line arguments or environment, inspect tags of the workflow job 283 | queue to detect default task job queue and (if applicable) EFS Access Point ID and workflow 284 | role ARN. Infra provisioning (CloudFormation, Terraform, etc.) may have set the expected tags. 285 | """ 286 | if not args.task_queue or (args.efs and not (args.fsap or args.workflow_role)): 287 | workflow_queue_tags = aws_batch.describe_job_queues(jobQueues=[args.workflow_queue])[ 288 | "jobQueues" 289 | ][0]["tags"] 290 | if not args.task_queue: 291 | args.task_queue = workflow_queue_tags.get("DefaultTaskQueue", None) 292 | if not args.task_queue: 293 | print( 294 | "Unable to detect default task job queue name from DefaultTaskQueue tag of workflow job queue." 295 | " Set --task-queue or environment variable MINIWDL__AWS__TASK_QUEUE.", 296 | file=sys.stderr, 297 | ) 298 | sys.exit(1) 299 | if not args.task_queue_fallback: 300 | args.task_queue_fallback = workflow_queue_tags.get("DefaultTaskQueueFallback", None) 301 | if args.efs and not args.fsap: 302 | try: 303 | args.fsap = workflow_queue_tags["DefaultFsap"] 304 | assert args.fsap.startswith("fsap-") 305 | except: 306 | if not args.fsap: 307 | print( 308 | "Unable to detect default EFS Access Point (fsap-xxxx) from DefaultFsap tag of workflow job queue." 309 | " Set --fsap or environment variable MINIWDL__AWS__FSAP.", 310 | file=sys.stderr, 311 | ) 312 | sys.exit(1) 313 | if args.efs and not args.workflow_role: 314 | # Workflow role ARN is needed for Fargate Batch (unlike EC2 Batch, where a role is 315 | # associated with the EC2 instance profile in the compute environment). 316 | try: 317 | args.workflow_role = aws_batch.describe_job_queues(jobQueues=[args.workflow_queue])[ 318 | "jobQueues" 319 | ][0]["tags"]["WorkflowEngineRoleArn"] 320 | assert args.workflow_role.startswith("arn:aws:iam::") 321 | except: 322 | if not args.workflow_role: 323 | print( 324 | "Unable to detect ARN of workflow engine IAM role from WorkflowEngineRoleArn tag of workflow job queue." 325 | " Double-check --workflow-queue, or set --workflow-role or environment MINIWDL__AWS__WORKFLOW_ROLE.", 326 | file=sys.stderr, 327 | ) 328 | sys.exit(1) 329 | 330 | 331 | def form_miniwdl_run_cmd(args, unused_args, verbose=False): 332 | """ 333 | Formulate the `miniwdl run` command line to be invoked in the workflow job container 334 | """ 335 | wdl_zip = None 336 | if args.self_test: 337 | self_test_dir = os.path.join( 338 | args.mount, "miniwdl_run_self_test", datetime.today().strftime("%Y%m%d_%H%M%S") 339 | ) 340 | miniwdl_run_cmd = ["miniwdl", "run_self_test", "--dir", self_test_dir] 341 | job_name = args.name if args.name else "miniwdl_run_self_test" 342 | else: 343 | wdl_filename_pos = next( 344 | (i for i, arg in enumerate(unused_args) if not arg.startswith("-")), -1 345 | ) 346 | if wdl_filename_pos < 0: 347 | print("Command line appears to be missing WDL filename", file=sys.stderr) 348 | sys.exit(1) 349 | wdl_filename = unused_args[wdl_filename_pos] 350 | wdl_zip = zip_wdl(wdl_filename, args.mount, verbose) 351 | if wdl_zip: 352 | # this sentinel argument will be recognized by miniwdl-run-s3upload 353 | unused_args[wdl_filename_pos] = "--WDL--ZIP--" 354 | job_name = args.name 355 | if not job_name: 356 | job_name = os.path.basename(wdl_filename).lstrip(".") 357 | try: 358 | for punct in (".", "?"): 359 | if job_name.index(punct) > 0: 360 | job_name = job_name[: job_name.index(punct)] 361 | except ValueError: 362 | pass 363 | job_name = ("miniwdl_run_" + job_name)[:128] 364 | # pass most arguments through to miniwdl-run-s3upload inside workflow job 365 | miniwdl_run_cmd = ["miniwdl-run-s3upload"] + unused_args 366 | miniwdl_run_cmd.extend(["--dir", args.dir]) 367 | miniwdl_run_cmd.extend(["--s3upload", args.s3upload] if args.s3upload else []) 368 | miniwdl_run_cmd.extend(["--delete-after", args.delete_after] if args.delete_after else []) 369 | return (job_name, miniwdl_run_cmd, wdl_zip) 370 | 371 | 372 | def zip_wdl(wdl_filename, mount, verbose=False): 373 | """ 374 | If wdl_filename is an existing local .wdl or .zip file, prepare to ship it as the WDL source 375 | code for the workflow job to execute. (Otherwise, it'll be passed through assuming it's some 376 | path or URI the workflow job will be able to open directly.) 377 | 378 | If it's a .zip file, assume it's generated by `miniwdl zip`. 379 | 380 | If it's a .wdl file, run `miniwdl zip` on it. 381 | """ 382 | if not os.path.isfile(wdl_filename) or not ( 383 | wdl_filename.endswith(".wdl") or wdl_filename.endswith(".zip") 384 | ): 385 | if verbose: 386 | print( 387 | f"WDL: {wdl_filename} (not a local WDL file; assuming accessible inside workflow job)" 388 | ) 389 | return None 390 | if os.path.normpath(os.path.abspath(wdl_filename)).startswith(mount + "/"): 391 | if verbose: 392 | print(f"WDL: {wdl_filename} (assuming {mount} accessible inside workflow job)") 393 | return None 394 | 395 | # load zip bytes 396 | if wdl_filename.endswith(".wdl"): 397 | import subprocess 398 | import tempfile 399 | 400 | try: 401 | with tempfile.TemporaryDirectory() as tmpdir: 402 | zip_fn = os.path.join(tmpdir, os.path.basename(wdl_filename)) + ".zip" 403 | subprocess.check_call(["miniwdl", "zip", "-o", zip_fn, wdl_filename]) 404 | with open(zip_fn, "rb") as zip_file: 405 | zip_bytes = zip_file.read() 406 | # TODO: detect -i file.json in unused_args and provide it to miniwdl zip too 407 | except subprocess.CalledProcessError as exn: 408 | sys.exit(exn.returncode) 409 | else: 410 | assert wdl_filename.endswith(".zip") 411 | with open(wdl_filename, "rb") as zip_file: 412 | zip_bytes = zip_file.read() 413 | assert zip_bytes, "empty WDL zip" 414 | 415 | # aggressively compress, to maximize chance of fitting within the 30KiB limit on Batch 416 | # SubmitJob request: https://docs.aws.amazon.com/batch/latest/userguide/service_limits.html 417 | import base64 418 | import lzma 419 | 420 | zip_str = base64.urlsafe_b64encode( 421 | lzma.compress(zip_bytes, format=lzma.FORMAT_ALONE, preset=(9 | lzma.PRESET_EXTREME)) 422 | ).decode("ascii") 423 | if verbose: 424 | print( 425 | f"WDL/ZIP: {wdl_filename} (encoded as {len(zip_str)} bytes to submit with workflow job)", 426 | file=sys.stderr, 427 | ) 428 | return zip_str 429 | 430 | 431 | def form_workflow_container_props(args, miniwdl_run_cmd, fs_id, wdl_zip=None, verbose=False): 432 | environment = [ 433 | {"name": "MINIWDL__AWS__TASK_QUEUE", "value": args.task_queue}, 434 | {"name": "MINIWDL__FILE_IO__ROOT", "value": args.mount}, 435 | ] 436 | if args.task_queue_fallback: 437 | environment.append( 438 | {"name": "MINIWDL__AWS__TASK_QUEUE_FALLBACK", "value": args.task_queue_fallback} 439 | ) 440 | if args.efs: 441 | environment.append({"name": "MINIWDL__AWS__FS", "value": fs_id}) 442 | environment.append({"name": "MINIWDL__AWS__FSAP", "value": args.fsap}) 443 | else: 444 | environment.append( 445 | {"name": "MINIWDL__SCHEDULER__CONTAINER_BACKEND", "value": "aws_batch_job_no_efs"} 446 | ) 447 | extra_env = set() 448 | if not args.no_env: 449 | # pass through environment variables starting with MINIWDL__ (except those specific to 450 | # workflow job launch, or passed through via command line) 451 | for k in os.environ: 452 | if k.startswith("MINIWDL__") and k not in ( 453 | "MINIWDL__AWS__FS", 454 | "MINIWDL__AWS__FSAP", 455 | "MINIWDL__AWS__TASK_QUEUE", 456 | "MINIWDL__AWS__TASK_QUEUE_FALLBACK", 457 | "MINIWDL__AWS__WORKFLOW_QUEUE", 458 | "MINIWDL__AWS__WORKFLOW_ROLE", 459 | "MINIWDL__AWS__WORKFLOW_IMAGE", 460 | "MINIWDL__AWS__S3_UPLOAD_FOLDER", 461 | "MINIWDL__AWS__S3_UPLOAD_DELETE_AFTER", 462 | "MINIWDL__FILE_IO__ROOT", 463 | ): 464 | environment.append({"name": k, "value": os.environ[k]}) 465 | extra_env.add(k) 466 | 467 | if verbose and extra_env: 468 | print( 469 | "Passing through environment variables (--no-env to disable): " 470 | + " ".join(list(extra_env)), 471 | file=sys.stderr, 472 | ) 473 | 474 | workflow_container_props = { 475 | "image": args.image, 476 | "resourceRequirements": [ 477 | {"type": "VCPU", "value": str(args.cpu)}, 478 | {"type": "MEMORY", "value": str(args.memory_GiB * 1024)}, 479 | ], 480 | "environment": [], 481 | } 482 | job_def_tags = {} 483 | job_tags = {} 484 | if wdl_zip: 485 | # If the command line provided a local WDL (or WDL zipped by `miniwdl zip`), ship it in the 486 | # workflow job environment, to be picked up by miniwdl-run-s3upload. If the encoded zip is 487 | # over 4096 characters, then spray the remainder across tags on the workflow job definition 488 | # and workflow job itself. The 4KiB keeps our container properties (+overrides) within AWS' 489 | # 8KiB limit. Then we use up to 42 tags on the job def & job, each with 381 usable bytes 490 | # (within the AWS limits of 50 tags per resource with key length 128 and value length 256). 491 | # Total capacity = 4096 + 2*42*381 = 36100 characters. 492 | workflow_container_props["environment"].append({"name": "WDL_ZIP", "value": wdl_zip[:4096]}) 493 | wdl_zip = wdl_zip[4096:] 494 | tag_num = 0 495 | while wdl_zip: 496 | if tag_num >= 84: 497 | print(_WDL_ZIP_SIZE_MSG, file=sys.stderr) 498 | sys.exit(123) 499 | tag_key = ( 500 | "WZ" 501 | + chr((ord("A") if tag_num % 42 < 26 else (ord("a") - 26)) + tag_num % 42) 502 | + wdl_zip[:125] 503 | ) 504 | tag_value = wdl_zip[125:381] 505 | wdl_zip = wdl_zip[381:] 506 | if tag_num < 42: 507 | job_def_tags[tag_key] = tag_value 508 | else: 509 | job_tags[tag_key] = tag_value 510 | tag_num += 1 511 | workflow_container_overrides = { 512 | "command": miniwdl_run_cmd, 513 | "environment": environment, 514 | } 515 | if args.efs: 516 | # EFS: set EFS volume/mountPoint and Fargate execution role 517 | assert args.workflow_role and fs_id and args.fsap 518 | workflow_container_props.update( 519 | { 520 | "fargatePlatformConfiguration": {"platformVersion": "1.4.0"}, 521 | "executionRoleArn": args.workflow_role, 522 | "jobRoleArn": args.workflow_role, 523 | "volumes": [ 524 | { 525 | "name": "efs", 526 | "efsVolumeConfiguration": { 527 | "fileSystemId": fs_id, 528 | "transitEncryption": "ENABLED", 529 | "authorizationConfig": {"accessPointId": args.fsap}, 530 | }, 531 | } 532 | ], 533 | "mountPoints": [{"containerPath": args.mount, "sourceVolume": "efs"}], 534 | } 535 | ) 536 | if not args.no_public_ip: 537 | workflow_container_props["networkConfiguration"] = {"assignPublicIp": "ENABLED"} 538 | else: 539 | # non-EFS: set volume/mountPoint assuming compute environments mount automatically 540 | workflow_container_props.update( 541 | { 542 | "volumes": [ 543 | { 544 | "name": "file_io_root", 545 | "host": {"sourcePath": args.mount}, 546 | } 547 | ], 548 | "mountPoints": [{"containerPath": args.mount, "sourceVolume": "file_io_root"}], 549 | } 550 | ) 551 | 552 | return (workflow_container_props, workflow_container_overrides, job_def_tags, job_tags) 553 | 554 | 555 | def wait(aws_region_name, aws_batch, workflow_job_id, follow, expect_log_eof=True): 556 | """ 557 | Wait for workflow job to complete & return its exit code; optionally tail its log to stderr 558 | """ 559 | try: 560 | log_follower = None 561 | exit_code = None 562 | saw_end = False 563 | while exit_code is None: 564 | time.sleep(1.0) 565 | job_descs = aws_batch.describe_jobs(jobs=[workflow_job_id]) 566 | job_desc = job_descs["jobs"][0] 567 | if ( 568 | not log_follower 569 | and "container" in job_desc 570 | and "logStreamName" in job_desc["container"] 571 | ): 572 | log_stream_name = job_desc["container"]["logStreamName"] 573 | print("Log stream: " + log_stream_name, file=sys.stderr) 574 | sys.stderr.flush() 575 | log_follower = CloudWatchLogsFollower( 576 | boto3.DEFAULT_SESSION, aws_region_name, "/aws/batch/job", log_stream_name 577 | ) 578 | if follow and log_follower: 579 | for event in log_follower.new_events(): 580 | if END_OF_LOG not in event["message"]: 581 | print(event["message"], file=sys.stderr) 582 | else: 583 | saw_end = True 584 | sys.stderr.flush() 585 | if job_desc["status"] == "SUCCEEDED": 586 | exit_code = 0 587 | elif job_desc["status"] == "FAILED": 588 | exit_code = -1 589 | if "container" in job_desc and "exitCode" in job_desc["container"]: 590 | exit_code = job_desc["container"]["exitCode"] 591 | assert exit_code != 0 592 | if expect_log_eof and follow and log_follower and not saw_end: 593 | # give straggler log messages a few seconds to appear 594 | time.sleep(3.0) 595 | for event in log_follower.new_events(): 596 | if END_OF_LOG not in event["message"]: 597 | print(event["message"], file=sys.stderr) 598 | else: 599 | saw_end = True 600 | if not saw_end: 601 | print( 602 | f"[miniwdl-aws-submit] WARNING: end-of-log marker not seen; more information may appear in log stream {log_stream_name}", 603 | file=sys.stderr, 604 | ) 605 | sys.stderr.flush() 606 | status = job_desc["status"] 607 | reason = job_desc.get("statusReason", "") 608 | if reason: 609 | reason = ( 610 | f"\t{reason}" if reason and reason != "Essential container in task exited" else "" 611 | ) 612 | print(status + "\t" + workflow_job_id + reason, file=sys.stderr) 613 | if status == "FAILED" and "Container Overrides length must be at most" in reason: 614 | print(_WDL_ZIP_SIZE_MSG, file=sys.stderr) 615 | exit_code = 123 616 | assert isinstance(exit_code, int) and (exit_code != 0 or status == "SUCCEEDED") 617 | return exit_code 618 | except KeyboardInterrupt: 619 | print( 620 | "[miniwdl-aws-submit] interrupted by Ctrl-C; workflow job probably remains active. To terminate:\n" 621 | f" aws batch terminate-job --reason abort --job-id {workflow_job_id}", 622 | file=sys.stderr, 623 | ) 624 | return -1 625 | 626 | 627 | class CloudWatchLogsFollower: 628 | # Based loosely on: 629 | # https://github.com/aws/aws-cli/blob/v2/awscli/customizations/logs/tail.py 630 | # which wasn't suitable to use directly at the time of this writing, because of 631 | # https://github.com/aws/aws-cli/issues/5560 632 | def __init__(self, boto_session, region_name, group_name, stream_name=None): 633 | self.group_name = group_name 634 | self.stream_name = stream_name 635 | self._newest_timestamp = None 636 | self._newest_event_ids = set() 637 | self._client = boto_session.client("logs", region_name=region_name) 638 | 639 | def new_events(self): 640 | event_ids_per_timestamp = defaultdict(set) 641 | 642 | filter_args = {"logGroupName": self.group_name} 643 | if self.stream_name: 644 | filter_args["logStreamNames"] = [self.stream_name] 645 | if self._newest_timestamp: 646 | filter_args["startTime"] = self._newest_timestamp 647 | while True: 648 | try: 649 | response = self._client.filter_log_events(**filter_args) 650 | except self._client.exceptions.ResourceNotFoundException: 651 | return # we may learn the Batch job's log stream name before it actually exists 652 | for event in response["events"]: 653 | # For the case where we've hit the last page, we will be 654 | # reusing the newest timestamp of the received events to keep polling. 655 | # This means it is possible that duplicate log events with same timestamp 656 | # are returned back which we do not want to yield again. 657 | # We only want to yield log events that we have not seen. 658 | if event["eventId"] not in self._newest_event_ids: 659 | event_ids_per_timestamp[event["timestamp"]].add(event["eventId"]) 660 | yield event 661 | if "nextToken" in response: 662 | filter_args["nextToken"] = response["nextToken"] 663 | else: 664 | break 665 | 666 | if event_ids_per_timestamp: 667 | self._newest_timestamp = max(event_ids_per_timestamp.keys()) 668 | self._newest_event_ids = event_ids_per_timestamp[self._newest_timestamp] 669 | 670 | 671 | _WDL_ZIP_SIZE_MSG = ( 672 | "\nExceeded AWS Batch request payload size limit; make the WDL source code and/or inputs" 673 | " available by URL or remote filesystem path, to pass by reference." 674 | ) 675 | -------------------------------------------------------------------------------- /plugin_log_task_usage/StressTest.wdl: -------------------------------------------------------------------------------- 1 | version 1.1 2 | # MINIWDL__LOG_TASK_USAGE__PERIOD=2 miniwdl run examples/plugin_log_task_usage/StressTest.wdl --dir /tmp --verbose 3 | # MINIWDL__LOG_TASK_USAGE__PERIOD=2 miniwdl-aws-submit plugin_log_task_usage/StressTest.wdl --verbose --follow 4 | 5 | task StressTest { 6 | input { 7 | Int cpu = 4 8 | Int memory_G = 2 9 | Int cpu_memory_duration_s = 10 10 | Int disk_load_G = 2 11 | 12 | String docker = "polinux/stress" # Docker image with stress tool 13 | } 14 | 15 | command <<< 16 | set -euxo pipefail 17 | 18 | >&2 ls -l /sys/fs/cgroup 19 | 20 | stress --cpu 4 --vm 1 --vm-bytes ~{memory_G}G --vm-hang 0 --timeout ~{cpu_memory_duration_s}s || true 21 | dd if=/dev/zero of=testfile bs=1G count=~{disk_load_G} 22 | sync 23 | cat testfile > /dev/null & 24 | sleep 5 25 | >>> 26 | 27 | runtime { 28 | docker: docker 29 | memory: "${memory_G*2}G" 30 | cpu: cpu 31 | } 32 | 33 | output { 34 | File stderr_txt = stderr() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /plugin_log_task_usage/miniwdl_log_task_usage.py: -------------------------------------------------------------------------------- 1 | """ 2 | miniwdl plugin instrumenting each task container to log its own CPU & memory resource usage 3 | periodically. The logs are written to the task's standard error stream, so they'll appear on the 4 | console only with --verbose logging (but are always recorded in each task's stderr.txt). 5 | 6 | To enable, install this plugin (`pip3 install .` & confirm listed by `miniwdl --version`) and 7 | set configuration [log_task_usage] period (or the environment variable 8 | MINIWDL__LOG_TASK_USAGE__PERIOD) to the desired logging period in seconds. 9 | 10 | YMMV because host OS version & configuration may affect availability of the cgroup counters read 11 | from pseudo-files under /sys/fs/cgroup 12 | """ 13 | 14 | 15 | def main(cfg, logger, run_id, run_dir, task, **recv): 16 | # do nothing with inputs 17 | recv = yield recv 18 | 19 | # inject logger into command script 20 | if cfg.has_option("log_task_usage", "period"): 21 | period = cfg["log_task_usage"].get_int("period") 22 | recv["command"] = _logger_sh + f"_miniwdl_log_task_usage {period} &\n\n" + recv["command"] 23 | recv = yield recv 24 | 25 | # do nothing with outputs 26 | yield recv 27 | 28 | 29 | _logger_sh = r""" 30 | _miniwdl_log_task_usage() { 31 | set +ex 32 | local PERIOD_SECS=${1:-10} # logging period (default 10s) 33 | 34 | # detect whether host provides cgroup v2 or v1, and helper functions to read CPU & memory usage 35 | # counters from the appropriate pseudo-files 36 | local cgroup_version="" 37 | if [ -f /sys/fs/cgroup/cpu.stat ]; then 38 | cgroup_version=2 39 | elif [ -f /sys/fs/cgroup/cpuacct/cpuacct.stat ]; then 40 | cgroup_version=1 41 | else 42 | >&2 echo "miniwdl_log_task_usage unable to report: cgroup CPU usage counters not found" 43 | exit 1 44 | fi 45 | 46 | cpu_secs() { 47 | local ans 48 | if [ $cgroup_version -eq 2 ]; then 49 | ans=$(awk '/^usage_usec/ {print $2}' /sys/fs/cgroup/cpu.stat) 50 | echo $(( ans / 1000000 )) 51 | else 52 | ans=$(cut -f2 -d ' ' /sys/fs/cgroup/cpuacct/cpuacct.stat | head -n 1) 53 | echo $(( ans / 100 )) # 100 "jiffies" per second 54 | fi 55 | } 56 | 57 | mem_bytes() { 58 | if [ $cgroup_version -eq 2 ]; then 59 | awk '$1 == "anon" { print $2 }' /sys/fs/cgroup/memory.stat 60 | else 61 | awk -F ' ' '$1 == "total_rss" { print $2 }' /sys/fs/cgroup/memory/memory.stat 62 | fi 63 | } 64 | 65 | local T_0=$(date +%s) 66 | local t_last=$T_0 67 | local cpu_secs_0=$(cpu_secs) 68 | local cpu_secs_last=$cpu_secs_0 69 | 70 | while true; do 71 | sleep "$PERIOD_SECS" 72 | local t=$(date +%s) 73 | local wall_secs=$(( t - T_0 )) 74 | 75 | local cpu_secs_current=$(cpu_secs) 76 | local cpu_total_secs=$(( cpu_secs_current - cpu_secs_0 )) 77 | local cpu_period_secs=$(( cpu_secs_current - cpu_secs_last )) 78 | 79 | local mem_bytes_current=$(mem_bytes) 80 | 81 | >&2 echo "container usage :: cpu_pct: $(( 100 * cpu_period_secs / PERIOD_SECS )), mem_MiB: $(( mem_bytes_current/1048576 )), cpu_total_s: ${cpu_total_secs}, elapsed_s: ${wall_secs}" 82 | 83 | cpu_secs_last=$cpu_secs_current 84 | t_last=$t 85 | done 86 | } 87 | """ 88 | -------------------------------------------------------------------------------- /plugin_log_task_usage/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="miniwdl_log_task_usage", 5 | version="0.1.0", 6 | description="miniwdl task plugin to log container cpu/mem usage", 7 | author="Wid L. Hacker", 8 | py_modules=["miniwdl_log_task_usage"], 9 | python_requires=">=3.6", 10 | setup_requires=["reentry"], 11 | install_requires=["miniwdl"], 12 | reentry_register=True, 13 | entry_points={ 14 | "miniwdl.plugin.task": ["log_task_usage = miniwdl_log_task_usage:main"], 15 | }, 16 | ) 17 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | HERE="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 6 | cd "$HERE" 7 | 8 | if grep dirty <(git describe --always --dirty); then 9 | >&2 echo "Cannot release dirty working tree" 10 | exit 1 11 | fi 12 | 13 | rm -rf build dist *.egg-info 14 | python3 setup.py sdist 15 | echo -e "\033[0;31;5m -- Pushing $(basename `ls -1 dist/*.tar.gz` .tar.gz) to PyPI! -- \033[0m" 16 | twine upload dist/*.tar.gz 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from version import get_version 3 | 4 | with open("README.md") as fp: 5 | long_description = fp.read() 6 | 7 | setup( 8 | name="miniwdl-aws", 9 | version=get_version(), 10 | description="miniwdl AWS backend (Batch+EFS)", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | author="Wid L. Hacker", 14 | python_requires=">=3.6", 15 | packages=find_packages(), 16 | setup_requires=["reentry"], 17 | install_requires=["miniwdl>=1.11.1", "boto3>=1.17", "requests"], 18 | reentry_register=True, 19 | entry_points={ 20 | "miniwdl.plugin.container_backend": [ 21 | "aws_batch_job = miniwdl_aws:BatchJob", 22 | "aws_batch_job_no_efs = miniwdl_aws:BatchJobNoEFS", 23 | ], 24 | "console_scripts": [ 25 | "miniwdl-run-s3upload = miniwdl_aws:miniwdl_run_s3upload", 26 | "miniwdl-aws-submit = miniwdl_aws.__main__:main", 27 | ], 28 | }, 29 | ) 30 | -------------------------------------------------------------------------------- /test/assets/count_lines.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | workflow count_lines { 3 | input { 4 | Array[File] files 5 | } 6 | scatter (file in files) { 7 | Array[String] file_lines = read_lines(file) 8 | } 9 | output { 10 | Int lines = length(flatten(file_lines)) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /test/assets/test_call_cache.wdl: -------------------------------------------------------------------------------- 1 | version 1.1 2 | 3 | workflow test_call_cache { 4 | input { 5 | Array[String] names 6 | Int timestamp_in # set to test case start time, preventing use of stale cache test entries 7 | Boolean fail = false 8 | } 9 | scatter (name in names) { 10 | call write_name { 11 | input: 12 | name = name, 13 | timestamp_in = timestamp_in 14 | } 15 | call t { 16 | input: 17 | who = write_name.name_file, 18 | timestamp_in = timestamp_in 19 | } 20 | } 21 | if (fail) { 22 | call failer after t 23 | } 24 | output { 25 | Array[Int] timestamps_out = t.timestamp_out 26 | Array[File] messages = t.message 27 | } 28 | } 29 | 30 | task write_name { 31 | input { 32 | String name 33 | Int timestamp_in 34 | } 35 | command { 36 | cp '~{write_lines([name])}' name.txt 37 | } 38 | output { 39 | File name_file = "name.txt" 40 | Int timestamp_out = timestamp_in 41 | } 42 | } 43 | 44 | task t { 45 | input { 46 | File who 47 | Int timestamp_in 48 | } 49 | command <<< 50 | t=$(date +%s) 51 | echo "$t" > timestamp_out 52 | echo "Hello, $(cat ~{who})! @$t" | tee message.txt 53 | >>> 54 | output { 55 | Int timestamp_out = read_int("timestamp_out") 56 | File message = "message.txt" 57 | } 58 | } 59 | 60 | task failer { 61 | command { 62 | exit 1 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /test/assets/test_directory.wdl: -------------------------------------------------------------------------------- 1 | version development 2 | 3 | workflow test_directory_workflow { 4 | input { 5 | Array[String] names = ["Alice", "Bob", "Carol"] 6 | } 7 | call make_directory { 8 | input: 9 | names 10 | } 11 | call test_directory { 12 | input: 13 | dir = make_directory.dir 14 | } 15 | output { 16 | Directory dir = make_directory.dir 17 | File report = test_directory.report 18 | Int file_count = test_directory.file_count 19 | } 20 | } 21 | 22 | task make_directory { 23 | input { 24 | Array[String] names 25 | } 26 | 27 | File names_file = write_lines(names) 28 | 29 | command <<< 30 | mkdir messages 31 | while read -r name; do 32 | echo "Hello, $name!" > "messages/$name.txt" 33 | done < '~{names_file}' 34 | >>> 35 | 36 | output { 37 | Directory dir = "messages" 38 | } 39 | } 40 | 41 | task test_directory { 42 | input { 43 | Directory dir 44 | } 45 | 46 | command <<< 47 | find '~{dir}' -type f | xargs sha256sum > report.txt 48 | find '~{dir}' -type f | wc -l > file.count 49 | >>> 50 | 51 | output { 52 | File report = "report.txt" 53 | Int file_count = read_int("file.count") 54 | } 55 | 56 | runtime { 57 | docker: "ubuntu:22.04" 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /test/assets/test_nonexistent_docker.wdl: -------------------------------------------------------------------------------- 1 | version 1.1 2 | 3 | task t { 4 | input { 5 | String docker 6 | } 7 | command { 8 | echo "Hello, world!" 9 | } 10 | runtime { 11 | docker: docker 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /test/assets/test_retry_streams.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | # This WDL tests stdout/stderr outputs and automatic task retry. The task outputs messages via 3 | # captured standard output and error files, but fails on 3/4 attempts. We're looking for the tasks 4 | # to ultimately succeed and to produce the expected outputs (in particular, each output file should 5 | # have only one message, despite the task potentially having been tried multiple times). 6 | 7 | workflow test_retry_streams { 8 | input {} 9 | 10 | scatter (i in range(4)) { 11 | call test_retry_streams_task 12 | } 13 | 14 | output { 15 | Array[File] messages = test_retry_streams_task.message 16 | Array[File] stdouts = test_retry_streams_task.stdout 17 | Array[File] stderrs = test_retry_streams_task.stderr 18 | } 19 | } 20 | 21 | task test_retry_streams_task { 22 | input {} 23 | 24 | command <<< 25 | echo "Hello, stdout!" | tee message.txt 26 | >&2 echo "Hello, stderr!" 27 | if (( RANDOM % 4 > 0)); then 28 | exit 42 29 | fi 30 | >>> 31 | 32 | output { 33 | File message = "message.txt" 34 | File stdout = stdout() 35 | File stderr = stderr() 36 | } 37 | 38 | runtime { 39 | docker: "ubuntu:20.04" 40 | cpu: 1 41 | maxRetries: 99 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /test/assets/test_termination.wdl: -------------------------------------------------------------------------------- 1 | version 1.1 2 | 3 | workflow w { 4 | scatter (i in range(4)) { 5 | call t { 6 | input: 7 | i 8 | } 9 | } 10 | } 11 | 12 | task t { 13 | input { 14 | Int i 15 | } 16 | 17 | command <<< 18 | if (( ~{i} == 3 )); then 19 | sleep 10 20 | >&2 echo -n "This is the end, my only friend" 21 | echo "I'll never look into your eyes again" 22 | exit 42 23 | fi 24 | sleep 600 25 | >>> 26 | } 27 | -------------------------------------------------------------------------------- /test/build_test_image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build the miniwdl-aws docker image and push to ECR (of the currently-credentialed account) for 3 | # use with live tests. Prepare in advance: 4 | # aws ecr create-repository --repository-name miniwdl-aws 5 | 6 | set -euo pipefail 7 | 8 | # build local image 9 | cd "$(dirname "$0")/.." 10 | >&2 python3 setup.py check 11 | >&2 docker pull public.ecr.aws/amazonlinux/amazonlinux:2023 12 | >&2 docker build -t miniwdl-aws . 13 | 14 | # login to ECR 15 | AWS_REGION="$(aws configure get region)" 16 | ECR_REGISTRY_ID="$(aws ecr describe-registry | jq -r .registryId)" 17 | ECR_REPO="${ECR_REGISTRY_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/miniwdl-aws" 18 | aws ecr get-login-password --region $(aws configure get region) \ 19 | | >&2 docker login --username AWS --password-stdin $ECR_REPO 20 | 21 | # set ECR tag & push 22 | >&2 docker tag miniwdl-aws:latest ${ECR_REPO}:latest 23 | >&2 docker push ${ECR_REPO} 24 | 25 | # print full RepoDigest (for use with `docker pull`) to stdout 26 | >&2 echo 27 | echo "$(docker inspect ${ECR_REPO}:latest | jq -r '.[0].RepoDigests[0]')" 28 | -------------------------------------------------------------------------------- /test/requirements.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | black 3 | flake8 4 | pylint 5 | pytest 6 | pytest-xdist 7 | boto3 8 | -------------------------------------------------------------------------------- /test/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | cd "$(dirname "$0")" 6 | 7 | export AWS_DEFAULT_REGION=$(aws configure get region) 8 | if [[ -z ${MINIWDL__AWS__WORKFLOW_IMAGE:-} ]]; then 9 | export MINIWDL__AWS__WORKFLOW_IMAGE=$(./build_test_image.sh) 10 | if [[ -z $MINIWDL__AWS__WORKFLOW_IMAGE ]]; then 11 | exit 1 12 | fi 13 | fi 14 | export MINIWDL_AWS_TEST_BUCKET="miniwdl-test-$(aws sts get-caller-identity | jq -r .Account)" 15 | >&2 echo "Creating S3 bucket $MINIWDL_AWS_TEST_BUCKET (BucketAlreadyOwnedByYou error is OK):" 16 | aws s3api create-bucket --bucket "$MINIWDL_AWS_TEST_BUCKET" \ 17 | --region "$AWS_DEFAULT_REGION" --create-bucket-configuration LocationConstraint="$AWS_DEFAULT_REGION" \ 18 | || true 19 | # NOTE: workflow IAM role needs to be able to write to that bucket... 20 | 21 | pytest -sxv test*.py $@ 22 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import subprocess 4 | import time 5 | import pytest 6 | import boto3 7 | import random 8 | from datetime import datetime 9 | from urllib.parse import urlparse 10 | 11 | assert "AWS_DEFAULT_REGION" in os.environ 12 | assert ( 13 | "MINIWDL__AWS__WORKFLOW_IMAGE" in os.environ 14 | and "miniwdl-aws" in os.environ["MINIWDL__AWS__WORKFLOW_IMAGE"] 15 | ), "set environment MINIWDL__AWS__WORKFLOW_IMAGE to repo:digest" 16 | assert ( 17 | "MINIWDL__AWS__WORKFLOW_QUEUE" in os.environ 18 | ), "set MINIWDL__AWS__WORKFLOW_QUEUE to Batch queue name" 19 | assert ( 20 | "MINIWDL_AWS_TEST_BUCKET" in os.environ 21 | ), "set MINIWDL_AWS_TEST_BUCKET to test S3 bucket (name only)" 22 | 23 | 24 | @pytest.fixture(scope="module") 25 | def aws_batch(): 26 | return boto3.client("batch", region_name=os.environ["AWS_DEFAULT_REGION"]) 27 | 28 | 29 | def batch_miniwdl(aws_batch, args, env=None, upload=None, cache=False): 30 | """ 31 | Submit & await a Batch job to run cmd in the miniwdl_aws container (usually ~miniwdl run~ 32 | to launch other Batch jobs in turn) 33 | """ 34 | cmd = ["python3", "-m", "miniwdl_aws"] 35 | cmd.extend(args) 36 | cmd.append("--follow") 37 | if not cache: 38 | cmd.append("--no-cache") 39 | if upload: 40 | if not upload.endswith("/"): 41 | upload += "/" 42 | cmd.extend(["--s3upload", upload]) 43 | 44 | exit_code = subprocess.run( 45 | cmd, cwd=os.path.dirname(os.path.dirname(__file__)), check=False, env=env 46 | ).returncode 47 | 48 | if exit_code != 0: 49 | ans = {"success": False, "exit_code": exit_code} 50 | if upload: 51 | error = get_s3uri(upload + "error.json") 52 | if error: 53 | ans["error"] = json.loads(error) 54 | return ans 55 | 56 | ans = {"success": True} 57 | if upload: 58 | outputs = get_s3uri(upload + "outputs.json") 59 | if outputs: 60 | ans["outputs"] = json.loads(outputs) 61 | return ans 62 | 63 | 64 | def get_s3uri(uri): 65 | """ 66 | Download bytes from s3:// URI 67 | """ 68 | try: 69 | assert uri.startswith("s3://") 70 | parts = urlparse(uri) 71 | obj = boto3.resource("s3", region_name=os.environ["AWS_DEFAULT_REGION"]).Object( 72 | parts.netloc, parts.path.lstrip("/") 73 | ) 74 | return obj.get()["Body"].read() 75 | except Exception as exn: 76 | if "NoSuchKey" in str(exn): 77 | return None 78 | raise 79 | 80 | 81 | def test_miniwdl_run_self_test(aws_batch): 82 | subprocess.run( 83 | [ 84 | "python3", 85 | "-m", 86 | "miniwdl_aws", 87 | "--follow", 88 | "--self-test", 89 | "--no-cache", 90 | "--mount", 91 | "/mnt/shared", 92 | ], 93 | cwd=os.path.dirname(os.path.dirname(__file__)), 94 | check=True, 95 | ) 96 | 97 | 98 | @pytest.fixture(scope="session") 99 | def test_s3_folder(): 100 | """ 101 | S3 folder for this test session 102 | """ 103 | return f"s3://{os.environ['MINIWDL_AWS_TEST_BUCKET']}/{datetime.today().strftime('%Y%m%d_%H%M%S')}/" 104 | 105 | 106 | def test_retry_streams(aws_batch, test_s3_folder): 107 | env = dict(os.environ) 108 | env["MINIWDL__AWS__RETRY_WAIT"] = "1" 109 | rslt = batch_miniwdl( 110 | aws_batch, 111 | [ 112 | "/var/miniwdl_aws_test_assets/test_retry_streams.wdl", 113 | "--dir", 114 | "/mnt/efs/miniwdl_aws_tests", 115 | "--verbose", 116 | ], 117 | upload=test_s3_folder + "test_retry_streams/", 118 | env=env, 119 | ) 120 | assert rslt["success"] 121 | assert len(rslt["outputs"]["test_retry_streams.messages"]) == 4 122 | assert len(rslt["outputs"]["test_retry_streams.stdouts"]) == 4 123 | assert len(rslt["outputs"]["test_retry_streams.stderrs"]) == 4 124 | for i in range(4): 125 | assert ( 126 | get_s3uri(rslt["outputs"]["test_retry_streams.messages"][i]).decode().strip() 127 | == "Hello, stdout!" 128 | ) 129 | assert ( 130 | get_s3uri(rslt["outputs"]["test_retry_streams.stdouts"][i]).decode().strip() 131 | == "Hello, stdout!" 132 | ) 133 | assert ( 134 | get_s3uri(rslt["outputs"]["test_retry_streams.stderrs"][i]).decode().strip() 135 | == "Hello, stderr!" 136 | ) 137 | 138 | 139 | def test_assemble_refbased(aws_batch, test_s3_folder): 140 | rslt = batch_miniwdl( 141 | aws_batch, 142 | [ 143 | "https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/pipes/WDL/workflows/assemble_refbased.wdl", 144 | "reads_unmapped_bams=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/G5012.3.testreads.bam", 145 | "reference_fasta=https://github.com/broadinstitute/viral-pipelines/raw/v2.1.19.0/test/input/ebov-makona.fasta", 146 | "sample_name=G5012.3", 147 | "--dir", 148 | "/mnt/efs/miniwdl_aws_tests", 149 | "--verbose", 150 | ], 151 | upload=test_s3_folder + "test_assemble_refbased/", 152 | ) 153 | assert rslt["success"] 154 | # TODO: more assertions 155 | 156 | 157 | def test_termination(aws_batch, test_s3_folder): 158 | """ 159 | Upon a CommandFailed task failure, the workflow with parallel tasks quickly self-terminates. 160 | """ 161 | t0 = time.time() 162 | env = dict(os.environ) 163 | env["MINIWDL__AWS__CONTAINER_SYNC"] = "true" 164 | rslt = batch_miniwdl( 165 | aws_batch, 166 | [ 167 | "/var/miniwdl_aws_test_assets/test_termination.wdl", 168 | "--dir", 169 | "/mnt/efs/miniwdl_aws_tests", 170 | "--verbose", 171 | ], 172 | upload=test_s3_folder + "test_termination/", 173 | env=env, 174 | ) 175 | assert not rslt["success"] 176 | assert rslt["error"]["cause"]["error"] == "CommandFailed" 177 | assert rslt["error"]["cause"]["exit_status"] == 42 178 | assert ( 179 | "This is the end, my only friend" 180 | in get_s3uri(rslt["error"]["cause"]["stderr_s3file"]).decode() 181 | ) 182 | assert ( 183 | "I'll never look into your eyes again" 184 | in get_s3uri(rslt["error"]["cause"]["stdout_s3file"]).decode() 185 | ) 186 | assert time.time() - t0 < 600 187 | 188 | 189 | def test_nonexistent_docker(aws_batch, test_s3_folder): 190 | """ 191 | Workflow specifies a docker image that doesn't exist; does this error bubble up from AWS Batch 192 | in a reasonable way? 193 | """ 194 | rslt = batch_miniwdl( 195 | aws_batch, 196 | [ 197 | "/var/miniwdl_aws_test_assets/test_nonexistent_docker.wdl", 198 | "docker=nonexistent_bogus_12345", 199 | "--dir", 200 | "/mnt/efs/miniwdl_aws_tests", 201 | "--delete-after", 202 | "failure", 203 | "--verbose", 204 | ], 205 | upload=test_s3_folder + "test_nonexistent_docker/", 206 | ) 207 | assert not rslt["success"] 208 | assert "CannotPullContainerError" in str(rslt["error"]) 209 | 210 | 211 | def test_call_cache(aws_batch, test_s3_folder): 212 | """ 213 | Call cache works (short-term, where previous outputs remain on /mnt/shared) 214 | """ 215 | t0 = int(time.time()) 216 | # run once to prime cache 217 | rslt = batch_miniwdl( 218 | aws_batch, 219 | [ 220 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl", 221 | "timestamp_in=", 222 | str(t0), 223 | "names=Alice", 224 | "names=Bob", 225 | "names=Carol", 226 | "fail=true", 227 | "--verbose", 228 | "--dir", 229 | "/mnt/efs/miniwdl_aws_tests", 230 | ], 231 | cache=False, 232 | ) 233 | assert not rslt["success"] 234 | 235 | # run again where a subset of calls should be reused 236 | t1 = int(time.time()) 237 | rslt = batch_miniwdl( 238 | aws_batch, 239 | [ 240 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl", 241 | "timestamp_in=", 242 | str(t0), 243 | "names=Alice", 244 | "names=Bob", 245 | "names=Xavier", 246 | "--verbose", 247 | "--dir", 248 | "/mnt/efs/miniwdl_aws_tests", 249 | ], 250 | cache=True, 251 | upload=test_s3_folder + "test_call_cache/", 252 | ) 253 | assert rslt["success"] 254 | 255 | # Alice and Bob were cached, Xavier was not: 256 | assert t0 <= rslt["outputs"]["test_call_cache.timestamps_out"][0] <= t1 257 | assert t0 <= rslt["outputs"]["test_call_cache.timestamps_out"][1] <= t1 258 | assert rslt["outputs"]["test_call_cache.timestamps_out"][2] > t1 259 | assert "Hello, Alice!" in get_s3uri(rslt["outputs"]["test_call_cache.messages"][0]).decode() 260 | assert "Hello, Bob!" in get_s3uri(rslt["outputs"]["test_call_cache.messages"][1]).decode() 261 | assert "Hello, Xavier!" in get_s3uri(rslt["outputs"]["test_call_cache.messages"][2]).decode() 262 | 263 | 264 | def test_call_cache_one_task(aws_batch, test_s3_folder): 265 | """ 266 | Short-term call cache of one task (where the entire run outputs, not just a portion thereof, 267 | are sourced from the cache.) 268 | """ 269 | t0 = int(time.time()) 270 | rslt = batch_miniwdl( 271 | aws_batch, 272 | [ 273 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl", 274 | "timestamp_in=", 275 | str(t0), 276 | "name=Alyssa", 277 | "--task", 278 | "write_name", 279 | "--verbose", 280 | "--dir", 281 | "/mnt/efs/miniwdl_aws_tests", 282 | ], 283 | cache=False, 284 | ) 285 | assert rslt["success"] 286 | 287 | t1 = int(time.time()) 288 | rslt = batch_miniwdl( 289 | aws_batch, 290 | [ 291 | "/var/miniwdl_aws_test_assets/test_call_cache.wdl", 292 | "timestamp_in=", 293 | str(t0), 294 | "name=Alyssa", 295 | "--task", 296 | "write_name", 297 | "--verbose", 298 | "--dir", 299 | "/mnt/efs/miniwdl_aws_tests", 300 | ], 301 | cache=True, 302 | upload=test_s3_folder + "test_call_cache_one_task/", 303 | ) 304 | assert rslt["success"] 305 | 306 | assert t0 <= rslt["outputs"]["write_name.timestamp_out"] <= t1 307 | assert "Alyssa" in get_s3uri(rslt["outputs"]["write_name.name_file"]).decode() 308 | 309 | 310 | def test_download(aws_batch): 311 | """ 312 | Test workflow can use https:// and s3:// input files. This is functionality built-in to miniwdl 313 | so ought to just work, but nice to cover it here. 314 | """ 315 | rslt = batch_miniwdl( 316 | aws_batch, 317 | [ 318 | "/var/miniwdl_aws_test_assets/count_lines.wdl", 319 | "files=https://raw.githubusercontent.com/chanzuckerberg/miniwdl/main/tests/alyssa_ben.txt", 320 | "files=s3://1000genomes/CHANGELOG", 321 | "--dir", 322 | "/mnt/efs/miniwdl_aws_tests", 323 | "--verbose", 324 | ], 325 | ) 326 | assert rslt["success"] 327 | 328 | 329 | def test_directory(aws_batch, test_s3_folder): 330 | """ 331 | Test Directory I/O 332 | """ 333 | 334 | rslt = batch_miniwdl( 335 | aws_batch, 336 | [ 337 | "/var/miniwdl_aws_test_assets/test_directory.wdl", 338 | "--dir", 339 | "/mnt/efs/miniwdl_aws_tests", 340 | "--verbose", 341 | ], 342 | upload=test_s3_folder + "test_directory/", 343 | ) 344 | assert rslt["success"] 345 | assert rslt["outputs"]["test_directory_workflow.dir"].startswith("s3://") 346 | assert rslt["outputs"]["test_directory_workflow.file_count"] == 3 347 | 348 | rslt = batch_miniwdl( 349 | aws_batch, 350 | [ 351 | "/var/miniwdl_aws_test_assets/test_directory.wdl", 352 | "dir=s3://1000genomes/changelog_details/", 353 | "--task", 354 | "test_directory", 355 | "--dir", 356 | "/mnt/efs/miniwdl_aws_tests", 357 | "--verbose", 358 | ], 359 | upload=test_s3_folder + "test_directory/", 360 | ) 361 | assert rslt["success"] 362 | assert rslt["outputs"]["test_directory.file_count"] > 100 363 | 364 | 365 | def test_shipping_local_wdl(aws_batch, tmp_path, test_s3_folder): 366 | with open(tmp_path / "outer.wdl", "w") as outfile: 367 | print( 368 | """ 369 | version development 370 | import "inner.wdl" 371 | 372 | workflow outer { 373 | input { 374 | String who 375 | } 376 | call inner.hello { input: who } 377 | output { 378 | String message = hello.message 379 | } 380 | } 381 | """, 382 | file=outfile, 383 | ) 384 | with open(tmp_path / "inner.wdl", "w") as outfile: 385 | print( 386 | """ 387 | version development 388 | 389 | task hello { 390 | input { 391 | String who 392 | } 393 | command { 394 | echo 'Hello, ~{who}!' 395 | } 396 | output { 397 | String message = read_string(stdout()) 398 | } 399 | } 400 | """, 401 | file=outfile, 402 | ) 403 | rslt = batch_miniwdl( 404 | aws_batch, 405 | [ 406 | str(tmp_path / "outer.wdl"), 407 | "who=world", 408 | "--dir", 409 | "/mnt/efs/miniwdl_aws_tests", 410 | ], 411 | upload=test_s3_folder + "test_shipping_local_wdl/", 412 | ) 413 | assert rslt["outputs"]["outer.message"] == "Hello, world!" 414 | 415 | 416 | def test_shipping_local_wdl_error(aws_batch, tmp_path, test_s3_folder): 417 | almost_big_str = "".join(chr(random.randrange(ord("A"), ord("Z"))) for _ in range(42000)) 418 | with open(tmp_path / "almost_big.wdl", "w") as outfile: 419 | print( 420 | """ 421 | version development 422 | 423 | workflow outer { 424 | input { 425 | } 426 | output { 427 | String big = "XXX" 428 | } 429 | } 430 | """.replace( 431 | "XXX", almost_big_str 432 | ), 433 | file=outfile, 434 | ) 435 | rslt = batch_miniwdl( 436 | aws_batch, 437 | [ 438 | str(tmp_path / "almost_big.wdl"), 439 | "--dir", 440 | "/mnt/efs/miniwdl_aws_tests", 441 | ], 442 | upload=test_s3_folder + "test_shipping_local_wdl_error/", 443 | ) 444 | assert rslt["success"] 445 | assert rslt["outputs"]["outer.big"] == almost_big_str 446 | 447 | # Test for reasonable error when zipped WDL is too large 448 | big_str = "".join(chr(random.randrange(ord("A"), ord("Z"))) for _ in range(50000)) 449 | with open(tmp_path / "big.wdl", "w") as outfile: 450 | print( 451 | """ 452 | version development 453 | 454 | workflow outer { 455 | input { 456 | } 457 | output { 458 | String big = "XXX" 459 | } 460 | } 461 | """.replace( 462 | "XXX", big_str 463 | ), 464 | file=outfile, 465 | ) 466 | rslt = batch_miniwdl( 467 | aws_batch, 468 | [ 469 | str(tmp_path / "big.wdl"), 470 | "--dir", 471 | "/mnt/efs/miniwdl_aws_tests", 472 | ], 473 | ) 474 | assert rslt["exit_code"] == 123 475 | 476 | 477 | def test_log_task_usage(aws_batch, test_s3_folder): 478 | env = dict(os.environ) 479 | env["MINIWDL__LOG_TASK_USAGE__PERIOD"] = "2" 480 | rslt = batch_miniwdl( 481 | aws_batch, 482 | [ 483 | os.path.join(os.path.dirname(__file__), "../plugin_log_task_usage/StressTest.wdl"), 484 | "--dir", 485 | "/mnt/efs/miniwdl_aws_tests", 486 | "--verbose", 487 | "--delete-after", 488 | "always", 489 | ], 490 | upload=test_s3_folder + "test_log_task_usage/", 491 | env=env, 492 | ) 493 | assert rslt["success"] 494 | assert "container usage ::" in get_s3uri(rslt["outputs"]["StressTest.stderr_txt"]).decode() 495 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Calculates the current version number. 4 | 5 | If possible, uses output of “git describe” modified to conform to the 6 | visioning scheme that setuptools uses (see PEP 386). Releases must be 7 | labelled with annotated tags (signed tags are annotated) of the following 8 | format: 9 | 10 | v(.)+ [ {a|b|c|rc} (.)* ] 11 | 12 | If “git describe” returns an error (likely because we're in an unpacked copy 13 | of a release tarball, rather than a git working copy), or returns a tag that 14 | does not match the above format, version is read from RELEASE-VERSION file. 15 | 16 | To use this script, simply import it your setup.py file, and use the results 17 | of get_version() as your package version: 18 | 19 | import version 20 | setup( 21 | version=version.get_version(), 22 | . 23 | . 24 | . 25 | ) 26 | 27 | This will automatically update the RELEASE-VERSION file. The RELEASE-VERSION 28 | file should *not* be checked into git but it *should* be included in sdist 29 | tarballs (as should version.py file). To do this, run: 30 | 31 | echo include RELEASE-VERSION version.py >>MANIFEST.in 32 | echo RELEASE-VERSION >>.gitignore 33 | 34 | With that setup, a new release can be labelled by simply invoking: 35 | 36 | git tag -s v1.0 37 | """ 38 | 39 | __author__ = ("Douglas Creager ", "Michal Nazarewicz ") 40 | __license__ = "This file is placed into the public domain." 41 | __maintainer__ = "Michal Nazarewicz" 42 | __email__ = "mina86@mina86.com" 43 | 44 | __all__ = "get_version" 45 | 46 | 47 | import re 48 | import subprocess 49 | import sys 50 | 51 | 52 | RELEASE_VERSION_FILE = "RELEASE-VERSION" 53 | 54 | # http://www.python.org/dev/peps/pep-0386/ 55 | _PEP386_SHORT_VERSION_RE = r"\d+(?:\.\d+)+(?:(?:[abc]|rc)\d+(?:\.\d+)*)?" 56 | _PEP386_VERSION_RE = r"^%s(?:\.post\d+)?(?:\.dev\d+)?$" % (_PEP386_SHORT_VERSION_RE) 57 | _GIT_DESCRIPTION_RE = r"^v(?P%s)-(?P\d+)-g(?P[\da-f]+)$" % ( 58 | _PEP386_SHORT_VERSION_RE 59 | ) 60 | 61 | 62 | def read_git_version(): 63 | try: 64 | proc = subprocess.Popen( # pylint: disable=R1732 65 | ("git", "describe", "--long", "--tags", "--match", "v[0-9]*.*"), 66 | stdout=subprocess.PIPE, 67 | stderr=subprocess.PIPE, 68 | ) 69 | data, _ = proc.communicate() 70 | if proc.returncode: 71 | return None 72 | ver = data.decode().splitlines()[0].strip() 73 | except: 74 | return None 75 | 76 | if not ver: 77 | return None 78 | match = re.search(_GIT_DESCRIPTION_RE, ver) 79 | if not match: 80 | sys.stderr.write("version: git description (%s) is invalid, " "ignoring\n" % ver) 81 | return None 82 | 83 | commits = int(match.group("commits")) 84 | if not commits: 85 | return match.group("ver") 86 | return "%s.post%d.dev%d" % (match.group("ver"), commits, int(match.group("sha"), 16)) 87 | 88 | 89 | def read_release_version(): 90 | try: 91 | with open(RELEASE_VERSION_FILE) as infile: 92 | ver = infile.readline().strip() 93 | if not re.search(_PEP386_VERSION_RE, ver): 94 | sys.stderr.write( 95 | "version: release version (%s) is invalid, " "will use it anyway\n" % ver 96 | ) 97 | return ver 98 | except: 99 | return None 100 | 101 | 102 | def write_release_version(version): 103 | with open(RELEASE_VERSION_FILE, "w") as outfile: 104 | outfile.write("%s\n" % version) 105 | 106 | 107 | def get_version(): 108 | release_version = read_release_version() 109 | version = read_git_version() or release_version 110 | if not version: 111 | raise ValueError("Cannot find the version number") 112 | if version != release_version: 113 | write_release_version(version) 114 | return version 115 | 116 | 117 | if __name__ == "__main__": 118 | print(get_version()) 119 | --------------------------------------------------------------------------------