├── .pre-commit-config.yaml ├── LICENSE ├── README.md └── aws ├── cloudformation ├── README.md └── metaflow-cfn-template.yml └── terraform ├── .gitignore ├── README.md ├── infra ├── README.md ├── data.tf ├── example.tfvars ├── locals.tf ├── main.tf ├── outputs.tf ├── variables.tf ├── versions.tf └── vpc.tf ├── metaflow ├── .gitignore ├── data.tf ├── example.tfvars ├── iam-custom-role.tf ├── locals.tf ├── main.tf ├── outputs.tf ├── variables.tf └── versions.tf ├── modules ├── common │ ├── locals.tf │ ├── main.tf │ ├── outputs.tf │ └── variables.tf └── metaflow │ ├── .terraform-docs.yml │ ├── README.md │ ├── data.tf │ ├── ecr.tf │ ├── iam.tf │ ├── locals.tf │ ├── main.tf │ ├── modules │ ├── .terraform-docs.yml │ ├── README.md │ ├── computation │ │ ├── README.md │ │ ├── batch.tf │ │ ├── data.tf │ │ ├── ec2.tf │ │ ├── iam-batch-execution.tf │ │ ├── iam-ecs-execution.tf │ │ ├── iam-ecs-instance.tf │ │ ├── locals.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── datastore │ │ ├── README.md │ │ ├── kms.tf │ │ ├── locals.tf │ │ ├── outputs.tf │ │ ├── rds.tf │ │ ├── s3.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── metadata-service │ │ ├── .gitignore │ │ ├── README.md │ │ ├── api-gateway.tf │ │ ├── cloud-watch.tf │ │ ├── data.tf │ │ ├── ec2.tf │ │ ├── ecs.tf │ │ ├── iam.tf │ │ ├── lambda.tf │ │ ├── locals.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ └── step-functions │ │ ├── README.md │ │ ├── data.tf │ │ ├── dynamodb.tf │ │ ├── iam-eventbridge.tf │ │ ├── iam-step-functions.tf │ │ ├── locals.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf └── sagemaker-notebook ├── README.md ├── data.tf ├── example.tfvars ├── iam.tf ├── locals.tf ├── main.tf ├── outputs.tf ├── sagemaker.tf ├── variables.tf └── versions.tf /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - repo: https://github.com/terraform-docs/terraform-docs 8 | rev: "v0.15.0" 9 | hooks: 10 | - id: terraform-docs-go 11 | name: "Main terraform module docs" 12 | files: "^aws/terraform/modules/metaflow/" 13 | args: ["-c", "aws/terraform/modules/metaflow/.terraform-docs.yml", "markdown", "aws/terraform/modules/metaflow"] 14 | - id: terraform-docs-go 15 | name: "Computation terraform module docs" 16 | files: "^aws/terraform/modules/metaflow/" 17 | args: ["-c", "aws/terraform/modules/metaflow/modules/.terraform-docs.yml", "markdown", "aws/terraform/modules/metaflow/modules/computation"] 18 | - id: terraform-docs-go 19 | name: "Datastore terraform module docs" 20 | files: "^aws/terraform/modules/metaflow/" 21 | args: ["-c", "aws/terraform/modules/metaflow/modules/.terraform-docs.yml", "markdown", "aws/terraform/modules/metaflow/modules/datastore"] 22 | - id: terraform-docs-go 23 | name: "Metadata Service terraform module docs" 24 | files: "^aws/terraform/modules/metaflow/" 25 | args: ["-c", "aws/terraform/modules/metaflow/modules/.terraform-docs.yml", "markdown", "aws/terraform/modules/metaflow/modules/metadata-service"] 26 | - id: terraform-docs-go 27 | name: "Step Functions terraform module docs" 28 | files: "^aws/terraform/modules/metaflow/" 29 | args: ["-c", "aws/terraform/modules/metaflow/modules/.terraform-docs.yml", "markdown", "aws/terraform/modules/metaflow/modules/step-functions"] 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Netflix, Inc. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Metaflow Tools 2 | 3 | Devops tools and utilities for operating Metaflow in the cloud. 4 | 5 | ### This git repository is archived, but the tools continue to be actively maintained at [outerbounds/metaflow-tools](https://github.com/outerbounds/metaflow-tools) 6 | -------------------------------------------------------------------------------- /aws/cloudformation/README.md: -------------------------------------------------------------------------------- 1 | ## Using This Template 2 | --- 3 | ### Overview 4 | 5 | This CloudFormation template deploys all the necessary infrastucture in AWS to support Metaflow's integration points and extend its capabilities into the Cloud. A brief snapshot of its components are as follows: 6 | 7 | - **Amazon S3 Bucket** - Metaflow uses Amazon S3 as a centralized data repository for all data that's leveraged by and generated for its flows. This template creates a dedicated private bucket and all appropriate permissions. 8 | 9 | - **AWS Batch Compute Environment** - In order to extend Metaflow's compute capabilities to the cloud, AWS Batch provides a simple API that runs container-based jobs to completion on AWS Elastic Container Service. 10 | 11 | - **AWS Step Functions and Event Bridge IAM Resources** - While Step Functions state machines aren't explicitly created by this template, Metaflow's 2.0+ releases include functionality to allow a 1:1 Flow <--> State Machine relationship. In order to facilitate this, there are some IAM roles and policies specific to allowing Metaflow to deploy and trigger Step Functions State Machines. 12 | 13 | - **Amazon DynamoDB Table** - Metaflow leverages DynamoDB to store information related to branching paths in flows executed by AWS Step Functions. This template deploys the appropriate table and overlays necessary permissions for AWS Batch and AWS Step Functions to communicate with it. 14 | 15 | - **Amazon Sagemaker Notebook Instance** - Metaflow's API allows for easy access to flow results and information which can be cleanly displayed in a Jupyter notebook. Amazon Sagemaker Notebook instances provide a fully managed notebook environment with dedicated and customizable compute resources. 16 | 17 | - **Metadata and Database Services on AWS Fargate and Amazon Relational Database Service** - To facilitate persistent programmatic access to flow information, Metaflow provides a Metadata service that can be run on cloud resources and enable remote accessibility. This CloudFormation template leverages AWS Fargate and Amazon Relational Database Service to deploy the Metadata Service Automatically. 18 | 19 | - **Amazon API Gateway** - To provide secure, encrypted access to a user's Metadata Service, this CloudFormation template uses Amazon API Gateway as a TLS termination point and an optional point of basic API authentication via key. 20 | 21 | - **Amazon VPC Networking** - All underlying network components are deployed to facilitate connectivity for the resources leveraged by Metaflow. Specifically, a VPC with (2) customizable subnets and Internet connectivity will be leveraged for this template. 22 | 23 | - **AWS Identity and Access Management** - Roles specific to Metaflow will be provisioned by this template in order to provide "principle of least privilege" access to resources such as AWS Batch and Amazon Sagemaker Notebook instances. Additionally, an optional role can be created that provides restricted access to only the resources Metaflow requires. This allows an easy path of utilization to users who don't need full access to all AWS resources. 24 | 25 | ### Prerequisites 26 | 27 | 1. Adequate permissions to deploy all CloudFormation resources within an AWS account. 28 | 29 | ### How To Deploy from the AWS Console 30 | 31 | 1. Navigate to "Services" and select "CloudFormation" under the "Management and Governance" heading (or search for it in the search bar). 32 | 2. Click "Create stack" and select "With new resources (standard)". 33 | 3. Ensure "Template is ready" remains selected, choose "Upload a template file", and click "Choose file". 34 | 4. Feel free to explore with "View in Designer" if you so choose, otherwise click "Next". 35 | 5. Name your stack, select your parameters, and click "Next", noting that if you enable "APIBasicAuth" and/or "CustomRole", further configuration will be required after deployment. More info below. 36 | 6. If desired, feel free to tag your stack in whatever way best fits your organization. When finished, click "Next". 37 | 7. Ensure you select the check box next to "I acknowledge that AWS CloudFormation might create IAM resources." and click "Create stack". 38 | 8. Wait roughly 10-15 minutes for deployment to complete. The Stack status will eventually change to "CREATE_COMPLETE". 39 | 40 | Once complete, you'll find an "Outputs" tab that contains values for the components generated by this CloudFormation template. Those values correlate to respective environment variables (listed next to the outputs) you'll set to enable cloud features within Metaflow. 41 | 42 | ### Additional Configuration 43 | 44 | Did you choose to enable "APIBasicAuth" and/or "CustomRole" and are wondering how they work? Then you're in the right place! Below are some details on what happens when those features are enabled and how to make use of them. 45 | 46 | - **APIBasicAuth** - In addition to TLS termination, Amazon API Gateway provides the ability to generate an API key that restricts access only to requests that pass that API key in the 'x-api-key' HTTP header. This is useful in that it restricts access to flow information from the general Internet while still allowing remote connectivity to authenticated clients. However, enabling this feature means that you'll need to request the API Key from Amazon API Gateway, as exposing a credential as an output from CloudFormation is a potential security problem. CloudFormation does, however, output the ID of the API Key that correlates to your stack, making is easy to get the key and pass it to Metaflow. Follow one of the two instructions below to output the key, and then export it to the `METAFLOW_SERVICE_AUTH_KEY` environment variable. 47 | 48 | 1. From the AWS CLI, run the following: `aws apigateway get-api-key --api-key --include-value | grep value` 49 | 2. From the AWS Console, navigate to "Services" and select "API Gateway" from "Networking & Content Delivery" (or search for it in the search bar). Click your API, select "API Keys" from the left side, select the API that corresponds to your Stack name, and click "show" next to "API Key". 50 | 51 | - **CustomRole** - This template can create an optional role that can be assumed by users (or applications) that includes limited permissions to only the resources required by Metaflow, including access only to the Amazon S3 bucket, AWS Batch Compute Environment, and Amazon Sagemaker Notebook Instance created by this template. You will, however, need to modify the trust policy for the role to grant access to the principals (users/roles/accounts) who will assume it, and you'll also need to have your users configure an appropriate role-assumption profile. The ARN of the Custom Role can be found in the "Output" tab of the CloudFormation stack under `MetaflowUserRoleArn`. To modify the trust policy to allow new principals, follow the directions [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/roles-managingrole-editing-console.html#roles-managingrole_edit-trust-policy). Once you've granted access to the principals of your choice, have your users create a new Profile for the AWS CLI that assumes the role ARN by following the directions [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-role.html). 52 | 53 | ### Optional Metaflow User Interface (`EnableUI` -parameter) 54 | 55 | Please note: This section can be ignored if `EnableUI` -parameter is disabled (this is the default value). 56 | 57 | User Interface is provided as part of the `metaflow-cfn-template.yml` template and doesn't require any additional 58 | configuration besides enabling the `EnableUI` -parameter. You can follow the [AWS CloudFormation Deployment](https://admin-docs.metaflow.org/metaflow-on-aws/deployment-guide/aws-cloudformation-deployment#steps-for-aws-cloudformation-deployment) instructions. 59 | 60 | Once deployed the Cloudformation Stack will provide two outputs: 61 | - `UIServiceUrl` - Application Load Balancer endpoint 62 | - `UIServiceCloudfrontUrl` - Cloudfront distribution (using ALB) endpoint with HTTPS enabled (preferred) 63 | 64 | Please note: Metaflow User Interface doesn't provide any authentication by default. -------------------------------------------------------------------------------- /aws/terraform/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignores any local metaflow datastore & metadata service that could be created here when testing terraform stack 2 | .metaflow 3 | 4 | # Created by https://www.toptal.com/developers/gitignore/api/terraform 5 | # Edit at https://www.toptal.com/developers/gitignore?templates=terraform 6 | 7 | ### Terraform ### 8 | # Local .terraform directories 9 | **/.terraform 10 | .terraform.lock.hcl 11 | 12 | # .tfstate files 13 | *.tfstate 14 | *.tfstate.* 15 | 16 | # Ignore override files as they are usually used to override resources locally and so 17 | # are not checked in 18 | override.tf 19 | override.tf.json 20 | *_override.tf 21 | *_override.tf.json 22 | 23 | # Include override files you do wish to add to version control using negated pattern 24 | # !example_override.tf 25 | 26 | # Crash log files 27 | crash.log 28 | 29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 30 | *.plan 31 | 32 | # End of https://www.toptal.com/developers/gitignore/api/terraform 33 | -------------------------------------------------------------------------------- /aws/terraform/README.md: -------------------------------------------------------------------------------- 1 | ## Pre-requisites 2 | 3 | ### Terraform 4 | 5 | [Download](https://www.terraform.io/downloads.html) and install terraform 0.14.x. 6 | 7 | ### AWS 8 | 9 | AWS should be [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) with a profile. The `AWS_PROFILE` environment should be set to the profile that has been configured. 10 | 11 | The [awscli](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) can be used as follows to 12 | confirm that it has been configured properly by running: 13 | 14 | ``` 15 | aws sts get-caller-identity 16 | ``` 17 | 18 | which should output your account information. 19 | 20 | ## Setup 21 | 22 | ### Infrastructure stack 23 | 24 | The infra sub-project provides some pre-requisite infrastructure for the Metaflow service. For more details see the [README](aws/terraform/infra/README.md) 25 | 26 | Copy `example.tfvars` to `prod.tfvars` (or whatever environment name you prefer) and update that `env` name and the `region` as needed. These variables are used to construct unique names for infrastructure resources. 27 | 28 | Initialize the terraform: 29 | 30 | `cd infra && terraform init` 31 | 32 | Apply it: 33 | 34 | ``` 35 | terraform apply --var-file prod.tfvars 36 | ``` 37 | 38 | ### Metaflow stack 39 | 40 | The metaflow sub-project provisions the metadata API, AWS Step Functions, and an AWS Batch queue. For more details see the 41 | [README](aws/terraform/metaflow/README.md) 42 | 43 | Copy `example.tfvars` to `prod.tfvars` (or whatever environment name you prefer) and update that `env` name and the `region` as needed. These variables are used to construct unique names for infrastructure resources. 44 | 45 | Protecting the Metadata API: 46 | By default, the Metadata API has basic authentication enabled (recommended), but it is exposed to the public internet via Amazon API Gateway. To further restrict access to the API, the `access_list_cidr_blocks` can be set to specify IPs or network cidr blocks that are allowed to access the endpoint, blocking all other access. 47 | 48 | Additionally: 49 | * There are variables which govern the compute environment associated with the AWS Batch queue that can be adjusted based on needs. 50 | * The `enable_step_functions` flag can be set to false to not provision the AWS Step Functions infrastructure. 51 | 52 | Initialize the terraform: 53 | 54 | `cd metaflow && terraform init` 55 | 56 | Apply it: 57 | 58 | ``` 59 | terraform apply --var-file prod.tfvars 60 | ``` 61 | 62 | Once the Terraform executes, configure Metaflow using `metaflow configure import ./metaflow_config__.json` 63 | 64 | ### Custom Default Batch Image 65 | 66 | A custom default batch image can be used by setting the variable `enable_custom_batch_container_registry` to `true`. This will provision an Amazon ECR registry, and the generated Metaflow AWS Batch configuration will have `METAFLOW_BATCH_CONTAINER_IMAGE` and `METAFLOW_BATCH_CONTAINER_REGISTRY` set to the Amazon ECR repository. The Metaflow AWS Batch image must then be pushed into the repository before the first flow can be executed. 67 | 68 | To do this, first copy the output of `metaflow_batch_container_image`. 69 | 70 | Then login to the Amazon ECR repository: 71 | ``` 72 | aws ecr get-login-password | docker login --username AWS --password-stdin 73 | ``` 74 | 75 | Pull the appropriate image from Docker Hub. In this case, we are using `continuumio/miniconda3:latest`: 76 | 77 | ``` 78 | docker pull continuumio/miniconda3 79 | ``` 80 | 81 | Tag the image: 82 | 83 | ``` 84 | docker tag continuumio/miniconda3:latest 85 | ``` 86 | 87 | Push the image: 88 | 89 | ``` 90 | docker push 91 | ``` 92 | 93 | ### Amazon Sagemaker Notebook stack 94 | 95 | The sagemaker-notebook subproject provisions an optional Jupyter notebook with access to the Metaflow API. 96 | 97 | Copy `example.tfvars` to `prod.tfvars` (or whatever environment name you prefer) and update that `env` name and the `region` as needed. These variables are used to construct unique names for infrastructure resources. 98 | 99 | Initialize the terraform: 100 | 101 | `cd sagemaker-notebook && terraform init` 102 | 103 | Apply it: 104 | 105 | ``` 106 | terraform apply --var-file prod.tfvars 107 | ``` 108 | 109 | The Amazon Sagemaker notebook url is output as `SAGEMAKER_NOTEBOOK_URL`. Open it to access the notebook. 110 | -------------------------------------------------------------------------------- /aws/terraform/infra/README.md: -------------------------------------------------------------------------------- 1 | # Infra 2 | 3 | Stands up the base infrastructure required to deploy a Metaflow stack. 4 | 5 | Mostly stands up and configures the Amazon VPC. 6 | 7 | ## AWS Resources 8 | 9 | ### Amazon VPC 10 | 11 | Amazon Virtual Private Cloud with two private subnets in different availability zones and a public subnet. Also includes an 12 | Elastic IP address for Amazon VPC egress (`elastic_ip_allocation_id`) to allow external services to whitelist access by IP. 13 | -------------------------------------------------------------------------------- /aws/terraform/infra/data.tf: -------------------------------------------------------------------------------- 1 | /* 2 | Grant us ability to yield different availability zones for a region 3 | */ 4 | data "aws_availability_zones" "available" { 5 | state = "available" 6 | 7 | filter { 8 | name = "opt-in-status" 9 | values = ["opt-in-not-required"] 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /aws/terraform/infra/example.tfvars: -------------------------------------------------------------------------------- 1 | env = "prod" 2 | aws_region = "us-west-2" 3 | -------------------------------------------------------------------------------- /aws/terraform/infra/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | resource_prefix = var.app 3 | resource_suffix = "${var.env}${module.common_vars.workspace_suffix}" 4 | 5 | subnet1_name = "${local.resource_prefix}-subnet-1-${local.resource_suffix}" 6 | subnet2_name = "${local.resource_prefix}-subnet-2-${local.resource_suffix}" 7 | } 8 | -------------------------------------------------------------------------------- /aws/terraform/infra/main.tf: -------------------------------------------------------------------------------- 1 | module "common_vars" { 2 | source = "../modules/common" 3 | 4 | app = var.app 5 | env = var.env 6 | } 7 | -------------------------------------------------------------------------------- /aws/terraform/infra/outputs.tf: -------------------------------------------------------------------------------- 1 | output "subnet1_id" { 2 | value = aws_subnet.subnet1.id 3 | description = "First subnet used for availability zone redundancy" 4 | } 5 | 6 | output "subnet2_id" { 7 | value = aws_subnet.subnet2.id 8 | description = "Second subnet used for availability zone redundancy" 9 | } 10 | 11 | output "vpc_cidr_block" { 12 | value = aws_vpc.this.cidr_block 13 | description = "The CIDR block we've designated for this VPC" 14 | } 15 | 16 | output "vpc_id" { 17 | value = aws_vpc.this.id 18 | description = "The id of the single VPC we stood up for all Metaflow resources to exist in." 19 | } 20 | -------------------------------------------------------------------------------- /aws/terraform/infra/variables.tf: -------------------------------------------------------------------------------- 1 | variable "app" { 2 | type = string 3 | default = "metaflow-infra" 4 | description = "Name of the application" 5 | } 6 | 7 | variable "aws_region" { 8 | type = string 9 | default = "us-west-2" 10 | description = "AWS region we will deploy to." 11 | } 12 | 13 | variable "env" { 14 | type = string 15 | default = "dev" 16 | description = "The environment for this stack to be created in. Used for the tfstate bucket and naming scope of resources." 17 | } 18 | 19 | variable "subnet1_cidr" { 20 | type = string 21 | default = "10.20.0.0/24" 22 | description = "CIDR for Metaflow VPC Subnet 1" 23 | } 24 | 25 | variable "subnet2_cidr" { 26 | type = string 27 | default = "10.20.1.0/24" 28 | description = "CIDR for Metaflow VPC Subnet 1" 29 | } 30 | 31 | variable "vpc_cidr" { 32 | type = string 33 | default = "10.20.0.0/16" 34 | description = "CIDR for the Metaflow VPC" 35 | } 36 | -------------------------------------------------------------------------------- /aws/terraform/infra/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = ">= 3.38.0" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /aws/terraform/infra/vpc.tf: -------------------------------------------------------------------------------- 1 | resource "aws_vpc" "this" { 2 | cidr_block = var.vpc_cidr 3 | enable_dns_support = true 4 | enable_dns_hostnames = true 5 | 6 | tags = merge( 7 | module.common_vars.tags, 8 | { 9 | Name = "${local.resource_prefix}-vpc-${local.resource_suffix}" 10 | } 11 | ) 12 | } 13 | 14 | # Two subnets are used to leverage two separate availability zones 15 | 16 | resource "aws_subnet" "subnet1" { 17 | vpc_id = aws_vpc.this.id 18 | cidr_block = var.subnet1_cidr 19 | availability_zone = data.aws_availability_zones.available.names[0] 20 | map_public_ip_on_launch = true 21 | 22 | tags = merge( 23 | module.common_vars.tags, 24 | { 25 | Name = local.subnet1_name 26 | Metaflow = "true" 27 | } 28 | ) 29 | } 30 | 31 | resource "aws_subnet" "subnet2" { 32 | availability_zone = data.aws_availability_zones.available.names[1] 33 | cidr_block = var.subnet2_cidr 34 | map_public_ip_on_launch = true 35 | vpc_id = aws_vpc.this.id 36 | 37 | tags = merge( 38 | module.common_vars.tags, 39 | { 40 | Name = local.subnet2_name 41 | Metaflow = "true" 42 | } 43 | ) 44 | } 45 | 46 | /* 47 | Setup a gateway between Amazon VPC and internet. Allow access to and from resources 48 | in subnet with public IP addr. 49 | Ref: https://nickcharlton.net/posts/terraform-aws-vpc.html 50 | */ 51 | resource "aws_internet_gateway" "internet_gateway" { 52 | vpc_id = aws_vpc.this.id 53 | 54 | tags = merge( 55 | module.common_vars.tags, 56 | { 57 | Name = "${local.resource_prefix}-internet-gateway-${local.resource_suffix}" 58 | } 59 | ) 60 | } 61 | 62 | resource "aws_route_table" "public_route_table" { 63 | vpc_id = aws_vpc.this.id 64 | 65 | tags = merge( 66 | module.common_vars.tags, 67 | { 68 | Name = "Public Subnet" 69 | Metaflow = "true" 70 | } 71 | ) 72 | } 73 | 74 | /* 75 | Map all traffic to the internet gateway for egress. 76 | This allows all traffic to appear to come from the associated EIP. 77 | */ 78 | resource "aws_route" "this" { 79 | destination_cidr_block = "0.0.0.0/0" 80 | gateway_id = aws_internet_gateway.internet_gateway.id 81 | route_table_id = aws_route_table.public_route_table.id 82 | } 83 | 84 | resource "aws_route_table_association" "subnet1_rta" { 85 | subnet_id = aws_subnet.subnet1.id 86 | route_table_id = aws_route_table.public_route_table.id 87 | } 88 | 89 | resource "aws_route_table_association" "subnet2_rta" { 90 | subnet_id = aws_subnet.subnet2.id 91 | route_table_id = aws_route_table.public_route_table.id 92 | } 93 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/.gitignore: -------------------------------------------------------------------------------- 1 | metaflow_config* 2 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/data.tf: -------------------------------------------------------------------------------- 1 | data "aws_region" "current" {} 2 | 3 | data "aws_caller_identity" "current" {} 4 | 5 | data "terraform_remote_state" "infra" { 6 | backend = "local" 7 | 8 | config = { 9 | path = "../infra/terraform.tfstate" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/example.tfvars: -------------------------------------------------------------------------------- 1 | env = "prod" 2 | 3 | aws_region = "us-west-2" 4 | 5 | # Setting min vpcus and desired vcpus to 0 to prevent accidental cost accumulation. 6 | # These settings will result in longer job startup times as AWS boots up the necessary compute resources. 7 | cpu_max_compute_vcpus = 128 8 | cpu_min_compute_vcpus = 0 9 | cpu_desired_compute_vcpus = 0 10 | 11 | large_cpu_max_compute_vcpus = 128 12 | large_cpu_min_compute_vcpus = 0 13 | large_cpu_desired_compute_vcpus = 0 14 | 15 | gpu_max_compute_vcpus = 128 16 | gpu_min_compute_vcpus = 0 17 | gpu_desired_compute_vcpus = 0 18 | 19 | enable_step_functions = true 20 | 21 | access_list_cidr_blocks = [] 22 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/iam-custom-role.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "metaflow_user_role_assume_role" { 2 | statement { 3 | actions = [ 4 | "sts:AssumeRole" 5 | ] 6 | 7 | effect = "Allow" 8 | 9 | principals { 10 | identifiers = [ 11 | module.metaflow.metadata_svc_ecs_task_role_arn 12 | ] 13 | type = "AWS" 14 | } 15 | } 16 | } 17 | 18 | resource "aws_iam_role" "metaflow_user_role" { 19 | count = var.custom_role ? 1 : 0 20 | name = local.metaflow_user_role_name 21 | # Read more about ECS' `task_role` and `execution_role` here https://stackoverflow.com/a/49947471 22 | assume_role_policy = data.aws_iam_policy_document.metaflow_user_role_assume_role.json 23 | 24 | tags = module.common_vars.tags 25 | } 26 | 27 | data "aws_iam_policy_document" "metaflow_policy" { 28 | statement { 29 | effect = "Allow" 30 | 31 | actions = [ 32 | "cloudformation:DescribeStacks", 33 | "cloudformation:*Stack", 34 | "cloudformation:*ChangeSet" 35 | ] 36 | 37 | resources = [ 38 | "arn:${var.iam_partition}:cloudformation:${local.aws_region}:${local.aws_account_id}:stack/${local.resource_prefix}*${local.resource_suffix}" 39 | ] 40 | } 41 | 42 | statement { 43 | actions = [ 44 | "s3:*Object" 45 | ] 46 | 47 | effect = "Allow" 48 | 49 | resources = [ 50 | "${module.metaflow.metaflow_s3_bucket_arn}/*" 51 | ] 52 | } 53 | 54 | statement { 55 | effect = "Allow" 56 | 57 | actions = [ 58 | "sagemaker:DescribeNotebook*", 59 | "sagemaker:StartNotebookInstance", 60 | "sagemaker:StopNotebookInstance", 61 | "sagemaker:UpdateNotebookInstance", 62 | "sagemaker:CreatePresignedNotebookInstanceUrl", 63 | ] 64 | 65 | resources = [ 66 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:notebook-instance/${local.resource_prefix}*${local.resource_suffix}", 67 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:notebook-instance-lifecycle-config/basic*" 68 | ] 69 | } 70 | 71 | statement { 72 | effect = "Allow" 73 | 74 | actions = [ 75 | "iam:PassRole", 76 | ] 77 | 78 | resources = [ 79 | "arn:${var.iam_partition}:iam::${local.aws_account_id}:role/${local.resource_prefix}*${local.resource_suffix}" 80 | ] 81 | } 82 | 83 | statement { 84 | effect = "Allow" 85 | 86 | actions = [ 87 | "kms:Decrypt", 88 | "kms:Encrypt", 89 | "kms:GenerateDataKey" 90 | ] 91 | 92 | resources = [ 93 | "arn:${var.iam_partition}:kms:${local.aws_region}:${local.aws_account_id}:key/" 94 | ] 95 | } 96 | } 97 | 98 | data "aws_iam_policy_document" "batch_perms" { 99 | statement { 100 | sid = "JobsPermissions" 101 | 102 | effect = "Allow" 103 | 104 | actions = [ 105 | "batch:TerminateJob", 106 | "batch:DescribeJobs", 107 | "batch:DescribeJobDefinitions", 108 | "batch:DescribeJobQueues", 109 | "batch:RegisterJobDefinition", 110 | "batch:DescribeComputeEnvironments", 111 | ] 112 | 113 | resources = [ 114 | "*" 115 | ] 116 | } 117 | 118 | statement { 119 | sid = "DefinitionsPermissions" 120 | 121 | effect = "Allow" 122 | 123 | actions = [ 124 | "batch:SubmitJob" 125 | ] 126 | 127 | resources = [ 128 | module.metaflow.METAFLOW_BATCH_JOB_QUEUE, 129 | "arn:${var.iam_partition}:batch:${local.aws_region}:${local.aws_account_id}:job-definition/*:*", 130 | ] 131 | } 132 | } 133 | 134 | data "aws_iam_policy_document" "custom_s3_list_access" { 135 | statement { 136 | sid = "BucketAccess" 137 | 138 | effect = "Allow" 139 | 140 | actions = [ 141 | "s3:ListBucket" 142 | ] 143 | 144 | resources = [ 145 | module.metaflow.metaflow_s3_bucket_arn 146 | ] 147 | } 148 | } 149 | 150 | data "aws_iam_policy_document" "log_perms" { 151 | statement { 152 | sid = "GetLogs" 153 | 154 | effect = "Allow" 155 | 156 | actions = [ 157 | "logs:GetLogEvents" 158 | ] 159 | 160 | resources = [ 161 | "arn:${var.iam_partition}:logs:${local.aws_region}:${local.aws_account_id}:log-group:*:log-stream:*", 162 | ] 163 | } 164 | } 165 | 166 | data "aws_iam_policy_document" "allow_sagemaker" { 167 | statement { 168 | sid = "AllowSagemakerCreate" 169 | 170 | effect = "Allow" 171 | 172 | actions = [ 173 | "sagemaker:CreateTrainingJob" 174 | ] 175 | 176 | resources = [ 177 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:training-job/*", 178 | ] 179 | } 180 | 181 | statement { 182 | sid = "AllowSagemakerDescribe" 183 | 184 | effect = "Allow" 185 | 186 | actions = [ 187 | "sagemaker:DescribeTrainingJob" 188 | ] 189 | 190 | resources = [ 191 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:training-job/*", 192 | ] 193 | } 194 | } 195 | 196 | data "aws_iam_policy_document" "allow_step_functions" { 197 | statement { 198 | sid = "TasksAndExecutionsGlobal" 199 | 200 | effect = "Allow" 201 | 202 | actions = [ 203 | "states:ListStateMachines" 204 | ] 205 | 206 | resources = [ 207 | "*", 208 | ] 209 | } 210 | 211 | statement { 212 | sid = "StateMachines" 213 | 214 | effect = "Allow" 215 | 216 | actions = [ 217 | "states:DescribeStateMachine", 218 | "states:UpdateStateMachine", 219 | "states:StartExecution", 220 | "states:CreateStateMachine", 221 | "states:ListExecutions", 222 | "states:StopExecution" 223 | ] 224 | 225 | resources = [ 226 | "arn:${var.iam_partition}:states:${local.aws_region}:${local.aws_account_id}:stateMachine:*", 227 | ] 228 | } 229 | } 230 | 231 | data "aws_iam_policy_document" "allow_event_bridge" { 232 | statement { 233 | sid = "RuleMaintenance" 234 | 235 | effect = "Allow" 236 | 237 | actions = [ 238 | "events:PutTargets", 239 | "events:DisableRule", 240 | ] 241 | 242 | resources = [ 243 | "arn:${var.iam_partition}:events:${local.aws_region}:${local.aws_account_id}:rule/*", 244 | ] 245 | } 246 | 247 | statement { 248 | sid = "PutRule" 249 | 250 | effect = "Allow" 251 | 252 | actions = [ 253 | "events:PutRule", 254 | ] 255 | 256 | resources = [ 257 | "arn:${var.iam_partition}:events:${local.aws_region}:${local.aws_account_id}:rule/*", 258 | ] 259 | 260 | condition { 261 | test = "Null" 262 | values = [ 263 | true 264 | ] 265 | variable = "events:source" 266 | } 267 | } 268 | } 269 | 270 | resource "aws_iam_role_policy" "grant_metaflow_policy" { 271 | count = var.custom_role ? 1 : 0 272 | name = "metaflow" 273 | role = aws_iam_role.metaflow_user_role[0].name 274 | policy = data.aws_iam_policy_document.metaflow_policy.json 275 | } 276 | 277 | resource "aws_iam_role_policy" "grant_batch_perms" { 278 | count = var.custom_role ? 1 : 0 279 | name = "batch" 280 | role = aws_iam_role.metaflow_user_role[0].name 281 | policy = data.aws_iam_policy_document.batch_perms.json 282 | } 283 | 284 | resource "aws_iam_role_policy" "grant_custom_s3_list_access" { 285 | count = var.custom_role ? 1 : 0 286 | name = "s3_list" 287 | role = aws_iam_role.metaflow_user_role[0].name 288 | policy = data.aws_iam_policy_document.custom_s3_list_access.json 289 | } 290 | 291 | resource "aws_iam_role_policy" "grant_log_perms" { 292 | count = var.custom_role ? 1 : 0 293 | name = "log" 294 | role = aws_iam_role.metaflow_user_role[0].name 295 | policy = data.aws_iam_policy_document.log_perms.json 296 | } 297 | 298 | resource "aws_iam_role_policy" "grant_allow_sagemaker" { 299 | count = var.custom_role ? 1 : 0 300 | name = "sagemaker" 301 | role = aws_iam_role.metaflow_user_role[0].name 302 | policy = data.aws_iam_policy_document.allow_sagemaker.json 303 | } 304 | 305 | resource "aws_iam_role_policy" "grant_allow_step_functions" { 306 | count = var.custom_role ? 1 : 0 307 | name = "step_functions" 308 | role = aws_iam_role.metaflow_user_role[0].name 309 | policy = data.aws_iam_policy_document.allow_step_functions.json 310 | } 311 | 312 | resource "aws_iam_role_policy" "grant_allow_event_bridge" { 313 | count = var.custom_role ? 1 : 0 314 | name = "event_bridge" 315 | role = aws_iam_role.metaflow_user_role[0].name 316 | policy = data.aws_iam_policy_document.allow_event_bridge.json 317 | } 318 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | resource_prefix = var.app 3 | resource_suffix = "${var.env}${module.common_vars.workspace_suffix}-${lookup(module.common_vars.aws_regions, data.aws_region.current.name, "")}" 4 | 5 | aws_region = data.aws_region.current.name 6 | aws_account_id = data.aws_caller_identity.current.account_id 7 | 8 | metaflow_config_filename = "metaflow_config_${var.env}_${data.aws_region.current.name}.json" 9 | metaflow_user_role_name = "${local.resource_prefix}-metaflow_user-${local.resource_suffix}" 10 | } 11 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/main.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = var.aws_region 3 | } 4 | 5 | module "common_vars" { 6 | source = "../modules/common" 7 | 8 | app = var.app 9 | env = var.env 10 | } 11 | 12 | module "metaflow" { 13 | source = "../modules/metaflow" 14 | 15 | resource_prefix = local.resource_prefix 16 | resource_suffix = local.resource_suffix 17 | 18 | access_list_cidr_blocks = var.access_list_cidr_blocks 19 | api_basic_auth = var.api_basic_auth 20 | batch_type = var.batch_type 21 | compute_environment_desired_vcpus = var.compute_environment_desired_vcpus 22 | compute_environment_instance_types = var.compute_environment_instance_types 23 | compute_environment_max_vcpus = var.compute_environment_max_vcpus 24 | compute_environment_min_vcpus = var.compute_environment_min_vcpus 25 | enable_custom_batch_container_registry = var.enable_custom_batch_container_registry 26 | enable_step_functions = var.enable_step_functions 27 | iam_partition = var.iam_partition 28 | subnet1_id = data.terraform_remote_state.infra.outputs.subnet1_id 29 | subnet2_id = data.terraform_remote_state.infra.outputs.subnet2_id 30 | vpc_cidr_block = data.terraform_remote_state.infra.outputs.vpc_cidr_block 31 | vpc_id = data.terraform_remote_state.infra.outputs.vpc_id 32 | 33 | tags = module.common_vars.tags 34 | } 35 | 36 | resource "local_file" "metaflow_config" { 37 | content = module.metaflow.metaflow_profile_json 38 | filename = "${path.module}/${local.metaflow_config_filename}" 39 | } 40 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/outputs.tf: -------------------------------------------------------------------------------- 1 | output "METAFLOW_BATCH_JOB_QUEUE" { 2 | value = module.metaflow.METAFLOW_BATCH_JOB_QUEUE 3 | description = "AWS Batch Job Queue ARN for Metaflow" 4 | } 5 | 6 | output "METAFLOW_DATASTORE_SYSROOT_S3" { 7 | value = module.metaflow.METAFLOW_DATASTORE_SYSROOT_S3 8 | description = "Amazon S3 URL for Metaflow DataStore" 9 | } 10 | 11 | output "METAFLOW_DATATOOLS_S3ROOT" { 12 | value = module.metaflow.METAFLOW_DATATOOLS_S3ROOT 13 | description = "Amazon S3 URL for Metaflow DataTools" 14 | } 15 | 16 | output "METAFLOW_ECS_S3_ACCESS_IAM_ROLE" { 17 | value = module.metaflow.METAFLOW_ECS_S3_ACCESS_IAM_ROLE 18 | description = "Role for AWS Batch to Access Amazon S3 ARN" 19 | } 20 | 21 | output "METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE" { 22 | value = module.metaflow.METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE 23 | description = "IAM role for Amazon EventBridge to access AWS Step Functions." 24 | } 25 | 26 | output "METAFLOW_SERVICE_INTERNAL_URL" { 27 | value = module.metaflow.METAFLOW_SERVICE_INTERNAL_URL 28 | description = "URL for Metadata Service (Accessible in VPC)" 29 | } 30 | 31 | output "METAFLOW_SERVICE_URL" { 32 | value = module.metaflow.METAFLOW_SERVICE_URL 33 | description = "URL for Metadata Service (Accessible in VPC)" 34 | } 35 | 36 | output "METAFLOW_SFN_DYNAMO_DB_TABLE" { 37 | value = module.metaflow.METAFLOW_SFN_DYNAMO_DB_TABLE 38 | description = "AWS DynamoDB table name for tracking AWS Step Functions execution metadata." 39 | } 40 | 41 | output "METAFLOW_SFN_IAM_ROLE" { 42 | value = module.metaflow.METAFLOW_SFN_IAM_ROLE 43 | description = "IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB)." 44 | } 45 | 46 | output "api_gateway_rest_api_id_key_id" { 47 | value = module.metaflow.api_gateway_rest_api_id_key_id 48 | description = "API Gateway Key ID for Metadata Service. Fetch Key from AWS Console [METAFLOW_SERVICE_AUTH_KEY]" 49 | } 50 | 51 | output "datastore_s3_bucket_kms_key_arn" { 52 | value = module.metaflow.datastore_s3_bucket_kms_key_arn 53 | description = "The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket" 54 | } 55 | 56 | output "metaflow_api_gateway_rest_api_id" { 57 | value = module.metaflow.metaflow_api_gateway_rest_api_id 58 | description = "The ID of the API Gateway REST API we'll use to accept MetaData service requests to forward to the Fargate API instance" 59 | } 60 | 61 | output "metaflow_batch_container_image" { 62 | value = module.metaflow.metaflow_batch_container_image 63 | description = "The ECR repo containing the metaflow batch image" 64 | } 65 | 66 | output "metaflow_profile_configuration" { 67 | value = "Run this command in a shell to import the Metaflow configuration: metaflow configure import ${path.module}/${local.metaflow_config_filename}" 68 | description = "Instructions to import the generated Metaflow configuration" 69 | } 70 | 71 | output "metaflow_s3_bucket_arn" { 72 | value = module.metaflow.metaflow_s3_bucket_arn 73 | description = "The ARN of the bucket we'll be using as blob storage" 74 | } 75 | 76 | output "metaflow_s3_bucket_name" { 77 | value = module.metaflow.metaflow_s3_bucket_name 78 | description = "The name of the bucket we'll be using as blob storage" 79 | } 80 | 81 | output "metaflow_user_role_arn" { 82 | value = var.custom_role ? aws_iam_role.metaflow_user_role[0].arn : "" 83 | description = "IAM Role for Metaflow Stack" 84 | } 85 | 86 | output "migration_function_arn" { 87 | value = module.metaflow.migration_function_arn 88 | description = "ARN of DB Migration Function" 89 | } 90 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/variables.tf: -------------------------------------------------------------------------------- 1 | variable "access_list_cidr_blocks" { 2 | type = list(string) 3 | description = "List of CIDRs we want to grant access to our Metaflow Metadata Service. Usually this is your VPN's CIDR blocks." 4 | default = [] 5 | } 6 | 7 | variable "api_basic_auth" { 8 | type = bool 9 | default = true 10 | description = "Enable basic auth for API Gateway? (requires key export)" 11 | } 12 | 13 | variable "app" { 14 | default = "metaflow" 15 | description = "Name of the application" 16 | } 17 | 18 | variable "aws_region" { 19 | type = string 20 | default = "us-west-2" 21 | description = "AWS region we will deploy to." 22 | } 23 | 24 | variable "batch_type" { 25 | type = string 26 | description = "AWS Batch Compute Type ('ec2', 'fargate')" 27 | default = "ec2" 28 | } 29 | 30 | variable "compute_environment_desired_vcpus" { 31 | type = number 32 | description = "Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)" 33 | default = 8 34 | } 35 | 36 | variable "compute_environment_instance_types" { 37 | type = list(string) 38 | description = "The instance types for the compute environment" 39 | default = ["c4.large", "c4.xlarge", "c4.2xlarge", "c4.4xlarge", "c4.8xlarge"] 40 | } 41 | 42 | variable "compute_environment_max_vcpus" { 43 | type = number 44 | description = "Maximum VCPUs for Batch Compute Environment [16-96]" 45 | default = 64 46 | } 47 | 48 | variable "compute_environment_min_vcpus" { 49 | type = number 50 | description = "Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)" 51 | default = 8 52 | } 53 | 54 | variable "custom_role" { 55 | type = bool 56 | default = false 57 | description = "Enable custom role with restricted permissions?" 58 | } 59 | 60 | variable "enable_custom_batch_container_registry" { 61 | type = bool 62 | default = false 63 | description = "Provisions infrastructure for custom ECR container registry if enabled" 64 | } 65 | 66 | variable "enable_step_functions" { 67 | type = bool 68 | default = true 69 | description = "Provisions infrastructure for step functions if enabled" 70 | } 71 | 72 | variable "env" { 73 | type = string 74 | default = "dev" 75 | description = "The environment for this stack to be created in. Used for the tfstate bucket and naming scope of resources." 76 | } 77 | 78 | variable "iam_partition" { 79 | type = string 80 | default = "aws" 81 | description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" 82 | } 83 | -------------------------------------------------------------------------------- /aws/terraform/metaflow/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /aws/terraform/modules/common/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | aws_regions = { 3 | "us-east-1" = "usea1" 4 | "us-east-2" = "usea2" 5 | "us-west-1" = "uswe1" 6 | "us-west-2" = "uswe2" 7 | "us-gov-west-1" = "ugwe2" 8 | "ca-central-1" = "cace1" 9 | "eu-west-1" = "euwe1" 10 | "eu-west-2" = "euwe2" 11 | "eu-central-1" = "euce1" 12 | "ap-southeast-1" = "apse1" 13 | "ap-southeast-2" = "apse2" 14 | "ap-south-1" = "apso1" 15 | "ap-northeast-1" = "apne1" 16 | "ap-northeast-2" = "apne2" 17 | "sa-east-1" = "saea1" 18 | "cn-north-1" = "cnno1" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /aws/terraform/modules/common/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 0.13" 3 | } 4 | -------------------------------------------------------------------------------- /aws/terraform/modules/common/outputs.tf: -------------------------------------------------------------------------------- 1 | output "app" { 2 | value = var.app 3 | } 4 | 5 | output "env" { 6 | value = var.env 7 | } 8 | 9 | output "tags" { 10 | value = merge( 11 | var.tags, 12 | { 13 | "application" = var.app, 14 | "environment" = var.env, 15 | "tf.workspace" = terraform.workspace 16 | } 17 | ) 18 | } 19 | 20 | output "workspace_suffix" { 21 | value = terraform.workspace == "default" ? "" : "-${terraform.workspace}" 22 | } 23 | 24 | output "aws_regions" { 25 | value = local.aws_regions 26 | } 27 | -------------------------------------------------------------------------------- /aws/terraform/modules/common/variables.tf: -------------------------------------------------------------------------------- 1 | variable "app" { 2 | default = "metaflow" 3 | } 4 | 5 | variable "env" { 6 | default = "dev" 7 | } 8 | 9 | variable "tags" { 10 | default = { 11 | repo = "metaflow-tools" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/.terraform-docs.yml: -------------------------------------------------------------------------------- 1 | formatter: markdown 2 | sections: 3 | show: 4 | - modules 5 | - inputs 6 | - outputs 7 | output: 8 | file: "README.md" 9 | mode: inject 10 | template: |- 11 | 12 | {{ .Content }} 13 | 14 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/README.md: -------------------------------------------------------------------------------- 1 | # Metaflow Terraform module 2 | 3 | Provides the core functionality for Metaflow which includes: 4 | 5 | - on demand processing (`computation`) 6 | - blob and tabular storage (`datastore`) 7 | - an API to record and query past executions (`metadata-service`) 8 | - orchestrated processing (`step-functions`) 9 | - other bits of infra like Amazon Elastic Container Registry (ECR) to hold the Docker image we wish to use with Metaflow. 10 | 11 | This module is composed of submodules which break up the responsibility into logical parts listed above. 12 | You can either use this high-level module, or submodules individually. See each module's corresponding `README.md` for more details. 13 | 14 | This module requires an Amazon VPC to be set up by the module user beforehand. The output of the project `infra` is an example configuration of an Amazon VPC that can be passed to this module. 15 | 16 | 17 | ## Modules 18 | 19 | | Name | Source | Version | 20 | |------|--------|---------| 21 | | [metaflow-computation](#module\_metaflow-computation) | ./modules/computation | n/a | 22 | | [metaflow-datastore](#module\_metaflow-datastore) | ./modules/datastore | n/a | 23 | | [metaflow-metadata-service](#module\_metaflow-metadata-service) | ./modules/metadata-service | n/a | 24 | | [metaflow-step-functions](#module\_metaflow-step-functions) | ./modules/step-functions | n/a | 25 | 26 | ## Inputs 27 | 28 | | Name | Description | Type | Default | Required | 29 | |------|-------------|------|---------|:--------:| 30 | | [access\_list\_cidr\_blocks](#input\_access\_list\_cidr\_blocks) | List of CIDRs we want to grant access to our Metaflow Metadata Service. Usually this is our VPN's CIDR blocks. | `list(string)` | `[]` | no | 31 | | [api\_basic\_auth](#input\_api\_basic\_auth) | Enable basic auth for API Gateway? (requires key export) | `bool` | `true` | no | 32 | | [batch\_type](#input\_batch\_type) | AWS Batch Compute Type ('ec2', 'fargate') | `string` | `"ec2"` | no | 33 | | [compute\_environment\_desired\_vcpus](#input\_compute\_environment\_desired\_vcpus) | Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate) | `number` | `8` | no | 34 | | [compute\_environment\_instance\_types](#input\_compute\_environment\_instance\_types) | The instance types for the compute environment | `list(string)` |
[
"c4.large",
"c4.xlarge",
"c4.2xlarge",
"c4.4xlarge",
"c4.8xlarge"
]
| no | 35 | | [compute\_environment\_max\_vcpus](#input\_compute\_environment\_max\_vcpus) | Maximum VCPUs for Batch Compute Environment [16-96] | `number` | `64` | no | 36 | | [compute\_environment\_min\_vcpus](#input\_compute\_environment\_min\_vcpus) | Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate) | `number` | `8` | no | 37 | | [enable\_custom\_batch\_container\_registry](#input\_enable\_custom\_batch\_container\_registry) | Provisions infrastructure for custom Amazon ECR container registry if enabled | `bool` | `false` | no | 38 | | [enable\_step\_functions](#input\_enable\_step\_functions) | Provisions infrastructure for step functions if enabled | `bool` | n/a | yes | 39 | | [iam\_partition](#input\_iam\_partition) | IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is) | `string` | `"aws"` | no | 40 | | [resource\_prefix](#input\_resource\_prefix) | string prefix for all resources | `string` | `"metaflow"` | no | 41 | | [resource\_suffix](#input\_resource\_suffix) | string suffix for all resources | `string` | `""` | no | 42 | | [subnet1\_id](#input\_subnet1\_id) | First subnet used for availability zone redundancy | `string` | n/a | yes | 43 | | [subnet2\_id](#input\_subnet2\_id) | Second subnet used for availability zone redundancy | `string` | n/a | yes | 44 | | [tags](#input\_tags) | aws tags | `map(string)` | n/a | yes | 45 | | [vpc\_cidr\_block](#input\_vpc\_cidr\_block) | The VPC CIDR block that we'll access list on our Metadata Service API to allow all internal communications | `string` | n/a | yes | 46 | | [vpc\_id](#input\_vpc\_id) | The id of the single VPC we stood up for all Metaflow resources to exist in. | `string` | n/a | yes | 47 | 48 | ## Outputs 49 | 50 | | Name | Description | 51 | |------|-------------| 52 | | [METAFLOW\_BATCH\_JOB\_QUEUE](#output\_METAFLOW\_BATCH\_JOB\_QUEUE) | AWS Batch Job Queue ARN for Metaflow | 53 | | [METAFLOW\_DATASTORE\_SYSROOT\_S3](#output\_METAFLOW\_DATASTORE\_SYSROOT\_S3) | Amazon S3 URL for Metaflow DataStore | 54 | | [METAFLOW\_DATATOOLS\_S3ROOT](#output\_METAFLOW\_DATATOOLS\_S3ROOT) | Amazon S3 URL for Metaflow DataTools | 55 | | [METAFLOW\_ECS\_S3\_ACCESS\_IAM\_ROLE](#output\_METAFLOW\_ECS\_S3\_ACCESS\_IAM\_ROLE) | Role for AWS Batch to Access Amazon S3 | 56 | | [METAFLOW\_EVENTS\_SFN\_ACCESS\_IAM\_ROLE](#output\_METAFLOW\_EVENTS\_SFN\_ACCESS\_IAM\_ROLE) | IAM role for Amazon EventBridge to access AWS Step Functions. | 57 | | [METAFLOW\_SERVICE\_INTERNAL\_URL](#output\_METAFLOW\_SERVICE\_INTERNAL\_URL) | URL for Metadata Service (Accessible in VPC) | 58 | | [METAFLOW\_SERVICE\_URL](#output\_METAFLOW\_SERVICE\_URL) | URL for Metadata Service (Accessible in VPC) | 59 | | [METAFLOW\_SFN\_DYNAMO\_DB\_TABLE](#output\_METAFLOW\_SFN\_DYNAMO\_DB\_TABLE) | AWS DynamoDB table name for tracking AWS Step Functions execution metadata. | 60 | | [METAFLOW\_SFN\_IAM\_ROLE](#output\_METAFLOW\_SFN\_IAM\_ROLE) | IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB). | 61 | | [api\_gateway\_rest\_api\_id\_key\_id](#output\_api\_gateway\_rest\_api\_id\_key\_id) | API Gateway Key ID for Metadata Service. Fetch Key from AWS Console [METAFLOW\_SERVICE\_AUTH\_KEY] | 62 | | [datastore\_s3\_bucket\_kms\_key\_arn](#output\_datastore\_s3\_bucket\_kms\_key\_arn) | The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket | 63 | | [metadata\_svc\_ecs\_task\_role\_arn](#output\_metadata\_svc\_ecs\_task\_role\_arn) | n/a | 64 | | [metaflow\_api\_gateway\_rest\_api\_id](#output\_metaflow\_api\_gateway\_rest\_api\_id) | The ID of the API Gateway REST API we'll use to accept MetaData service requests to forward to the Fargate API instance | 65 | | [metaflow\_batch\_container\_image](#output\_metaflow\_batch\_container\_image) | The ECR repo containing the metaflow batch image | 66 | | [metaflow\_profile\_json](#output\_metaflow\_profile\_json) | Metaflow profile JSON object that can be used to communicate with this Metaflow Stack. Store this in `~/.metaflow/config_[stack-name]` and select with `$ export METAFLOW_PROFILE=[stack-name]`. | 67 | | [metaflow\_s3\_bucket\_arn](#output\_metaflow\_s3\_bucket\_arn) | The ARN of the bucket we'll be using as blob storage | 68 | | [metaflow\_s3\_bucket\_name](#output\_metaflow\_s3\_bucket\_name) | The name of the bucket we'll be using as blob storage | 69 | | [migration\_function\_arn](#output\_migration\_function\_arn) | ARN of DB Migration Function | 70 | 71 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/data.tf: -------------------------------------------------------------------------------- 1 | data "aws_region" "current" {} 2 | 3 | data "aws_caller_identity" "current" {} 4 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/ecr.tf: -------------------------------------------------------------------------------- 1 | resource "aws_ecr_repository" "metaflow_batch_image" { 2 | count = var.enable_custom_batch_container_registry ? 1 : 0 3 | 4 | name = local.metaflow_batch_image_name 5 | 6 | tags = var.tags 7 | } 8 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/iam.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "batch_s3_task_role_assume_role" { 2 | statement { 3 | actions = [ 4 | "sts:AssumeRole" 5 | ] 6 | 7 | effect = "Allow" 8 | 9 | principals { 10 | identifiers = [ 11 | "ecs-tasks.amazonaws.com", 12 | ] 13 | type = "Service" 14 | } 15 | } 16 | } 17 | 18 | resource "aws_iam_role" "batch_s3_task_role" { 19 | name = local.batch_s3_task_role_name 20 | 21 | description = "Role for AWS Batch to Access Amazon S3 [METAFLOW_ECS_S3_ACCESS_IAM_ROLE]" 22 | 23 | assume_role_policy = data.aws_iam_policy_document.batch_s3_task_role_assume_role.json 24 | 25 | tags = var.tags 26 | } 27 | 28 | data "aws_iam_policy_document" "custom_s3_list_batch" { 29 | statement { 30 | sid = "BucketAccessBatch" 31 | actions = [ 32 | "s3:ListBucket" 33 | ] 34 | 35 | effect = "Allow" 36 | 37 | resources = [ 38 | module.metaflow-datastore.s3_bucket_arn 39 | ] 40 | } 41 | } 42 | 43 | data "aws_iam_policy_document" "custom_s3_batch" { 44 | statement { 45 | sid = "ObjectAccessBatch" 46 | actions = [ 47 | "s3:PutObject", 48 | "s3:GetObject", 49 | "s3:DeleteObject" 50 | ] 51 | 52 | effect = "Allow" 53 | 54 | resources = [ 55 | "${module.metaflow-datastore.s3_bucket_arn}/*" 56 | ] 57 | } 58 | } 59 | 60 | data "aws_iam_policy_document" "s3_kms" { 61 | statement { 62 | effect = "Allow" 63 | 64 | actions = [ 65 | "kms:Decrypt", 66 | "kms:Encrypt", 67 | "kms:GenerateDataKey" 68 | ] 69 | 70 | resources = [ 71 | module.metaflow-datastore.datastore_s3_bucket_kms_key_arn 72 | ] 73 | } 74 | } 75 | 76 | data "aws_iam_policy_document" "deny_presigned_batch" { 77 | statement { 78 | sid = "DenyPresignedBatch" 79 | actions = [ 80 | "s3:*" 81 | ] 82 | 83 | effect = "Deny" 84 | 85 | resources = [ 86 | "*", 87 | ] 88 | 89 | condition { 90 | test = "StringNotEquals" 91 | values = [ 92 | "REST-HEADER" 93 | ] 94 | variable = "s3:authType" 95 | } 96 | } 97 | } 98 | 99 | data "aws_iam_policy_document" "allow_sagemaker" { 100 | statement { 101 | sid = "AllowSagemakerCreate" 102 | actions = [ 103 | "sagemaker:CreateTrainingJob" 104 | ] 105 | 106 | effect = "Allow" 107 | 108 | resources = [ 109 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:training-job/*", 110 | ] 111 | } 112 | 113 | statement { 114 | sid = "AllowSagemakerDescribe" 115 | actions = [ 116 | "sagemaker:DescribeTrainingJob" 117 | ] 118 | 119 | effect = "Allow" 120 | 121 | resources = [ 122 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:training-job/*", 123 | ] 124 | } 125 | 126 | statement { 127 | sid = "AllowSagemakerDeploy" 128 | actions = [ 129 | "sagemaker:CreateModel", 130 | "sagemaker:CreateEndpointConfig", 131 | "sagemaker:CreateEndpoint", 132 | "sagemaker:DescribeModel", 133 | "sagemaker:DescribeEndpoint", 134 | "sagemaker:InvokeEndpoint" 135 | ] 136 | 137 | effect = "Allow" 138 | 139 | resources = [ 140 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:endpoint/*", 141 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:model/*", 142 | "arn:${var.iam_partition}:sagemaker:${local.aws_region}:${local.aws_account_id}:endpoint-config/*", 143 | ] 144 | } 145 | } 146 | 147 | data "aws_iam_policy_document" "iam_pass_role" { 148 | statement { 149 | sid = "AllowPassRole" 150 | actions = [ 151 | "iam:PassRole", 152 | ] 153 | 154 | effect = "Allow" 155 | 156 | resources = [ 157 | "*" 158 | ] 159 | 160 | condition { 161 | test = "StringEquals" 162 | values = [ 163 | "sagemaker.amazonaws.com" 164 | ] 165 | variable = "iam:PassedToService" 166 | } 167 | } 168 | } 169 | 170 | data "aws_iam_policy_document" "dynamodb" { 171 | statement { 172 | sid = "Items" 173 | actions = [ 174 | "dynamodb:PutItem", 175 | "dynamodb:GetItem", 176 | "dynamodb:UpdateItem", 177 | ] 178 | 179 | effect = "Allow" 180 | 181 | resources = [ 182 | module.metaflow-step-functions.metaflow_step_functions_dynamodb_table_arn 183 | ] 184 | 185 | condition { 186 | test = "StringEquals" 187 | values = [ 188 | "sagemaker.amazonaws.com" 189 | ] 190 | variable = "iam:PassedToService" 191 | } 192 | } 193 | } 194 | 195 | data "aws_iam_policy_document" "cloudwatch" { 196 | statement { 197 | sid = "AllowPutLogs" 198 | actions = [ 199 | "logs:CreateLogStream", 200 | "logs:PutLogEvents", 201 | ] 202 | 203 | effect = "Allow" 204 | 205 | resources = [ 206 | "*" 207 | ] 208 | } 209 | } 210 | 211 | resource "aws_iam_role_policy" "grant_custom_s3_list_batch" { 212 | name = "s3_list" 213 | role = aws_iam_role.batch_s3_task_role.name 214 | policy = data.aws_iam_policy_document.custom_s3_list_batch.json 215 | } 216 | 217 | resource "aws_iam_role_policy" "grant_custom_s3_batch" { 218 | name = "custom_s3" 219 | role = aws_iam_role.batch_s3_task_role.name 220 | policy = data.aws_iam_policy_document.custom_s3_batch.json 221 | } 222 | 223 | resource "aws_iam_role_policy" "grant_s3_kms" { 224 | name = "s3_kms" 225 | role = aws_iam_role.batch_s3_task_role.name 226 | policy = data.aws_iam_policy_document.s3_kms.json 227 | } 228 | 229 | resource "aws_iam_role_policy" "grant_deny_presigned_batch" { 230 | name = "deny_presigned" 231 | role = aws_iam_role.batch_s3_task_role.name 232 | policy = data.aws_iam_policy_document.deny_presigned_batch.json 233 | } 234 | 235 | resource "aws_iam_role_policy" "grant_allow_sagemaker" { 236 | name = "sagemaker" 237 | role = aws_iam_role.batch_s3_task_role.name 238 | policy = data.aws_iam_policy_document.allow_sagemaker.json 239 | } 240 | 241 | resource "aws_iam_role_policy" "grant_iam_pass_role" { 242 | name = "iam_pass_role" 243 | role = aws_iam_role.batch_s3_task_role.name 244 | policy = data.aws_iam_policy_document.iam_pass_role.json 245 | } 246 | 247 | resource "aws_iam_role_policy" "grant_dynamodb" { 248 | count = var.enable_step_functions ? 1 : 0 249 | name = "dynamodb" 250 | role = aws_iam_role.batch_s3_task_role.name 251 | policy = data.aws_iam_policy_document.dynamodb.json 252 | } 253 | 254 | resource "aws_iam_role_policy" "grant_cloudwatch" { 255 | name = "cloudwatch" 256 | role = aws_iam_role.batch_s3_task_role.name 257 | policy = data.aws_iam_policy_document.cloudwatch.json 258 | } 259 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | resource_prefix = length(var.resource_prefix) > 0 ? "${var.resource_prefix}-" : "" 3 | resource_suffix = length(var.resource_suffix) > 0 ? "-${var.resource_suffix}" : "" 4 | 5 | aws_region = data.aws_region.current.name 6 | aws_account_id = data.aws_caller_identity.current.account_id 7 | 8 | batch_s3_task_role_name = "${local.resource_prefix}batch_s3_task_role${local.resource_suffix}" 9 | metaflow_batch_image_name = "${local.resource_prefix}batch${local.resource_suffix}" 10 | } 11 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/main.tf: -------------------------------------------------------------------------------- 1 | module "metaflow-datastore" { 2 | source = "./modules/datastore" 3 | 4 | resource_prefix = local.resource_prefix 5 | resource_suffix = local.resource_suffix 6 | 7 | ecs_execution_role_arn = module.metaflow-computation.ecs_execution_role_arn 8 | ecs_instance_role_arn = module.metaflow-computation.ecs_instance_role_arn 9 | metadata_service_security_group_id = module.metaflow-metadata-service.metadata_service_security_group_id 10 | metaflow_vpc_id = var.vpc_id 11 | subnet1_id = var.subnet1_id 12 | subnet2_id = var.subnet2_id 13 | 14 | standard_tags = var.tags 15 | } 16 | 17 | module "metaflow-metadata-service" { 18 | source = "./modules/metadata-service" 19 | 20 | resource_prefix = local.resource_prefix 21 | resource_suffix = local.resource_suffix 22 | 23 | access_list_cidr_blocks = var.access_list_cidr_blocks 24 | api_basic_auth = var.api_basic_auth 25 | database_password = module.metaflow-datastore.database_password 26 | database_username = module.metaflow-datastore.database_username 27 | datastore_s3_bucket_kms_key_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn 28 | fargate_execution_role_arn = module.metaflow-computation.ecs_execution_role_arn 29 | iam_partition = var.iam_partition 30 | metaflow_vpc_id = var.vpc_id 31 | rds_master_instance_endpoint = module.metaflow-datastore.rds_master_instance_endpoint 32 | s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn 33 | subnet1_id = var.subnet1_id 34 | subnet2_id = var.subnet2_id 35 | vpc_cidr_block = var.vpc_cidr_block 36 | 37 | standard_tags = var.tags 38 | } 39 | 40 | module "metaflow-computation" { 41 | source = "./modules/computation" 42 | 43 | resource_prefix = local.resource_prefix 44 | resource_suffix = local.resource_suffix 45 | 46 | batch_type = var.batch_type 47 | compute_environment_desired_vcpus = var.compute_environment_desired_vcpus 48 | compute_environment_instance_types = var.compute_environment_instance_types 49 | compute_environment_max_vcpus = var.compute_environment_max_vcpus 50 | compute_environment_min_vcpus = var.compute_environment_min_vcpus 51 | enable_step_functions = var.enable_step_functions 52 | iam_partition = var.iam_partition 53 | metaflow_step_functions_dynamodb_policy = module.metaflow-step-functions.metaflow_step_functions_dynamodb_policy 54 | metaflow_vpc_id = var.vpc_id 55 | subnet1_id = var.subnet1_id 56 | subnet2_id = var.subnet2_id 57 | 58 | standard_tags = var.tags 59 | } 60 | 61 | module "metaflow-step-functions" { 62 | source = "./modules/step-functions" 63 | 64 | resource_prefix = local.resource_prefix 65 | resource_suffix = local.resource_suffix 66 | 67 | active = var.enable_step_functions 68 | batch_job_queue_arn = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE 69 | iam_partition = var.iam_partition 70 | s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn 71 | s3_bucket_kms_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn 72 | 73 | standard_tags = var.tags 74 | } 75 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/.terraform-docs.yml: -------------------------------------------------------------------------------- 1 | formatter: markdown 2 | sections: 3 | show: 4 | - inputs 5 | - outputs 6 | output: 7 | file: "README.md" 8 | mode: inject 9 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/README.md: -------------------------------------------------------------------------------- 1 | # Modules 2 | 3 | Our Metaflow Terraform code has been separated into separate modules based on the service architecture. 4 | 5 | ## Computation 6 | 7 | Sets up remote computation resources so flows can be run on Amazon EC2 instances. These resources do not perform 8 | orchestration and rely on the data scientist's computer to perform this coordination. 9 | 10 | ## Datastore 11 | 12 | Sets up blob and tabular data storage. Records all flows, the steps they took, their conda environments, artifacts 13 | and results. 14 | 15 | Should exist for the lifetime of the stack. 16 | 17 | ## Metadata Service 18 | 19 | Sets up an API entrypoint to interact with all other services, both for running flows and interacting with the 20 | Datastore to explore historic runs. 21 | 22 | ## Step Functions 23 | 24 | Sets up remote computation resources that come with orchestration. This allows data scientists to schedule flows 25 | using crons as well as being able to kick off flows and shut down their machine, as the remote resources will handle 26 | all coordination. 27 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/README.md: -------------------------------------------------------------------------------- 1 | # Computation 2 | 3 | This module sets up the resources to run Metaflow steps on AWS Batch. One can modify how many resources 4 | we want to have available, as well as configure autoscaling 5 | 6 | This module is not required to use Metaflow, as you can also run steps locally and leverage the Datastore 7 | 8 | To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/metaflow-on-aws#compute) 9 | 10 | 11 | ## Inputs 12 | 13 | | Name | Description | Type | Default | Required | 14 | |------|-------------|------|---------|:--------:| 15 | | [batch\_type](#input\_batch\_type) | AWS Batch Compute Type ('ec2', 'fargate') | `string` | `"ec2"` | no | 16 | | [compute\_environment\_desired\_vcpus](#input\_compute\_environment\_desired\_vcpus) | Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate) | `number` | n/a | yes | 17 | | [compute\_environment\_instance\_types](#input\_compute\_environment\_instance\_types) | The instance types for the compute environment as a comma-separated list | `list(string)` | n/a | yes | 18 | | [compute\_environment\_max\_vcpus](#input\_compute\_environment\_max\_vcpus) | Maximum VCPUs for Batch Compute Environment [16-96] | `number` | n/a | yes | 19 | | [compute\_environment\_min\_vcpus](#input\_compute\_environment\_min\_vcpus) | Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate) | `number` | n/a | yes | 20 | | [enable\_step\_functions](#input\_enable\_step\_functions) | If true, apply policies required for step functions | `bool` | `false` | no | 21 | | [iam\_partition](#input\_iam\_partition) | IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is) | `string` | `"aws"` | no | 22 | | [metaflow\_step\_functions\_dynamodb\_policy](#input\_metaflow\_step\_functions\_dynamodb\_policy) | IAM policy allowing access to the step functions dynamodb policy | `string` | n/a | yes | 23 | | [metaflow\_vpc\_id](#input\_metaflow\_vpc\_id) | ID of the Metaflow VPC this SageMaker notebook instance is to be deployed in | `string` | n/a | yes | 24 | | [resource\_prefix](#input\_resource\_prefix) | Prefix given to all AWS resources to differentiate between applications | `string` | n/a | yes | 25 | | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | 26 | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | 27 | | [subnet1\_id](#input\_subnet1\_id) | The first private subnet used for redundancy | `string` | n/a | yes | 28 | | [subnet2\_id](#input\_subnet2\_id) | The second private subnet used for redundancy | `string` | n/a | yes | 29 | 30 | ## Outputs 31 | 32 | | Name | Description | 33 | |------|-------------| 34 | | [METAFLOW\_BATCH\_JOB\_QUEUE](#output\_METAFLOW\_BATCH\_JOB\_QUEUE) | AWS Batch Job Queue ARN for Metaflow | 35 | | [batch\_job\_queue\_arn](#output\_batch\_job\_queue\_arn) | The ARN of the job queue we'll use to accept Metaflow tasks | 36 | | [ecs\_execution\_role\_arn](#output\_ecs\_execution\_role\_arn) | The IAM role that grants access to ECS and Batch services which we'll use as our Metadata Service API's execution\_role for our Fargate instance | 37 | | [ecs\_instance\_role\_arn](#output\_ecs\_instance\_role\_arn) | This role will be granted access to our S3 Bucket which acts as our blob storage. | 38 | 39 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/batch.tf: -------------------------------------------------------------------------------- 1 | resource "aws_batch_compute_environment" "this" { 2 | /* Unique name for compute environment. 3 | We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must 4 | be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new 5 | batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had 6 | just used compute_environment_name, then there would be a conflict when we went to stand up the new 7 | compute_environment that had the modifications applied and the process would fail. 8 | */ 9 | compute_environment_name_prefix = local.compute_env_prefix_name 10 | 11 | # Give permissions so the batch service can make API calls. 12 | service_role = aws_iam_role.batch_execution_role.arn 13 | type = "MANAGED" 14 | 15 | # On destroy, this avoids removing these policies below until compute environments are destroyed 16 | depends_on = [ 17 | aws_iam_role_policy.grant_iam_pass_role, 18 | aws_iam_role_policy.grant_custom_access_policy, 19 | aws_iam_role_policy.grant_iam_custom_policies, 20 | aws_iam_role_policy.grant_ec2_custom_policies, 21 | ] 22 | 23 | compute_resources { 24 | # Give permissions so the ECS container instances can make API call. 25 | instance_role = !local.enable_fargate_on_batch ? aws_iam_instance_profile.ecs_instance_role.arn : null 26 | 27 | # List of types that can be launched. 28 | instance_type = !local.enable_fargate_on_batch ? var.compute_environment_instance_types : null 29 | 30 | # Range of number of CPUs. 31 | max_vcpus = var.compute_environment_max_vcpus 32 | min_vcpus = !local.enable_fargate_on_batch ? var.compute_environment_min_vcpus : null 33 | desired_vcpus = !local.enable_fargate_on_batch ? var.compute_environment_desired_vcpus : null 34 | 35 | # Prefers cheap vCPU approaches 36 | allocation_strategy = !local.enable_fargate_on_batch ? "BEST_FIT" : null 37 | 38 | /* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data. 39 | Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size 40 | and this compute environment will not have to be destroyed and then created to point to a new Launch Template. 41 | */ 42 | dynamic "launch_template" { 43 | for_each = aws_launch_template.cpu 44 | content { 45 | launch_template_id = launch_template.value.id 46 | version = launch_template.value.latest_version 47 | } 48 | } 49 | 50 | # Security group to apply to the instances launched. 51 | security_group_ids = [ 52 | data.aws_security_group.vpc_default.id, 53 | ] 54 | 55 | # Which subnet to launch the instances into. 56 | subnets = [ 57 | var.subnet1_id, 58 | var.subnet2_id 59 | ] 60 | 61 | # Type of instance Amazon EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available 62 | type = local.enable_fargate_on_batch ? "FARGATE" : "EC2" 63 | 64 | tags = !local.enable_fargate_on_batch ? var.standard_tags : null 65 | } 66 | 67 | lifecycle { 68 | /* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740 69 | helps with "modifying" batch compute environments which requires creating new ones and deleting old ones 70 | as no inplace modification can be made 71 | */ 72 | create_before_destroy = true 73 | # To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify 74 | ignore_changes = [compute_resources.0.desired_vcpus] 75 | } 76 | } 77 | 78 | resource "aws_batch_job_queue" "this" { 79 | name = local.batch_queue_name 80 | state = "ENABLED" 81 | priority = 1 82 | compute_environments = [ 83 | aws_batch_compute_environment.this.arn 84 | ] 85 | 86 | tags = var.standard_tags 87 | } 88 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/data.tf: -------------------------------------------------------------------------------- 1 | data "aws_region" "current" {} 2 | 3 | data "aws_ssm_parameter" "ecs_optimized_cpu_ami" { 4 | name = "/aws/service/ecs/optimized-ami/amazon-linux-2/recommended" 5 | } 6 | 7 | data "aws_ssm_parameter" "ecs_optimized_gpu_ami" { 8 | name = "/aws/service/ecs/optimized-ami/amazon-linux-2/gpu/recommended" 9 | } 10 | 11 | data "aws_security_group" "vpc_default" { 12 | name = "default" 13 | vpc_id = var.metaflow_vpc_id 14 | } 15 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/ec2.tf: -------------------------------------------------------------------------------- 1 | resource "aws_launch_template" "cpu" { 2 | count = local.enable_fargate_on_batch ? 0 : 1 3 | 4 | /* To provide a large disk space than the default 8GB for AWS Batch. 5 | AWS Batch points to this using the latest version, so we can update the disk size here 6 | and AWS Batch will use that. 7 | 8 | This is used for all Metaflow AWS CPU Batch remote jobs. 9 | */ 10 | name = "${var.resource_prefix}batch-launch-tmpl-cpu-100gb${var.resource_suffix}" 11 | 12 | # Defines what IAM Role to assume to grant an Amazon EC2 instance 13 | # This role must have a policy to access the kms_key_id used to encrypt the EBS volume 14 | iam_instance_profile { 15 | arn = aws_iam_instance_profile.ecs_instance_role.arn 16 | } 17 | 18 | image_id = jsondecode(data.aws_ssm_parameter.ecs_optimized_cpu_ami.value)["image_id"] 19 | 20 | block_device_mappings { 21 | device_name = "/dev/xvda" 22 | 23 | ebs { 24 | volume_size = 100 25 | delete_on_termination = true 26 | encrypted = true 27 | } 28 | } 29 | 30 | tags = var.standard_tags 31 | } 32 | 33 | /* 34 | Instance profile is a container for an IAM role. On console when we define role 35 | instance profile is generated but here we have to manually generate. The instance 36 | profile passes role info to the instance when it starts. 37 | Ref: 38 | https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html 39 | */ 40 | resource "aws_iam_instance_profile" "ecs_instance_role" { 41 | name = local.ecs_instance_role_name 42 | role = aws_iam_role.ecs_instance_role.name 43 | } 44 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/iam-batch-execution.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "batch_execution_role_assume_role" { 2 | statement { 3 | actions = [ 4 | "sts:AssumeRole" 5 | ] 6 | 7 | effect = "Allow" 8 | 9 | principals { 10 | identifiers = [ 11 | "batch.amazonaws.com", 12 | ] 13 | type = "Service" 14 | } 15 | } 16 | } 17 | 18 | resource "aws_iam_role" "batch_execution_role" { 19 | name = local.batch_execution_role_name 20 | # Learn more by reading this Terraform documentation https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/batch_compute_environment#argument-reference 21 | # Learn more by reading this AWS Batch documentation https://docs.aws.amazon.com/batch/latest/userguide/service_IAM_role.html 22 | description = "This role is passed to AWS Batch as a `service_role`. This allows AWS Batch to make calls to other AWS services on our behalf." 23 | 24 | assume_role_policy = data.aws_iam_policy_document.batch_execution_role_assume_role.json 25 | 26 | tags = var.standard_tags 27 | } 28 | 29 | data "aws_iam_policy_document" "iam_pass_role" { 30 | statement { 31 | actions = [ 32 | "iam:PassRole" 33 | ] 34 | 35 | effect = "Allow" 36 | 37 | resources = [ 38 | "*" 39 | ] 40 | 41 | condition { 42 | test = "StringEquals" 43 | variable = "iam:PassedToService" 44 | values = ["ec2.amazonaws.com", "ec2.amazonaws.com.cn", "ecs-tasks.amazonaws.com"] 45 | } 46 | } 47 | } 48 | 49 | data "aws_iam_policy_document" "custom_access_policy" { 50 | statement { 51 | actions = [ 52 | "ec2:DescribeAccountAttributes", 53 | "ec2:DescribeInstances", 54 | "ec2:DescribeInstanceAttribute", 55 | "ec2:DescribeSubnets", 56 | "ec2:DescribeSecurityGroups", 57 | "ec2:DescribeKeyPairs", 58 | "ec2:DescribeImages", 59 | "ec2:DescribeImageAttribute", 60 | "ec2:DescribeSpotInstanceRequests", 61 | "ec2:DescribeSpotFleetInstances", 62 | "ec2:DescribeSpotFleetRequests", 63 | "ec2:DescribeSpotPriceHistory", 64 | "ec2:DescribeVpcClassicLink", 65 | "ec2:DescribeLaunchTemplateVersions", 66 | "ec2:CreateLaunchTemplate", 67 | "ec2:DeleteLaunchTemplate", 68 | "ec2:RequestSpotFleet", 69 | "ec2:CancelSpotFleetRequests", 70 | "ec2:ModifySpotFleetRequest", 71 | "ec2:TerminateInstances", 72 | "ec2:RunInstances", 73 | "autoscaling:DescribeAccountLimits", 74 | "autoscaling:DescribeAutoScalingGroups", 75 | "autoscaling:DescribeLaunchConfigurations", 76 | "autoscaling:DescribeAutoScalingInstances", 77 | "autoscaling:CreateLaunchConfiguration", 78 | "autoscaling:CreateAutoScalingGroup", 79 | "autoscaling:UpdateAutoScalingGroup", 80 | "autoscaling:SetDesiredCapacity", 81 | "autoscaling:DeleteLaunchConfiguration", 82 | "autoscaling:DeleteAutoScalingGroup", 83 | "autoscaling:CreateOrUpdateTags", 84 | "autoscaling:SuspendProcesses", 85 | "autoscaling:PutNotificationConfiguration", 86 | "autoscaling:TerminateInstanceInAutoScalingGroup", 87 | "ecs:DescribeClusters", 88 | "ecs:DescribeContainerInstances", 89 | "ecs:DescribeTaskDefinition", 90 | "ecs:DescribeTasks", 91 | "ecs:ListClusters", 92 | "ecs:ListContainerInstances", 93 | "ecs:ListTaskDefinitionFamilies", 94 | "ecs:ListTaskDefinitions", 95 | "ecs:ListTasks", 96 | "ecs:CreateCluster", 97 | "ecs:DeleteCluster", 98 | "ecs:RegisterTaskDefinition", 99 | "ecs:DeregisterTaskDefinition", 100 | "ecs:RunTask", 101 | "ecs:StartTask", 102 | "ecs:StopTask", 103 | "ecs:UpdateContainerAgent", 104 | "ecs:DeregisterContainerInstance", 105 | "logs:CreateLogGroup", 106 | "logs:CreateLogStream", 107 | "logs:PutLogEvents", 108 | "logs:DescribeLogGroups", 109 | "iam:GetInstanceProfile", 110 | "iam:GetRole", 111 | ] 112 | 113 | effect = "Allow" 114 | 115 | resources = [ 116 | "*" 117 | ] 118 | } 119 | } 120 | 121 | data "aws_iam_policy_document" "iam_custom_policies" { 122 | statement { 123 | actions = [ 124 | "iam:CreateServiceLinkedRole" 125 | ] 126 | 127 | effect = "Allow" 128 | 129 | resources = [ 130 | "*", 131 | ] 132 | 133 | condition { 134 | test = "StringEquals" 135 | variable = "iam:AWSServiceName" 136 | values = ["autoscaling.amazonaws.com", "ecs.amazonaws.com"] 137 | } 138 | } 139 | } 140 | 141 | data "aws_iam_policy_document" "ec2_custom_policies" { 142 | statement { 143 | actions = [ 144 | "ec2:CreateTags" 145 | ] 146 | 147 | effect = "Allow" 148 | 149 | resources = [ 150 | "*", 151 | ] 152 | 153 | condition { 154 | test = "StringEquals" 155 | variable = "ec2:CreateAction" 156 | values = ["RunInstances"] 157 | } 158 | } 159 | } 160 | 161 | resource "aws_iam_role_policy" "grant_iam_pass_role" { 162 | name = "iam_pass_role" 163 | role = aws_iam_role.batch_execution_role.name 164 | policy = data.aws_iam_policy_document.iam_pass_role.json 165 | } 166 | 167 | resource "aws_iam_role_policy" "grant_custom_access_policy" { 168 | name = "custom_access" 169 | role = aws_iam_role.batch_execution_role.name 170 | policy = data.aws_iam_policy_document.custom_access_policy.json 171 | } 172 | 173 | resource "aws_iam_role_policy" "grant_iam_custom_policies" { 174 | name = "iam_custom" 175 | role = aws_iam_role.batch_execution_role.name 176 | policy = data.aws_iam_policy_document.iam_custom_policies.json 177 | } 178 | 179 | resource "aws_iam_role_policy" "grant_ec2_custom_policies" { 180 | name = "ec2_custom" 181 | role = aws_iam_role.batch_execution_role.name 182 | policy = data.aws_iam_policy_document.ec2_custom_policies.json 183 | } 184 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/iam-ecs-execution.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "ecs_execution_role_assume_role" { 2 | statement { 3 | actions = [ 4 | "sts:AssumeRole" 5 | ] 6 | 7 | effect = "Allow" 8 | 9 | principals { 10 | identifiers = [ 11 | "ec2.amazonaws.com", 12 | "ecs.amazonaws.com", 13 | "ecs-tasks.amazonaws.com", 14 | "batch.amazonaws.com" 15 | ] 16 | type = "Service" 17 | } 18 | } 19 | } 20 | 21 | resource "aws_iam_role" "ecs_execution_role" { 22 | name = local.ecs_execution_role_name 23 | # Read more about ECS' `task_role` and `execution_role` here https://stackoverflow.com/a/49947471 24 | description = "This role is passed to our AWS ECS' task definition as the `execution_role`. This allows things like the correct image to be pulled and logs to be stored." 25 | assume_role_policy = data.aws_iam_policy_document.ecs_execution_role_assume_role.json 26 | 27 | tags = var.standard_tags 28 | } 29 | 30 | data "aws_iam_policy_document" "ecs_task_execution_policy" { 31 | statement { 32 | effect = "Allow" 33 | 34 | actions = [ 35 | "ecr:GetAuthorizationToken", 36 | "ecr:BatchCheckLayerAvailability", 37 | "ecr:GetDownloadUrlForLayer", 38 | "ecr:BatchGetImage", 39 | "logs:CreateLogStream", 40 | "logs:PutLogEvents" 41 | ] 42 | 43 | # The `"Resource": "*"` is not a concern and the policy that Amazon suggests using 44 | # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html 45 | resources = [ 46 | "*" 47 | ] 48 | } 49 | } 50 | 51 | resource "aws_iam_role_policy" "grant_ecs_access" { 52 | name = "ecs_access" 53 | role = aws_iam_role.ecs_execution_role.name 54 | policy = data.aws_iam_policy_document.ecs_task_execution_policy.json 55 | } 56 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/iam-ecs-instance.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "ecs_instance_role_assume_role" { 2 | statement { 3 | actions = [ 4 | "sts:AssumeRole" 5 | ] 6 | 7 | effect = "Allow" 8 | 9 | principals { 10 | identifiers = [ 11 | "ec2.amazonaws.com" 12 | ] 13 | type = "Service" 14 | } 15 | } 16 | } 17 | 18 | resource "aws_iam_role" "ecs_instance_role" { 19 | name = local.ecs_instance_role_name 20 | # Learn more by reading this Terraform documentation https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/batch_compute_environment#argument-reference 21 | # Learn more by reading this AWS Batch documentation https://docs.aws.amazon.com/batch/latest/userguide/service_IAM_role.html 22 | description = "This role is passed to AWS Batch as a `instance_role`. This allows our Metaflow Batch jobs to execute with proper permissions." 23 | 24 | assume_role_policy = data.aws_iam_policy_document.ecs_instance_role_assume_role.json 25 | } 26 | 27 | /* 28 | Attach policy AmazonEC2ContainerServiceforEC2Role to ecs_instance_role. The 29 | policy is what the role is allowed to do similar to rwx for a user. 30 | AmazonEC2ContainerServiceforEC2Role is a predefined set of permissions by aws the 31 | permissions given are at: 32 | https://docs.aws.amazon.com/AmazonECS/latest/developerguide/instance_IAM_role.html 33 | */ 34 | resource "aws_iam_role_policy_attachment" "ecs_instance_role" { 35 | role = aws_iam_role.ecs_instance_role.name 36 | policy_arn = "arn:${var.iam_partition}:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" 37 | } 38 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | # Name of Batch service's security group used on the compute environment 3 | batch_security_group_name = "${var.resource_prefix}batch-compute-environment-security-group${var.resource_suffix}" 4 | 5 | # Prefix name of Batch compute environment 6 | compute_env_prefix_name = "${var.resource_prefix}cpu${var.resource_suffix}" 7 | 8 | # Name of Batch Queue. 9 | # replace() ensures names that are composed of just prefix + suffix do not have duplicate dashes 10 | batch_queue_name = replace("${var.resource_prefix}${var.resource_suffix}", "--", "-") 11 | 12 | # Name of IAM role to create to manage ECS tasks 13 | ecs_execution_role_name = "${var.resource_prefix}ecs-execution-role${var.resource_suffix}" 14 | 15 | # Name of Batch service IAM role 16 | batch_execution_role_name = "${var.resource_prefix}batch-execution-role${var.resource_suffix}" 17 | 18 | # Name of ECS IAM role 19 | ecs_instance_role_name = "${var.resource_prefix}ecs-iam-role${var.resource_suffix}" 20 | 21 | enable_fargate_on_batch = var.batch_type == "fargate" 22 | } 23 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/outputs.tf: -------------------------------------------------------------------------------- 1 | output "METAFLOW_BATCH_JOB_QUEUE" { 2 | value = aws_batch_job_queue.this.arn 3 | description = "AWS Batch Job Queue ARN for Metaflow" 4 | } 5 | 6 | output "batch_job_queue_arn" { 7 | value = aws_batch_job_queue.this.arn 8 | description = "The ARN of the job queue we'll use to accept Metaflow tasks" 9 | } 10 | 11 | output "ecs_execution_role_arn" { 12 | value = aws_iam_role.ecs_execution_role.arn 13 | description = "The IAM role that grants access to ECS and Batch services which we'll use as our Metadata Service API's execution_role for our Fargate instance" 14 | } 15 | 16 | output "ecs_instance_role_arn" { 17 | value = aws_iam_role.ecs_instance_role.arn 18 | description = "This role will be granted access to our S3 Bucket which acts as our blob storage." 19 | } 20 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/variables.tf: -------------------------------------------------------------------------------- 1 | variable "batch_type" { 2 | type = string 3 | description = "AWS Batch Compute Type ('ec2', 'fargate')" 4 | default = "ec2" 5 | } 6 | 7 | variable "compute_environment_desired_vcpus" { 8 | type = number 9 | description = "Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)" 10 | } 11 | 12 | variable "compute_environment_instance_types" { 13 | type = list(string) 14 | description = "The instance types for the compute environment as a comma-separated list" 15 | } 16 | 17 | variable "compute_environment_max_vcpus" { 18 | type = number 19 | description = "Maximum VCPUs for Batch Compute Environment [16-96]" 20 | } 21 | 22 | variable "compute_environment_min_vcpus" { 23 | type = number 24 | description = "Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)" 25 | } 26 | 27 | variable "enable_step_functions" { 28 | default = false 29 | description = "If true, apply policies required for step functions" 30 | type = bool 31 | } 32 | 33 | variable "iam_partition" { 34 | type = string 35 | default = "aws" 36 | description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" 37 | } 38 | 39 | variable "metaflow_step_functions_dynamodb_policy" { 40 | type = string 41 | description = "IAM policy allowing access to the step functions dynamodb policy" 42 | } 43 | 44 | variable "resource_prefix" { 45 | type = string 46 | description = "Prefix given to all AWS resources to differentiate between applications" 47 | } 48 | 49 | variable "resource_suffix" { 50 | type = string 51 | description = "Suffix given to all AWS resources to differentiate between environment and workspace" 52 | } 53 | 54 | variable "metaflow_vpc_id" { 55 | type = string 56 | description = "ID of the Metaflow VPC this SageMaker notebook instance is to be deployed in" 57 | } 58 | 59 | variable "standard_tags" { 60 | type = map(string) 61 | description = "The standard tags to apply to every AWS resource." 62 | } 63 | 64 | variable "subnet1_id" { 65 | type = string 66 | description = "The first private subnet used for redundancy" 67 | } 68 | 69 | variable "subnet2_id" { 70 | type = string 71 | description = "The second private subnet used for redundancy" 72 | } 73 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/computation/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = ">= 3.38.0" 6 | } 7 | } 8 | required_version = ">= 0.13" 9 | } 10 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/README.md: -------------------------------------------------------------------------------- 1 | # Datastore 2 | 3 | Stores Metaflow state, acting as Metaflow's remote Datastore. The data stored includes but is not limited: 4 | 5 | - for each flow 6 | - for each version 7 | - conda environments 8 | - dependencies 9 | - artifacts 10 | - input 11 | - output 12 | 13 | No duplicate data is stored thanks to automatic deduplication built into Metaflow. 14 | 15 | To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/metaflow-on-aws#datastore) 16 | 17 | 18 | ## Inputs 19 | 20 | | Name | Description | Type | Default | Required | 21 | |------|-------------|------|---------|:--------:| 22 | | [db\_instance\_type](#input\_db\_instance\_type) | RDS instance type to launch for PostgresQL database. | `string` | `"db.t2.small"` | no | 23 | | [db\_name](#input\_db\_name) | Name of PostgresQL database for Metaflow service. | `string` | `"metaflow"` | no | 24 | | [db\_username](#input\_db\_username) | PostgresQL username; defaults to 'metaflow' | `string` | `"metaflow"` | no | 25 | | [ecs\_execution\_role\_arn](#input\_ecs\_execution\_role\_arn) | This role will be granted access to our S3 Bucket which acts as our blob storage. | `string` | n/a | yes | 26 | | [ecs\_instance\_role\_arn](#input\_ecs\_instance\_role\_arn) | This role will be granted access to our S3 Bucket which acts as our blob storage. | `string` | n/a | yes | 27 | | [metadata\_service\_security\_group\_id](#input\_metadata\_service\_security\_group\_id) | The security group ID used by the MetaData service. We'll grant this access to our DB. | `string` | n/a | yes | 28 | | [metaflow\_vpc\_id](#input\_metaflow\_vpc\_id) | ID of the Metaflow VPC this SageMaker notebook instance is to be deployed in | `string` | n/a | yes | 29 | | [resource\_prefix](#input\_resource\_prefix) | Prefix given to all AWS resources to differentiate between applications | `string` | n/a | yes | 30 | | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | 31 | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | 32 | | [subnet1\_id](#input\_subnet1\_id) | First subnet used for availability zone redundancy | `string` | n/a | yes | 33 | | [subnet2\_id](#input\_subnet2\_id) | Second subnet used for availability zone redundancy | `string` | n/a | yes | 34 | 35 | ## Outputs 36 | 37 | | Name | Description | 38 | |------|-------------| 39 | | [METAFLOW\_DATASTORE\_SYSROOT\_S3](#output\_METAFLOW\_DATASTORE\_SYSROOT\_S3) | Amazon S3 URL for Metaflow DataStore | 40 | | [METAFLOW\_DATATOOLS\_S3ROOT](#output\_METAFLOW\_DATATOOLS\_S3ROOT) | Amazon S3 URL for Metaflow DataTools | 41 | | [database\_password](#output\_database\_password) | The database password | 42 | | [database\_username](#output\_database\_username) | The database username | 43 | | [datastore\_s3\_bucket\_kms\_key\_arn](#output\_datastore\_s3\_bucket\_kms\_key\_arn) | The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket | 44 | | [rds\_master\_instance\_endpoint](#output\_rds\_master\_instance\_endpoint) | The database connection endpoint in address:port format | 45 | | [s3\_bucket\_arn](#output\_s3\_bucket\_arn) | The ARN of the bucket we'll be using as blob storage | 46 | | [s3\_bucket\_name](#output\_s3\_bucket\_name) | The name of the bucket we'll be using as blob storage | 47 | 48 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/kms.tf: -------------------------------------------------------------------------------- 1 | resource "aws_kms_key" "s3" { 2 | description = "This key is used to encrypt and decrypt the S3 bucket used to store blobs." 3 | 4 | tags = var.standard_tags 5 | } 6 | 7 | resource "aws_kms_key" "rds" { 8 | description = "This key is used to encrypt and decrypt the RDS database used to store flow execution data." 9 | 10 | tags = var.standard_tags 11 | } 12 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | # Name of PostgresQL subnet group. 3 | pg_subnet_group_name = "${var.resource_prefix}main${var.resource_suffix}" 4 | 5 | # Name of the RDS security group 6 | rds_security_group_name = "${var.resource_prefix}rds-security-group${var.resource_suffix}" 7 | 8 | # Name of S3 bucket 9 | s3_bucket_name = "${var.resource_prefix}s3${var.resource_suffix}" 10 | } 11 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/outputs.tf: -------------------------------------------------------------------------------- 1 | output "METAFLOW_DATATOOLS_S3ROOT" { 2 | value = "s3://${aws_s3_bucket.this.bucket}/data" 3 | description = "Amazon S3 URL for Metaflow DataTools" 4 | } 5 | 6 | output "METAFLOW_DATASTORE_SYSROOT_S3" { 7 | value = "s3://${aws_s3_bucket.this.bucket}/metaflow" 8 | description = "Amazon S3 URL for Metaflow DataStore" 9 | } 10 | 11 | output "database_password" { 12 | value = random_password.this.result 13 | description = "The database password" 14 | } 15 | 16 | output "database_username" { 17 | value = var.db_username 18 | description = "The database username" 19 | } 20 | 21 | output "datastore_s3_bucket_kms_key_arn" { 22 | value = aws_kms_key.s3.arn 23 | description = "The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket" 24 | } 25 | 26 | output "rds_master_instance_endpoint" { 27 | value = aws_db_instance.this.endpoint 28 | description = "The database connection endpoint in address:port format" 29 | } 30 | 31 | output "s3_bucket_arn" { 32 | value = aws_s3_bucket.this.arn 33 | description = "The ARN of the bucket we'll be using as blob storage" 34 | } 35 | 36 | output "s3_bucket_name" { 37 | value = aws_s3_bucket.this.bucket 38 | description = "The name of the bucket we'll be using as blob storage" 39 | } 40 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/rds.tf: -------------------------------------------------------------------------------- 1 | /* 2 | A subnet is attached to an availability zone so for db redundancy and 3 | performance we need to define additional subnet(s) and aws_db_subnet_group 4 | is how we define this. 5 | */ 6 | resource "aws_db_subnet_group" "this" { 7 | name = local.pg_subnet_group_name 8 | subnet_ids = [var.subnet1_id, var.subnet2_id] 9 | 10 | tags = merge( 11 | var.standard_tags, 12 | { 13 | Name = local.pg_subnet_group_name 14 | Metaflow = "true" 15 | } 16 | ) 17 | } 18 | 19 | /* 20 | Define a new firewall for our database instance. 21 | */ 22 | resource "aws_security_group" "rds_security_group" { 23 | name = local.rds_security_group_name 24 | vpc_id = var.metaflow_vpc_id 25 | 26 | # ingress only from port 5432 27 | ingress { 28 | from_port = 5432 29 | to_port = 5432 30 | protocol = "tcp" 31 | security_groups = [var.metadata_service_security_group_id] 32 | } 33 | 34 | # egress to anywhere 35 | egress { 36 | from_port = 0 37 | to_port = 0 38 | protocol = "-1" 39 | cidr_blocks = ["0.0.0.0/0"] 40 | } 41 | 42 | tags = var.standard_tags 43 | } 44 | 45 | resource "random_password" "this" { 46 | length = 64 47 | special = true 48 | # redefines the `special` variable by removing the `@` 49 | # this documentation https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/CHAP_Limits.html 50 | # shows that the `/`, `"`, `@` and ` ` cannot be used in the password 51 | override_special = "!#$%&*()-_=+[]{}<>:?" 52 | } 53 | 54 | resource "random_pet" "final_snapshot_id" {} 55 | 56 | /* 57 | Define rds db instance. 58 | */ 59 | resource "aws_db_instance" "this" { 60 | publicly_accessible = false 61 | allocated_storage = 20 # Allocate 20GB 62 | storage_type = "gp2" # general purpose SSD 63 | storage_encrypted = true 64 | kms_key_id = aws_kms_key.rds.arn 65 | engine = "postgres" 66 | engine_version = "11" 67 | instance_class = var.db_instance_type # Hardware configuration 68 | identifier = "${var.resource_prefix}${var.db_name}${var.resource_suffix}" # used for dns hostname needs to be customer unique in region 69 | name = var.db_name # unique id for CLI commands (name of DB table which is why we're not adding the prefix as no conflicts will occur and the API expects this table name) 70 | username = var.db_username 71 | password = random_password.this.result 72 | db_subnet_group_name = aws_db_subnet_group.this.id 73 | max_allocated_storage = 1000 # Upper limit of automatic scaled storage 74 | multi_az = true # Multiple availability zone? 75 | final_snapshot_identifier = "${var.resource_prefix}${var.db_name}-final-snapshot${var.resource_suffix}-${random_pet.final_snapshot_id.id}" # Snapshot upon delete 76 | vpc_security_group_ids = [aws_security_group.rds_security_group.id] 77 | 78 | tags = merge( 79 | var.standard_tags, 80 | { 81 | Name = "${var.resource_prefix}${var.db_name}${var.resource_suffix}" 82 | Metaflow = "true" 83 | } 84 | ) 85 | } 86 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/s3.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "this" { 2 | bucket = local.s3_bucket_name 3 | acl = "private" 4 | server_side_encryption_configuration { 5 | rule { 6 | apply_server_side_encryption_by_default { 7 | kms_master_key_id = aws_kms_key.s3.arn 8 | sse_algorithm = "aws:kms" 9 | } 10 | } 11 | } 12 | 13 | tags = merge( 14 | var.standard_tags, 15 | { 16 | Metaflow = "true" 17 | } 18 | ) 19 | } 20 | 21 | resource "aws_s3_bucket_public_access_block" "this" { 22 | bucket = aws_s3_bucket.this.id 23 | 24 | block_public_policy = true 25 | block_public_acls = true 26 | ignore_public_acls = true 27 | restrict_public_buckets = true 28 | } 29 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/variables.tf: -------------------------------------------------------------------------------- 1 | variable "db_instance_type" { 2 | type = string 3 | description = "RDS instance type to launch for PostgresQL database." 4 | default = "db.t2.small" 5 | } 6 | 7 | variable "db_name" { 8 | type = string 9 | description = "Name of PostgresQL database for Metaflow service." 10 | default = "metaflow" 11 | } 12 | 13 | variable "db_username" { 14 | type = string 15 | description = "PostgresQL username; defaults to 'metaflow'" 16 | default = "metaflow" 17 | } 18 | 19 | variable "ecs_execution_role_arn" { 20 | type = string 21 | description = "This role will be granted access to our S3 Bucket which acts as our blob storage." 22 | } 23 | 24 | variable "ecs_instance_role_arn" { 25 | type = string 26 | description = "This role will be granted access to our S3 Bucket which acts as our blob storage." 27 | } 28 | 29 | variable "metadata_service_security_group_id" { 30 | type = string 31 | description = "The security group ID used by the MetaData service. We'll grant this access to our DB." 32 | } 33 | 34 | variable "metaflow_vpc_id" { 35 | type = string 36 | description = "ID of the Metaflow VPC this SageMaker notebook instance is to be deployed in" 37 | } 38 | 39 | variable "resource_prefix" { 40 | type = string 41 | description = "Prefix given to all AWS resources to differentiate between applications" 42 | } 43 | 44 | variable "resource_suffix" { 45 | type = string 46 | description = "Suffix given to all AWS resources to differentiate between environment and workspace" 47 | } 48 | 49 | variable "standard_tags" { 50 | type = map(string) 51 | description = "The standard tags to apply to every AWS resource." 52 | } 53 | 54 | variable "subnet1_id" { 55 | type = string 56 | description = "First subnet used for availability zone redundancy" 57 | } 58 | 59 | variable "subnet2_id" { 60 | type = string 61 | description = "Second subnet used for availability zone redundancy" 62 | } 63 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/datastore/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | } 6 | random = { 7 | source = "hashicorp/random" 8 | } 9 | } 10 | required_version = ">= 0.13" 11 | } 12 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/metadata-service/.gitignore: -------------------------------------------------------------------------------- 1 | index.py 2 | db_migrate_lambda.zip 3 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/metadata-service/README.md: -------------------------------------------------------------------------------- 1 | # Metadata Service 2 | 3 | The Metadata Service is a central store for the Metaflow metadata. Namely, it contains information about past runs, and pointers to data artifacts they produced. Metaflow client talks to the Metadata service over an HTTP API endpoint. Metadata service is not strictly required to use Metaflow, especially in the local mode, but it enables a lot of useful functionality, especially if there is more than person using Metaflow in your team. 4 | 5 | This terraform module provisions infrastructure to run Metadata service on AWS Fargate. 6 | 7 | To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/metaflow-on-aws#metadata) 8 | 9 | ### Access control 10 | 11 | If the `access_list_cidr_blocks` variable is set, only traffic originating from the specified IP addresses will be accepted. Services internal to AWS can directly access the load balancer used by the API. 12 | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | Type | Default | Required | 17 | |------|-------------|------|---------|:--------:| 18 | | [access\_list\_cidr\_blocks](#input\_access\_list\_cidr\_blocks) | List of CIDRs we want to grant access to our Metaflow Metadata Service. Usually this is our VPN's CIDR blocks. | `list(string)` | n/a | yes | 19 | | [api\_basic\_auth](#input\_api\_basic\_auth) | Enable basic auth for API Gateway? (requires key export) | `bool` | `true` | no | 20 | | [database\_password](#input\_database\_password) | The database password | `string` | n/a | yes | 21 | | [database\_username](#input\_database\_username) | The database username | `string` | n/a | yes | 22 | | [datastore\_s3\_bucket\_kms\_key\_arn](#input\_datastore\_s3\_bucket\_kms\_key\_arn) | The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket | `string` | n/a | yes | 23 | | [fargate\_execution\_role\_arn](#input\_fargate\_execution\_role\_arn) | The IAM role that grants access to ECS and Batch services which we'll use as our Metadata Service API's execution\_role for our Fargate instance | `string` | n/a | yes | 24 | | [iam\_partition](#input\_iam\_partition) | IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is) | `string` | `"aws"` | no | 25 | | [is\_gov](#input\_is\_gov) | Set to true if IAM partition is 'aws-us-gov' | `bool` | `false` | no | 26 | | [metaflow\_vpc\_id](#input\_metaflow\_vpc\_id) | ID of the Metaflow VPC this SageMaker notebook instance is to be deployed in | `string` | n/a | yes | 27 | | [rds\_master\_instance\_endpoint](#input\_rds\_master\_instance\_endpoint) | The database connection endpoint in address:port format | `string` | n/a | yes | 28 | | [resource\_prefix](#input\_resource\_prefix) | Prefix given to all AWS resources to differentiate between applications | `string` | n/a | yes | 29 | | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | 30 | | [s3\_bucket\_arn](#input\_s3\_bucket\_arn) | The ARN of the bucket we'll be using as blob storage | `string` | n/a | yes | 31 | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | 32 | | [subnet1\_id](#input\_subnet1\_id) | First private subnet used for availability zone redundancy | `string` | n/a | yes | 33 | | [subnet2\_id](#input\_subnet2\_id) | Second private subnet used for availability zone redundancy | `string` | n/a | yes | 34 | | [vpc\_cidr\_block](#input\_vpc\_cidr\_block) | The VPC CIDR block that we'll access list on our Metadata Service API to allow all internal communications | `string` | n/a | yes | 35 | 36 | ## Outputs 37 | 38 | | Name | Description | 39 | |------|-------------| 40 | | [METAFLOW\_SERVICE\_INTERNAL\_URL](#output\_METAFLOW\_SERVICE\_INTERNAL\_URL) | URL for Metadata Service (Accessible in VPC) | 41 | | [METAFLOW\_SERVICE\_URL](#output\_METAFLOW\_SERVICE\_URL) | URL for Metadata Service (Open to Public Access) | 42 | | [api\_gateway\_rest\_api\_id](#output\_api\_gateway\_rest\_api\_id) | The ID of the API Gateway REST API we'll use to accept MetaData service requests to forward to the Fargate API instance | 43 | | [api\_gateway\_rest\_api\_id\_key\_id](#output\_api\_gateway\_rest\_api\_id\_key\_id) | API Gateway Key ID for Metadata Service. Fetch Key from AWS Console [METAFLOW\_SERVICE\_AUTH\_KEY] | 44 | | [metadata\_service\_security\_group\_id](#output\_metadata\_service\_security\_group\_id) | The security group ID used by the MetaData service. We'll grant this access to our DB. | 45 | | [metadata\_svc\_ecs\_task\_role\_arn](#output\_metadata\_svc\_ecs\_task\_role\_arn) | This role is passed to AWS ECS' task definition as the `task_role`. This allows the running of the Metaflow Metadata Service to have the proper permissions to speak to other AWS resources. | 46 | | [migration\_function\_arn](#output\_migration\_function\_arn) | ARN of DB Migration Function | 47 | | [network\_load\_balancer\_dns\_name](#output\_network\_load\_balancer\_dns\_name) | The DNS addressable name for the Network Load Balancer that accepts requests and forwards them to our Fargate MetaData service instance(s) | 48 | 49 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/metadata-service/api-gateway.tf: -------------------------------------------------------------------------------- 1 | resource "aws_api_gateway_rest_api_policy" "this" { 2 | count = length(var.access_list_cidr_blocks) > 0 ? 1 : 0 3 | rest_api_id = aws_api_gateway_rest_api.this.id 4 | policy = < 10 | ## Inputs 11 | 12 | | Name | Description | Type | Default | Required | 13 | |------|-------------|------|---------|:--------:| 14 | | [active](#input\_active) | When true step function infrastructure is provisioned. | `bool` | `false` | no | 15 | | [batch\_job\_queue\_arn](#input\_batch\_job\_queue\_arn) | Batch job queue arn | `string` | n/a | yes | 16 | | [iam\_partition](#input\_iam\_partition) | IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is) | `string` | `"aws"` | no | 17 | | [resource\_prefix](#input\_resource\_prefix) | Prefix given to all AWS resources to differentiate between applications | `string` | n/a | yes | 18 | | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | 19 | | [s3\_bucket\_arn](#input\_s3\_bucket\_arn) | arn of the metaflow datastore s3 bucket | `string` | n/a | yes | 20 | | [s3\_bucket\_kms\_arn](#input\_s3\_bucket\_kms\_arn) | arn of the metaflow datastore s3 bucket's kms key | `string` | n/a | yes | 21 | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | 22 | 23 | ## Outputs 24 | 25 | | Name | Description | 26 | |------|-------------| 27 | | [metaflow\_eventbridge\_role\_arn](#output\_metaflow\_eventbridge\_role\_arn) | IAM role for Amazon EventBridge to access AWS Step Functions. | 28 | | [metaflow\_step\_functions\_dynamodb\_policy](#output\_metaflow\_step\_functions\_dynamodb\_policy) | Policy json allowing access to the step functions dynamodb table. | 29 | | [metaflow\_step\_functions\_dynamodb\_table\_arn](#output\_metaflow\_step\_functions\_dynamodb\_table\_arn) | AWS DynamoDB table arn for tracking AWS Step Functions execution metadata. | 30 | | [metaflow\_step\_functions\_dynamodb\_table\_name](#output\_metaflow\_step\_functions\_dynamodb\_table\_name) | AWS DynamoDB table name for tracking AWS Step Functions execution metadata. | 31 | | [metaflow\_step\_functions\_role\_arn](#output\_metaflow\_step\_functions\_role\_arn) | IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB). | 32 | 33 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/data.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | 3 | data "aws_region" "current" {} 4 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/dynamodb.tf: -------------------------------------------------------------------------------- 1 | resource "aws_dynamodb_table" "step_functions_state_table" { 2 | count = var.active ? 1 : 0 3 | name = local.dynamodb_step_functions_state_db_name 4 | billing_mode = "PAY_PER_REQUEST" 5 | hash_key = "pathspec" 6 | 7 | server_side_encryption { 8 | enabled = true 9 | } 10 | 11 | point_in_time_recovery { 12 | enabled = false 13 | } 14 | 15 | attribute { 16 | name = "pathspec" 17 | type = "S" 18 | } 19 | 20 | ttl { 21 | attribute_name = "ttl" 22 | enabled = true 23 | } 24 | 25 | tags = var.standard_tags 26 | } 27 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/iam-eventbridge.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "eventbridge_assume_role_policy" { 2 | statement { 3 | effect = "Allow" 4 | 5 | principals { 6 | identifiers = [ 7 | "events.amazonaws.com" 8 | ] 9 | type = "Service" 10 | } 11 | 12 | actions = [ 13 | "sts:AssumeRole" 14 | ] 15 | } 16 | } 17 | 18 | data "aws_iam_policy_document" "eventbridge_step_functions_policy" { 19 | statement { 20 | actions = [ 21 | "states:StartExecution" 22 | ] 23 | 24 | resources = [ 25 | "arn:${var.iam_partition}:states:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:stateMachine:*" 26 | ] 27 | } 28 | } 29 | 30 | resource "aws_iam_role" "eventbridge_role" { 31 | count = var.active ? 1 : 0 32 | name = "${var.resource_prefix}eventbridge_role${var.resource_suffix}" 33 | description = "IAM role for Amazon EventBridge to access AWS Step Functions." 34 | assume_role_policy = data.aws_iam_policy_document.eventbridge_assume_role_policy.json 35 | 36 | tags = var.standard_tags 37 | } 38 | 39 | resource "aws_iam_role_policy" "eventbridge_step_functions_policy" { 40 | count = var.active ? 1 : 0 41 | name = "step_functions" 42 | role = aws_iam_role.eventbridge_role[0].id 43 | policy = data.aws_iam_policy_document.eventbridge_step_functions_policy.json 44 | } 45 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/iam-step-functions.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "step_functions_assume_role_policy" { 2 | statement { 3 | effect = "Allow" 4 | 5 | principals { 6 | identifiers = [ 7 | "states.amazonaws.com" 8 | ] 9 | type = "Service" 10 | } 11 | 12 | actions = [ 13 | "sts:AssumeRole" 14 | ] 15 | } 16 | } 17 | 18 | data "aws_iam_policy_document" "step_functions_batch_policy" { 19 | statement { 20 | actions = [ 21 | "batch:TerminateJob", 22 | "batch:DescribeJobs", 23 | "batch:DescribeJobDefinitions", 24 | "batch:DescribeJobQueues", 25 | "batch:RegisterJobDefinition" 26 | ] 27 | 28 | resources = [ 29 | "*" 30 | ] 31 | } 32 | 33 | statement { 34 | actions = [ 35 | "batch:SubmitJob" 36 | ] 37 | 38 | resources = [ 39 | var.batch_job_queue_arn, 40 | "arn:${var.iam_partition}:batch:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:job-definition/*" 41 | ] 42 | } 43 | } 44 | 45 | data "aws_iam_policy_document" "step_functions_s3" { 46 | statement { 47 | actions = [ 48 | "s3:ListBucket" 49 | ] 50 | 51 | resources = [ 52 | var.s3_bucket_arn 53 | ] 54 | } 55 | 56 | statement { 57 | actions = [ 58 | "s3:*Object" 59 | ] 60 | 61 | resources = [ 62 | var.s3_bucket_arn, "${var.s3_bucket_arn}/*" 63 | ] 64 | } 65 | 66 | statement { 67 | actions = [ 68 | "kms:Decrypt" 69 | ] 70 | 71 | resources = [ 72 | var.s3_bucket_kms_arn 73 | ] 74 | } 75 | } 76 | 77 | data "aws_iam_policy_document" "step_functions_cloudwatch" { 78 | statement { 79 | actions = [ 80 | "logs:CreateLogDelivery", 81 | "logs:GetLogDelivery", 82 | "logs:UpdateLogDelivery", 83 | "logs:DeleteLogDelivery", 84 | "logs:ListLogDeliveries", 85 | "logs:PutResourcePolicy", 86 | "logs:DescribeResourcePolicies", 87 | "logs:DescribeLogGroups" 88 | ] 89 | 90 | resources = [ 91 | "*" 92 | ] 93 | } 94 | } 95 | 96 | data "aws_iam_policy_document" "step_functions_eventbridge" { 97 | statement { 98 | actions = [ 99 | "events:PutTargets", 100 | "events:DescribeRule" 101 | ] 102 | 103 | resources = [ 104 | "arn:${var.iam_partition}:events:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:rule/StepFunctionsGetEventsForBatchJobsRule", 105 | ] 106 | } 107 | 108 | statement { 109 | actions = [ 110 | "events:PutRule" 111 | ] 112 | 113 | resources = [ 114 | "arn:${var.iam_partition}:events:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:rule/StepFunctionsGetEventsForBatchJobsRule" 115 | ] 116 | 117 | condition { 118 | test = "StringEquals" 119 | variable = "events:detail-type" 120 | values = ["Batch Job State Change"] 121 | } 122 | } 123 | } 124 | 125 | data "aws_iam_policy_document" "step_functions_dynamodb" { 126 | statement { 127 | actions = [ 128 | "dynamodb:PutItem", 129 | "dynamodb:GetItem", 130 | "dynamodb:UpdateItem" 131 | ] 132 | 133 | resources = [ 134 | join("", [for arn in aws_dynamodb_table.step_functions_state_table.*.arn : arn]) 135 | ] 136 | } 137 | } 138 | 139 | resource "aws_iam_role" "step_functions_role" { 140 | count = var.active ? 1 : 0 141 | name = "${var.resource_prefix}step_functions_role${var.resource_suffix}" 142 | description = "IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB)." 143 | assume_role_policy = data.aws_iam_policy_document.step_functions_assume_role_policy.json 144 | 145 | tags = var.standard_tags 146 | } 147 | 148 | resource "aws_iam_role_policy" "step_functions_batch" { 149 | count = var.active ? 1 : 0 150 | name = "aws_batch" 151 | role = aws_iam_role.step_functions_role[0].id 152 | policy = data.aws_iam_policy_document.step_functions_batch_policy.json 153 | } 154 | 155 | resource "aws_iam_role_policy" "step_functions_s3" { 156 | count = var.active ? 1 : 0 157 | name = "s3" 158 | role = aws_iam_role.step_functions_role[0].id 159 | policy = data.aws_iam_policy_document.step_functions_s3.json 160 | } 161 | 162 | resource "aws_iam_role_policy" "step_functions_cloudwatch" { 163 | count = var.active ? 1 : 0 164 | name = "cloudwatch" 165 | role = aws_iam_role.step_functions_role[0].id 166 | policy = data.aws_iam_policy_document.step_functions_cloudwatch.json 167 | } 168 | 169 | resource "aws_iam_role_policy" "step_functions_eventbridge" { 170 | count = var.active ? 1 : 0 171 | name = "event_bridge" 172 | role = aws_iam_role.step_functions_role[0].id 173 | policy = data.aws_iam_policy_document.step_functions_eventbridge.json 174 | } 175 | 176 | resource "aws_iam_role_policy" "step_functions_dynamodb" { 177 | count = var.active ? 1 : 0 178 | name = "dynamodb" 179 | role = aws_iam_role.step_functions_role[0].id 180 | policy = data.aws_iam_policy_document.step_functions_dynamodb.json 181 | } 182 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | dynamodb_step_functions_state_db_name = "${var.resource_prefix}step_functions_state${var.resource_suffix}" 3 | } 4 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/outputs.tf: -------------------------------------------------------------------------------- 1 | output "metaflow_eventbridge_role_arn" { 2 | value = join("", [for arn in aws_iam_role.eventbridge_role.*.arn : arn]) 3 | description = "IAM role for Amazon EventBridge to access AWS Step Functions." 4 | } 5 | 6 | output "metaflow_step_functions_dynamodb_policy" { 7 | value = var.active ? data.aws_iam_policy_document.step_functions_dynamodb.json : "" 8 | description = "Policy json allowing access to the step functions dynamodb table." 9 | } 10 | 11 | output "metaflow_step_functions_dynamodb_table_arn" { 12 | value = join("", [for arn in aws_dynamodb_table.step_functions_state_table.*.arn : arn]) 13 | description = "AWS DynamoDB table arn for tracking AWS Step Functions execution metadata." 14 | } 15 | 16 | output "metaflow_step_functions_dynamodb_table_name" { 17 | value = join("", [for name in aws_dynamodb_table.step_functions_state_table.*.name : name]) 18 | description = "AWS DynamoDB table name for tracking AWS Step Functions execution metadata." 19 | } 20 | 21 | output "metaflow_step_functions_role_arn" { 22 | value = join("", [for arn in aws_iam_role.step_functions_role.*.arn : arn]) 23 | description = "IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB)." 24 | } 25 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/variables.tf: -------------------------------------------------------------------------------- 1 | variable "active" { 2 | default = false 3 | description = "When true step function infrastructure is provisioned." 4 | type = bool 5 | } 6 | 7 | variable "batch_job_queue_arn" { 8 | type = string 9 | description = "Batch job queue arn" 10 | } 11 | 12 | variable "iam_partition" { 13 | type = string 14 | default = "aws" 15 | description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" 16 | } 17 | 18 | variable "resource_prefix" { 19 | type = string 20 | description = "Prefix given to all AWS resources to differentiate between applications" 21 | } 22 | 23 | variable "resource_suffix" { 24 | type = string 25 | description = "Suffix given to all AWS resources to differentiate between environment and workspace" 26 | } 27 | 28 | variable "s3_bucket_arn" { 29 | type = string 30 | description = "arn of the metaflow datastore s3 bucket" 31 | } 32 | 33 | variable "s3_bucket_kms_arn" { 34 | type = string 35 | description = "arn of the metaflow datastore s3 bucket's kms key" 36 | } 37 | 38 | variable "standard_tags" { 39 | type = map(string) 40 | description = "The standard tags to apply to every AWS resource." 41 | } 42 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/modules/step-functions/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/outputs.tf: -------------------------------------------------------------------------------- 1 | output "METAFLOW_BATCH_JOB_QUEUE" { 2 | value = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE 3 | description = "AWS Batch Job Queue ARN for Metaflow" 4 | } 5 | 6 | output "METAFLOW_DATASTORE_SYSROOT_S3" { 7 | value = module.metaflow-datastore.METAFLOW_DATASTORE_SYSROOT_S3 8 | description = "Amazon S3 URL for Metaflow DataStore" 9 | } 10 | 11 | output "METAFLOW_DATATOOLS_S3ROOT" { 12 | value = module.metaflow-datastore.METAFLOW_DATATOOLS_S3ROOT 13 | description = "Amazon S3 URL for Metaflow DataTools" 14 | } 15 | 16 | output "METAFLOW_ECS_S3_ACCESS_IAM_ROLE" { 17 | value = aws_iam_role.batch_s3_task_role.arn 18 | description = "Role for AWS Batch to Access Amazon S3" 19 | } 20 | 21 | output "METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE" { 22 | value = module.metaflow-step-functions.metaflow_eventbridge_role_arn 23 | description = "IAM role for Amazon EventBridge to access AWS Step Functions." 24 | } 25 | 26 | output "METAFLOW_SERVICE_INTERNAL_URL" { 27 | value = module.metaflow-metadata-service.METAFLOW_SERVICE_INTERNAL_URL 28 | description = "URL for Metadata Service (Accessible in VPC)" 29 | } 30 | 31 | output "METAFLOW_SERVICE_URL" { 32 | value = module.metaflow-metadata-service.METAFLOW_SERVICE_URL 33 | description = "URL for Metadata Service (Accessible in VPC)" 34 | } 35 | 36 | output "METAFLOW_SFN_DYNAMO_DB_TABLE" { 37 | value = module.metaflow-step-functions.metaflow_step_functions_dynamodb_table_name 38 | description = "AWS DynamoDB table name for tracking AWS Step Functions execution metadata." 39 | } 40 | 41 | output "METAFLOW_SFN_IAM_ROLE" { 42 | value = module.metaflow-step-functions.metaflow_step_functions_role_arn 43 | description = "IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB)." 44 | } 45 | 46 | output "api_gateway_rest_api_id_key_id" { 47 | value = module.metaflow-metadata-service.api_gateway_rest_api_id_key_id 48 | description = "API Gateway Key ID for Metadata Service. Fetch Key from AWS Console [METAFLOW_SERVICE_AUTH_KEY]" 49 | } 50 | 51 | output "datastore_s3_bucket_kms_key_arn" { 52 | value = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn 53 | description = "The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket" 54 | } 55 | 56 | output "metadata_svc_ecs_task_role_arn" { 57 | value = module.metaflow-metadata-service.metadata_svc_ecs_task_role_arn 58 | } 59 | 60 | output "metaflow_api_gateway_rest_api_id" { 61 | value = module.metaflow-metadata-service.api_gateway_rest_api_id 62 | description = "The ID of the API Gateway REST API we'll use to accept MetaData service requests to forward to the Fargate API instance" 63 | } 64 | 65 | output "metaflow_batch_container_image" { 66 | value = var.enable_custom_batch_container_registry ? aws_ecr_repository.metaflow_batch_image[0].repository_url : "" 67 | description = "The ECR repo containing the metaflow batch image" 68 | } 69 | 70 | output "metaflow_profile_json" { 71 | value = jsonencode( 72 | merge( 73 | var.enable_custom_batch_container_registry ? { 74 | "METAFLOW_BATCH_CONTAINER_REGISTRY" = element(split("/", aws_ecr_repository.metaflow_batch_image[0].repository_url), 0), 75 | "METAFLOW_BATCH_CONTAINER_IMAGE" = element(split("/", aws_ecr_repository.metaflow_batch_image[0].repository_url), 1) 76 | } : {}, 77 | var.api_basic_auth ? { 78 | "METAFLOW_SERVICE_AUTH_KEY" = "## Replace with output from 'aws apigateway get-api-key --api-key ${module.metaflow-metadata-service.api_gateway_rest_api_id_key_id} --include-value | grep value' ##" 79 | } : {}, 80 | var.batch_type == "fargate" ? { 81 | "METAFLOW_ECS_FARGATE_EXECUTION_ROLE" = module.metaflow-computation.ecs_execution_role_arn 82 | } : {}, 83 | { 84 | "METAFLOW_DATASTORE_SYSROOT_S3" = module.metaflow-datastore.METAFLOW_DATASTORE_SYSROOT_S3, 85 | "METAFLOW_DATATOOLS_S3ROOT" = module.metaflow-datastore.METAFLOW_DATATOOLS_S3ROOT, 86 | "METAFLOW_BATCH_JOB_QUEUE" = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE, 87 | "METAFLOW_ECS_S3_ACCESS_IAM_ROLE" = aws_iam_role.batch_s3_task_role.arn 88 | "METAFLOW_SERVICE_URL" = module.metaflow-metadata-service.METAFLOW_SERVICE_URL, 89 | "METAFLOW_SERVICE_INTERNAL_URL" = module.metaflow-metadata-service.METAFLOW_SERVICE_INTERNAL_URL, 90 | "METAFLOW_SFN_IAM_ROLE" = module.metaflow-step-functions.metaflow_step_functions_role_arn, 91 | "METAFLOW_SFN_STATE_MACHINE_PREFIX" = replace("${local.resource_prefix}${local.resource_suffix}", "--", "-"), 92 | "METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE" = module.metaflow-step-functions.metaflow_eventbridge_role_arn, 93 | "METAFLOW_SFN_DYNAMO_DB_TABLE" = module.metaflow-step-functions.metaflow_step_functions_dynamodb_table_name, 94 | "METAFLOW_DEFAULT_DATASTORE" = "s3", 95 | "METAFLOW_DEFAULT_METADATA" = "service" 96 | } 97 | ) 98 | ) 99 | description = "Metaflow profile JSON object that can be used to communicate with this Metaflow Stack. Store this in `~/.metaflow/config_[stack-name]` and select with `$ export METAFLOW_PROFILE=[stack-name]`." 100 | } 101 | 102 | output "metaflow_s3_bucket_name" { 103 | value = module.metaflow-datastore.s3_bucket_name 104 | description = "The name of the bucket we'll be using as blob storage" 105 | } 106 | 107 | output "metaflow_s3_bucket_arn" { 108 | value = module.metaflow-datastore.s3_bucket_arn 109 | description = "The ARN of the bucket we'll be using as blob storage" 110 | } 111 | 112 | output "migration_function_arn" { 113 | value = module.metaflow-metadata-service.migration_function_arn 114 | description = "ARN of DB Migration Function" 115 | } 116 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/variables.tf: -------------------------------------------------------------------------------- 1 | variable "access_list_cidr_blocks" { 2 | type = list(string) 3 | description = "List of CIDRs we want to grant access to our Metaflow Metadata Service. Usually this is our VPN's CIDR blocks." 4 | default = [] 5 | } 6 | 7 | variable "api_basic_auth" { 8 | type = bool 9 | default = true 10 | description = "Enable basic auth for API Gateway? (requires key export)" 11 | } 12 | 13 | variable "batch_type" { 14 | type = string 15 | description = "AWS Batch Compute Type ('ec2', 'fargate')" 16 | default = "ec2" 17 | } 18 | 19 | variable "enable_custom_batch_container_registry" { 20 | type = bool 21 | default = false 22 | description = "Provisions infrastructure for custom Amazon ECR container registry if enabled" 23 | } 24 | 25 | variable "enable_step_functions" { 26 | type = bool 27 | description = "Provisions infrastructure for step functions if enabled" 28 | } 29 | 30 | variable "resource_prefix" { 31 | default = "metaflow" 32 | description = "string prefix for all resources" 33 | } 34 | 35 | variable "resource_suffix" { 36 | default = "" 37 | description = "string suffix for all resources" 38 | } 39 | 40 | variable "compute_environment_desired_vcpus" { 41 | type = number 42 | description = "Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)" 43 | default = 8 44 | } 45 | 46 | variable "compute_environment_instance_types" { 47 | type = list(string) 48 | description = "The instance types for the compute environment" 49 | default = ["c4.large", "c4.xlarge", "c4.2xlarge", "c4.4xlarge", "c4.8xlarge"] 50 | } 51 | 52 | variable "compute_environment_min_vcpus" { 53 | type = number 54 | description = "Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate)" 55 | default = 8 56 | } 57 | 58 | variable "compute_environment_max_vcpus" { 59 | type = number 60 | description = "Maximum VCPUs for Batch Compute Environment [16-96]" 61 | default = 64 62 | } 63 | 64 | variable "iam_partition" { 65 | type = string 66 | default = "aws" 67 | description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" 68 | } 69 | 70 | variable "tags" { 71 | description = "aws tags" 72 | type = map(string) 73 | } 74 | 75 | # variables from infra project that defines the VPC we will deploy to 76 | 77 | variable "subnet1_id" { 78 | type = string 79 | description = "First subnet used for availability zone redundancy" 80 | } 81 | 82 | variable "subnet2_id" { 83 | type = string 84 | description = "Second subnet used for availability zone redundancy" 85 | } 86 | 87 | variable "vpc_cidr_block" { 88 | type = string 89 | description = "The VPC CIDR block that we'll access list on our Metadata Service API to allow all internal communications" 90 | } 91 | 92 | variable "vpc_id" { 93 | type = string 94 | description = "The id of the single VPC we stood up for all Metaflow resources to exist in." 95 | } 96 | -------------------------------------------------------------------------------- /aws/terraform/modules/metaflow/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | # grants us access to the new `aws_api_gateway_rest_api_policy` resource which allows us to more easily avoid 6 | # hard coding values and avoid self referential issues when attempting to get the `aws_api_gateway_rest_api`'s 7 | # id for writing the policy. Previously we wrote the policy inline which is the old style. We're jumping from `v3.7.0` 8 | # `v3.16.0` which is only a minor upgrade. 9 | version = ">= 3.38.0" 10 | } 11 | } 12 | required_version = ">= 0.13" 13 | } 14 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/README.md: -------------------------------------------------------------------------------- 1 | # Sagemaker Notebook 2 | 3 | Provides an always on SageMaker Notebook instance that can speak to our internal MetaData Service and Datastore. 4 | This will allow data scientists to use this notebook as a way to interact with historical flows, inspecting executions and artifacts. 5 | 6 | Depends on the output of the `infra` and `metaflow` projects. 7 | 8 | To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/metaflow-on-aws#notebooks) 9 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/data.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | 3 | data "aws_region" "current" {} 4 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/example.tfvars: -------------------------------------------------------------------------------- 1 | env = "prod" 2 | aws_region = "us-west-2" 3 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/iam.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy_document" "sagemaker_execution_role_assume_role" { 2 | statement { 3 | effect = "Allow" 4 | 5 | principals { 6 | identifiers = [ 7 | "sagemaker.amazonaws.com" 8 | ] 9 | type = "Service" 10 | } 11 | 12 | actions = [ 13 | "sts:AssumeRole" 14 | ] 15 | } 16 | } 17 | 18 | resource "aws_iam_role" "sagemaker_execution_role" { 19 | name = local.sagemaker_execution_role_name 20 | description = "The role that our SageMaker instances uses" 21 | 22 | assume_role_policy = data.aws_iam_policy_document.sagemaker_execution_role_assume_role.json 23 | 24 | tags = local.standard_tags 25 | } 26 | 27 | data "aws_iam_policy_document" "iam_pass_role" { 28 | statement { 29 | sid = "AllowPassRole" 30 | 31 | actions = [ 32 | "iam:PassRole" 33 | ] 34 | 35 | resources = [ 36 | "*" 37 | ] 38 | 39 | condition { 40 | test = "StringEquals" 41 | values = [ 42 | "sagemaker.amazonaws.com" 43 | ] 44 | variable = "iam:PassedToService" 45 | } 46 | } 47 | } 48 | 49 | data "aws_iam_policy_document" "misc_permissions" { 50 | statement { 51 | sid = "MiscPermissions" 52 | 53 | effect = "Allow" 54 | 55 | actions = [ 56 | "cloudwatch:PutMetricData", 57 | "ecr:GetDownloadUrlForLayer", 58 | "ecr:BatchGetImage", 59 | "ecr:GetAuthorizationToken", 60 | "ecr:BatchCheckLayerAvailability" 61 | ] 62 | 63 | resources = [ 64 | "*" 65 | ] 66 | } 67 | } 68 | 69 | data "aws_iam_policy_document" "logs_roles_policy" { 70 | statement { 71 | sid = "CreateLogStream" 72 | 73 | effect = "Allow" 74 | 75 | actions = [ 76 | "logs:CreateLogStream" 77 | ] 78 | 79 | resources = [ 80 | "arn:${var.iam_partition}:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/batch/job:log-stream:*", 81 | "arn:${var.iam_partition}:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/sagemaker/NotebookInstances:log-stream:*", 82 | ] 83 | } 84 | 85 | statement { 86 | sid = "LogEvents" 87 | 88 | effect = "Allow" 89 | 90 | actions = [ 91 | "logs:PutLogEvents", 92 | "logs:GetLogEvents", 93 | ] 94 | 95 | resources = [ 96 | "${aws_sagemaker_notebook_instance.this.arn}/jupyter.log", 97 | "${aws_sagemaker_notebook_instance.this.arn}/LifecycleConfigOnCreate", 98 | "arn:${var.iam_partition}:logs:${local.aws_region}:${local.aws_account_id}:log-group:/aws/batch/job:log-stream:job-queue-", 99 | ] 100 | } 101 | 102 | statement { 103 | sid = "LogGroup" 104 | 105 | effect = "Allow" 106 | 107 | actions = [ 108 | "logs:DescribeLogGroups", 109 | "logs:DescribeLogStreams", 110 | "logs:CreateLogGroup", 111 | ] 112 | 113 | resources = [ 114 | "*" 115 | ] 116 | } 117 | } 118 | 119 | data "aws_iam_policy_document" "sagemaker_permissions" { 120 | statement { 121 | sid = "SageMakerNotebook" 122 | 123 | effect = "Allow" 124 | 125 | actions = [ 126 | "sagemaker:DescribeNotebook*", 127 | "sagemaker:StartNotebookInstance", 128 | "sagemaker:StopNotebookInstance", 129 | "sagemaker:UpdateNotebookInstance", 130 | "sagemaker:CreatePresignedNotebookInstanceUrl" 131 | ] 132 | 133 | resources = [ 134 | aws_sagemaker_notebook_instance.this.arn, 135 | "arn:${var.iam_partition}:logs:${local.aws_region}:${local.aws_account_id}:notebook-instance-lifecycle-config/basic*", 136 | ] 137 | } 138 | } 139 | 140 | data "aws_iam_policy_document" "custom_s3_list_access" { 141 | statement { 142 | sid = "BucketAccess" 143 | 144 | effect = "Allow" 145 | 146 | actions = [ 147 | "s3:ListBucket" 148 | ] 149 | 150 | resources = [ 151 | data.terraform_remote_state.metaflow.outputs.metaflow_s3_bucket_arn 152 | ] 153 | } 154 | } 155 | 156 | data "aws_iam_policy_document" "custom_s3_access" { 157 | statement { 158 | sid = "ObjectAccess" 159 | 160 | effect = "Allow" 161 | 162 | actions = [ 163 | "s3:PutObject", 164 | "s3:GetObject", 165 | "s3:DeleteObject" 166 | ] 167 | 168 | resources = [ 169 | "${data.terraform_remote_state.metaflow.outputs.metaflow_s3_bucket_arn}/*" 170 | ] 171 | } 172 | } 173 | 174 | data "aws_iam_policy_document" "deny_presigned" { 175 | statement { 176 | sid = "DenyPresigned" 177 | 178 | effect = "Deny" 179 | 180 | actions = [ 181 | "s3:*" 182 | ] 183 | 184 | resources = [ 185 | "*" 186 | ] 187 | 188 | condition { 189 | test = "StringNotEquals" 190 | values = [ 191 | "REST-HEADER" 192 | ] 193 | variable = "s3:authType" 194 | } 195 | } 196 | } 197 | 198 | data "aws_iam_policy_document" "s3_kms" { 199 | statement { 200 | effect = "Allow" 201 | 202 | actions = [ 203 | "kms:Decrypt", 204 | "kms:Encrypt", 205 | "kms:GenerateDataKey" 206 | ] 207 | 208 | resources = [ 209 | data.terraform_remote_state.metaflow.outputs.datastore_s3_bucket_kms_key_arn 210 | ] 211 | } 212 | } 213 | 214 | resource "aws_iam_role_policy" "grant_iam_pass_role" { 215 | name = "iam_pass_role" 216 | role = aws_iam_role.sagemaker_execution_role.name 217 | policy = data.aws_iam_policy_document.iam_pass_role.json 218 | } 219 | 220 | resource "aws_iam_role_policy" "grant_misc_permissions_role" { 221 | name = "misc_permissions" 222 | role = aws_iam_role.sagemaker_execution_role.name 223 | policy = data.aws_iam_policy_document.misc_permissions.json 224 | } 225 | 226 | resource "aws_iam_role_policy" "grant_logs_roles_policy" { 227 | name = "logs" 228 | role = aws_iam_role.sagemaker_execution_role.name 229 | policy = data.aws_iam_policy_document.logs_roles_policy.json 230 | } 231 | 232 | resource "aws_iam_role_policy" "grant_sagemaker_permissions" { 233 | name = "sagemaker" 234 | role = aws_iam_role.sagemaker_execution_role.name 235 | policy = data.aws_iam_policy_document.sagemaker_permissions.json 236 | } 237 | 238 | resource "aws_iam_role_policy" "grant_custom_s3_list_access" { 239 | name = "s3_list" 240 | role = aws_iam_role.sagemaker_execution_role.name 241 | policy = data.aws_iam_policy_document.custom_s3_list_access.json 242 | } 243 | 244 | resource "aws_iam_role_policy" "grant_custom_s3_access" { 245 | name = "s3" 246 | role = aws_iam_role.sagemaker_execution_role.name 247 | policy = data.aws_iam_policy_document.custom_s3_access.json 248 | } 249 | 250 | resource "aws_iam_role_policy" "grant_deny_presigned" { 251 | name = "deny_presigned" 252 | role = aws_iam_role.sagemaker_execution_role.name 253 | policy = data.aws_iam_policy_document.deny_presigned.json 254 | } 255 | 256 | resource "aws_iam_role_policy" "grant_s3_kms" { 257 | name = "s3_kms" 258 | role = aws_iam_role.sagemaker_execution_role.name 259 | policy = data.aws_iam_policy_document.s3_kms.json 260 | } 261 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | resource_prefix = var.app 3 | resource_suffix = "${var.env}${module.common_vars.workspace_suffix}" 4 | 5 | aws_region = data.aws_region.current.name 6 | aws_account_id = data.aws_caller_identity.current.account_id 7 | standard_tags = module.common_vars.tags 8 | 9 | # Name of Sagemaker IAM role 10 | sagemaker_execution_role_name = "${local.resource_prefix}-sm-execution-role-${local.resource_suffix}-${var.aws_region}" 11 | } 12 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/main.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = var.aws_region 3 | } 4 | 5 | module "common_vars" { 6 | source = "../modules/common" 7 | 8 | app = var.app 9 | env = var.env 10 | } 11 | 12 | data "terraform_remote_state" "infra" { 13 | backend = "local" 14 | 15 | config = { 16 | path = "../infra/terraform.tfstate" 17 | } 18 | } 19 | 20 | data "terraform_remote_state" "metaflow" { 21 | backend = "local" 22 | 23 | config = { 24 | path = "../metaflow/terraform.tfstate" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/outputs.tf: -------------------------------------------------------------------------------- 1 | output "SAGEMAKER_NOTEBOOK_URL" { 2 | value = "https://${aws_sagemaker_notebook_instance.this.name}.notebook.${var.aws_region}.sagemaker.aws/tree" 3 | description = "URL used to access the SageMaker notebook instance" 4 | } 5 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/sagemaker.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "sagemaker" { 2 | name = "${local.resource_prefix}-sagemaker-security-group-${local.resource_suffix}" 3 | description = "Sagemaker notebook security group" 4 | vpc_id = data.terraform_remote_state.infra.outputs.vpc_id 5 | 6 | ingress { 7 | from_port = 8080 8 | to_port = 8080 9 | protocol = "TCP" 10 | cidr_blocks = ["0.0.0.0/0"] 11 | } 12 | 13 | # egress to anywhere 14 | egress { 15 | from_port = 0 16 | to_port = 0 17 | protocol = "-1" 18 | cidr_blocks = ["0.0.0.0/0"] 19 | } 20 | 21 | tags = local.standard_tags 22 | } 23 | 24 | resource "aws_sagemaker_notebook_instance_lifecycle_configuration" "this" { 25 | name = "${local.resource_prefix}-nb-instance-lc-conf-${local.resource_suffix}" 26 | 27 | # recreates what our outputs produces 28 | # this is acceptable repetition as we cannot reference our outputs until the Terraform stack is applied 29 | # Note: We are purposefully providing the value of METAFLOW_SERVICE_INTERNAL_URL to the key METAFLOW_SERVICE_URL 30 | # this way our Jupyter Notebooks will speak to our internal Load Balancer just like it is our API Gateway. 31 | # This ensures our Jupyter Notebook queries stay within our VPC. 32 | on_start = base64encode( 33 | <> /etc/profile.d/jupyter-env.sh 36 | echo 'export METAFLOW_DATATOOLS_S3ROOT=s3://${data.terraform_remote_state.metaflow.outputs.metaflow_s3_bucket_name}/data/' >> /etc/profile.d/jupyter-env.sh 37 | echo 'export METAFLOW_SERVICE_URL=${data.terraform_remote_state.metaflow.outputs.METAFLOW_SERVICE_INTERNAL_URL}' >> /etc/profile.d/jupyter-env.sh 38 | echo 'export AWS_DEFAULT_REGION=${var.aws_region}' >> /etc/profile.d/jupyter-env.sh 39 | echo 'export METAFLOW_DEFAULT_DATASTORE=s3' >> /etc/profile.d/jupyter-env.sh 40 | echo 'export METAFLOW_DEFAULT_METADATA=service' >> /etc/profile.d/jupyter-env.sh 41 | initctl restart jupyter-server --no-wait 42 | EOF 43 | ) 44 | } 45 | 46 | resource "random_pet" "this" { 47 | } 48 | 49 | 50 | resource "aws_sagemaker_notebook_instance" "this" { 51 | # Random Pet name is added to make it easier to deploy changes to this instance without having name conflicts 52 | # names must be unique, so the "Random Pet" helps us here 53 | name = "${local.resource_prefix}-nb-inst-${random_pet.this.id}-${local.resource_suffix}" 54 | 55 | instance_type = var.ec2_instance_type 56 | 57 | role_arn = aws_iam_role.sagemaker_execution_role.arn 58 | 59 | lifecycle_config_name = aws_sagemaker_notebook_instance_lifecycle_configuration.this.name 60 | 61 | subnet_id = data.terraform_remote_state.infra.outputs.subnet1_id 62 | 63 | security_groups = [ 64 | aws_security_group.sagemaker.id 65 | ] 66 | 67 | # The standard tags to apply to every AWS resource. 68 | tags = local.standard_tags 69 | } 70 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/variables.tf: -------------------------------------------------------------------------------- 1 | variable "app" { 2 | default = "sm-notebook" 3 | description = "Name of the application" 4 | } 5 | 6 | variable "aws_region" { 7 | type = string 8 | description = "AWS region we will deploy to." 9 | } 10 | 11 | variable "env" { 12 | type = string 13 | default = "dev" 14 | description = "The environment for this stack to be created in. Used for the tfstate bucket and naming scope of resources." 15 | } 16 | 17 | variable "ec2_instance_type" { 18 | type = string 19 | description = "Amazon EC2 instance type used to stand up SageMaker instance" 20 | default = "ml.t3.medium" 21 | } 22 | 23 | variable "iam_partition" { 24 | type = string 25 | default = "aws" 26 | description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" 27 | } 28 | -------------------------------------------------------------------------------- /aws/terraform/sagemaker-notebook/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | } 6 | random = { 7 | source = "hashicorp/random" 8 | } 9 | } 10 | required_version = ">= 0.13" 11 | } 12 | --------------------------------------------------------------------------------