├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── assets └── auto_shutdown_template │ └── autoshutdown-script.sh ├── diagrams ├── sagemaker_auto_shutdown.png └── sagemaker_domain_vpc_only.png ├── examples ├── main.tf ├── providers.tf ├── variables.tf └── versions.tf ├── locals.tf ├── providers.tf ├── sagemaker.tf ├── submodules ├── iam │ ├── iam.tf │ ├── outputs.tf │ └── variables.tf ├── s3_upload │ ├── outputs.tf │ ├── s3_upload.tf │ └── variables.tf └── vpc │ ├── outputs.tf │ ├── variables.tf │ └── vpc.tf ├── variables.tf └── versions.tf /.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # .tfstate files 5 | *.tfstate 6 | *.tfstate.* 7 | 8 | # Crash log files 9 | crash.log 10 | crash.*.log 11 | 12 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 13 | # password, private keys, and other secrets. These should not be part of version 14 | # control as they are data points which are potentially sensitive and subject 15 | # to change depending on the environment. 16 | *.tfvars 17 | *.tfvars.json 18 | 19 | # Ignore override files as they are usually used to override resources locally and so 20 | # are not checked in 21 | override.tf 22 | override.tf.json 23 | *_override.tf 24 | *_override.tf.json 25 | 26 | # Include override files you do wish to add to version control using negated pattern 27 | # !example_override.tf 28 | 29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 30 | # example: *tfplan* 31 | 32 | # Ignore CLI configuration files 33 | .terraformrc 34 | terraform.rc 35 | .terraform.lock.hcl 36 | 37 | # PyBuilder 38 | .pybuilder/ 39 | target/ 40 | 41 | # Jupyter Notebook 42 | .ipynb_checkpoints 43 | 44 | # IPython 45 | profile_default/ 46 | ipython_config.py 47 | 48 | # Files created by environment 49 | .DS_Store 50 | 51 | # Environments 52 | .env 53 | .venv 54 | env/ 55 | venv/ 56 | ENV/ 57 | env.bak/ 58 | venv.bak/ 59 | 60 | # Pyre type checker 61 | .pyre/ 62 | 63 | # pytype static type analyzer 64 | .pytype/ 65 | 66 | # Cython debug symbols 67 | cython_debug/ 68 | 69 | # PyCharm 70 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 71 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 72 | # and can be added to the global gitignore or merged into this file. For a more nuclear 73 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 74 | .idea/ 75 | 76 | # Drawio 77 | # backup files for diagrams 78 | **/.$*.bkp 79 | 80 | *.tar.gz 81 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feature Rich Secure SageMaker Domain by Terraform 2 | 3 | The solution will use Terraform to create: 4 | - A VPC with subnets, security groups, as well as VPC endpoints to support VPC Only mode for the SageMaker Domain 5 | - A SageMaker Domain in VPC Only mode with a user profile 6 | - An AWS Key Management Service (KMS) key to encrypt the SageMaker Studio's Amazon Elastic File System (EFS) volume 7 | - A Lifecycle Configuration attached to the SageMaker Domain to automatically shut down idle Studio notebook instances 8 | - A SageMaker Domain Execution Role and IAM policies to enable SageMaker Studio and Canvas functionalities 9 | 10 | ## VPC Requirements to Use VPC Only Mode 11 | 12 | To create a SageMaker Domain in VPC Only mode, it requires a VPC with the following configurations: 13 | 14 | 1. At least two private subnets, each in a different Availability Zone to ensure high availability. 15 | 2. Ensure your subnets have the required number of IP addresses needed. We recommend between 2 and 4 IP addresses per user. The total IP address capacity for a Studio domain is the sum of available IP addresses for each subnet provided when the domain is created. 16 | 3. Set up one or more security groups with inbound and outbound rules that together allow the following traffic: 17 | - NFS traffic over TCP on port 2049 between the domain and the Amazon EFS volume. 18 | - TCP traffic within the security group. This is required for connectivity between the JupyterServer app and the KernelGateway apps. You must allow access to at least ports in the range 8192-65535. 19 | 4. Create Gateway endpoints for S3. SageMaker Studio needs to access Amazon S3 from your VPC using Gateway VPC endpoints. After you create the gateway endpoint, you need to add it as a target in your route table for traffic destined from your VPC to Amazon S3. 20 | 5. Create interface VPC endpoints (AWS PrivateLink) to allow Studio to access the following services with the corresponding service names. You must also associate a security group for your VPC with these endpoints to allow all inbound traffic from port 443: 21 | - SageMaker API : com.amazonaws.region.sagemaker.api 22 | - SageMaker runtime: com.amazonaws.region.sagemaker.runtime. This is required to run Studio notebooks and to train and host models. 23 | - SageMaker Feature Store: com.amazonaws.region.sagemaker.featurestore-runtime 24 | - To use SageMaker Projects: com.amazonaws.region.servicecatalog 25 | 26 | In addition to above VPC endpoints, to use SageMaker Canvas, you need to also create the following interface VPC endpoints: 27 | 28 | - Amazon Forecast: com.amazonaws.region.forecast 29 | - Amazon Forecast Query: com.amazonaws.region.forecastquery 30 | - Amazon Rekognition: com.amazonaws.region.rekognition 31 | - Amazon Textract: com.amazonaws.region.textract 32 | - Amazon Comprehend: com.amazonaws.region.comprehend 33 | - Amazon Security Token Service (STS): com.amazonaws.region.sts 34 | - Amazon Redshift: com.amazonaws.region.redshift-data 35 | - Amazon Athena: com.amazonaws.region.athena 36 | - AWS Glue: com.amazonaws.region.glue 37 | 38 | To view all VPC endpoints for each service you can use with SageMaker Canvas, please go to: https://docs.aws.amazon.com/sagemaker/latest/dg/canvas-vpc.html 39 | 40 | ## SageMaker Domain 41 | 42 | ### SageMaker Domain in VPC Only Mode 43 | 44 | 45 | 46 | By launching SageMaker Domain in your VPC, you can control the data flow from your SageMaker Studio and Canvas environments. This allows you to restrict internet access, monitor and inspect traffic using standard AWS networking and security capabilities, and connect to other AWS resources through VPC endpoints. 47 | 48 | ### KMS Encryption for SageMaker Studio's EFS Volume 49 | 50 | The first time a user on your team onboards to SageMaker Studio, SageMaker creates an EFS volume for the team. A home directory is created in the volume for each user who onboards to Studio as part of your team. Notebook files and data files are stored in these directories. 51 | 52 | You can encrypt your SageMaker Studio's EFS volume with a KMS key so your home directories' data are encrypted at rest. This Terraform solution creates a KMS key and use it to encrypt SageMaker Studio's EFS volume. 53 | 54 | ### SageMaker Domain Lifecycle Configuration to Auto-shutdown Idle Studio Notebooks 55 | 56 | 57 | Lifecycle Configurations are shell scripts triggered by Amazon SageMaker Studio lifecycle events, such as starting a new Studio notebook. You can use Lifecycle Configurations to automate customization for your Studio environment. 58 | 59 | This Terraform solution creates a SageMaker Lifecycle Configuration to detect and stop idle resources that are incurring costs within Studio using an auto-shutdown Jupyter extension. Under the hood, the following resources are created/configured to achieve the desired result: 60 | 61 | 1. Create an S3 bucket and upload to the latest version of the auto-shutdown extension sagemaker_studio_autoshutdown-0.1.5.tar.gz. Later the auto-shutdown script will run s3 cp command to download the extension file from the S3 bucket on Jupyter Server start ups. Please refer to the following GitHub repos for more information regarding the [auto-shutdown extension](https://github.com/aws-samples/sagemaker-studio-auto-shutdown-extension) and [auto-shutdown script](https://github.com/aws-samples/sagemaker-studio-lifecycle-config-examples/blob/main/scripts/install-autoshutdown-server-extension/on-jupyter-server-start.sh). 62 | 2. Create an [aws_sagemaker_studio_lifecycle_config](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sagemaker_studio_lifecycle_config) resource “auto_shutdown“. This resource will encode the autoshutdown-script.sh with base 64 and create a Lifecycle Configuration for the SageMaker domain. 63 | 3. For SageMaker domain default user settings, specify the Lifecycle Configuration arn and set it as default. 64 | 65 | ### SageMaker Execution Role IAM Permissions 66 | 67 | As a managed service, SageMaker performs operations on your behalf on the AWS hardware that is managed by SageMaker. SageMaker can perform only operations that the user permits. 68 | 69 | A SageMaker user can grant these permissions with an IAM role (referred to as an execution role). When you create a SageMaker Studio domain, then by default SageMaker allows you to create the execution role. You can restrict access to user profiles by changing the SageMaker user profile role. This Terraform solution attaches the following IAM policies to the SageMaker execution role: 70 | 71 | - SageMaker managed AmazonSageMakerFullAccess policy. This policy grants the execution role full access to use SageMaker Studio. 72 | - A custom managed IAM policy to access the KMS key used to encrypt the SageMaker Studio's EFS volume. 73 | - SageMaker managed AmazonSageMakerCanvasFullAccess and AmazonSageMakerCanvasAIServicesAccess policies. These policies grant the execution role full access to use SageMaker Canvas. 74 | - In order to enable time series analysis in SageMaker Canvas, you also need to add the IAM Trust policy for Amazon Forecast. 75 | 76 | ## Deployment 77 | 78 | ### Pre-requisites 79 | - An AWS account. 80 | - An IAM user with administrative access. 81 | - AWS CLI. Check [this guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) for up to date instructions to install AWS CLI. 82 | - Terraform CLI. Check [this guide](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) for up to date instructions to install Terafrom for Amazon Linux. 83 | - You must establish how the AWS CLI authenticates with AWS when you deploy this solution. To configure credentials for programmatic access for the AWS CLI, choose one of the options from [this guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-authentication.html) 84 | 85 | ### Clone the Code Repo and Download the Auto-shutdown Extension 86 | - Clone the repo and navigate to the sagemaker-domain-vpconly-canvas-with-terraform folder: 87 | ``` 88 | git clone https://github.com/aws-samples/sagemaker-domain-vpconly-canvas-with-terraform.git 89 | 90 | cd sagemaker-domain-vpconly-canvas-with-terraform 91 | ``` 92 | - Download the auto-shutdown extension and place it in assets/auto_shutdown_template folder: 93 | ``` 94 | wget https://github.com/aws-samples/sagemaker-studio-auto-shutdown-extension/raw/main/sagemaker_studio_autoshutdown-0.1.5.tar.gz -P assets/auto_shutdown_template 95 | ``` 96 | - Your file structure should look like: 97 | ``` 98 | . 99 | ├── LICENSE 100 | ├── README.md 101 | ├── assets 102 | │   └── auto_shutdown_template 103 | │   ├── autoshutdown-script.sh 104 | │   └── sagemaker_studio_autoshutdown-0.1.5.tar.gz 105 | ... 106 | ``` 107 | 108 | ### Deployment Steps 109 | In terminal, run the following terraform commands: 110 | 111 | ``` 112 | terraform init 113 | ``` 114 | You should see a success message like: 115 | ``` 116 | Terraform has been successfully initialized! 117 | 118 | You may now begin working with Terraform. Try running "terraform plan" to see 119 | any changes that are required for your infrastructure. All Terraform commands 120 | should now work. 121 | 122 | If you ever set or change modules or backend configuration for Terraform, 123 | rerun this command to reinitialize your working directory. If you forget, other 124 | commands will detect it and remind you to do so if necessary. 125 | ``` 126 | Now you can run: 127 | ``` 128 | terraform plan 129 | ``` 130 | After you are satisfied with the resources the plan outlines to be created, you can run: 131 | ``` 132 | terraform apply 133 | ``` 134 | Enter “yes“ when prompted to confirm the deployment. 135 | 136 | If successfully deployed, you should see an output that looks like: 137 | ``` 138 | Apply complete! Resources: X added, 0 changed, 0 destroyed. 139 | ``` 140 | 141 | ## Cleaning up 142 | Run the following command to clean up your resources 143 | ``` 144 | terraform destroy 145 | ``` 146 | 147 | *Tip: If you set the EFS retention policy as “Retain” (the default), you will run into issues during `terraform destroy` because Terraform is trying to delete the subnets and VPC when the EFS volume as well as its associated security groups (created by SageMaker) still exist. To fix this, first delete the EFS volume manually, and then delete the subnets and VPC manually in the AWS console.* -------------------------------------------------------------------------------- /assets/auto_shutdown_template/autoshutdown-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script installs the idle notebook auto-checker server extension to SageMaker Studio 3 | # The original extension has a lab extension part where users can set the idle timeout via a Jupyter Lab widget. 4 | # In this version the script installs the server side of the extension only. The idle timeout 5 | # can be set via a command-line script which will be also created by this create and places into the 6 | # user's home folder 7 | # 8 | # Installing the server side extension does not require Internet connection (as all the dependencies are stored in the 9 | # install tarball) and can be done via VPCOnly mode. 10 | 11 | set -eux 12 | 13 | # timeout in minutes 14 | export TIMEOUT_IN_MINS=120 15 | 16 | # Should already be running in user home directory, but just to check: 17 | cd /home/sagemaker-user 18 | 19 | # By working in a directory starting with ".", we won't clutter up users' Jupyter file tree views 20 | mkdir -p .auto-shutdown 21 | 22 | # Create the command-line script for setting the idle timeout 23 | cat > .auto-shutdown/set-time-interval.sh << EOF 24 | #!/opt/conda/bin/python 25 | import json 26 | import requests 27 | TIMEOUT=$${TIMEOUT_IN_MINS} 28 | session = requests.Session() 29 | # Getting the xsrf token first from Jupyter Server 30 | response = session.get("http://localhost:8888/jupyter/default/tree") 31 | # calls the idle_checker extension's interface to set the timeout value 32 | response = session.post("http://localhost:8888/jupyter/default/sagemaker-studio-autoshutdown/idle_checker", 33 | json={"idle_time": TIMEOUT, "keep_terminals": False}, 34 | params={"_xsrf": response.headers['Set-Cookie'].split(";")[0].split("=")[1]}) 35 | if response.status_code == 200: 36 | print("Succeeded, idle timeout set to {} minutes".format(TIMEOUT)) 37 | else: 38 | print("Error!") 39 | print(response.status_code) 40 | EOF 41 | chmod +x .auto-shutdown/set-time-interval.sh 42 | 43 | # "wget" is not part of the base Jupyter Server image, you need to install it first if needed to download the tarball 44 | # sudo yum install -y wget 45 | # You can download the tarball from GitHub or alternatively, if you're using VPCOnly mode, you can host on S3 46 | # wget -O .auto-shutdown/extension.tar.gz https://github.com/aws-samples/sagemaker-studio-auto-shutdown-extension/raw/main/sagemaker_studio_autoshutdown-0.1.5.tar.gz 47 | 48 | # Or instead, could serve the tarball from an S3 bucket in which case "wget" would not be needed: 49 | aws s3 cp s3://${tar_file_bucket}/${tar_file_id} .auto-shutdown/extension.tar.gz 50 | 51 | # Installs the extension 52 | cd .auto-shutdown 53 | tar xzf extension.tar.gz 54 | cd sagemaker_studio_autoshutdown-0.1.5 55 | 56 | # Activate studio environment just for installing extension 57 | export AWS_SAGEMAKER_JUPYTERSERVER_IMAGE="$${AWS_SAGEMAKER_JUPYTERSERVER_IMAGE:-'jupyter-server'}" 58 | if [ "$AWS_SAGEMAKER_JUPYTERSERVER_IMAGE" = "jupyter-server-3" ] ; then 59 | eval "$(conda shell.bash hook)" 60 | conda activate studio 61 | fi; 62 | pip install --no-dependencies --no-build-isolation -e . 63 | jupyter serverextension enable --py sagemaker_studio_autoshutdown 64 | if [ "$AWS_SAGEMAKER_JUPYTERSERVER_IMAGE" = "jupyter-server-3" ] ; then 65 | conda deactivate 66 | fi; 67 | 68 | # Restarts the jupyter server 69 | nohup supervisorctl -c /etc/supervisor/conf.d/supervisord.conf restart jupyterlabserver 70 | 71 | # Waiting for 30 seconds to make sure the Jupyter Server is up and running 72 | sleep 30 73 | 74 | # Calling the script to set the idle-timeout and active the extension 75 | /home/sagemaker-user/.auto-shutdown/set-time-interval.sh -------------------------------------------------------------------------------- /diagrams/sagemaker_auto_shutdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-domain-vpconly-canvas-with-terraform/03b098b22601fcec0c3ec68c26a57e37783db8b7/diagrams/sagemaker_auto_shutdown.png -------------------------------------------------------------------------------- /diagrams/sagemaker_domain_vpc_only.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-domain-vpconly-canvas-with-terraform/03b098b22601fcec0c3ec68c26a57e37783db8b7/diagrams/sagemaker_domain_vpc_only.png -------------------------------------------------------------------------------- /examples/main.tf: -------------------------------------------------------------------------------- 1 | module "sagemaker_domain" { 2 | source = "../" 3 | domain_name = var.domain_name 4 | auth_mode = var.auth_mode 5 | app_network_access_type = var.app_network_access_type 6 | efs_retention_policy = var.efs_retention_policy 7 | } 8 | -------------------------------------------------------------------------------- /examples/providers.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = var.aws_region 3 | } -------------------------------------------------------------------------------- /examples/variables.tf: -------------------------------------------------------------------------------- 1 | variable "domain_name" { 2 | description = "Sagemaker Domain Name" 3 | type = string 4 | default = "sagemaker-domain" 5 | } 6 | 7 | variable "auth_mode" { 8 | description = "The mode of authentication that members use to access the domain. Valid values are IAM and SSO" 9 | type = string 10 | default = "IAM" 11 | } 12 | 13 | variable "app_network_access_type" { 14 | description = "Specifies the VPC used for non-EFS traffic. Valid values are PublicInternetOnly and VpcOnly" 15 | type = string 16 | default = "VpcOnly" 17 | } 18 | 19 | variable "efs_retention_policy" { 20 | description = "The retention policy for data stored on an EFS volume. Valid values are Retain or Delete." 21 | type = string 22 | default = "Retain" 23 | } 24 | 25 | variable "aws_region" { 26 | description = "AWS Region." 27 | type = string 28 | default = "us-east-1" 29 | } 30 | -------------------------------------------------------------------------------- /examples/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">=1.4.0" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 5.10.0" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | vpc = { 3 | cidr_block = "10.0.0.0/23" 4 | private_subnet_cidrs = ["10.0.1.0/25", "10.0.1.128/25"] 5 | availability_zones = ["us-east-1a", "us-east-1b"] 6 | } 7 | 8 | sagemaker = { 9 | jupyter_image_tag = "jupyter-server-3" 10 | image_arn_prefixes = { 11 | us-east-1 = "arn:aws:sagemaker:us-east-1:081325390199:image" 12 | us-east-2 = "arn:aws:sagemaker:us-east-2:429704687514:image" 13 | us-west-1 = "arn:aws:sagemaker:us-west-1:742091327244:image" 14 | us-west-2 = "arn:aws:sagemaker:us-west-2:236514542706:image" 15 | af-south-1 = "arn:aws:sagemaker:af-south-1:559312083959:image" 16 | ap-east-1 = "arn:aws:sagemaker:ap-east-1:493642496378:image" 17 | ap-south-1 = "arn:aws:sagemaker:ap-south-1:394103062818:image" 18 | ap-northeast-2 = "arn:aws:sagemaker:ap-northeast-2:806072073708:image" 19 | ap-southeast-1 = "arn:aws:sagemaker:ap-southeast-1:492261229750:image" 20 | ap-southeast-2 = "arn:aws:sagemaker:ap-southeast-2:452832661640:image" 21 | ap-northeast-1 = "arn:aws:sagemaker:ap-northeast-1:102112518831:image" 22 | ca-central-1 = "arn:aws:sagemaker:ca-central-1:310906938811:image" 23 | eu-central-1 = "arn:aws:sagemaker:eu-central-1:936697816551:image" 24 | eu-west-1 = "arn:aws:sagemaker:eu-west-1:470317259841:image" 25 | eu-west-2 = "arn:aws:sagemaker:eu-west-2:712779665605:image" 26 | eu-west-3 = "arn:aws:sagemaker:eu-west-3:615547856133:image" 27 | eu-north-1 = "arn:aws:sagemaker:eu-north-1:243637512696:image" 28 | eu-south-1 = "arn:aws:sagemaker:eu-south-1:592751261982:image" 29 | sa-east-1 = "arn:aws:sagemaker:sa-east-1:782484402741:image" 30 | cn-north-1 = "arn:aws-cn:sagemaker:cn-north-1:390048526115:image" 31 | cn-northwest-1 = "arn:aws-cn:sagemaker:cn-northwest-1:390780980154:image" 32 | } 33 | } 34 | 35 | sagemaker_image_arn_prefix = lookup(local.sagemaker.image_arn_prefixes, data.aws_region.current.name, "us-east-1") 36 | 37 | sagemaker_image_arn = "${local.sagemaker_image_arn_prefix}/${local.sagemaker.jupyter_image_tag}" 38 | } 39 | -------------------------------------------------------------------------------- /providers.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = var.aws_region 3 | } -------------------------------------------------------------------------------- /sagemaker.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | 3 | data "aws_region" "current" {} 4 | 5 | resource "aws_kms_key" "sagemaker_efs_kms_key" { 6 | description = "KMS key used to encrypt SageMaker Studio EFS volume" 7 | enable_key_rotation = true 8 | } 9 | 10 | resource "aws_kms_key_policy" "example" { 11 | key_id = aws_kms_key.sagemaker_efs_kms_key.id 12 | policy = jsonencode({ 13 | Id = "example" 14 | Statement = [ 15 | { 16 | Action = "kms:*" 17 | Effect = "Allow" 18 | Principal = { 19 | AWS = [data.aws_caller_identity.current.account_id] 20 | } 21 | 22 | Resource = "*" 23 | Sid = "Enable IAM User Permissions" 24 | }, 25 | ] 26 | Version = "2012-10-17" 27 | }) 28 | } 29 | 30 | module "sagemaker_domain_execution_role" { 31 | source = "./submodules/iam" 32 | kms_arn = aws_kms_key.sagemaker_efs_kms_key.arn 33 | } 34 | 35 | module "sagemaker_domain_vpc" { 36 | source = "./submodules/vpc" 37 | cidr_block = local.vpc.cidr_block 38 | private_subnet_cidrs = local.vpc.private_subnet_cidrs 39 | azs = local.vpc.availability_zones 40 | } 41 | 42 | module "auto_shutdown_s3_upload" { 43 | source = "./submodules/s3_upload" 44 | kms_arn = aws_kms_key.sagemaker_efs_kms_key.arn 45 | } 46 | 47 | resource "aws_sagemaker_studio_lifecycle_config" "auto_shutdown" { 48 | studio_lifecycle_config_name = "auto-shutdown" 49 | studio_lifecycle_config_app_type = "JupyterServer" 50 | studio_lifecycle_config_content = base64encode(templatefile("${path.module}/assets/auto_shutdown_template/autoshutdown-script.sh", { tar_file_bucket = module.auto_shutdown_s3_upload.tar_file_bucket, tar_file_id = module.auto_shutdown_s3_upload.tar_file_id })) 51 | } 52 | 53 | resource "aws_sagemaker_domain" "sagemaker_domain" { 54 | domain_name = var.domain_name 55 | auth_mode = var.auth_mode 56 | vpc_id = module.sagemaker_domain_vpc.vpc_id 57 | subnet_ids = module.sagemaker_domain_vpc.subnet_ids 58 | 59 | default_user_settings { 60 | execution_role = module.sagemaker_domain_execution_role.default_execution_role 61 | jupyter_server_app_settings { 62 | default_resource_spec { 63 | lifecycle_config_arn = aws_sagemaker_studio_lifecycle_config.auto_shutdown.arn 64 | sagemaker_image_arn = local.sagemaker_image_arn 65 | } 66 | lifecycle_config_arns = [aws_sagemaker_studio_lifecycle_config.auto_shutdown.arn] 67 | } 68 | 69 | canvas_app_settings { 70 | time_series_forecasting_settings { 71 | status = "ENABLED" 72 | } 73 | } 74 | } 75 | 76 | domain_settings { 77 | security_group_ids = [module.sagemaker_domain_vpc.security_group_id] 78 | } 79 | 80 | kms_key_id = aws_kms_key.sagemaker_efs_kms_key.arn 81 | 82 | app_network_access_type = var.app_network_access_type 83 | 84 | retention_policy { 85 | home_efs_file_system = var.efs_retention_policy 86 | } 87 | } 88 | 89 | resource "aws_sagemaker_user_profile" "default_user" { 90 | domain_id = aws_sagemaker_domain.sagemaker_domain.id 91 | user_profile_name = "defaultuser" 92 | 93 | user_settings { 94 | execution_role = module.sagemaker_domain_execution_role.default_execution_role 95 | security_groups = [module.sagemaker_domain_vpc.security_group_id] 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /submodules/iam/iam.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy" "AmazonSageMakerFullAccess" { 2 | name = "AmazonSageMakerFullAccess" 3 | } 4 | 5 | data "aws_iam_policy" "AmazonSageMakerCanvasFullAccess" { 6 | name = "AmazonSageMakerCanvasFullAccess" 7 | } 8 | 9 | data "aws_iam_policy" "AmazonSageMakerCanvasAIServicesAccess" { 10 | name = "AmazonSageMakerCanvasAIServicesAccess" 11 | } 12 | 13 | data "aws_iam_policy_document" "sagemaker_domain_assume_role_policy" { 14 | statement { 15 | actions = ["sts:AssumeRole"] 16 | 17 | principals { 18 | type = "Service" 19 | identifiers = ["sagemaker.amazonaws.com", "forecast.amazonaws.com"] 20 | } 21 | } 22 | } 23 | 24 | resource "aws_iam_policy" "sagemaker_kms" { 25 | name = "sagemaker_kms_policy" 26 | path = "/" 27 | description = "KMS policy for SageMaker" 28 | policy = jsonencode({ 29 | Version = "2012-10-17" 30 | Statement = [ 31 | { 32 | Action = [ 33 | "kms:Decrypt", 34 | "kms:GenerateDataKey", 35 | "kms:CreateGrant" 36 | ] 37 | Effect = "Allow" 38 | Resource = [ 39 | var.kms_arn 40 | ] 41 | } 42 | ] 43 | }) 44 | } 45 | 46 | resource "aws_iam_role" "sagemaker_domain_default_execution_role" { 47 | name = "sagemaker_domain_exec_role_default" 48 | path = "/" 49 | assume_role_policy = data.aws_iam_policy_document.sagemaker_domain_assume_role_policy.json 50 | 51 | managed_policy_arns = [ 52 | data.aws_iam_policy.AmazonSageMakerFullAccess.arn, 53 | data.aws_iam_policy.AmazonSageMakerCanvasFullAccess.arn, 54 | data.aws_iam_policy.AmazonSageMakerCanvasAIServicesAccess.arn, 55 | aws_iam_policy.sagemaker_kms.arn 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /submodules/iam/outputs.tf: -------------------------------------------------------------------------------- 1 | output "default_execution_role" { 2 | value = aws_iam_role.sagemaker_domain_default_execution_role.arn 3 | } 4 | -------------------------------------------------------------------------------- /submodules/iam/variables.tf: -------------------------------------------------------------------------------- 1 | variable "kms_arn" { 2 | description = "kms key to encrypt EFS" 3 | type = string 4 | } 5 | -------------------------------------------------------------------------------- /submodules/s3_upload/outputs.tf: -------------------------------------------------------------------------------- 1 | output "tar_file_id" { 2 | value = aws_s3_object.autoshutdown_tar_upload.id 3 | } 4 | 5 | output "tar_file_bucket" { 6 | value = aws_s3_bucket.auto_shutdown_bucket.id 7 | } -------------------------------------------------------------------------------- /submodules/s3_upload/s3_upload.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | 3 | data "aws_region" "current" {} 4 | 5 | resource "aws_s3_bucket" "auto_shutdown_bucket" { 6 | bucket = "sagemaker-auto-shutdown-${data.aws_region.current.name}-${data.aws_caller_identity.current.account_id}" 7 | } 8 | 9 | resource "aws_s3_bucket_server_side_encryption_configuration" "auto_shutdown_bucket" { 10 | bucket = aws_s3_bucket.auto_shutdown_bucket.bucket 11 | rule { 12 | apply_server_side_encryption_by_default { 13 | kms_master_key_id = var.kms_arn 14 | sse_algorithm = "aws:kms" 15 | } 16 | } 17 | } 18 | 19 | resource "aws_s3_bucket_public_access_block" "block_public_auto_shutdown" { 20 | bucket = aws_s3_bucket.auto_shutdown_bucket.id 21 | block_public_acls = true 22 | block_public_policy = true 23 | ignore_public_acls = true 24 | restrict_public_buckets = true 25 | } 26 | 27 | resource "aws_s3_bucket_lifecycle_configuration" "auto_shutdown_bucket_life_cycle_config" { 28 | bucket = aws_s3_bucket.auto_shutdown_bucket.id 29 | 30 | rule { 31 | id = "life-cycle-configuration-rule" 32 | abort_incomplete_multipart_upload { 33 | days_after_initiation = 1 34 | } 35 | status = "Enabled" 36 | } 37 | } 38 | 39 | resource "aws_s3_bucket_logging" "auto_shutdown_bucket_access_log" { 40 | bucket = aws_s3_bucket.auto_shutdown_bucket.id 41 | 42 | target_bucket = aws_s3_bucket.auto_shutdown_bucket.id 43 | target_prefix = "log/" 44 | } 45 | 46 | resource "aws_s3_bucket_versioning" "auto_shutdown_bucket_versioning" { 47 | bucket = aws_s3_bucket.auto_shutdown_bucket.id 48 | versioning_configuration { 49 | status = "Enabled" 50 | } 51 | } 52 | 53 | resource "aws_s3_object" "autoshutdown_tar_upload" { 54 | bucket = aws_s3_bucket.auto_shutdown_bucket.id 55 | key = "sagemaker_studio_autoshutdown-0.1.5.tar.gz" 56 | source = "${path.module}/../../assets/auto_shutdown_template/sagemaker_studio_autoshutdown-0.1.5.tar.gz" 57 | kms_key_id = var.kms_arn 58 | } 59 | 60 | -------------------------------------------------------------------------------- /submodules/s3_upload/variables.tf: -------------------------------------------------------------------------------- 1 | variable "kms_arn" { 2 | description = "kms key to encrypt EFS" 3 | type = string 4 | } -------------------------------------------------------------------------------- /submodules/vpc/outputs.tf: -------------------------------------------------------------------------------- 1 | output "vpc_id" { 2 | value = aws_vpc.vpc.id 3 | } 4 | 5 | output "subnet_ids" { 6 | value = aws_subnet.private_subnets[*].id 7 | } 8 | 9 | output "security_group_id" { 10 | value = aws_security_group.sagemaker_sg.id 11 | } -------------------------------------------------------------------------------- /submodules/vpc/variables.tf: -------------------------------------------------------------------------------- 1 | variable "cidr_block" { 2 | type = string 3 | description = "CIDR block for SageMaker VPC" 4 | default = "10.0.0.0/23" 5 | } 6 | 7 | variable "private_subnet_cidrs" { 8 | type = list(string) 9 | description = "Private Subnet CIDR values" 10 | default = ["10.0.1.0/25", "10.0.1.128/25"] 11 | } 12 | 13 | variable "azs" { 14 | type = list(string) 15 | description = "Availability Zones" 16 | default = ["us-east-1a", "us-east-1b"] 17 | } 18 | -------------------------------------------------------------------------------- /submodules/vpc/vpc.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | 3 | data "aws_region" "current" {} 4 | 5 | resource "aws_vpc" "vpc" { 6 | cidr_block = var.cidr_block 7 | 8 | enable_dns_support = true 9 | enable_dns_hostnames = true 10 | 11 | tags = { 12 | Name = "SageMaker VPC" 13 | } 14 | } 15 | 16 | resource "aws_default_security_group" "default" { 17 | vpc_id = aws_vpc.vpc.id 18 | } 19 | 20 | resource "aws_flow_log" "vpc_flow_log" { 21 | iam_role_arn = aws_iam_role.vpc_flow_log_role.arn 22 | log_destination = aws_cloudwatch_log_group.vpc_flow_log_group.arn 23 | traffic_type = "ALL" 24 | vpc_id = aws_vpc.vpc.id 25 | } 26 | 27 | resource "aws_cloudwatch_log_group" "vpc_flow_log_group" { 28 | name = "vpc_flow_log_group" 29 | } 30 | 31 | data "aws_iam_policy_document" "assume_role" { 32 | statement { 33 | effect = "Allow" 34 | 35 | principals { 36 | type = "Service" 37 | identifiers = ["vpc-flow-logs.amazonaws.com"] 38 | } 39 | 40 | actions = ["sts:AssumeRole"] 41 | } 42 | } 43 | 44 | resource "aws_iam_role" "vpc_flow_log_role" { 45 | name = "vpc_flow_log_role" 46 | assume_role_policy = data.aws_iam_policy_document.assume_role.json 47 | } 48 | 49 | data "aws_iam_policy_document" "log_policy" { 50 | statement { 51 | effect = "Allow" 52 | 53 | actions = [ 54 | "logs:CreateLogGroup", 55 | "logs:CreateLogStream", 56 | "logs:PutLogEvents", 57 | "logs:DescribeLogGroups", 58 | "logs:DescribeLogStreams", 59 | ] 60 | 61 | resources = ["*"] 62 | } 63 | } 64 | 65 | resource "aws_iam_role_policy" "log_policy" { 66 | name = "log_policy" 67 | role = aws_iam_role.vpc_flow_log_role.id 68 | policy = data.aws_iam_policy_document.log_policy.json 69 | } 70 | 71 | resource "aws_subnet" "private_subnets" { 72 | count = length(var.private_subnet_cidrs) 73 | vpc_id = aws_vpc.vpc.id 74 | cidr_block = element(var.private_subnet_cidrs, count.index) 75 | availability_zone = element(var.azs, count.index) 76 | tags = { 77 | Name = "SageMaker Private Subnet ${count.index + 1}" 78 | } 79 | } 80 | 81 | resource "aws_route_table" "private_subnets_rt" { 82 | vpc_id = aws_vpc.vpc.id 83 | 84 | tags = { 85 | Name = "SageMaker Private Subnet Route Table" 86 | } 87 | } 88 | 89 | resource "aws_route_table_association" "private_rt_associations" { 90 | count = length(var.private_subnet_cidrs) 91 | subnet_id = element(aws_subnet.private_subnets[*].id, count.index) 92 | route_table_id = aws_route_table.private_subnets_rt.id 93 | } 94 | 95 | resource "aws_security_group" "sagemaker_sg" { 96 | name = "sagemaker_sg" 97 | description = "Allow certain NFS and TCP inbound traffic" 98 | vpc_id = aws_vpc.vpc.id 99 | 100 | ingress { 101 | description = "NFS traffic over TCP on port 2049 between the domain and EFS volume" 102 | from_port = 2049 103 | to_port = 2049 104 | protocol = "tcp" 105 | self = true 106 | } 107 | 108 | ingress { 109 | description = "TCP traffic between JupyterServer app and the KernelGateway apps" 110 | from_port = 8192 111 | to_port = 65535 112 | protocol = "tcp" 113 | self = true 114 | } 115 | 116 | egress { 117 | description = "Allow all outbound traffic" 118 | from_port = 0 119 | to_port = 0 120 | protocol = "-1" 121 | cidr_blocks = ["0.0.0.0/0"] 122 | ipv6_cidr_blocks = ["::/0"] 123 | } 124 | 125 | tags = { 126 | Name = "SageMaker sg" 127 | } 128 | } 129 | 130 | resource "aws_security_group" "vpc_endpoint_sg" { 131 | name = "vpc_endpoint_sg" 132 | description = "Allow incoming connections on port 443 from VPC" 133 | vpc_id = aws_vpc.vpc.id 134 | 135 | ingress { 136 | description = "Allow incoming connections on port 443 from VPC" 137 | from_port = 443 138 | to_port = 443 139 | protocol = "tcp" 140 | cidr_blocks = [aws_vpc.vpc.cidr_block] 141 | } 142 | 143 | egress { 144 | description = "Allow all outbound traffic" 145 | from_port = 0 146 | to_port = 0 147 | protocol = "-1" 148 | cidr_blocks = ["0.0.0.0/0"] 149 | ipv6_cidr_blocks = ["::/0"] 150 | } 151 | 152 | tags = { 153 | Name = "VPC endpoint sg" 154 | } 155 | } 156 | 157 | resource "aws_vpc_endpoint" "interface_endpoints" { 158 | for_each = toset([ 159 | "com.amazonaws.${data.aws_region.current.name}.sagemaker.api", 160 | "com.amazonaws.${data.aws_region.current.name}.sagemaker.runtime", 161 | "com.amazonaws.${data.aws_region.current.name}.sagemaker.featurestore-runtime", 162 | "com.amazonaws.${data.aws_region.current.name}.servicecatalog" 163 | ]) 164 | 165 | vpc_id = aws_vpc.vpc.id 166 | service_name = each.key 167 | vpc_endpoint_type = "Interface" 168 | subnet_ids = aws_subnet.private_subnets[*].id 169 | private_dns_enabled = true 170 | 171 | security_group_ids = [ 172 | aws_security_group.vpc_endpoint_sg.id 173 | ] 174 | } 175 | 176 | resource "aws_vpc_endpoint" "s3" { 177 | vpc_id = aws_vpc.vpc.id 178 | service_name = "com.amazonaws.${data.aws_region.current.name}.s3" 179 | vpc_endpoint_type = "Gateway" 180 | } 181 | 182 | resource "aws_vpc_endpoint_route_table_association" "s3_vpce_route_table_association" { 183 | route_table_id = aws_route_table.private_subnets_rt.id 184 | vpc_endpoint_id = aws_vpc_endpoint.s3.id 185 | } 186 | 187 | # VPC endpoints for Canvas 188 | resource "aws_vpc_endpoint" "interface_endpoints_canvas" { 189 | for_each = toset([ 190 | "com.amazonaws.${data.aws_region.current.name}.forecast", 191 | "com.amazonaws.${data.aws_region.current.name}.forecastquery", 192 | "com.amazonaws.${data.aws_region.current.name}.rekognition", 193 | "com.amazonaws.${data.aws_region.current.name}.textract", 194 | "com.amazonaws.${data.aws_region.current.name}.comprehend", 195 | "com.amazonaws.${data.aws_region.current.name}.sts", 196 | "com.amazonaws.${data.aws_region.current.name}.redshift-data", 197 | "com.amazonaws.${data.aws_region.current.name}.athena", 198 | "com.amazonaws.${data.aws_region.current.name}.glue" 199 | ]) 200 | 201 | vpc_id = aws_vpc.vpc.id 202 | service_name = each.key 203 | vpc_endpoint_type = "Interface" 204 | subnet_ids = aws_subnet.private_subnets[*].id 205 | private_dns_enabled = true 206 | 207 | security_group_ids = [ 208 | aws_security_group.vpc_endpoint_sg.id 209 | ] 210 | } 211 | -------------------------------------------------------------------------------- /variables.tf: -------------------------------------------------------------------------------- 1 | variable "domain_name" { 2 | description = "Sagemaker Domain Name" 3 | type = string 4 | default = "sagemaker-domain" 5 | } 6 | 7 | variable "auth_mode" { 8 | description = "The mode of authentication that members use to access the domain. Valid values are IAM and SSO" 9 | type = string 10 | default = "IAM" 11 | } 12 | 13 | variable "app_network_access_type" { 14 | description = "Specifies the VPC used for non-EFS traffic. Valid values are PublicInternetOnly and VpcOnly" 15 | type = string 16 | default = "VpcOnly" 17 | } 18 | 19 | variable "efs_retention_policy" { 20 | description = "The retention policy for data stored on an EFS volume. Valid values are Retain or Delete." 21 | type = string 22 | default = "Retain" 23 | } 24 | 25 | variable "aws_region" { 26 | description = "AWS Region." 27 | type = string 28 | default = "us-east-1" 29 | } 30 | -------------------------------------------------------------------------------- /versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">=1.4.0" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 5.10.0" 8 | } 9 | } 10 | } 11 | --------------------------------------------------------------------------------