├── .gitignore
├── .npmignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── cdk
├── .gitignore
├── .npmignore
└── nginxAuthentication
│ ├── README.md
│ ├── bin
│ └── app.ts
│ ├── cdk.json
│ ├── deploy.sh
│ ├── destroy.sh
│ ├── diff.sh
│ ├── lib
│ ├── http-gateway-stack.ts
│ ├── mlflow-vpc-stack.ts
│ └── sagemaker-studio-user-stack.ts
│ ├── package-lock.json
│ ├── package.json
│ ├── resize-cloud9.sh
│ └── tsconfig.json
├── images
├── Architecture.png
├── HttpApiGatewayURL.png
├── HttpApiStack.png
├── MlflowVpclinkStack.png
├── SageMakerNotebookInstance.png
├── clone-repo-studio-ui.png
├── launch-sm-studio.png
├── sm-studio-user.png
└── trialcomponent-output-artifacts-mlflow.png
├── lab
└── nginxBasicAuth
│ ├── sagemaker_studio_and_mlflow.ipynb
│ └── source_dir
│ ├── requirements.txt
│ ├── setup.py
│ └── train.py
├── resize-cloud9.sh
└── src
├── mlflow
├── .dockerignore
└── Dockerfile
└── nginx
└── basic_auth
├── Dockerfile
├── nginx.conf
└── script.sh
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | !jest.config.js
3 | *.d.ts
4 | **/node_modules/
5 | **/.vscode/
6 |
7 | # CDK asset staging directory
8 | .cdk.staging
9 | cdk.out
10 |
11 | # Parcel default cache directory
12 | .parcel-cache
13 |
14 | # Jupyter Notebook
15 | .ipynb_checkpoints
16 |
17 | # Environments
18 | .env
19 | .venv
20 | env/
21 | venv/
22 |
23 | **/.DS_Store
24 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | *.ts
2 | !*.d.ts
3 |
4 | # CDK asset staging directory
5 | .cdk.staging
6 | cdk.out
7 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MLflow (Open Machine Learning Platform) on AWS
2 | 
3 | [](https://gitpod.io/#https://github.com/aws/aws-cdk)
4 | [](https://badge.fury.io/js/aws-cdk)
5 | [](https://badge.fury.io/py/aws-cdk.core)
6 | [](https://badge.fury.io/nu/Amazon.CDK)
7 |
8 | ## Introduction
9 |
10 | MLflow is an open-source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry.
11 | [MLflow](https://mlflow.org/) is a framework for end-to-end development and tracking of machine learning projects and a natural companion to [Amazon SageMaker](https://aws.amazon.com/sagemaker/), the AWS fully managed service for data science. MLflow solves the problem of tracking experiments evolution and deploying agnostic and fully reproducible ML scoring solutions. It includes the following components.
12 |
13 | * Tracking – Record and query experiments: code, data, configuration, and results
14 | * Projects – Package data science code in a format to reproduce runs on any platform
15 | * Models – Deploy ML models in diverse serving environments
16 |
17 | ## Architecture
18 |
19 | In this project, we show how to deploy MLflow on [AWS Fargate](https://aws.amazon.com/fargate) with basic authentication and use it during your ML project with [Amazon SageMaker](https://aws.amazon.com/sagemaker).
20 | Our solution is based on three main high level components:
21 | * MLFlow server;
22 | * HTTP API Gateway; and
23 | * SageMaker Studio domain and SageMaker Studio user.
24 |
25 | ### MLflow Server
26 | MLflow is provisioned in a VPC on an [Amazon ECS](https://aws.amazon.com/ecs/) cluster using AWS Fargate for the serverless compute engine.
27 | The MLflow application is shielded by an internal Application LoadBalancer.
28 | We use [Amazon Simple Storage Service](http://aws.amazon.com/s3) (Amazon S3) and [Amazon Aurora](https://aws.amazon.com/rds/aurora/) as MLflow artifact and backend stores, respectively.
29 | Finally, we provide a simple authentication mechanism leveraging NGINX and its reverse proxy as well as its basic authentication capabilities.
30 |
31 | ### Amazon HTTP API Gateway
32 | In order to implement the private integration, we create a AWS PrivateLink to encapsulate connections between MLflow server and the outside world through an [Amazon HTTP API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/http-api.html).
33 |
34 | ### SageMaker Studio
35 |
36 | [Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/) provides a single, web-based visual interface where you can perform all ML development steps.
37 | SageMaker Studio gives you complete access, control, and visibility into each step required to build, train, and deploy models.
38 | You can quickly upload data, create new notebooks, train and tune models, move back and forth between steps to adjust experiments, compare results, and deploy models to production all in one place.
39 | We will use an Amazon SageMaker Studio to experiment with our machine learning sample problem, and to show how you can integrate SageMaker with MLFlow and can handle the user authentication.
40 |
41 | ## Implementation
42 |
43 | Prior to the availability of AWS PrivateLink, services residing in a single Amazon VPC were connected to multiple Amazon VPCs either (1) through public IP addresses using each VPC’s internet gateway or (2) by private IP addresses using VPC peering.
44 |
45 | With AWS PrivateLink, service connectivity over Transmission Control Protocol (TCP) can be established from the service provider’s VPC (Producer) to the service consumer’s VPC (Consumer) in a secure and scalable manner.
46 | Tom Adamski has provided an [architecture](https://aws.amazon.com/blogs/networking-and-content-delivery/how-to-securely-publish-internet-applications-at-scale-using-application-load-balancer-and-aws-privatelink/) where he shows one way of using AWS PrivateLink along with ALB and NLBs to publish Internet applications at scale.
47 | Mani Chandrasekaran provided a [solution](https://aws.amazon.com/blogs/compute/access-private-applications-on-aws-fargate-using-amazon-api-gateway-privatelink/) where he uses API Gateway to expose applications running on AWS Fargate using REST APIs, but it uses NLB since ALB is not yet supported by this architecture.
48 |
49 | Our solution leverages the existing applications / APIs running in AWS Fargate behind a Private ALB inside a VPC and proposes an architecture to expose these APIs securely through HTTP APIs using Amazon API Gateway and AWS PrivateLink.
50 |
51 | The target audience for this workshop are developers and architects who want to architect API based services using the existing applications running inside Amazon VPCs.
52 |
53 | ## Prerequisites
54 | In order to implement the instructions laid out in this post, you will need the following:
55 | - An [AWS account](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/)
56 | - an IAM user with Administrator permissions
57 |
58 | ## Architecture
59 | As shown in Fig 1, we shall create one AWS CDK application consisting of three AWS CDK stacks **MLflowVpcStack**, **HttpApiGatewayStack**, and a **SageMakerStudioUserStack**.
60 |
61 | Inside the `MLflowVpcStack`, we deploy mlflowService using Amazon Fargate within the MLFlowVPC.
62 | An internal load balancer distributes external incoming application traffic to the mlflowService.
63 | In order to implement the private integration we create a VpcLink to encapsulate connections between API Gateway and mlflowService.
64 | Inside the `HttpApiGatewayStack`, we create an HTTP API Gateway that integrates with the mlflowService Amazon Fargate service running inside the `MLflowVpcStack` using the Vpclink and internal load balancer listener.
65 |
66 | 
67 | *Fig 1 - Architecture*
68 |
69 | Here are the steps we’ll be following to implement the above architecture:
70 |
71 | - Create and configure AWS Cloud9 environment
72 | - Provisioning AWS resources using the AWS CDK
73 | - Testing the Http Api
74 | - Cleanup
75 | - Conclusion
76 |
77 |
78 | ## Create and configure AWS Cloud9 environment
79 | Log into the AWS Management Console and search for Cloud9 service in the search bar.
80 |
81 | Click Cloud9 and create an AWS Cloud9 environment region based on Amazon Linux 2.
82 | Create an IAM role for Cloud9 workspace as explained [here](https://www.eksworkshop.com/020_prerequisites/iamrole/).
83 | Attache the IAM role to your workspace as explained [here](https://www.eksworkshop.com/020_prerequisites/ec2instance/).
84 | Turn off the AWS managed temporary credentials of the Cloud9 environment as explained [here](https://www.eksworkshop.com/020_prerequisites/workspaceiam/).
85 |
86 | ## Provisioning AWS resources using the AWS CDK
87 |
88 | ### Clone the GitHub repository
89 |
90 | Open a new terminal inside AWS Cloud9 IDE and run:
91 | ```bash
92 | git clone https://github.com/aws-samples/aws-mlflow-sagemaker-cdk
93 | ```
94 |
95 | ### Setting the region
96 |
97 | The default region used by the CDK app is `us-west-2`.
98 | If you are already working in `us-west-2` you can skip this section.
99 | However, you can change the default region by setting up the `AWS_REGION` environment variable.
100 | When working on Cloud9, you can specify the same region where your Cloud9 environment is running as follow:
101 |
102 | ```bash
103 | sudo yum install jq -y
104 | export AWS_REGION=$(curl -s 169.254.169.254/latest/dynamic/instance-identity/document | jq -r '.region')
105 | echo "export AWS_REGION=${AWS_REGION}" | tee -a ~/.bash_profile
106 | export AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
107 | echo "export AWS_ACCOUNT=${AWS_ACCOUNT}" | tee -a ~/.bash_profile
108 | ```
109 |
110 | The CDK script expects to find the ENV variable `DOMAIN_ID` in order to figure out if a new SageMaker Studio domain is needed or not.
111 | ```bash
112 | export DOMAIN_ID=$(aws sagemaker list-domains | jq -r 'select(.Domains[0] != null) .Domains[0].DomainId | tostring')
113 | echo "export DOMAIN_ID=${DOMAIN_ID}" | tee -a ~/.bash_profile
114 | ```
115 |
116 | ### Resizing the Cloud9
117 | Before deploying, since we use CDK construct to build the container images locally, we need a larger disk size than the one provided by Cloud9 in its default environment configuration (i.e. 20GB, whivh is not enough).
118 | To resize it on the fly without rebooting the instance, you can run the following script specifying a new desired size.
119 |
120 | ```
121 | cd ~/environment/aws-mlflow-sagemaker-cdk/
122 | ./resize-cloud9.sh 100
123 | ```
124 | Where `100` represents the new desired disk size in GB.
125 |
126 | ### Install and bootstrap AWS CDK
127 |
128 | The AWS Cloud Development Kit (AWS CDK) is an open-source software development framework to model and provision your cloud application resources using familiar programming languages.
129 | If you would like to familiarize yourself the [CDKWorkshop](https://cdkworkshop.com/) is a great place to start.
130 |
131 | Using Cloud9 environment, open a new Terminal and use the following commands:
132 | ```bash
133 | cd ~/environment/aws-mlflow-sagemaker-cdk/cdk/nginxAuthentication
134 | npm install -g aws-cdk@2.114.1 --force
135 | cdk --version
136 | ```
137 |
138 | Take a note of the latest version that you install, at the time of writing this post it is `2.85.0`.
139 | Open the package.json file and replace the version “2.85.0” of the following modules with the latest version that you have installed above.
140 |
141 | ```typescript
142 | "aws-cdk-lib": "2.114.1",
143 | "@aws-cdk/aws-apigatewayv2-alpha": "2.114.1-alpha.0",
144 | "@aws-cdk/aws-apigatewayv2-integrations-alpha": "2.114.1-alpha.0",
145 | ```
146 |
147 | This will install all the latest CDK modules under the `node_modules` directory (`npm install`) and prepare your AWS account to deploy resources with CDK (`cdk bootstrap`).
148 |
149 | ```bash
150 | cd ~/environment/aws-mlflow-sagemaker-cdk/cdk/nginxAuthentication
151 | npm install
152 | cdk bootstrap
153 | ```
154 |
155 | #### Ensure Python 3.8 (or greater) is installed
156 |
157 | In order to deploy to SageMaker an mlflow model, you need to create a serving container that implements what the SageMaker runtime expects to find.
158 | MLflow makes this effor easier by providing a CLI command that build the image locally and pushes to your ECR the image.
159 | Most recent versions of MLflow have dependencies on `Python 3.8`.
160 |
161 | ```bash
162 | python --version
163 | ```
164 |
165 | If running this sample on Cloud9, you need to ensure you have Python `3.8` installed.
166 | You can follow these instructions on how to do it.
167 |
168 | ```bash
169 | sudo yum install -y amazon-linux-extras
170 | sudo amazon-linux-extras enable python3.8
171 | sudo yum install -y python3.8
172 | ```
173 |
174 | #### Push the `mlflow-pyfunc` container to ECR
175 |
176 | Il on Cloud9 run the following (after installing Python 3.8)
177 | ```bash
178 | # install the libraries
179 | pip3.8 install mlflow==2.14.2 boto3 # or pip install mlflow==2.14.2 boto3 if your default pip comes alongside a python version >= 3.8
180 | ```
181 |
182 | ```bash
183 | # build and push the container to ECR into your account
184 | mlflow sagemaker build-and-push-container
185 | ```
186 |
187 | ## Provisioning AWS resources using the AWS CDK
188 |
189 | Now we are ready to deploy our full solution.
190 |
191 | ```bash
192 | cdk deploy --all --require-approval never
193 | ```
194 |
195 | Alternatively, you can also deploy individual stacks by passing the stack name to CDK, e.g.
196 |
197 | ```bash
198 | cdk deploy MLflowVpcStack HttpGatewayStack
199 | ```
200 |
201 | if you want to only deploy the `MLflowVpcStack` and `HttpGatewayStack` and leave out the `SageMakerStudioUserStack`.
202 |
203 | ## Detailed walkthrough the solution
204 |
205 | We have implemented this architecture using an AWS CDK application comprising of two individual CDK stacks:
206 |
207 | - **MLflowVpcStack** deploys the MLflow server on Fargate on a Vpc.
208 | - **HttpGatewayStack** deploys the HTTP Api integrated with Fargate service using a Vpclink.
209 |
210 | Let us discuss these stacks one by one.
211 |
212 | ### **MLflowVpcStack**
213 |
214 | 
215 | *Fig 2 - MLFlow VPC*
216 |
217 | Under the `./cdk/nginxAuthentication/lib` folder, open the `mlflow-vpclink-stack.ts` file and let us explore the following different CDK constructs.
218 |
219 | ```typescript
220 | // Export Vpc, ALB Listener, and Mlflow secret ARN
221 | public readonly httpApiListener: elbv2.ApplicationListener;
222 | public readonly mlflowSecretArn: string;
223 | public readonly vpc: ec2.Vpc;
224 | ```
225 |
226 | These three variables enable us to export the provisioned Vpc along with the ALB Listener from **MLflowVpcStack** stack so as to use these to create the Http Api in the **HttpGatewayStack** stack.
227 |
228 | **VPC:**
229 |
230 | This code creates a MLFlowVPC with public, private, and isolted subnets across 2 availability zones.
231 | The public subnets are used to run the NAT gateway which allows our components in the private subnets to get internet connectivity.
232 | The application code and the database run in the private subnets and isolated subnets respectively.
233 |
234 | ```typescript
235 | // VPC
236 | const vpc = new ec2.Vpc(this, 'MLFlowVPC', {
237 | cidr: cidr,
238 | natGateways: 1,
239 | maxAzs: 2,
240 | subnetConfiguration: [
241 | {
242 | name: 'public',
243 | subnetType: ec2.SubnetType.PUBLIC,
244 | cidrMask: 24,
245 | },
246 | {
247 | name: 'private',
248 | subnetType: ec2.SubnetType.PRIVATE,
249 | cidrMask: 26,
250 | },
251 | {
252 | name: 'isolated',
253 | subnetType: ec2.SubnetType.ISOLATED,
254 | cidrMask: 28,
255 | },
256 | ],
257 | });
258 | ```
259 |
260 | **AWS Secrets Manager**
261 | We need to generate two kind of credentials, i.e., one for the database backend, and one for the application server.
262 | [AWS Secrets Manager](https://aws.amazon.com/secrets-manager/) helps generate and store secrets securely.
263 | Furthermore, it allows users and applications retrieve secrets with a call to AWS Secrets Manager APIs, eliminating the need to hardcode sensitive information in plain text.
264 |
265 | ```typescript
266 | // DB Credentials
267 | const databaseCredentialsSecret = new secretsmanager.Secret(this, 'DBCredentialsSecret', {
268 | secretName: `${serviceName}-credentials`,
269 | generateSecretString: {
270 | secretStringTemplate: JSON.stringify({
271 | username: dbUsername,
272 | }),
273 | excludePunctuation: true,
274 | includeSpace: false,
275 | generateStringKey: 'password'
276 | }
277 | });
278 |
279 | // Mflow credentials
280 | const mlflowCredentialsSecret = new secretsmanager.Secret(this, 'MlflowCredentialsSecret', {
281 | secretName: mlflowSecretName,
282 | generateSecretString: {
283 | secretStringTemplate: JSON.stringify({
284 | username: mlflowUsername,
285 | }),
286 | excludePunctuation: true,
287 | includeSpace: false,
288 | generateStringKey: 'password'
289 | }
290 | });
291 | ```
292 |
293 | **Aurora RDS Cluster:**
294 |
295 | This creates an Aurora RDS cluster inside the MLFlowVPC.
296 | This database is used by MLflow to store metadata about the ML model generated in our experiments.
297 |
298 | ```typescript
299 | const dbConfig = {
300 | dbClusterIdentifier: `${serviceName}-cluster`,
301 | engineMode: 'serverless',
302 | engine: 'aurora-mysql',
303 | engineVersion: '5.7.12',
304 | databaseName: dbName,
305 | masterUsername: databaseCredentialsSecret.secretValueFromJson('username').toString(),
306 | masterUserPassword: databaseCredentialsSecret.secretValueFromJson('password').toString(),
307 | dbSubnetGroupName: dbSubnetGroup.dbSubnetGroupName,
308 | scalingConfiguration: {
309 | autoPause: true,
310 | maxCapacity: 2,
311 | minCapacity: 2,
312 | secondsUntilAutoPause: 3600,
313 | },
314 | vpcSecurityGroupIds: [
315 | dbClusterSecurityGroup.securityGroupId
316 | ]
317 | };
318 |
319 | // RDS Cluster
320 | const rdsCluster = new CfnDBCluster(this, 'DBCluster', dbConfig);
321 | rdsCluster.addDependsOn(dbSubnetGroup)
322 | ```
323 |
324 | **ECS Cluster:**
325 |
326 | This creates an Amazon ECS cluster inside the MLFlowVPC, we shall be running mlflow service inside this ECS cluster using AWS Fargate.
327 |
328 | ```typescript
329 | const cluster = new ecs.Cluster(this, "Fargate Cluster" , {
330 | vpc : vpc,
331 | });
332 | ```
333 |
334 | **Cloud Map Namespace:**
335 |
336 | AWS Cloud Map allows us to register any application resources, such as microservices, and other cloud resources, with custom names. Using AWS Cloud Map, we can define custom names for our application microservice, and it maintains the updated location of the dynamically changing microservice.
337 |
338 | ```typescript
339 | const dnsNamespace = new servicediscovery.PrivateDnsNamespace(this,"DnsNamespace",{
340 | name: "http-api.local",
341 | vpc: vpc,
342 | description: "Private DnsNamespace for Microservices",
343 | }
344 | );
345 | ```
346 | **ECS Task Role:**
347 |
348 | We define in an IAM Role the set of permissions that our AWS Faregat Task is allowed to be granted.
349 | Please also note the inline policy that grants access to the database and mlflow credentials.
350 |
351 | ```typescript
352 | const taskrole = new iam.Role(this, "ecsTaskExecutionRole", {
353 | assumedBy: new iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
354 | managedPolicies: [
355 | iam.ManagedPolicy.fromAwsManagedPolicyName("service-role/AmazonECSTaskExecutionRolePolicy"),
356 | iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonS3FullAccess") // for a production environment, you might want to restrict this policy to only the bucket you need.
357 | ],
358 | inlinePolicies: {
359 | secretsManagerRestricted: new iam.PolicyDocument({
360 | statements: [
361 | new iam.PolicyStatement({
362 | effect: iam.Effect.ALLOW,
363 | resources: [
364 | mlflowCredentialsSecret.secretArn,
365 | databaseCredentialsSecret.secretArn
366 | ],
367 | actions: [
368 | "secretsmanager:GetResourcePolicy",
369 | "secretsmanager:GetSecretValue",
370 | "secretsmanager:DescribeSecret",
371 | "secretsmanager:ListSecretVersionIds"
372 | ]
373 | }),
374 | new iam.PolicyStatement({
375 | effect: iam.Effect.ALLOW,
376 | resources: ["*"],
377 | actions: ["secretsmanager:ListSecrets"]
378 | }),
379 | ]
380 | })
381 | }
382 | });
383 | ```
384 |
385 | **Task Definition:**
386 |
387 | A task definition is required to run Docker containers in Amazon ECS, we shall create the task definition (mlflowTaskDefinition) for the mlflow service.
388 |
389 | ```typescript
390 | const mlflowTaskDefinition = new ecs.FargateTaskDefinition(
391 | this,
392 | "mlflowTaskDef",
393 | {
394 | taskRole: taskrole,
395 | family: "mlFlowStack"
396 | },
397 | );
398 | ```
399 | **Log Groups:**
400 |
401 | Let us create a log group mlflowServiceLogGroup and the associated log driver.
402 |
403 | ```typescript
404 | const mlflowServiceLogGroup = new logs.LogGroup(this, "mlflowServiceLogGroup", {
405 | logGroupName: "/ecs/mlflowService",
406 | removalPolicy: cdk.RemovalPolicy.DESTROY,
407 | });
408 |
409 | const mlflowServiceLogDriver = new ecs.AwsLogDriver({
410 | logGroup: mlflowServiceLogGroup,
411 | streamPrefix: "mlflowService",
412 | });
413 | ```
414 | **Task Containers:**
415 |
416 | Let us define two containers in the `mlflowTaskDefinition` task definition, i.e. the NGINX container responsible to authenticate requests and acting as reverse proxy, and the MLFlow container where the MLFlow server application code is running.
417 | It is important to note how we securely pass credentials (generated and stored securely in AWS Secrets Manager) to the task definition.
418 | The ECS Task Role described earlier defines, among the others, the custom policy that grants to the Fargate task read-access to exclusively the two secrets needed by the containers.
419 | In this way, all credentials are transparently handled by Fargate, avoiding to expose any sensitive information to the task definition in clear text.
420 |
421 | For the interested reader on the topic on how to secure credentials with Fargate and AWS Secrets Manager, [this blogpost](https://aws.amazon.com/blogs/compute/securing-credentials-using-aws-secrets-manager-with-aws-fargate/) provide a more in depth discussion.
422 |
423 | ```typescript
424 | const nginxContainer = mlflowTaskDefinition.addContainer(
425 | "nginxContainer",
426 | {
427 | containerName: "nginxContainer",
428 | essential: true,
429 | // memoryReservationMiB: 512,
430 | // cpu: 512,
431 | portMappings: [{
432 | containerPort: 80,
433 | protocol: ecs.Protocol.TCP
434 | }],
435 | image: ecs.ContainerImage.fromAsset('../../src/nginx', {}),
436 | secrets: {
437 | MLFLOW_USERNAME: ecs.Secret.fromSecretsManager(mlflowCredentialsSecret, 'username'),
438 | MLFLOW_PASSWORD: ecs.Secret.fromSecretsManager(mlflowCredentialsSecret, 'password')
439 | },
440 | logging: mlflowServiceLogDriver,
441 | }
442 | );
443 |
444 | const mlflowServiceContainer = mlflowTaskDefinition.addContainer(
445 | "mlflowContainer",
446 | {
447 | containerName: "mlflowContainer",
448 | essential: true,
449 | // memoryReservationMiB: 512,
450 | // cpu: 512,
451 | portMappings: [{
452 | containerPort: containerPort,
453 | protocol: ecs.Protocol.TCP,
454 | }],
455 | image: ecs.ContainerImage.fromAsset('../../src/mlflow', {}),
456 | environment: {
457 | 'BUCKET': `s3://${mlFlowBucket.bucketName}`,
458 | 'HOST': rdsCluster.attrEndpointAddress,
459 | 'PORT': `${dbPort}`,
460 | 'DATABASE': dbName
461 | },
462 | secrets: {
463 | USERNAME: ecs.Secret.fromSecretsManager(databaseCredentialsSecret, 'username'),
464 | PASSWORD: ecs.Secret.fromSecretsManager(databaseCredentialsSecret, 'password')
465 | },
466 | logging: mlflowServiceLogDriver,
467 | });
468 | ```
469 |
470 | ***NGINX container:***
471 |
472 | The NGINX container acts as reverse proxy in front of the MLFlow container, providing also a simple and straighforward way to add Basic Auth to the MLFlow server.
473 |
474 | In order to securely inject the credentials auto-generated in AWS Secrets Manager into the container, the credential file for NGINX at container startup.
475 | The creation of the credential file for NGINX is handled by a script whose content is provided below:
476 |
477 | ```bash
478 | #!/bin/sh
479 | echo -n $MLFLOW_USERNAME: >> /etc/nginx/.htpasswd
480 | openssl passwd -1 $MLFLOW_PASSWORD >> /etc/nginx/.htpasswd
481 | ```
482 |
483 | In the `Dockerfile`, since we are using the `CMD` command to run the script at startup time, we must include `-g 'daemon off';` in order to allow nginx to stay in the foreground.
484 | As such, Docker can track the process properly (otherwise your container will stop immediately after starting).
485 | Generating the credential file at container startup rather then during the build process adds little overhead, however this is for the greater good, since we have now the security and the flexibility to change credentials too without the need to rebuild the container.
486 |
487 | ```dockerfile
488 | FROM nginx:1.17.6
489 | RUN apt-get update
490 | RUN apt-get install openssl -y
491 | # Remove default Nginx config
492 | RUN rm /etc/nginx/nginx.conf
493 | # Copy the modified Nginx conf
494 | COPY nginx.conf /etc/nginx/nginx.conf
495 | RUN ln -sf /dev/stdout /var/log/nginx/access.log \
496 | && ln -sf /dev/stderr /var/log/nginx/error.log
497 |
498 | COPY script.sh /root/script.sh
499 | RUN chmod +x /root/script.sh
500 |
501 | CMD /root/script.sh && nginx -g 'daemon off;'
502 | ```
503 |
504 | Finally, let us highlight at an important setting in the `nginx.conf` file, i.e., `resolver 169.254.169.253;`, needed to allow the DNS resolution within the AWS VPC.
505 | In this way, the NGINX container can resolve the MLFlow container IP address via DNS.
506 |
507 | ```
508 | location / {
509 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
510 | proxy_set_header Host $http_host;
511 | proxy_redirect off;
512 | resolver 169.254.169.253;
513 | set $mlflow mlflowservice.http-api.local;
514 | proxy_pass http://$mlflow:5000;
515 | auth_basic "Administrator’s Area";
516 | auth_basic_user_file /etc/nginx/.htpasswd;
517 | }
518 | ```
519 | ***MLFlow container:***
520 |
521 | The MLFlow container pip-install the MLFlow server.
522 | Then the MLFlow server is started at container startup.
523 | Similarly as we did for the NGINX credentials, we ensure that the credentials for the Aurora Database are injected at startup time by Fargate from the AWS Secrets Manager.
524 |
525 | ```dockerfile
526 | FROM python:3.9.0
527 |
528 | RUN pip install \
529 | mlflow==2.14.2 \
530 | pymysql==1.0.2 \
531 | boto3 && \
532 | mkdir /mlflow/
533 |
534 | EXPOSE 5000
535 |
536 | CMD mlflow server \
537 | --host 0.0.0.0 \
538 | --port 5000 \
539 | --default-artifact-root ${BUCKET} \
540 | --backend-store-uri mysql+pymysql://${USERNAME}:${PASSWORD}@${HOST}:${PORT}/${DATABASE}
541 | ```
542 |
543 | **Security Groups:**
544 |
545 | In order to control the inbound and outbound traffic to Fargate tasks, we shall create a security group that act as a virtual firewall.
546 |
547 | ```typescript
548 | const mlflowServiceSecGrp = new ec2.SecurityGroup(
549 | this,
550 | "mlflowServiceSecurityGroup",
551 | {
552 | allowAllOutbound: true,
553 | securityGroupName: "mlflowServiceSecurityGroup",
554 | vpc: vpc,
555 | }
556 | );
557 | mlflowServiceSecGrp.connections.allowFromAnyIpv4(ec2.Port.tcp(containerPort));
558 | mlflowServiceSecGrp.connections.allowFromAnyIpv4(ec2.Port.tcp(80));
559 | ```
560 |
561 | **Fargate Service:**
562 |
563 | Let us create an ECS Fargate service (mlflowService) based on the task definition created above.
564 | An Amazon ECS service enables you to run and maintain a specified number of instances of a task definition simultaneously in an Amazon ECS cluster. If any of your tasks should fail or stop for any reason, the Amazon ECS service scheduler launches another instance of your task definition to replace it in order to maintain the desired number of tasks in the service.
565 |
566 | ```typescript
567 | const mlflowService = new ecs.FargateService(this, "mlflowService", {
568 | cluster: cluster,
569 | serviceName: serviceName,
570 | taskDefinition: mlflowTaskDefinition,
571 | assignPublicIp: false,
572 | desiredCount: 2,
573 | securityGroup: mlflowServiceSecGrp,
574 | cloudMapOptions: {
575 | name: "mlflowService",
576 | cloudMapNamespace: dnsNamespace,
577 | },
578 | });
579 | ```
580 |
581 | **ALB:**
582 |
583 | The load balancer distributes incoming application traffic across ECS services, in multiple Availability Zones. This increases the availability of your application. Let us add an internal Application Load Balancer.
584 |
585 | ```typescript
586 | const httpApiInternalALB = new elbv2.ApplicationLoadBalancer(
587 | this,
588 | "httpapiInternalALB",
589 | {
590 | vpc: vpc,
591 | internetFacing: false,
592 | }
593 | );
594 | ```
595 | **ALB Listener:**
596 |
597 | An ALB listener checks for connection requests from clients, using the protocol and port that we configure.
598 |
599 | ```typescript
600 | this.httpApiListener = httpApiInternalALB.addListener("httpapiListener", {
601 | port: 80,
602 | protocol: ApplicationProtocol.HTTP,
603 | });
604 | ```
605 |
606 | **Target Groups:**
607 |
608 | We shall create a target group, i.e., `mlflowServiceTargetGroup`, pointing to the NGINX container.
609 |
610 | ```typescript
611 | const mlflowServiceTargetGroup = this.httpApiListener.addTargets(
612 | "mlflowServiceTargetGroup",
613 | {
614 | healthCheck: {
615 | path: "/elb-status"
616 | },
617 | targets: [
618 | mlflowService.loadBalancerTarget(
619 | {
620 | containerName: 'nginxContainer',
621 | containerPort: 80
622 | }
623 | )
624 | ],
625 | port: 80,
626 | protocol: ApplicationProtocol.HTTP,
627 | }
628 | );
629 | ```
630 | **Auto Scaling Group:**
631 |
632 | We shall create an auto scaling group for the MLflow server application code.
633 |
634 | ```typescript
635 | // Task Auto Scaling
636 | const autoScaling = mlflowService.autoScaleTaskCount({ maxCapacity: 6 });
637 | autoScaling.scaleOnCpuUtilization('CpuScaling', {
638 | targetUtilizationPercent: 70,
639 | scaleInCooldown: cdk.Duration.seconds(60),
640 | scaleOutCooldown: cdk.Duration.seconds(60),
641 | });
642 | ```
643 |
644 | ### **HttpApiStack**
645 |
646 | Under the `./cdk/nginxAuthentication/lib` folder, open the `http-gateway-stack.ts` file and let us explore the following different CDK constructs.
647 |
648 |
649 | **VPC Link:**
650 |
651 | It is easy to expose our HTTP/HTTPS resources behind an Amazon VPC for access by clients outside of the Producer VPC using API Gateway private integration.
652 | To extend access to our private VPC resources beyond the VPC boundaries, we can create an HTTP API with private integration for open access or controlled access.
653 | The private integration uses an API Gateway resource of VpcLink to encapsulate connections between API Gateway and targeted VPC resources.
654 | As an owner of a VPC resource, we are responsible for creating an Application Load Balancer in our Producer VPC and adding a VPC resource as a target of an Application Load Balancer's listener.
655 | As an HTTP API developer, to set up an HTTP API with the private integration, we are responsible for creating a VpcLink targeting the specified Application Load Balancer and then treating the VpcLink as an effective integration endpoint.
656 | Let us create a Vpclink based on the private subnets of the MLFlowVPC.
657 |
658 | ```typescript
659 | this.httpVpcLink = new cdk.CfnResource(this, "HttpVpcLink", {
660 | type: "AWS::ApiGatewayV2::VpcLink",
661 | properties: {
662 | Name: "http-api-vpclink",
663 | SubnetIds: vpc.privateSubnets.map((m) => m.subnetId),
664 | },
665 | });
666 | ```
667 |
668 | **API Integration:**
669 |
670 | The following construct will integrate the Amazon HTTP API Gateway with the backend mlflowService using the Vpclink and the Application Loadbalancer Listener.
671 |
672 | ```typescript
673 | // HTTP Integration with VpcLink
674 | const mlflowIntegration = new HttpAlbIntegration(
675 | 'MLflowIntegration',
676 | httpApiListener,
677 | { vpcLink: MLflowVpclink }
678 | )
679 | ```
680 |
681 | **HTTP Api:**
682 |
683 | Let us create an Http Api based on a default stage with the an HTTP integration on the VpcLink
684 |
685 | ```typescript
686 | // HTTP Api
687 | this.api = new apig.HttpApi(this, "mlflow-api", {
688 | createDefaultStage: true,
689 | defaultIntegration: mlflowIntegration
690 | });
691 | ```
692 |
693 | **API Route:**
694 |
695 | Now let us create the Http Api proxy routes targeting the Api integration.
696 |
697 | ```typescript
698 | // HTTP Api Route
699 | this.api.addRoutes({
700 | integration: mlflowIntegration,
701 | path: "/{proxy+}"
702 | })
703 | ```
704 |
705 | After the **MLflowVpcStack** and the **HttpApiGatewayStack** are deployed, the MLflow server is finally accessible.
706 |
707 | ## *Integration with SageMaker*
708 |
709 | ### **SageMakerStudioUserStack**
710 |
711 | Under the `./cdk/nginxAuthentication/lib` folder, open the `sagemaker-studio-user-stack.ts` file and let us explore the following CDK construct.
712 |
713 | **SageMaker execution role**
714 |
715 | As a managed service, Amazon SageMaker performs operations on your behalf on the AWS hardware that is managed by SageMaker.
716 | SageMaker can perform only operations that the user permits.
717 | You can grant these permissions via specifying an execution role.
718 | Our use case dictates that not only we need access to the full Amazon SageMaker functionalities, but also to specific S3 buckets (e.g., to store and retrieve training and test data), as well as to the MLFlow credentials stored in AWS Secrets Manager.
719 |
720 | ```typescript
721 | // SageMaker Execution Role
722 | const sagemakerExecutionRole = new iam.Role(this, "sagemaker-execution-role", {
723 | assumedBy: new iam.ServicePrincipal("sagemaker.amazonaws.com"),
724 | managedPolicies: [
725 | iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonSageMakerFullAccess")
726 | ],
727 | inlinePolicies: {
728 | retrieveApiGatewayUrl: new iam.PolicyDocument({
729 | statements: [
730 | new iam.PolicyStatement({
731 | effect: iam.Effect.ALLOW,
732 | resources: [`arn:*:cloudformation:${this.region}:${this.account}:stack/${httpGatewayStackName}/*`],
733 | actions: ["cloudformation:DescribeStacks"],
734 | })
735 | ],
736 | }),
737 | s3Buckets: new iam.PolicyDocument({
738 | statements: [
739 | new iam.PolicyStatement({
740 | effect: iam.Effect.ALLOW,
741 | resources: ["arn:aws:s3:::*mlflow*"],
742 | actions: ["s3:ListBucket","s3:GetObject", "s3:PutObject", "s3:DeleteObject", "s3:PutObjectTagging", "s3:CreateBucket"],
743 | })
744 | ],
745 | }),
746 | secretsManagerRestricted: new iam.PolicyDocument({
747 | statements: [
748 | new iam.PolicyStatement({
749 | effect: iam.Effect.ALLOW,
750 | resources: [mlflowSecretArn],
751 | actions: [
752 | "secretsmanager:GetResourcePolicy",
753 | "secretsmanager:GetSecretValue",
754 | "secretsmanager:DescribeSecret",
755 | "secretsmanager:ListSecretVersionIds"
756 | ]
757 | }),
758 | new iam.PolicyStatement({
759 | effect: iam.Effect.ALLOW,
760 | resources: ["*"],
761 | actions: ["secretsmanager:ListSecrets"]
762 | })
763 | ]
764 | })
765 | },
766 | });
767 | ```
768 |
769 | In the **SageMakerStudioUserStack** we have included logic to either deploy a new SageMaker Studio domain, or to update an existing one.
770 | The CDK script expects to find the ENV variable `DOMAIN_ID` in order to figure out if a new domain is needed or not, which we have setup earlier.
771 |
772 | #### Provision a new SageMaker Studio domain
773 | ( Skip to [Update an existing SageMaker Studio](#update-an-existing-sagemaker-studio-domain) if you have already an existing SageMaker Studio domain)
774 |
775 | Provisioning a new SageMaker Studio domain will do the following operations:
776 |
777 | 1. Create a SageMaker execution role with the correct permissions
778 |
779 | ```typescript
780 | // SageMaker Execution Role
781 | const sagemakerExecutionRole = new iam.Role(this, "sagemaker-execution-role", {
782 | assumedBy: new iam.ServicePrincipal("sagemaker.amazonaws.com"),
783 | managedPolicies: [
784 | iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonSageMakerFullAccess")
785 | ],
786 | inlinePolicies: {
787 | retrieveApiGatewayUrl: new iam.PolicyDocument({
788 | statements: [
789 | new iam.PolicyStatement({
790 | effect: iam.Effect.ALLOW,
791 | resources: [`arn:*:cloudformation:${this.region}:${this.account}:stack/${httpGatewayStackName}/*`],
792 | actions: ["cloudformation:DescribeStacks"],
793 | })
794 | ],
795 | }),
796 | s3Buckets: new iam.PolicyDocument({
797 | statements: [
798 | new iam.PolicyStatement({
799 | effect: iam.Effect.ALLOW,
800 | resources: ["arn:aws:s3:::*mlflow*"],
801 | actions: ["s3:ListBucket","s3:GetObject", "s3:PutObject", "s3:DeleteObject", "s3:PutObjectTagging", "s3:CreateBucket"],
802 | })
803 | ],
804 | }),
805 | secretsManagerRestricted: new iam.PolicyDocument({
806 | statements: [
807 | new iam.PolicyStatement({
808 | effect: iam.Effect.ALLOW,
809 | resources: [mlflowSecretArn],
810 | actions: [
811 | "secretsmanager:GetResourcePolicy",
812 | "secretsmanager:GetSecretValue",
813 | "secretsmanager:DescribeSecret",
814 | "secretsmanager:ListSecretVersionIds"
815 | ]
816 | }),
817 | new iam.PolicyStatement({
818 | effect: iam.Effect.ALLOW,
819 | resources: ["*"],
820 | actions: ["secretsmanager:ListSecrets"]
821 | })
822 | ]
823 | })
824 | },
825 | });
826 | ```
827 |
828 | 2. Create a new SageMaker Studio domain in the default VPC.
829 |
830 | ```typescript
831 | // SageMaker Studio domain
832 | const defaultVpc = ec2.Vpc.fromLookup(this, 'DefaultVPC', { isDefault: true });
833 | const subnetIds: string[] = [];
834 |
835 | defaultVpc.publicSubnets.forEach((subnet, index) => {
836 | subnetIds.push(subnet.subnetId);
837 | });
838 |
839 | const cfnStudioDomain = new sagemaker.CfnDomain(this, 'MyStudioDomain', {
840 | authMode: 'IAM',
841 | defaultUserSettings: {
842 | executionRole: sagemakerExecutionRole.roleArn,
843 | },
844 | domainName: 'StudioDomainName',
845 | vpcId: defaultVpc.vpcId,
846 | subnetIds: subnetIds,
847 | });
848 | ```
849 |
850 | 3. Create a new SageMaker Studio user attached to the domain and with the execution role previously created attached to it
851 |
852 | ```typescript
853 | // SageMaker Studio user
854 | const cfnUserProfile = new sagemaker.CfnUserProfile(this, 'MyCfnUserProfile', {
855 | domainId: cfnStudioDomain.attrDomainId,
856 | userProfileName: 'mlflow-user',
857 | userSettings: {
858 | executionRole: sagemakerExecutionRole.roleArn,
859 | }
860 | }
861 | );
862 | ```
863 |
864 | #### Update an existing SageMaker Studio Domain
865 |
866 | Updating an existing SageMaker Studio domain will do the following operations:
867 |
868 | 1. Create a sagemaker execution role with the correct permissions
869 | 2. Create a new SageMaker Studio user attached to the domain and with the execution role previously created attached to it
870 |
871 | ## Push the `mlflow-pyfunc` container to ECR
872 |
873 | In order to deploy to SageMaker an mlflow model, you need to create a serving container that implements what the SageMaker runtime expects to find.
874 | MLFlow makes this effor easier by providing a CLI command that build the image locally and pushes to your ECR the image.
875 |
876 | ```bash
877 | # install the libraries
878 | pip install mlflow==2.14.2 boto3
879 |
880 | # build and push the container to ECR into your account
881 | mlflow sagemaker build-and-push-container
882 | ```
883 |
884 | ## Testing the Http Api
885 |
886 | ### Accessing the MLFlow UI
887 | In order to access the MLFlow UI, we need the URL of the Amazon HTTP API Gateway and the credentials generated for the user authentication.
888 |
889 | The HTTP API Gateway can be retrieved from the **HttpApiGatewayStack** CloudFormation output as shown in the figure below.
890 |
891 | 
892 | *Fig 3 - Retrieve Amazon HTTP API Gateway URL*
893 |
894 | The MLFlow credentials can be retrieved either by navigating in the console to the AWS Secrets Manager, or by using the aws cli.
895 | ```bash
896 | # username
897 | aws secretsmanager get-secret-value --secret-id mlflow-server-credentials | jq -r '.SecretString' | jq -r '.username'
898 | # password
899 | aws secretsmanager get-secret-value --secret-id mlflow-server-credentials | jq -r '.SecretString' | jq -r '.password'
900 | ```
901 | ### MLFlow / Amazon SageMaker Studio integration lab
902 |
903 | In the AWS console, navigate to Amazon SageMaker Studio and open Studio for the `mlflow-user` user as shown in the pictures below.
904 |
905 | 
906 | *Fig 4 - Navigate to Amazon SageMaker Studio*
907 |
908 | 
909 | *Fig 5 - Launch Amazon SageMaker Studio for the `mlflow-user`*
910 |
911 | Clone this repository either from the terminal or from the Studio UI.
912 |
913 | 
914 | *Fig 6 - Clone repo in SageMaker Studio*
915 |
916 | Navigate to the `./aws-mlflow-sagemaker-cdk/lab/nginxBasicAuth` folder and open the open the `sagemaker_studio_and_mlflow.ipynb` notebook.
917 | You can see how to train in Amazon SageMaker and store the resulting models in MLFlow after retrieving the credentials at runtime and how to deploy models stored in Amazon SageMaker endpoints using the MLFlow SDK.
918 | Furthermore, the lab shows how you can enrich MLFlow metadata with SageMaker metadata, and vice versa, by storing MFlow specifics in SageMaker via SageMaker Experiments SDK and visualize them in the SageMaker Studio UI.
919 |
920 | ## Cleanup
921 |
922 | Before removing all resources created, you need to make sure that all Apps are deleted from the `mlflow-user` user, i.e. all `KernelGateway` apps, as well as the default `JupiterServer`
923 |
924 | Once done, you can destroy the CDK stack by running
925 |
926 | ```bash
927 | cd ~/environment/aws-mlflow-sagemaker-cdk/cdk/nginxAuthentication
928 | ./destroy.sh
929 | ```
930 |
931 | At the prompt, enter `y`.
932 |
933 | ## Conclusion
934 |
935 | The benefit of this serverless architecture is that it takes away the overhead of having to manage underlying servers and helps reduce costs, as you only pay for the time in which your code executes.
936 |
937 |
--------------------------------------------------------------------------------
/cdk/.gitignore:
--------------------------------------------------------------------------------
1 | *.js
2 | !jest.config.js
3 | *.d.ts
4 | **/node_modules/
5 | **/.vscode/
6 |
7 | # CDK asset staging directory
8 | .cdk.staging
9 | cdk.out
10 |
--------------------------------------------------------------------------------
/cdk/.npmignore:
--------------------------------------------------------------------------------
1 | *.ts
2 | !*.d.ts
3 |
4 | # CDK asset staging directory
5 | .cdk.staging
6 | cdk.out
7 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/README.md:
--------------------------------------------------------------------------------
1 | # Welcome to your CDK TypeScript project!
2 |
3 | This is a blank project for TypeScript development with CDK.
4 |
5 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
6 |
7 | ## Useful commands
8 |
9 | * `npm run build` compile typescript to js
10 | * `npm run watch` watch for changes and compile
11 | * `npm run test` perform the jest unit tests
12 | * `cdk deploy` deploy this stack to your default AWS account/region
13 | * `cdk diff` compare deployed stack with current state
14 | * `cdk synth` emits the synthesized CloudFormation template
15 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/bin/app.ts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import 'source-map-support/register';
3 | import * as cdk from 'aws-cdk-lib';
4 | import { MLflowVpcStack } from '../lib/mlflow-vpc-stack';
5 | import { HttpGatewayStack } from '../lib/http-gateway-stack';
6 | import { SageMakerStudioUserStack } from '../lib/sagemaker-studio-user-stack';
7 | const env = { region: (process.env['AWS_REGION'] || 'us-west-2'), account: process.env['AWS_ACCOUNT'] };
8 |
9 | const domainId = (process.env['DOMAIN_ID'] || "" )
10 | const mlflowSecretName = 'mlflow-server-credentials'
11 |
12 | const app = new cdk.App();
13 |
14 | const mlflowVpcStack = new MLflowVpcStack(
15 | app,
16 | 'MLflowVpcStack',
17 | mlflowSecretName,
18 | { env: env }
19 | );
20 |
21 | const httpGatewayStack = new HttpGatewayStack(
22 | app,
23 | 'HttpGatewayStack',
24 | mlflowVpcStack.vpc,
25 | mlflowVpcStack.httpApiListener,
26 | { env: env }
27 | );
28 |
29 | new SageMakerStudioUserStack(
30 | app,
31 | 'SageMakerStudioUserStack',
32 | mlflowVpcStack.mlflowSecretArn,
33 | 'HttpGatewayStack',
34 | domainId,
35 | { env: env }
36 | )
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/cdk.json:
--------------------------------------------------------------------------------
1 | {
2 | "app": "npx ts-node --prefer-ts-exts bin/app.ts",
3 | "context": {
4 | "aws-cdk:enableDiffNoFail": "true",
5 | "@aws-cdk/core:stackRelativeExports": "true",
6 | "@aws-cdk/aws-kms:defaultKeyPolicies": true
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/deploy.sh:
--------------------------------------------------------------------------------
1 | cdk deploy --all --require-approval never
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/destroy.sh:
--------------------------------------------------------------------------------
1 | cdk destroy --all
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/diff.sh:
--------------------------------------------------------------------------------
1 | cdk diff --all
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/lib/http-gateway-stack.ts:
--------------------------------------------------------------------------------
1 | import * as cdk from 'aws-cdk-lib';
2 | import { Construct } from 'constructs';
3 |
4 | import * as elbv2 from "aws-cdk-lib/aws-elasticloadbalancingv2";
5 | import * as apig from "@aws-cdk/aws-apigatewayv2-alpha";
6 | import { HttpAlbIntegration } from "@aws-cdk/aws-apigatewayv2-integrations-alpha";
7 |
8 | import * as ec2 from "aws-cdk-lib/aws-ec2";
9 |
10 | export class HttpGatewayStack extends cdk.Stack {
11 | public readonly api: apig.HttpApi;
12 | public readonly mlflowSecretArn: string;
13 |
14 | constructor(
15 | scope: Construct,
16 | id: string,
17 | vpc: ec2.Vpc,
18 | httpApiListener: elbv2.ApplicationListener,
19 | props?: cdk.StackProps
20 | ) {
21 | super(scope, id, props);
22 |
23 | const httpVpcLink = new cdk.CfnResource(this, 'HttpVpcLink', {
24 | type: "AWS::ApiGatewayV2::VpcLink",
25 | properties: {
26 | Name: "http-api-vpclink",
27 | SubnetIds: vpc.privateSubnets.map((m) => m.subnetId)
28 | },
29 | });
30 |
31 | const mlflowVpcLink = apig.VpcLink.fromVpcLinkAttributes(this, 'MLFlowVpcLink', {
32 | vpcLinkId: httpVpcLink.ref,
33 | vpc: vpc
34 | });
35 |
36 | // HTTP Integration with VpcLink
37 | const mlflowIntegration = new HttpAlbIntegration(
38 | 'MLflowIntegration',
39 | httpApiListener,
40 | { vpcLink: mlflowVpcLink }
41 | )
42 |
43 | // HTTP Api
44 | this.api = new apig.HttpApi(this, "mlflow-api", {
45 | createDefaultStage: true,
46 | defaultIntegration: mlflowIntegration
47 | });
48 |
49 | this.api.addRoutes({
50 | integration: mlflowIntegration,
51 | path: "/{proxy+}"
52 | })
53 |
54 | // 👇 API and Service Endpoints
55 | const httpApiEndpoint = this.api.apiEndpoint;
56 |
57 | new cdk.CfnOutput(this, "MLflow API endpoint: ", {
58 | value: httpApiEndpoint,
59 | });
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/lib/mlflow-vpc-stack.ts:
--------------------------------------------------------------------------------
1 | import * as cdk from 'aws-cdk-lib';
2 | import { Construct } from 'constructs';
3 |
4 | import * as elbv2 from "aws-cdk-lib/aws-elasticloadbalancingv2";
5 | import * as ec2 from "aws-cdk-lib/aws-ec2";
6 | import * as ecs from "aws-cdk-lib/aws-ecs";
7 | import * as iam from "aws-cdk-lib/aws-iam";
8 | import * as logs from "aws-cdk-lib/aws-logs";
9 | import * as servicediscovery from "aws-cdk-lib/aws-servicediscovery";
10 | import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
11 | import * as s3 from 'aws-cdk-lib/aws-s3';
12 | import { CfnDBCluster, CfnDBSubnetGroup } from 'aws-cdk-lib/aws-rds';
13 | import { Platform } from "aws-cdk-lib/aws-ecr-assets";
14 |
15 | const { ApplicationProtocol } = elbv2;
16 | const dbName = "mlflowdb"
17 | const dbPort = 3306
18 | const dbUsername = "master"
19 | const clusterName = "mlflowCluster"
20 | const serviceName = "mlflowService"
21 | const cidr = "10.0.0.0/16"
22 | const containerPort = 5000
23 |
24 | const mlflowUsername = "admin"
25 |
26 | export class MLflowVpcStack extends cdk.Stack {
27 |
28 | // Export Vpc, ALB Listener, and Mlflow secret ARN
29 | public readonly httpApiListener: elbv2.ApplicationListener;
30 | public readonly mlflowSecretArn: string;
31 | public readonly vpc: ec2.Vpc;
32 |
33 | readonly bucketName = `mlflow-${this.account}-${this.region}`
34 |
35 | constructor(
36 | scope: Construct,
37 | id: string,
38 | mlflowSecretName: string,
39 | props?: cdk.StackProps
40 | ) {
41 | super(scope, id, props);
42 |
43 | // VPC
44 | this.vpc = new ec2.Vpc(this, 'MLFlowVPC', {
45 | cidr: cidr,
46 | natGateways: 1,
47 | maxAzs: 2,
48 | subnetConfiguration: [
49 | {
50 | name: 'public',
51 | subnetType: ec2.SubnetType.PUBLIC,
52 | cidrMask: 24,
53 | },
54 | {
55 | name: 'private',
56 | subnetType: ec2.SubnetType.PRIVATE_WITH_NAT,
57 | cidrMask: 26,
58 | },
59 | {
60 | name: 'isolated',
61 | subnetType: ec2.SubnetType.PRIVATE_ISOLATED,
62 | cidrMask: 28,
63 | },
64 | ],
65 | });
66 |
67 | // S3 bucket
68 | const mlFlowBucket = new s3.Bucket(this, "mlFlowBucket", {
69 | versioned: false,
70 | bucketName: this.bucketName,
71 | publicReadAccess: false,
72 | blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
73 | removalPolicy: cdk.RemovalPolicy.DESTROY,
74 | autoDeleteObjects: true,
75 | encryption: s3.BucketEncryption.KMS_MANAGED
76 | })
77 |
78 | // DB SubnetGroup
79 | const subnetIds: string[] = [];
80 | this.vpc.isolatedSubnets.forEach((subnet, index) => {
81 | subnetIds.push(subnet.subnetId);
82 | });
83 |
84 | const dbSubnetGroup: CfnDBSubnetGroup = new CfnDBSubnetGroup(this, 'AuroraSubnetGroup', {
85 | dbSubnetGroupDescription: 'Subnet group to access aurora',
86 | dbSubnetGroupName: 'aurora-serverless-subnet-group',
87 | subnetIds
88 | });
89 |
90 | // DB Credentials
91 | const databaseCredentialsSecret = new secretsmanager.Secret(this, 'DBCredentialsSecret', {
92 | secretName: `mlflow-database-credentials`,
93 | generateSecretString: {
94 | secretStringTemplate: JSON.stringify({
95 | username: dbUsername,
96 | }),
97 | excludePunctuation: true,
98 | includeSpace: false,
99 | generateStringKey: 'password'
100 | }
101 | });
102 |
103 | // Mflow credentials
104 | const mlflowCredentialsSecret = new secretsmanager.Secret(this, 'MlflowCredentialsSecret', {
105 | secretName: mlflowSecretName,
106 | generateSecretString: {
107 | secretStringTemplate: JSON.stringify({
108 | username: mlflowUsername,
109 | }),
110 | excludePunctuation: true,
111 | includeSpace: false,
112 | generateStringKey: 'password'
113 | }
114 | });
115 |
116 | // 👇 DB SecurityGroup
117 | const dbClusterSecurityGroup = new ec2.SecurityGroup(this, 'DBClusterSecurityGroup', { vpc: this.vpc });
118 | dbClusterSecurityGroup.addIngressRule(ec2.Peer.ipv4(cidr), ec2.Port.tcp(dbPort));
119 |
120 | const dbConfig = {
121 | dbClusterIdentifier: `${serviceName}-cluster`,
122 | engineMode: 'serverless',
123 | engine: 'aurora-mysql',
124 | engineVersion: '5.7.12',
125 | databaseName: dbName,
126 | masterUsername: databaseCredentialsSecret.secretValueFromJson('username').toString(),
127 | masterUserPassword: databaseCredentialsSecret.secretValueFromJson('password').toString(),
128 | // Note: aurora serverless cluster can be accessed within its VPC only
129 | // https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless.html
130 | dbSubnetGroupName: dbSubnetGroup.dbSubnetGroupName,
131 | scalingConfiguration: {
132 | autoPause: true,
133 | maxCapacity: 2,
134 | minCapacity: 2,
135 | secondsUntilAutoPause: 3600,
136 | },
137 | vpcSecurityGroupIds: [
138 | dbClusterSecurityGroup.securityGroupId
139 | ]
140 | };
141 |
142 | // 👇 RDS Cluster
143 | const rdsCluster = new CfnDBCluster(this, 'DBCluster', dbConfig);
144 | rdsCluster.addDependsOn(dbSubnetGroup)
145 |
146 | // 👇 ECS Cluster
147 | const cluster = new ecs.Cluster(this, "Fargate Cluster", {
148 | vpc: this.vpc,
149 | clusterName: clusterName,
150 | });
151 |
152 | // 👇 Cloud Map Namespace
153 | const dnsNamespace = new servicediscovery.PrivateDnsNamespace(
154 | this,
155 | "DnsNamespace",
156 | {
157 | name: "http-api.local",
158 | vpc: this.vpc,
159 | description: "Private DnsNamespace for Microservices",
160 | }
161 | );
162 |
163 | // 👇 Fargate Task Role
164 | const taskrole = new iam.Role(this, "ecsTaskExecutionRole", {
165 | assumedBy: new iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
166 | managedPolicies: [
167 | iam.ManagedPolicy.fromAwsManagedPolicyName("service-role/AmazonECSTaskExecutionRolePolicy")
168 | ],
169 | inlinePolicies: {
170 | s3Bucket: new iam.PolicyDocument({
171 | statements:[
172 | new iam.PolicyStatement({
173 | effect: iam.Effect.ALLOW,
174 | resources: [`arn:aws:s3:::${this.bucketName}`,`arn:aws:s3:::${this.bucketName}/*`],
175 | actions: ["s3:*"]
176 | })
177 | ]
178 | }),
179 | secretsManagerRestricted: new iam.PolicyDocument({
180 | statements: [
181 | new iam.PolicyStatement({
182 | effect: iam.Effect.ALLOW,
183 | resources: [
184 | mlflowCredentialsSecret.secretArn,
185 | databaseCredentialsSecret.secretArn
186 | ],
187 | actions: [
188 | "secretsmanager:GetResourcePolicy",
189 | "secretsmanager:GetSecretValue",
190 | "secretsmanager:DescribeSecret",
191 | "secretsmanager:ListSecretVersionIds"
192 | ]
193 | }),
194 | new iam.PolicyStatement({
195 | effect: iam.Effect.ALLOW,
196 | resources: ["*"],
197 | actions: ["secretsmanager:ListSecrets"]
198 | })
199 | ]
200 | })
201 | }
202 | });
203 |
204 | // 👇 Task Definitions
205 | const mlflowTaskDefinition = new ecs.FargateTaskDefinition(
206 | this,
207 | "mlflowTaskDef",
208 | {
209 | taskRole: taskrole,
210 | family: "mlFlowStack",
211 | cpu: 1024,
212 | memoryLimitMiB: 2048
213 | },
214 | );
215 |
216 | // 👇 Log Groups
217 | const mlflowServiceLogGroup = new logs.LogGroup(this, "mlflowServiceLogGroup", {
218 | logGroupName: "/ecs/mlflowService",
219 | removalPolicy: cdk.RemovalPolicy.DESTROY,
220 | });
221 |
222 | const mlflowServiceLogDriver = new ecs.AwsLogDriver({
223 | logGroup: mlflowServiceLogGroup,
224 | streamPrefix: "mlflowService",
225 | });
226 |
227 | // 👇 nginx Task Container
228 | const nginxContainer = mlflowTaskDefinition.addContainer(
229 | "nginxContainer",
230 | {
231 | containerName: "nginxContainer",
232 | essential: true,
233 | // memoryReservationMiB: 512,
234 | // cpu: 512,
235 | portMappings: [{
236 | containerPort: 80,
237 | protocol: ecs.Protocol.TCP
238 | }],
239 | image: ecs.ContainerImage.fromAsset('../../src/nginx/basic_auth',
240 | {
241 | platform: Platform.LINUX_AMD64
242 | }
243 | ),
244 | secrets: {
245 | MLFLOW_USERNAME: ecs.Secret.fromSecretsManager(mlflowCredentialsSecret, 'username'),
246 | MLFLOW_PASSWORD: ecs.Secret.fromSecretsManager(mlflowCredentialsSecret, 'password')
247 | },
248 | logging: mlflowServiceLogDriver,
249 | }
250 | );
251 |
252 | // 👇 MlFlow Task Container
253 | const mlflowServiceContainer = mlflowTaskDefinition.addContainer(
254 | "mlflowContainer",
255 | {
256 | containerName: "mlflowContainer",
257 | essential: true,
258 | memoryReservationMiB: 1024,
259 | cpu: 512,
260 | portMappings: [{
261 | containerPort: containerPort,
262 | protocol: ecs.Protocol.TCP,
263 | }],
264 | image: ecs.ContainerImage.fromAsset('../../src/mlflow',
265 | {
266 | platform: Platform.LINUX_AMD64
267 | }
268 | ),
269 | environment: {
270 | 'BUCKET': `s3://${mlFlowBucket.bucketName}`,
271 | 'HOST': rdsCluster.attrEndpointAddress,
272 | 'PORT': `${dbPort}`,
273 | 'DATABASE': dbName
274 | },
275 | secrets: {
276 | USERNAME: ecs.Secret.fromSecretsManager(databaseCredentialsSecret, 'username'),
277 | PASSWORD: ecs.Secret.fromSecretsManager(databaseCredentialsSecret, 'password')
278 | },
279 | logging: mlflowServiceLogDriver,
280 | });
281 |
282 | // 👇 Security Group
283 | const mlflowServiceSecGrp = new ec2.SecurityGroup(
284 | this,
285 | "mlflowServiceSecurityGroup",
286 | {
287 | allowAllOutbound: true,
288 | securityGroupName: "mlflowServiceSecurityGroup",
289 | vpc: this.vpc,
290 | }
291 | );
292 | mlflowServiceSecGrp.connections.allowFromAnyIpv4(ec2.Port.tcp(containerPort));
293 | mlflowServiceSecGrp.connections.allowFromAnyIpv4(ec2.Port.tcp(80));
294 |
295 | // 👇 Fargate Services
296 | const mlflowService = new ecs.FargateService(this, "mlflowService", {
297 | cluster: cluster,
298 | serviceName: serviceName,
299 | taskDefinition: mlflowTaskDefinition,
300 | assignPublicIp: false,
301 | desiredCount: 2,
302 | securityGroups: [mlflowServiceSecGrp],
303 | cloudMapOptions: {
304 | name: "mlflowService",
305 | cloudMapNamespace: dnsNamespace,
306 | },
307 | });
308 |
309 | // 👇 ALB
310 | const httpApiInternalALB = new elbv2.ApplicationLoadBalancer(
311 | this,
312 | "httpapiInternalALB",
313 | {
314 | vpc: this.vpc,
315 | internetFacing: false,
316 | }
317 | );
318 |
319 | // 👇 ALB Listener
320 | this.httpApiListener = httpApiInternalALB.addListener("httpapiListener", {
321 | port: 80,
322 | protocol: ApplicationProtocol.HTTP,
323 |
324 | });
325 |
326 | // 👇 Target Groups
327 | const mlflowServiceTargetGroup = this.httpApiListener.addTargets(
328 | "mlflowServiceTargetGroup",
329 | {
330 | healthCheck: {
331 | path: "/elb-status"
332 | },
333 | targets: [
334 | mlflowService.loadBalancerTarget(
335 | {
336 | containerName: 'nginxContainer',
337 | containerPort: 80
338 | }
339 | )
340 | ],
341 | port: 80,
342 | protocol: ApplicationProtocol.HTTP,
343 | }
344 | );
345 |
346 | // 👇 Task Auto Scaling
347 | const autoScaling = mlflowService.autoScaleTaskCount({ maxCapacity: 6 });
348 | autoScaling.scaleOnCpuUtilization('CpuScaling', {
349 | targetUtilizationPercent: 70,
350 | scaleInCooldown: cdk.Duration.seconds(60),
351 | scaleOutCooldown: cdk.Duration.seconds(60),
352 | });
353 |
354 | this.mlflowSecretArn = mlflowCredentialsSecret.secretArn
355 |
356 | new cdk.CfnOutput(this, "ALB Dns Name : ", {
357 | value: httpApiInternalALB.loadBalancerDnsName,
358 | });
359 |
360 | }
361 | }
362 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/lib/sagemaker-studio-user-stack.ts:
--------------------------------------------------------------------------------
1 | import * as cdk from 'aws-cdk-lib';
2 | import { Construct } from 'constructs';
3 |
4 | import * as sagemaker from 'aws-cdk-lib/aws-sagemaker';
5 | import * as ec2 from 'aws-cdk-lib/aws-ec2';
6 |
7 | import * as iam from "aws-cdk-lib/aws-iam";
8 |
9 | export class SageMakerStudioUserStack extends cdk.Stack {
10 | constructor(
11 | scope: Construct,
12 | id: string,
13 | mlflowSecretArn: string,
14 | httpGatewayStackName: string,
15 | domainId: string,
16 | props?: cdk.StackProps
17 | ){
18 | super(scope, id, props);
19 |
20 | // SageMaker Execution Role
21 | const sagemakerExecutionRole = new iam.Role(this, "sagemaker-execution-role", {
22 | assumedBy: new iam.ServicePrincipal("sagemaker.amazonaws.com"),
23 | managedPolicies: [
24 | iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonSageMakerFullAccess")
25 | ],
26 | inlinePolicies: {
27 | retrieveApiGatewayUrl: new iam.PolicyDocument({
28 | statements: [
29 | new iam.PolicyStatement({
30 | effect: iam.Effect.ALLOW,
31 | resources: [`arn:*:cloudformation:${this.region}:${this.account}:stack/${httpGatewayStackName}/*`],
32 | actions: ["cloudformation:DescribeStacks"],
33 | })
34 | ],
35 | }),
36 | s3Buckets: new iam.PolicyDocument({
37 | statements: [
38 | new iam.PolicyStatement({
39 | effect: iam.Effect.ALLOW,
40 | resources: ["arn:aws:s3:::*mlflow*"],
41 | actions: ["s3:ListBucket","s3:GetObject", "s3:PutObject", "s3:DeleteObject", "s3:PutObjectTagging", "s3:CreateBucket"],
42 | })
43 | ],
44 | }),
45 | secretsManagerRestricted: new iam.PolicyDocument({
46 | statements: [
47 | new iam.PolicyStatement({
48 | effect: iam.Effect.ALLOW,
49 | resources: [mlflowSecretArn],
50 | actions: [
51 | "secretsmanager:GetResourcePolicy",
52 | "secretsmanager:GetSecretValue",
53 | "secretsmanager:DescribeSecret",
54 | "secretsmanager:ListSecretVersionIds"
55 | ]
56 | }),
57 | new iam.PolicyStatement({
58 | effect: iam.Effect.ALLOW,
59 | resources: ["*"],
60 | actions: ["secretsmanager:ListSecrets"]
61 | })
62 | ]
63 | })
64 | },
65 | });
66 |
67 | if (domainId == "") {
68 | const defaultVpc = ec2.Vpc.fromLookup(this, 'DefaultVPC', { isDefault: true });
69 | const subnetIds: string[] = [];
70 |
71 | defaultVpc.publicSubnets.forEach((subnet, index) => {
72 | subnetIds.push(subnet.subnetId);
73 | });
74 |
75 | const cfnStudioDomain = new sagemaker.CfnDomain(this, 'MyStudioDomain', {
76 | authMode: 'IAM',
77 | defaultUserSettings: {
78 | executionRole: sagemakerExecutionRole.roleArn,
79 | },
80 | domainName: 'StudioDomainName',
81 | vpcId: defaultVpc.vpcId,
82 | subnetIds: subnetIds,
83 | });
84 |
85 | const cfnUserProfile = new sagemaker.CfnUserProfile(this, 'MyCfnUserProfile', {
86 | domainId: cfnStudioDomain.attrDomainId,
87 | userProfileName: 'mlflow-user',
88 | userSettings: {
89 | executionRole: sagemakerExecutionRole.roleArn,
90 | }
91 | }
92 | );
93 | }
94 | else {
95 | const cfnUserProfile = new sagemaker.CfnUserProfile(this, 'MyCfnUserProfile', {
96 | domainId: domainId,
97 | userProfileName: 'mlflow-user',
98 | userSettings: {
99 | executionRole: sagemakerExecutionRole.roleArn,
100 | }
101 | }
102 | );
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "cdk",
3 | "version": "0.1.0",
4 | "lockfileVersion": 2,
5 | "requires": true,
6 | "packages": {
7 | "": {
8 | "name": "cdk",
9 | "version": "0.1.0",
10 | "dependencies": {
11 | "source-map-support": "^0.5.16"
12 | },
13 | "bin": {
14 | "cdk": "bin/cdk.js"
15 | },
16 | "devDependencies": {
17 | "@aws-cdk/aws-apigatewayv2-alpha": "2.114.1-alpha.0",
18 | "@aws-cdk/aws-apigatewayv2-integrations-alpha": "2.114.1-alpha.0",
19 | "@types/node": "10.17.27",
20 | "aws-cdk": "2.114.1",
21 | "aws-cdk-lib": "2.114.1",
22 | "typescript": "~3.9.7"
23 | }
24 | },
25 | "node_modules/@aws-cdk/asset-awscli-v1": {
26 | "version": "2.2.202",
27 | "resolved": "https://registry.npmjs.org/@aws-cdk/asset-awscli-v1/-/asset-awscli-v1-2.2.202.tgz",
28 | "integrity": "sha512-JqlF0D4+EVugnG5dAsNZMqhu3HW7ehOXm5SDMxMbXNDMdsF0pxtQKNHRl52z1U9igsHmaFpUgSGjbhAJ+0JONg==",
29 | "dev": true
30 | },
31 | "node_modules/@aws-cdk/asset-kubectl-v20": {
32 | "version": "2.1.2",
33 | "resolved": "https://registry.npmjs.org/@aws-cdk/asset-kubectl-v20/-/asset-kubectl-v20-2.1.2.tgz",
34 | "integrity": "sha512-3M2tELJOxQv0apCIiuKQ4pAbncz9GuLwnKFqxifWfe77wuMxyTRPmxssYHs42ePqzap1LT6GDcPygGs+hHstLg==",
35 | "dev": true
36 | },
37 | "node_modules/@aws-cdk/asset-node-proxy-agent-v6": {
38 | "version": "2.0.3",
39 | "resolved": "https://registry.npmjs.org/@aws-cdk/asset-node-proxy-agent-v6/-/asset-node-proxy-agent-v6-2.0.3.tgz",
40 | "integrity": "sha512-twhuEG+JPOYCYPx/xy5uH2+VUsIEhPTzDY0F1KuB+ocjWWB/KEDiOVL19nHvbPCB6fhWnkykXEMJ4HHcKvjtvg==",
41 | "dev": true
42 | },
43 | "node_modules/@aws-cdk/aws-apigatewayv2-alpha": {
44 | "version": "2.114.1-alpha.0",
45 | "resolved": "https://registry.npmjs.org/@aws-cdk/aws-apigatewayv2-alpha/-/aws-apigatewayv2-alpha-2.114.1-alpha.0.tgz",
46 | "integrity": "sha512-+urpw7rGrtdGvnHQlDXVfpI3TmQJpjuT9jTOeuuG5dNDczLJrUokBvQdj6H6KsngdmBC07WfWU+yL2MBp71ozA==",
47 | "dev": true,
48 | "engines": {
49 | "node": ">= 14.15.0"
50 | },
51 | "peerDependencies": {
52 | "aws-cdk-lib": "^2.114.1",
53 | "constructs": "^10.0.0"
54 | }
55 | },
56 | "node_modules/@aws-cdk/aws-apigatewayv2-integrations-alpha": {
57 | "version": "2.114.1-alpha.0",
58 | "resolved": "https://registry.npmjs.org/@aws-cdk/aws-apigatewayv2-integrations-alpha/-/aws-apigatewayv2-integrations-alpha-2.114.1-alpha.0.tgz",
59 | "integrity": "sha512-iB7vHoTguDKLeatJS4p/8OBqH4oLCG2xBn1toUFwMD9iWoRGahGiK0pYuq24baab3SuNS1LFThJw5Zu0R+cZGA==",
60 | "dev": true,
61 | "engines": {
62 | "node": ">= 14.15.0"
63 | },
64 | "peerDependencies": {
65 | "@aws-cdk/aws-apigatewayv2-alpha": "2.114.1-alpha.0",
66 | "aws-cdk-lib": "^2.114.1",
67 | "constructs": "^10.0.0"
68 | }
69 | },
70 | "node_modules/@types/node": {
71 | "version": "10.17.27",
72 | "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.27.tgz",
73 | "integrity": "sha512-J0oqm9ZfAXaPdwNXMMgAhylw5fhmXkToJd06vuDUSAgEDZ/n/69/69UmyBZbc+zT34UnShuDSBqvim3SPnozJg==",
74 | "dev": true
75 | },
76 | "node_modules/aws-cdk": {
77 | "version": "2.114.1",
78 | "resolved": "https://registry.npmjs.org/aws-cdk/-/aws-cdk-2.114.1.tgz",
79 | "integrity": "sha512-iLOCPb3WAJOgVYQ4GvAnrjtScJfPwcczlB4995h3nUYQdHbus0jNffFv13zBShdWct3cuX+bqLuZ4JyEmJ9+rg==",
80 | "dev": true,
81 | "bin": {
82 | "cdk": "bin/cdk"
83 | },
84 | "engines": {
85 | "node": ">= 14.15.0"
86 | },
87 | "optionalDependencies": {
88 | "fsevents": "2.3.2"
89 | }
90 | },
91 | "node_modules/aws-cdk-lib": {
92 | "version": "2.114.1",
93 | "resolved": "https://registry.npmjs.org/aws-cdk-lib/-/aws-cdk-lib-2.114.1.tgz",
94 | "integrity": "sha512-pJy+Sa3+s6K9I0CXYGU8J5jumw9uQEbl8zPK8EMA+A6hP9qb1JN+a8ohyw6a1O1cb4D5S6gwH+hE7Fq7hGPY3A==",
95 | "bundleDependencies": [
96 | "@balena/dockerignore",
97 | "case",
98 | "fs-extra",
99 | "ignore",
100 | "jsonschema",
101 | "minimatch",
102 | "punycode",
103 | "semver",
104 | "table",
105 | "yaml"
106 | ],
107 | "dev": true,
108 | "dependencies": {
109 | "@aws-cdk/asset-awscli-v1": "^2.2.201",
110 | "@aws-cdk/asset-kubectl-v20": "^2.1.2",
111 | "@aws-cdk/asset-node-proxy-agent-v6": "^2.0.1",
112 | "@balena/dockerignore": "^1.0.2",
113 | "case": "1.6.3",
114 | "fs-extra": "^11.1.1",
115 | "ignore": "^5.3.0",
116 | "jsonschema": "^1.4.1",
117 | "minimatch": "^3.1.2",
118 | "punycode": "^2.3.1",
119 | "semver": "^7.5.4",
120 | "table": "^6.8.1",
121 | "yaml": "1.10.2"
122 | },
123 | "engines": {
124 | "node": ">= 14.15.0"
125 | },
126 | "peerDependencies": {
127 | "constructs": "^10.0.0"
128 | }
129 | },
130 | "node_modules/aws-cdk-lib/node_modules/@balena/dockerignore": {
131 | "version": "1.0.2",
132 | "dev": true,
133 | "inBundle": true,
134 | "license": "Apache-2.0"
135 | },
136 | "node_modules/aws-cdk-lib/node_modules/ajv": {
137 | "version": "8.12.0",
138 | "dev": true,
139 | "inBundle": true,
140 | "license": "MIT",
141 | "dependencies": {
142 | "fast-deep-equal": "^3.1.1",
143 | "json-schema-traverse": "^1.0.0",
144 | "require-from-string": "^2.0.2",
145 | "uri-js": "^4.2.2"
146 | },
147 | "funding": {
148 | "type": "github",
149 | "url": "https://github.com/sponsors/epoberezkin"
150 | }
151 | },
152 | "node_modules/aws-cdk-lib/node_modules/ansi-regex": {
153 | "version": "5.0.1",
154 | "dev": true,
155 | "inBundle": true,
156 | "license": "MIT",
157 | "engines": {
158 | "node": ">=8"
159 | }
160 | },
161 | "node_modules/aws-cdk-lib/node_modules/ansi-styles": {
162 | "version": "4.3.0",
163 | "dev": true,
164 | "inBundle": true,
165 | "license": "MIT",
166 | "dependencies": {
167 | "color-convert": "^2.0.1"
168 | },
169 | "engines": {
170 | "node": ">=8"
171 | },
172 | "funding": {
173 | "url": "https://github.com/chalk/ansi-styles?sponsor=1"
174 | }
175 | },
176 | "node_modules/aws-cdk-lib/node_modules/astral-regex": {
177 | "version": "2.0.0",
178 | "dev": true,
179 | "inBundle": true,
180 | "license": "MIT",
181 | "engines": {
182 | "node": ">=8"
183 | }
184 | },
185 | "node_modules/aws-cdk-lib/node_modules/balanced-match": {
186 | "version": "1.0.2",
187 | "dev": true,
188 | "inBundle": true,
189 | "license": "MIT"
190 | },
191 | "node_modules/aws-cdk-lib/node_modules/brace-expansion": {
192 | "version": "1.1.11",
193 | "dev": true,
194 | "inBundle": true,
195 | "license": "MIT",
196 | "dependencies": {
197 | "balanced-match": "^1.0.0",
198 | "concat-map": "0.0.1"
199 | }
200 | },
201 | "node_modules/aws-cdk-lib/node_modules/case": {
202 | "version": "1.6.3",
203 | "dev": true,
204 | "inBundle": true,
205 | "license": "(MIT OR GPL-3.0-or-later)",
206 | "engines": {
207 | "node": ">= 0.8.0"
208 | }
209 | },
210 | "node_modules/aws-cdk-lib/node_modules/color-convert": {
211 | "version": "2.0.1",
212 | "dev": true,
213 | "inBundle": true,
214 | "license": "MIT",
215 | "dependencies": {
216 | "color-name": "~1.1.4"
217 | },
218 | "engines": {
219 | "node": ">=7.0.0"
220 | }
221 | },
222 | "node_modules/aws-cdk-lib/node_modules/color-name": {
223 | "version": "1.1.4",
224 | "dev": true,
225 | "inBundle": true,
226 | "license": "MIT"
227 | },
228 | "node_modules/aws-cdk-lib/node_modules/concat-map": {
229 | "version": "0.0.1",
230 | "dev": true,
231 | "inBundle": true,
232 | "license": "MIT"
233 | },
234 | "node_modules/aws-cdk-lib/node_modules/emoji-regex": {
235 | "version": "8.0.0",
236 | "dev": true,
237 | "inBundle": true,
238 | "license": "MIT"
239 | },
240 | "node_modules/aws-cdk-lib/node_modules/fast-deep-equal": {
241 | "version": "3.1.3",
242 | "dev": true,
243 | "inBundle": true,
244 | "license": "MIT"
245 | },
246 | "node_modules/aws-cdk-lib/node_modules/fs-extra": {
247 | "version": "11.1.1",
248 | "dev": true,
249 | "inBundle": true,
250 | "license": "MIT",
251 | "dependencies": {
252 | "graceful-fs": "^4.2.0",
253 | "jsonfile": "^6.0.1",
254 | "universalify": "^2.0.0"
255 | },
256 | "engines": {
257 | "node": ">=14.14"
258 | }
259 | },
260 | "node_modules/aws-cdk-lib/node_modules/graceful-fs": {
261 | "version": "4.2.11",
262 | "dev": true,
263 | "inBundle": true,
264 | "license": "ISC"
265 | },
266 | "node_modules/aws-cdk-lib/node_modules/ignore": {
267 | "version": "5.3.0",
268 | "dev": true,
269 | "inBundle": true,
270 | "license": "MIT",
271 | "engines": {
272 | "node": ">= 4"
273 | }
274 | },
275 | "node_modules/aws-cdk-lib/node_modules/is-fullwidth-code-point": {
276 | "version": "3.0.0",
277 | "dev": true,
278 | "inBundle": true,
279 | "license": "MIT",
280 | "engines": {
281 | "node": ">=8"
282 | }
283 | },
284 | "node_modules/aws-cdk-lib/node_modules/json-schema-traverse": {
285 | "version": "1.0.0",
286 | "dev": true,
287 | "inBundle": true,
288 | "license": "MIT"
289 | },
290 | "node_modules/aws-cdk-lib/node_modules/jsonfile": {
291 | "version": "6.1.0",
292 | "dev": true,
293 | "inBundle": true,
294 | "license": "MIT",
295 | "dependencies": {
296 | "universalify": "^2.0.0"
297 | },
298 | "optionalDependencies": {
299 | "graceful-fs": "^4.1.6"
300 | }
301 | },
302 | "node_modules/aws-cdk-lib/node_modules/jsonschema": {
303 | "version": "1.4.1",
304 | "dev": true,
305 | "inBundle": true,
306 | "license": "MIT",
307 | "engines": {
308 | "node": "*"
309 | }
310 | },
311 | "node_modules/aws-cdk-lib/node_modules/lodash.truncate": {
312 | "version": "4.4.2",
313 | "dev": true,
314 | "inBundle": true,
315 | "license": "MIT"
316 | },
317 | "node_modules/aws-cdk-lib/node_modules/lru-cache": {
318 | "version": "6.0.0",
319 | "dev": true,
320 | "inBundle": true,
321 | "license": "ISC",
322 | "dependencies": {
323 | "yallist": "^4.0.0"
324 | },
325 | "engines": {
326 | "node": ">=10"
327 | }
328 | },
329 | "node_modules/aws-cdk-lib/node_modules/minimatch": {
330 | "version": "3.1.2",
331 | "dev": true,
332 | "inBundle": true,
333 | "license": "ISC",
334 | "dependencies": {
335 | "brace-expansion": "^1.1.7"
336 | },
337 | "engines": {
338 | "node": "*"
339 | }
340 | },
341 | "node_modules/aws-cdk-lib/node_modules/punycode": {
342 | "version": "2.3.1",
343 | "dev": true,
344 | "inBundle": true,
345 | "license": "MIT",
346 | "engines": {
347 | "node": ">=6"
348 | }
349 | },
350 | "node_modules/aws-cdk-lib/node_modules/require-from-string": {
351 | "version": "2.0.2",
352 | "dev": true,
353 | "inBundle": true,
354 | "license": "MIT",
355 | "engines": {
356 | "node": ">=0.10.0"
357 | }
358 | },
359 | "node_modules/aws-cdk-lib/node_modules/semver": {
360 | "version": "7.5.4",
361 | "dev": true,
362 | "inBundle": true,
363 | "license": "ISC",
364 | "dependencies": {
365 | "lru-cache": "^6.0.0"
366 | },
367 | "bin": {
368 | "semver": "bin/semver.js"
369 | },
370 | "engines": {
371 | "node": ">=10"
372 | }
373 | },
374 | "node_modules/aws-cdk-lib/node_modules/slice-ansi": {
375 | "version": "4.0.0",
376 | "dev": true,
377 | "inBundle": true,
378 | "license": "MIT",
379 | "dependencies": {
380 | "ansi-styles": "^4.0.0",
381 | "astral-regex": "^2.0.0",
382 | "is-fullwidth-code-point": "^3.0.0"
383 | },
384 | "engines": {
385 | "node": ">=10"
386 | },
387 | "funding": {
388 | "url": "https://github.com/chalk/slice-ansi?sponsor=1"
389 | }
390 | },
391 | "node_modules/aws-cdk-lib/node_modules/string-width": {
392 | "version": "4.2.3",
393 | "dev": true,
394 | "inBundle": true,
395 | "license": "MIT",
396 | "dependencies": {
397 | "emoji-regex": "^8.0.0",
398 | "is-fullwidth-code-point": "^3.0.0",
399 | "strip-ansi": "^6.0.1"
400 | },
401 | "engines": {
402 | "node": ">=8"
403 | }
404 | },
405 | "node_modules/aws-cdk-lib/node_modules/strip-ansi": {
406 | "version": "6.0.1",
407 | "dev": true,
408 | "inBundle": true,
409 | "license": "MIT",
410 | "dependencies": {
411 | "ansi-regex": "^5.0.1"
412 | },
413 | "engines": {
414 | "node": ">=8"
415 | }
416 | },
417 | "node_modules/aws-cdk-lib/node_modules/table": {
418 | "version": "6.8.1",
419 | "dev": true,
420 | "inBundle": true,
421 | "license": "BSD-3-Clause",
422 | "dependencies": {
423 | "ajv": "^8.0.1",
424 | "lodash.truncate": "^4.4.2",
425 | "slice-ansi": "^4.0.0",
426 | "string-width": "^4.2.3",
427 | "strip-ansi": "^6.0.1"
428 | },
429 | "engines": {
430 | "node": ">=10.0.0"
431 | }
432 | },
433 | "node_modules/aws-cdk-lib/node_modules/universalify": {
434 | "version": "2.0.1",
435 | "dev": true,
436 | "inBundle": true,
437 | "license": "MIT",
438 | "engines": {
439 | "node": ">= 10.0.0"
440 | }
441 | },
442 | "node_modules/aws-cdk-lib/node_modules/uri-js": {
443 | "version": "4.4.1",
444 | "dev": true,
445 | "inBundle": true,
446 | "license": "BSD-2-Clause",
447 | "dependencies": {
448 | "punycode": "^2.1.0"
449 | }
450 | },
451 | "node_modules/aws-cdk-lib/node_modules/yallist": {
452 | "version": "4.0.0",
453 | "dev": true,
454 | "inBundle": true,
455 | "license": "ISC"
456 | },
457 | "node_modules/aws-cdk-lib/node_modules/yaml": {
458 | "version": "1.10.2",
459 | "dev": true,
460 | "inBundle": true,
461 | "license": "ISC",
462 | "engines": {
463 | "node": ">= 6"
464 | }
465 | },
466 | "node_modules/buffer-from": {
467 | "version": "1.1.1",
468 | "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
469 | "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
470 | },
471 | "node_modules/constructs": {
472 | "version": "10.0.37",
473 | "resolved": "https://registry.npmjs.org/constructs/-/constructs-10.0.37.tgz",
474 | "integrity": "sha512-wXtJtGpYzV8R+krlzeFpWqyndJ7zX7OLajzRTYW3bOE8bvfubiUdLbnTCienfcLz1fpvCnyTIO1b98CLBawKQw==",
475 | "dev": true,
476 | "peer": true,
477 | "engines": {
478 | "node": ">= 12.7.0"
479 | }
480 | },
481 | "node_modules/fsevents": {
482 | "version": "2.3.2",
483 | "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
484 | "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
485 | "dev": true,
486 | "hasInstallScript": true,
487 | "optional": true,
488 | "os": [
489 | "darwin"
490 | ],
491 | "engines": {
492 | "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
493 | }
494 | },
495 | "node_modules/source-map": {
496 | "version": "0.6.1",
497 | "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
498 | "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
499 | "engines": {
500 | "node": ">=0.10.0"
501 | }
502 | },
503 | "node_modules/source-map-support": {
504 | "version": "0.5.19",
505 | "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.19.tgz",
506 | "integrity": "sha512-Wonm7zOCIJzBGQdB+thsPar0kYuCIzYvxZwlBa87yi/Mdjv7Tip2cyVbLj5o0cFPN4EVkuTwb3GDDyUx2DGnGw==",
507 | "dependencies": {
508 | "buffer-from": "^1.0.0",
509 | "source-map": "^0.6.0"
510 | }
511 | },
512 | "node_modules/typescript": {
513 | "version": "3.9.7",
514 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.7.tgz",
515 | "integrity": "sha512-BLbiRkiBzAwsjut4x/dsibSTB6yWpwT5qWmC2OfuCg3GgVQCSgMs4vEctYPhsaGtd0AeuuHMkjZ2h2WG8MSzRw==",
516 | "dev": true,
517 | "bin": {
518 | "tsc": "bin/tsc",
519 | "tsserver": "bin/tsserver"
520 | },
521 | "engines": {
522 | "node": ">=4.2.0"
523 | }
524 | }
525 | },
526 | "dependencies": {
527 | "@aws-cdk/asset-awscli-v1": {
528 | "version": "2.2.202",
529 | "resolved": "https://registry.npmjs.org/@aws-cdk/asset-awscli-v1/-/asset-awscli-v1-2.2.202.tgz",
530 | "integrity": "sha512-JqlF0D4+EVugnG5dAsNZMqhu3HW7ehOXm5SDMxMbXNDMdsF0pxtQKNHRl52z1U9igsHmaFpUgSGjbhAJ+0JONg==",
531 | "dev": true
532 | },
533 | "@aws-cdk/asset-kubectl-v20": {
534 | "version": "2.1.2",
535 | "resolved": "https://registry.npmjs.org/@aws-cdk/asset-kubectl-v20/-/asset-kubectl-v20-2.1.2.tgz",
536 | "integrity": "sha512-3M2tELJOxQv0apCIiuKQ4pAbncz9GuLwnKFqxifWfe77wuMxyTRPmxssYHs42ePqzap1LT6GDcPygGs+hHstLg==",
537 | "dev": true
538 | },
539 | "@aws-cdk/asset-node-proxy-agent-v6": {
540 | "version": "2.0.3",
541 | "resolved": "https://registry.npmjs.org/@aws-cdk/asset-node-proxy-agent-v6/-/asset-node-proxy-agent-v6-2.0.3.tgz",
542 | "integrity": "sha512-twhuEG+JPOYCYPx/xy5uH2+VUsIEhPTzDY0F1KuB+ocjWWB/KEDiOVL19nHvbPCB6fhWnkykXEMJ4HHcKvjtvg==",
543 | "dev": true
544 | },
545 | "@aws-cdk/aws-apigatewayv2-alpha": {
546 | "version": "2.114.1-alpha.0",
547 | "resolved": "https://registry.npmjs.org/@aws-cdk/aws-apigatewayv2-alpha/-/aws-apigatewayv2-alpha-2.114.1-alpha.0.tgz",
548 | "integrity": "sha512-+urpw7rGrtdGvnHQlDXVfpI3TmQJpjuT9jTOeuuG5dNDczLJrUokBvQdj6H6KsngdmBC07WfWU+yL2MBp71ozA==",
549 | "dev": true,
550 | "requires": {}
551 | },
552 | "@aws-cdk/aws-apigatewayv2-integrations-alpha": {
553 | "version": "2.114.1-alpha.0",
554 | "resolved": "https://registry.npmjs.org/@aws-cdk/aws-apigatewayv2-integrations-alpha/-/aws-apigatewayv2-integrations-alpha-2.114.1-alpha.0.tgz",
555 | "integrity": "sha512-iB7vHoTguDKLeatJS4p/8OBqH4oLCG2xBn1toUFwMD9iWoRGahGiK0pYuq24baab3SuNS1LFThJw5Zu0R+cZGA==",
556 | "dev": true,
557 | "requires": {}
558 | },
559 | "@types/node": {
560 | "version": "10.17.27",
561 | "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.27.tgz",
562 | "integrity": "sha512-J0oqm9ZfAXaPdwNXMMgAhylw5fhmXkToJd06vuDUSAgEDZ/n/69/69UmyBZbc+zT34UnShuDSBqvim3SPnozJg==",
563 | "dev": true
564 | },
565 | "aws-cdk": {
566 | "version": "2.114.1",
567 | "resolved": "https://registry.npmjs.org/aws-cdk/-/aws-cdk-2.114.1.tgz",
568 | "integrity": "sha512-iLOCPb3WAJOgVYQ4GvAnrjtScJfPwcczlB4995h3nUYQdHbus0jNffFv13zBShdWct3cuX+bqLuZ4JyEmJ9+rg==",
569 | "dev": true,
570 | "requires": {
571 | "fsevents": "2.3.2"
572 | }
573 | },
574 | "aws-cdk-lib": {
575 | "version": "2.114.1",
576 | "resolved": "https://registry.npmjs.org/aws-cdk-lib/-/aws-cdk-lib-2.114.1.tgz",
577 | "integrity": "sha512-pJy+Sa3+s6K9I0CXYGU8J5jumw9uQEbl8zPK8EMA+A6hP9qb1JN+a8ohyw6a1O1cb4D5S6gwH+hE7Fq7hGPY3A==",
578 | "dev": true,
579 | "requires": {
580 | "@aws-cdk/asset-awscli-v1": "^2.2.201",
581 | "@aws-cdk/asset-kubectl-v20": "^2.1.2",
582 | "@aws-cdk/asset-node-proxy-agent-v6": "^2.0.1",
583 | "@balena/dockerignore": "^1.0.2",
584 | "case": "1.6.3",
585 | "fs-extra": "^11.1.1",
586 | "ignore": "^5.3.0",
587 | "jsonschema": "^1.4.1",
588 | "minimatch": "^3.1.2",
589 | "punycode": "^2.3.1",
590 | "semver": "^7.5.4",
591 | "table": "^6.8.1",
592 | "yaml": "1.10.2"
593 | },
594 | "dependencies": {
595 | "@balena/dockerignore": {
596 | "version": "1.0.2",
597 | "bundled": true,
598 | "dev": true
599 | },
600 | "ajv": {
601 | "version": "8.12.0",
602 | "bundled": true,
603 | "dev": true,
604 | "requires": {
605 | "fast-deep-equal": "^3.1.1",
606 | "json-schema-traverse": "^1.0.0",
607 | "require-from-string": "^2.0.2",
608 | "uri-js": "^4.2.2"
609 | }
610 | },
611 | "ansi-regex": {
612 | "version": "5.0.1",
613 | "bundled": true,
614 | "dev": true
615 | },
616 | "ansi-styles": {
617 | "version": "4.3.0",
618 | "bundled": true,
619 | "dev": true,
620 | "requires": {
621 | "color-convert": "^2.0.1"
622 | }
623 | },
624 | "astral-regex": {
625 | "version": "2.0.0",
626 | "bundled": true,
627 | "dev": true
628 | },
629 | "balanced-match": {
630 | "version": "1.0.2",
631 | "bundled": true,
632 | "dev": true
633 | },
634 | "brace-expansion": {
635 | "version": "1.1.11",
636 | "bundled": true,
637 | "dev": true,
638 | "requires": {
639 | "balanced-match": "^1.0.0",
640 | "concat-map": "0.0.1"
641 | }
642 | },
643 | "case": {
644 | "version": "1.6.3",
645 | "bundled": true,
646 | "dev": true
647 | },
648 | "color-convert": {
649 | "version": "2.0.1",
650 | "bundled": true,
651 | "dev": true,
652 | "requires": {
653 | "color-name": "~1.1.4"
654 | }
655 | },
656 | "color-name": {
657 | "version": "1.1.4",
658 | "bundled": true,
659 | "dev": true
660 | },
661 | "concat-map": {
662 | "version": "0.0.1",
663 | "bundled": true,
664 | "dev": true
665 | },
666 | "emoji-regex": {
667 | "version": "8.0.0",
668 | "bundled": true,
669 | "dev": true
670 | },
671 | "fast-deep-equal": {
672 | "version": "3.1.3",
673 | "bundled": true,
674 | "dev": true
675 | },
676 | "fs-extra": {
677 | "version": "11.1.1",
678 | "bundled": true,
679 | "dev": true,
680 | "requires": {
681 | "graceful-fs": "^4.2.0",
682 | "jsonfile": "^6.0.1",
683 | "universalify": "^2.0.0"
684 | }
685 | },
686 | "graceful-fs": {
687 | "version": "4.2.11",
688 | "bundled": true,
689 | "dev": true
690 | },
691 | "ignore": {
692 | "version": "5.3.0",
693 | "bundled": true,
694 | "dev": true
695 | },
696 | "is-fullwidth-code-point": {
697 | "version": "3.0.0",
698 | "bundled": true,
699 | "dev": true
700 | },
701 | "json-schema-traverse": {
702 | "version": "1.0.0",
703 | "bundled": true,
704 | "dev": true
705 | },
706 | "jsonfile": {
707 | "version": "6.1.0",
708 | "bundled": true,
709 | "dev": true,
710 | "requires": {
711 | "graceful-fs": "^4.1.6",
712 | "universalify": "^2.0.0"
713 | }
714 | },
715 | "jsonschema": {
716 | "version": "1.4.1",
717 | "bundled": true,
718 | "dev": true
719 | },
720 | "lodash.truncate": {
721 | "version": "4.4.2",
722 | "bundled": true,
723 | "dev": true
724 | },
725 | "lru-cache": {
726 | "version": "6.0.0",
727 | "bundled": true,
728 | "dev": true,
729 | "requires": {
730 | "yallist": "^4.0.0"
731 | }
732 | },
733 | "minimatch": {
734 | "version": "3.1.2",
735 | "bundled": true,
736 | "dev": true,
737 | "requires": {
738 | "brace-expansion": "^1.1.7"
739 | }
740 | },
741 | "punycode": {
742 | "version": "2.3.1",
743 | "bundled": true,
744 | "dev": true
745 | },
746 | "require-from-string": {
747 | "version": "2.0.2",
748 | "bundled": true,
749 | "dev": true
750 | },
751 | "semver": {
752 | "version": "7.5.4",
753 | "bundled": true,
754 | "dev": true,
755 | "requires": {
756 | "lru-cache": "^6.0.0"
757 | }
758 | },
759 | "slice-ansi": {
760 | "version": "4.0.0",
761 | "bundled": true,
762 | "dev": true,
763 | "requires": {
764 | "ansi-styles": "^4.0.0",
765 | "astral-regex": "^2.0.0",
766 | "is-fullwidth-code-point": "^3.0.0"
767 | }
768 | },
769 | "string-width": {
770 | "version": "4.2.3",
771 | "bundled": true,
772 | "dev": true,
773 | "requires": {
774 | "emoji-regex": "^8.0.0",
775 | "is-fullwidth-code-point": "^3.0.0",
776 | "strip-ansi": "^6.0.1"
777 | }
778 | },
779 | "strip-ansi": {
780 | "version": "6.0.1",
781 | "bundled": true,
782 | "dev": true,
783 | "requires": {
784 | "ansi-regex": "^5.0.1"
785 | }
786 | },
787 | "table": {
788 | "version": "6.8.1",
789 | "bundled": true,
790 | "dev": true,
791 | "requires": {
792 | "ajv": "^8.0.1",
793 | "lodash.truncate": "^4.4.2",
794 | "slice-ansi": "^4.0.0",
795 | "string-width": "^4.2.3",
796 | "strip-ansi": "^6.0.1"
797 | }
798 | },
799 | "universalify": {
800 | "version": "2.0.1",
801 | "bundled": true,
802 | "dev": true
803 | },
804 | "uri-js": {
805 | "version": "4.4.1",
806 | "bundled": true,
807 | "dev": true,
808 | "requires": {
809 | "punycode": "^2.1.0"
810 | }
811 | },
812 | "yallist": {
813 | "version": "4.0.0",
814 | "bundled": true,
815 | "dev": true
816 | },
817 | "yaml": {
818 | "version": "1.10.2",
819 | "bundled": true,
820 | "dev": true
821 | }
822 | }
823 | },
824 | "buffer-from": {
825 | "version": "1.1.1",
826 | "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
827 | "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A=="
828 | },
829 | "constructs": {
830 | "version": "10.0.37",
831 | "resolved": "https://registry.npmjs.org/constructs/-/constructs-10.0.37.tgz",
832 | "integrity": "sha512-wXtJtGpYzV8R+krlzeFpWqyndJ7zX7OLajzRTYW3bOE8bvfubiUdLbnTCienfcLz1fpvCnyTIO1b98CLBawKQw==",
833 | "dev": true,
834 | "peer": true
835 | },
836 | "fsevents": {
837 | "version": "2.3.2",
838 | "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
839 | "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
840 | "dev": true,
841 | "optional": true
842 | },
843 | "source-map": {
844 | "version": "0.6.1",
845 | "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
846 | "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g=="
847 | },
848 | "source-map-support": {
849 | "version": "0.5.19",
850 | "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.19.tgz",
851 | "integrity": "sha512-Wonm7zOCIJzBGQdB+thsPar0kYuCIzYvxZwlBa87yi/Mdjv7Tip2cyVbLj5o0cFPN4EVkuTwb3GDDyUx2DGnGw==",
852 | "requires": {
853 | "buffer-from": "^1.0.0",
854 | "source-map": "^0.6.0"
855 | }
856 | },
857 | "typescript": {
858 | "version": "3.9.7",
859 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.7.tgz",
860 | "integrity": "sha512-BLbiRkiBzAwsjut4x/dsibSTB6yWpwT5qWmC2OfuCg3GgVQCSgMs4vEctYPhsaGtd0AeuuHMkjZ2h2WG8MSzRw==",
861 | "dev": true
862 | }
863 | }
864 | }
865 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "cdk",
3 | "version": "0.1.0",
4 | "bin": {
5 | "cdk": "bin/cdk.js"
6 | },
7 | "scripts": {
8 | "cdk": "cdk"
9 | },
10 | "devDependencies": {
11 | "@aws-cdk/aws-apigatewayv2-alpha": "2.114.1-alpha.0",
12 | "@aws-cdk/aws-apigatewayv2-integrations-alpha": "2.114.1-alpha.0",
13 | "@types/node": "10.17.27",
14 | "aws-cdk": "2.114.1",
15 | "aws-cdk-lib": "2.114.1",
16 | "typescript": "~3.9.7"
17 | },
18 | "dependencies": {
19 | "source-map-support": "^0.5.16"
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/resize-cloud9.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Specify the desired volume size in GiB as a command line argument. If not specified, default to 20 GiB.
4 | SIZE=${1:-20}
5 |
6 | # Get the ID of the environment host Amazon EC2 instance.
7 | INSTANCEID=$(curl http://169.254.169.254/latest/meta-data/instance-id)
8 | REGION=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/')
9 |
10 | # Get the ID of the Amazon EBS volume associated with the instance.
11 | VOLUMEID=$(aws ec2 describe-instances \
12 | --instance-id $INSTANCEID \
13 | --query "Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId" \
14 | --output text \
15 | --region $REGION)
16 |
17 | # Resize the EBS volume.
18 | aws ec2 modify-volume --volume-id $VOLUMEID --size $SIZE
19 |
20 | # Wait for the resize to finish.
21 | while [ \
22 | "$(aws ec2 describe-volumes-modifications \
23 | --volume-id $VOLUMEID \
24 | --filters Name=modification-state,Values="optimizing","completed" \
25 | --query "length(VolumesModifications)"\
26 | --output text)" != "1" ]; do
27 | sleep 1
28 | done
29 |
30 | #Check if we're on an NVMe filesystem
31 | if [[ -e "/dev/xvda" && $(readlink -f /dev/xvda) = "/dev/xvda" ]]
32 | then
33 | # Rewrite the partition table so that the partition takes up all the space that it can.
34 | sudo growpart /dev/xvda 1
35 |
36 | # Expand the size of the file system.
37 | # Check if we're on AL2
38 | STR=$(cat /etc/os-release)
39 | SUB="VERSION_ID=\"2\""
40 | if [[ "$STR" == *"$SUB"* ]]
41 | then
42 | sudo xfs_growfs -d /
43 | else
44 | sudo resize2fs /dev/xvda1
45 | fi
46 |
47 | else
48 | # Rewrite the partition table so that the partition takes up all the space that it can.
49 | sudo growpart /dev/nvme0n1 1
50 |
51 | # Expand the size of the file system.
52 | # Check if we're on AL2
53 | STR=$(cat /etc/os-release)
54 | SUB="VERSION_ID=\"2\""
55 | if [[ "$STR" == *"$SUB"* ]]
56 | then
57 | sudo xfs_growfs -d /
58 | else
59 | sudo resize2fs /dev/nvme0n1p1
60 | fi
61 | fi
--------------------------------------------------------------------------------
/cdk/nginxAuthentication/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2018",
4 | "module": "commonjs",
5 | "lib": ["es2018"],
6 | "declaration": true,
7 | "strict": true,
8 | "noImplicitAny": false,
9 | "strictNullChecks": true,
10 | "noImplicitThis": true,
11 | "alwaysStrict": true,
12 | "noUnusedLocals": false,
13 | "noUnusedParameters": false,
14 | "noImplicitReturns": true,
15 | "noFallthroughCasesInSwitch": false,
16 | "inlineSourceMap": true,
17 | "inlineSources": true,
18 | "experimentalDecorators": true,
19 | "strictPropertyInitialization": false,
20 | "typeRoots": ["./node_modules/@types"]
21 | },
22 | "exclude": ["cdk.out"]
23 | }
24 |
--------------------------------------------------------------------------------
/images/Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/Architecture.png
--------------------------------------------------------------------------------
/images/HttpApiGatewayURL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/HttpApiGatewayURL.png
--------------------------------------------------------------------------------
/images/HttpApiStack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/HttpApiStack.png
--------------------------------------------------------------------------------
/images/MlflowVpclinkStack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/MlflowVpclinkStack.png
--------------------------------------------------------------------------------
/images/SageMakerNotebookInstance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/SageMakerNotebookInstance.png
--------------------------------------------------------------------------------
/images/clone-repo-studio-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/clone-repo-studio-ui.png
--------------------------------------------------------------------------------
/images/launch-sm-studio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/launch-sm-studio.png
--------------------------------------------------------------------------------
/images/sm-studio-user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/sm-studio-user.png
--------------------------------------------------------------------------------
/images/trialcomponent-output-artifacts-mlflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-mlflow-sagemaker-cdk/d19d38a4cc18950b7dbc0ebdc7cb352653490226/images/trialcomponent-output-artifacts-mlflow.png
--------------------------------------------------------------------------------
/lab/nginxBasicAuth/sagemaker_studio_and_mlflow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Train a Scikit-Learn model in SageMaker and track with MLFlow"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Intro\n",
15 | "\n",
16 | "The main objective of this notebook is to show how you can integrate Amazon SageMaker and MLFlow and MLFlow with SageMaker Experiments.\n",
17 | "\n",
18 | "## Pre-Requisites\n",
19 | "\n",
20 | "In order to run successfully this notebook, you must have prepared the infrastructure using CDK, which setups up for you the MLFlow server in an isolated VPC. When running this example in the SageMaker Notebook instance provisioned via CDK, you need to have access to the URI of the MLFlow server we will use for tracking purposes. In our case, this corresponds to the `HTTP API Gateway` endpoint that exposes our MLFlow server reacheable via a `PrivateLink` and have a SageMaker execution role with permissions to access the secret in `Amazon SecretsManager` from where we retrieve the username and password to interact with the MLFlow server.\n",
21 | "\n",
22 | "This notebook runs on SageMaker Studio using the `Base Python 2.0` image on a `Python 3` kernel.\n",
23 | "\n",
24 | "## The Machine Learning Problem\n",
25 | "\n",
26 | "In this example, we will solve a regression problem which aims to answer the question: \"what is the expected price of a house in the California area?\". The target variable is the house value for California districts, expressed in hundreds of thousands of dollars ($100,000).\n",
27 | "\n",
28 | "## Install required and/or update libraries\n",
29 | "\n",
30 | "At the time of writing, we have used the `sagemaker` SDK version 2. The MLFlow SDK library used is the one corresponding to our MLFlow server version, i.e., `2.14.2`"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "!pip install -q --upgrade pip\n",
40 | "!pip install sagemaker sagemaker-experiments scikit-learn==1.0.1 mlflow==2.14.2 boto3"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "Let's start by specifying:\n",
48 | "\n",
49 | "- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the notebook instance, training, and hosting.\n",
50 | "- The IAM role arn used to give training and hosting access to your data. See the [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/using-identity-based-policies.html) for more details on creating these. Note, if a role not associated with the current notebook instance, or more than one role is required for training and/or hosting, please replace `sagemaker.get_execution_role()` with a the appropriate full IAM role arn string(s).\n",
51 | "- The tracking URI where the MLFlow server runs\n",
52 | "- The experiment name as the logical entity to keep our tests grouped and organized."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "import os\n",
62 | "import pandas as pd\n",
63 | "import json\n",
64 | "import random\n",
65 | "import boto3\n",
66 | "\n",
67 | "## SageMaker and SKlearn libraries\n",
68 | "import sagemaker\n",
69 | "from sagemaker.sklearn.estimator import SKLearn\n",
70 | "from sagemaker.tuner import IntegerParameter, HyperparameterTuner\n",
71 | "\n",
72 | "## SKLearn libraries\n",
73 | "from sklearn.datasets import fetch_california_housing\n",
74 | "from sklearn.model_selection import train_test_split\n",
75 | "\n",
76 | "## MLFlow libraries\n",
77 | "import mlflow\n",
78 | "from mlflow.tracking.client import MlflowClient\n",
79 | "import mlflow.sagemaker\n",
80 | "\n",
81 | "cloudformation_client = boto3.client('cloudformation')\n",
82 | "\n",
83 | "sess = sagemaker.Session()\n",
84 | "role = sagemaker.get_execution_role()\n",
85 | "bucket = sess.default_bucket()\n",
86 | "region = sess.boto_region_name\n",
87 | "account = role.split(\"::\")[1].split(\":\")[0]\n",
88 | "tracking_uri = cloudformation_client.describe_stacks(StackName='HttpGatewayStack')['Stacks'][0]['Outputs'][0]['OutputValue']\n",
89 | "\n",
90 | "mlflow_secret_name = \"mlflow-server-credentials\"\n",
91 | "experiment_name = 'DEMO-sagemaker-mlflow'\n",
92 | "model_name = 'california-housing-model'\n",
93 | "\n",
94 | "print('SageMaker role: {}'.format(role.split(\"/\")[-1]))\n",
95 | "print('bucket: {}'.format(bucket))\n",
96 | "print('Account: {}'.format(account))\n",
97 | "print(\"Using AWS Region: {}\".format(region))\n",
98 | "print(\"MLflow server URI: {}\".format(tracking_uri))\n",
99 | "print(\"MLFLOW_SECRET_NAME: {}\".format(mlflow_secret_name))"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Data Preparation\n",
107 | "We load the dataset from sklearn, then split the data in training and testing datasets, where we allocate 75% of the data to the training dataset, and the remaining 25% to the traning dataset.\n",
108 | "\n",
109 | "The variable `target` is what we intend to estimate, which represents the value of a house, expressed in hundreds of thousands of dollars ($100,000)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "# we use the California housing dataset \n",
119 | "data = fetch_california_housing()\n",
120 | "\n",
121 | "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)\n",
122 | "\n",
123 | "trainX = pd.DataFrame(X_train, columns=data.feature_names)\n",
124 | "trainX['target'] = y_train\n",
125 | "\n",
126 | "testX = pd.DataFrame(X_test, columns=data.feature_names)\n",
127 | "testX['target'] = y_test"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "Finally, we save a copy of the data locally, as well as in S3. The data stored in S3 will be used SageMaker to train and test the model."
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "# save the data locally\n",
144 | "trainX.to_csv('california_train.csv', index=False)\n",
145 | "testX.to_csv('california_test.csv', index=False)\n",
146 | "\n",
147 | "# save the data to S3.\n",
148 | "train_path = sess.upload_data(path='california_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')\n",
149 | "test_path = sess.upload_data(path='california_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "### Setup SageMaker Experiments\n",
157 | "\n",
158 | "SageMaker Experiments is an AWS service for tracking machine learning Experiments. The SageMaker Experiments Python SDK is a high-level interface to this service that helps you track Experiment information using Python.\n",
159 | "\n",
160 | "Conceptually, these are the following entities within `SageMaker Experiments`:\n",
161 | "\n",
162 | "* Experiment: A collection of related Trials. Add Trials to an Experiment that you wish to compare together.\n",
163 | "* Trial: A description of a multi-step machine learning workflow. Each step in the workflow is described by a TrialComponent.\n",
164 | "* TrialComponent: A description of a single step in a machine learning workflow.\n",
165 | "* Tracker: A Python context-manager for logging information about a single TrialComponent.\n",
166 | "\n",
167 | "When running jobs (both training and processing ones) in the SageMaker managed infrastructure, SageMaker creates automatically a TrialComponent. TrialComponents includes by default jobs metadata and lineage information about the input and output data, models artifacts and metrics (for training jobs), and within your training script these data can be further enriched.\n",
168 | "\n",
169 | "We want to show how you can easily enable a two-way interaction between MLflow and SageMaker Experiments.\n",
170 | "\n",
171 | "Let us first create an `Experiment` and a `Trial`. These two entities are used to keep your experimentation organized."
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "from smexperiments.experiment import Experiment\n",
181 | "from smexperiments.trial import Trial\n",
182 | "from smexperiments.trial_component import TrialComponent\n",
183 | "from smexperiments.tracker import Tracker\n",
184 | "\n",
185 | "import time\n",
186 | "\n",
187 | "try:\n",
188 | " my_experiment = Experiment.load(experiment_name=experiment_name)\n",
189 | " print(\"existing experiment loaded\")\n",
190 | "except Exception as ex:\n",
191 | " if \"ResourceNotFound\" in str(ex):\n",
192 | " my_experiment = Experiment.create(\n",
193 | " experiment_name = experiment_name,\n",
194 | " description = \"MLFlow and SageMaker integration\"\n",
195 | " )\n",
196 | " print(\"new experiment created\")\n",
197 | " else:\n",
198 | " print(f\"Unexpected {ex}=, {type(ex)}\")\n",
199 | " print(\"Dont go forward!\")\n",
200 | " raise\n",
201 | "\n",
202 | "trial_name = \"trial-v1\"\n",
203 | "\n",
204 | "try:\n",
205 | " my_first_trial = Trial.load(trial_name=trial_name)\n",
206 | " print(\"existing trial loaded\")\n",
207 | "except Exception as ex:\n",
208 | " if \"ResourceNotFound\" in str(ex):\n",
209 | " my_first_trial = Trial.create(\n",
210 | " experiment_name=experiment_name,\n",
211 | " trial_name=trial_name,\n",
212 | " )\n",
213 | " print(\"new trial created\")\n",
214 | " else:\n",
215 | " print(f\"Unexpected {ex}=, {type(ex)}\")\n",
216 | " print(\"Dont go forward!\")\n",
217 | " raise\n",
218 | "\n",
219 | "create_date = time.strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
220 | "\n",
221 | "experiment_config = {\n",
222 | " \"ExperimentName\": experiment_name,\n",
223 | " \"TrialName\": trial_name,\n",
224 | "}"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "## Training\n",
232 | "\n",
233 | "For this example, we use the `SKlearn` framework in script mode with SageMaker. Let us explore in more details the different components we need to define.\n",
234 | "\n",
235 | "### Traning script and SageMaker environment\n",
236 | "\n",
237 | "The `./source_dir/train.py` script provides all the code we need for training a SageMaker model. The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, such as:\n",
238 | "\n",
239 | "* `SM_MODEL_DIR`: A string representing the path to the directory to write model artifacts to. These artifacts are uploaded to S3 for model hosting.\n",
240 | "* `SM_CHANNEL_TRAIN`: A string representing the path to the directory containing data in the 'training' channel.\n",
241 | "* `SM_CHANNEL_TEST`: A string representing the path to the directory containing data in the 'testing' channel.\n",
242 | "\n",
243 | "\n",
244 | "For more information about training environment variables, please visit \n",
245 | "[SageMaker Training Toolkit](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md).\n",
246 | "\n",
247 | "We want to highlight in particular `SM_TRAINING_ENV` since it provides all the training information as a JSON-encoded dictionary (see [here](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md#sm_training_env) for more details).\n",
248 | "\n",
249 | "#### Hyperparmeters\n",
250 | "\n",
251 | "We are using the `RandomForestRegressor` algorithm from the SKlearn framework. For the purpose of this exercise, we are only using a subset of hyperparameters supported by this algorithm, i.e. `n-estimators` and `min-samples-leaf`\n",
252 | "\n",
253 | "If you would like to know more the different hyperparmeters for this algorithm, please refer to the [`RandomForestRegressor` official documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html).\n",
254 | "\n",
255 | "Furthermore, it is important to note that for the purpose of this excercise, we are essentially omitting completely the feature engineering step, which is an essential step in any machine learning problem.\n",
256 | "\n",
257 | "#### MLFlow interaction\n",
258 | "\n",
259 | "To interact with the MLFlow server, we use the mlflow SDK, which allows us to set the tracking URI and the experiment name. One this initial setup is completed, we can store the parameters used (`mlflow.log_params(params)`), the model that is generated (`mlflow.sklearn.log_model(model, \"model\")`) with its associated metrics (`mlflow.log_metric(f'AE-at-{str(q)}th-percentile', np.percentile(a=abs_err, q=q))`).\n",
260 | "\n",
261 | "TODO: explain the `mlflow.autolog()` and the System Tags (add link) and how to overwrite them to have the right reference in SageMaker\n",
262 | "\n",
263 | "#### SageMaker"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "!pygmentize ./source_dir/train.py"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "### SKlearn container\n",
280 | "\n",
281 | "For this example, we use the `SKlearn` framework in script mode with SageMaker. For more information please refere to [the official documentation](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/using_sklearn.html)\n",
282 | "\n",
283 | "Our training script makes use of other 3rd party libraries, i.e. `mlflow`, which are not installed by default in the `Sklearn` container SageMaker provides. However, this can be easily overcome by supplying a `requirement.txt` file in the `source_dir` folder, which then SageMaker will `pip`-install before executing the training script.\n",
284 | "\n",
285 | "### Metric definition\n",
286 | "\n",
287 | "SageMaker emits every log to CLoudWatch. Since we are using scripting mode, we need to specify a metric definition object to define the format of the metric we are interested in via regex, so that SageMaker knows how to extract this metric from the CloudWatch logs of the training job.\n",
288 | "\n",
289 | "In our case our custom metric is as follow\n",
290 | "\n",
291 | "```python\n",
292 | "metric_definitions = [{'Name': 'median-AE', 'Regex': \"AE-at-50th-percentile: ([0-9.]+).*$\"}]\n",
293 | "```"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "metric_definitions = [{'Name': 'median-AE', 'Regex': \"AE-at-50th-percentile: ([0-9.]+).*$\"}]\n",
303 | "\n",
304 | "hyperparameters = {\n",
305 | " 'tracking_uri': tracking_uri,\n",
306 | " 'experiment_name': experiment_name,\n",
307 | " 'secret_name': mlflow_secret_name,\n",
308 | " 'region': region,\n",
309 | " 'n-estimators': 100,\n",
310 | " 'min-samples-leaf': 3,\n",
311 | " 'features': 'MedInc HouseAge AveRooms AveBedrms Population AveOccup',\n",
312 | " 'target': 'target'\n",
313 | "}\n",
314 | "\n",
315 | "estimator = SKLearn(\n",
316 | " entry_point='train.py',\n",
317 | " source_dir='source_dir',\n",
318 | " role=role,\n",
319 | " metric_definitions=metric_definitions,\n",
320 | " hyperparameters=hyperparameters,\n",
321 | " instance_count=1,\n",
322 | " instance_type='ml.m5.large', # to run SageMaker in a managed infrastructure\n",
323 | " framework_version='1.0-1',\n",
324 | " base_job_name='mlflow',\n",
325 | ")"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "Now we are ready to execute the training locally, which in turn will save its execution data to the MLFlow server. After initializing an `SKlearn` estimator object, all we need to do is to call the `.fit` method specifying where the training and testing data are located."
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": [
341 | "estimator.fit({'train':train_path, 'test': test_path}, experiment_config=experiment_config)"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "### From SageMaker to MLFlow\n",
349 | "\n",
350 | "Load the TrialComponent associate with the `estimator`."
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "training_job_name = estimator.latest_training_job.name\n",
360 | "\n",
361 | "trial_component = TrialComponent.load(f\"{training_job_name}-aws-training-job\")\n",
362 | "mlflow_run_url = trial_component.parameters[\"mlflow-run-url\"]"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": null,
368 | "metadata": {},
369 | "outputs": [],
370 | "source": [
371 | "from IPython.core.display import HTML\n",
372 | "HTML(\"link to MLFlow run\".format(mlflow_run_url))"
373 | ]
374 | },
375 | {
376 | "cell_type": "markdown",
377 | "metadata": {},
378 | "source": [
379 | "### From MLFlow to SageMaker Experiments\n",
380 | "\n",
381 | "Within SageMaker Experiments, we have enriched the TrialComponent with information specific to MLFlow. For example\n",
382 | "\n",
383 | "* the experiment ID in MLFlow\n",
384 | "* the MLFlow run ID corresponding to the SageMaker training job\n",
385 | "* any additional MLFlow parameters and metrics generated by MLFlow\n",
386 | "* the list of output artifacts generated by MLFlow (e.g., the output model) with their full path to S3\n",
387 | "\n",
388 | "A visual inspection of the SageMaker Studio UI for the output artifacts can be seen below\n",
389 | "\n",
390 | ""
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "metadata": {},
396 | "source": [
397 | "## Register the model to MLFlow\n",
398 | "\n",
399 | "At the end of the training, our model has been saved to the MLflow server and we are ready to register the model, i.e. assign it to a model package and create a version. Please refer to the [official MLFlow documentation](https://www.mlflow.org/docs/latest/model-registry.html) for furthe information."
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": null,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "def retrieve_credentials(region_name, secret_name):\n",
409 | " session = boto3.session.Session()\n",
410 | " client = session.client(\n",
411 | " service_name='secretsmanager',\n",
412 | " region_name=region_name\n",
413 | " )\n",
414 | " \n",
415 | " kwarg = {'SecretId': secret_name}\n",
416 | " secret = client.get_secret_value(**kwarg)\n",
417 | " credentials = {}\n",
418 | "\n",
419 | " credentials['username'] = json.loads(secret['SecretString'])['username']\n",
420 | " credentials['password'] = json.loads(secret['SecretString'])['password']\n",
421 | " \n",
422 | " return credentials\n",
423 | "\n",
424 | "# set the tracking token env variable will enable the mlflow SDK to set the header \"Authentication: Basic \" to authenticate.\n",
425 | "credentials = retrieve_credentials(region, mlflow_secret_name)\n",
426 | "os.environ['MLFLOW_TRACKING_USERNAME'] = credentials['username']\n",
427 | "os.environ['MLFLOW_TRACKING_PASSWORD'] = credentials['password']"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "metadata": {},
434 | "outputs": [],
435 | "source": [
436 | "mlflow.set_tracking_uri(tracking_uri)\n",
437 | "mlflow.set_experiment(experiment_name)\n",
438 | "client = MlflowClient()\n",
439 | "\n",
440 | "run = mlflow.get_run(run_id=trial_component.parameters[\"run_id\"])\n",
441 | "\n",
442 | "try:\n",
443 | " client.create_registered_model(model_name)\n",
444 | "except:\n",
445 | " print(\"Registered model already exists\")\n",
446 | "\n",
447 | "model_version = client.create_model_version(\n",
448 | " name=model_name,\n",
449 | " source=\"{}/model\".format(run.info.artifact_uri),\n",
450 | " run_id=run.info.run_uuid\n",
451 | ")\n",
452 | "\n",
453 | "print(\"model_version: {}\".format(model_version))"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {},
459 | "source": [
460 | "## Local Predictions\n",
461 | "\n",
462 | "We are now ready to make predictions with our model locally for testing purposes."
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {},
469 | "outputs": [],
470 | "source": [
471 | "# get the model URI from the MLFlow registry\n",
472 | "model_uri = model_version.source\n",
473 | "print(\"Model URI: {}\".format(model_uri))\n",
474 | "\n",
475 | "# Load model as a Sklearn model.\n",
476 | "loaded_model = mlflow.sklearn.load_model(model_uri)\n",
477 | "\n",
478 | "# get a random index to test the prediction from the test data\n",
479 | "index = random.randrange(0, len(testX))\n",
480 | "print(\"Random index value: {}\".format(index))\n",
481 | "\n",
482 | "# Prepare data on a Pandas DataFrame to make a prediction.\n",
483 | "data = testX.drop(['Latitude','Longitude','target'], axis=1).iloc[[index]]\n",
484 | "\n",
485 | "print(\"#######\\nData for prediction \\n{}\".format(data))\n",
486 | "\n",
487 | "y_hat = loaded_model.predict(data)[0]\n",
488 | "y = y_test[index]\n",
489 | "\n",
490 | "print(\"Predicted value: {}\".format(y_hat))\n",
491 | "print(\"Actual value: {}\".format(y))"
492 | ]
493 | },
494 | {
495 | "cell_type": "markdown",
496 | "metadata": {},
497 | "source": [
498 | "# Tune a Scikit-Learn model in SageMaker and track with MLFlow\n",
499 | "\n",
500 | "At this point, we are going to offload the training to the remote infrastructure managed by SageMaker. We want now to leverage SageMaker's hyperparameter tuning to kick off multiple training jobs with different hyperparameter combinations, to find the set with best model performance. This is an important step in the machine learning process as hyperparameter settings can have a large impact on model accuracy. In this example, we'll use the SageMaker Python SDK to create a hyperparameter tuning job for an SKlearn estimator.\n",
501 | "\n",
502 | "## Training\n",
503 | "We are again using `SKlearn` in script mode, with the same training script we have used in the previous section, i.e. `./source_dir/train.py`."
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": null,
509 | "metadata": {},
510 | "outputs": [],
511 | "source": [
512 | "hyperparameters = {\n",
513 | " 'tracking_uri': tracking_uri,\n",
514 | " 'experiment_name': experiment_name,\n",
515 | " 'secret_name': mlflow_secret_name,\n",
516 | " 'region': region,\n",
517 | " 'features': 'MedInc HouseAge AveRooms AveBedrms Population AveOccup',\n",
518 | " 'target': 'target'\n",
519 | "}\n",
520 | "\n",
521 | "metric_definitions = [{'Name': 'median-AE', 'Regex': \"AE-at-50th-percentile: ([0-9.]+).*$\"}]\n",
522 | "\n",
523 | "estimator = SKLearn(\n",
524 | " entry_point='train.py',\n",
525 | " source_dir='source_dir',\n",
526 | " role=role,\n",
527 | " instance_count=1,\n",
528 | " instance_type='ml.m5.xlarge',\n",
529 | " hyperparameters=hyperparameters,\n",
530 | " metric_definitions=metric_definitions,\n",
531 | " framework_version='1.0-1',\n",
532 | " py_version='py3'\n",
533 | ")"
534 | ]
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "metadata": {},
539 | "source": [
540 | "## Hyperparameter tuning\n",
541 | "\n",
542 | "Once we've defined our estimator we can specify the hyperparameters we'd like to tune and their possible values. We have three different types of hyperparameters.\n",
543 | "- Categorical parameters need to take one value from a discrete set. We define this by passing the list of possible values to `CategoricalParameter(list)`\n",
544 | "- Continuous parameters can take any real number value between the minimum and maximum value, defined by `ContinuousParameter(min, max)`\n",
545 | "- Integer parameters can take any integer value between the minimum and maximum value, defined by `IntegerParameter(min, max)`\n",
546 | "\n",
547 | "*Note, if possible, it's almost always best to specify a value as the least restrictive type. For example, tuning `thresh` as a continuous value between 0.01 and 0.2 is likely to yield a better result than tuning as a categorical parameter with possible values of 0.01, 0.1, 0.15, or 0.2.*"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "hyperparameter_ranges = {\n",
557 | " 'n-estimators': IntegerParameter(50, 200),\n",
558 | " 'min-samples-leaf': IntegerParameter(1, 10)\n",
559 | "}"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {},
565 | "source": [
566 | "Next we'll specify the objective metric that we'd like to tune and its definition. This refers to the regular expression (Regex) needed to extract that metric from the CloudWatch logs of our training job we defined earlier, as well as whether we are looking to `Maximize` or `Minimize` the objective metric."
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": null,
572 | "metadata": {},
573 | "outputs": [],
574 | "source": [
575 | "objective_metric_name = 'median-AE'\n",
576 | "objective_type = 'Minimize'"
577 | ]
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "metadata": {},
582 | "source": [
583 | "Now, we'll create a `HyperparameterTuner` object, which we pass:\n",
584 | "- The SKLearn estimator we created earlier\n",
585 | "- Our hyperparameter ranges\n",
586 | "- Objective metric name and type\n",
587 | "- Number of training jobs to run in total and how many training jobs should be run simultaneously. More parallel jobs will finish tuning sooner, but may sacrifice accuracy. We recommend you set the parallel jobs value to less than 10% of the total number of training jobs (we'll set it higher just for this example to keep it short)."
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": null,
593 | "metadata": {},
594 | "outputs": [],
595 | "source": [
596 | "max_jobs = 5\n",
597 | "max_parallel_jobs = 5\n",
598 | "\n",
599 | "tuner = HyperparameterTuner(estimator,\n",
600 | " objective_metric_name,\n",
601 | " hyperparameter_ranges,\n",
602 | " metric_definitions,\n",
603 | " max_jobs=max_jobs,\n",
604 | " max_parallel_jobs=max_parallel_jobs,\n",
605 | " objective_type=objective_type,\n",
606 | " base_tuning_job_name='mlflow')"
607 | ]
608 | },
609 | {
610 | "cell_type": "markdown",
611 | "metadata": {},
612 | "source": [
613 | "And finally, we can start our tuning job by calling `.fit()` and passing in the S3 paths to our train and test datasets."
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": null,
619 | "metadata": {},
620 | "outputs": [],
621 | "source": [
622 | "tuner.fit({'train':train_path, 'test': test_path})"
623 | ]
624 | },
625 | {
626 | "cell_type": "markdown",
627 | "metadata": {},
628 | "source": [
629 | "We can now query the MLFlow server to see the different models and their metrics that have been stored."
630 | ]
631 | },
632 | {
633 | "cell_type": "markdown",
634 | "metadata": {},
635 | "source": [
636 | "# Deploy an MLflow model with SageMaker\n",
637 | "\n",
638 | "We are finally ready to deploy a MLFlow model to a SageMaker hosted endpoint ready to be consumed for online predictions."
639 | ]
640 | },
641 | {
642 | "cell_type": "markdown",
643 | "metadata": {},
644 | "source": [
645 | "## Build MLflow docker image to serve the model with SageMaker\n",
646 | "\n",
647 | "We first need to build a new MLflow Sagemaker image, assign it a name, and push to ECR.\n",
648 | "\n",
649 | "The `mlflow sagemaker build-and-push-container` function does exactly that. It first builds an MLflow Docker image. The image is built locally and it requires Docker to run. Then, the image is pushed to ECR under current active AWS account and to current active AWS region. More information on this command can be found in the official [MLflow CLI documentation for SageMaker](https://www.mlflow.org/docs/latest/cli.html#mlflow-sagemaker).\n",
650 | "\n",
651 | "Make sure that you the `mlflow-pyfunc` container has already been pushed to `ECR` from the `Cloud9` environment from where deployed the CDK stacks."
652 | ]
653 | },
654 | {
655 | "cell_type": "code",
656 | "execution_count": null,
657 | "metadata": {},
658 | "outputs": [],
659 | "source": [
660 | "# URL of the ECR-hosted Docker image the model should be deployed into: make sure to include the tag 2.14.2\n",
661 | "image_uri = \"{}.dkr.ecr.{}.amazonaws.com/mlflow-pyfunc:{}\".format(account, region, mlflow.__version__)\n",
662 | "print(\"image URI: {}\".format(image_uri))"
663 | ]
664 | },
665 | {
666 | "cell_type": "markdown",
667 | "metadata": {},
668 | "source": [
669 | "## Deploy a SageMaker endpoint with our scikit-learn model\n",
670 | "\n",
671 | "We first need to get the best performing model stored in MLFlow. Once it has been identified, we register it to the Registry and then deploy to a SageMaker managed endpoint via the MLflow SDK. More information can be found [here](https://www.mlflow.org/docs/latest/python_api/mlflow.sagemaker.html)"
672 | ]
673 | },
674 | {
675 | "cell_type": "code",
676 | "execution_count": null,
677 | "metadata": {},
678 | "outputs": [],
679 | "source": [
680 | "best_training_job_name = tuner.best_training_job()\n",
681 | "\n",
682 | "best_trial_component = TrialComponent.load(f\"{best_training_job_name}-aws-training-job\")\n",
683 | "best_mlflow_run_url = best_trial_component.parameters[\"mlflow-run-url\"]"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": null,
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "from IPython.core.display import HTML\n",
693 | "HTML(\"MLFlow run corresponding to best training job\".format(best_mlflow_run_url))"
694 | ]
695 | },
696 | {
697 | "cell_type": "code",
698 | "execution_count": null,
699 | "metadata": {},
700 | "outputs": [],
701 | "source": [
702 | "experiment = mlflow.get_experiment_by_name(experiment_name)\n",
703 | "experiment_id = experiment.experiment_id\n",
704 | "\n",
705 | "run = mlflow.get_run(run_id=best_trial_component.parameters[\"run_id\"])\n",
706 | "\n",
707 | "try:\n",
708 | " client.create_registered_model(model_name)\n",
709 | "except:\n",
710 | " print(\"Registered model already exists\")\n",
711 | "\n",
712 | "model_version = client.create_model_version(\n",
713 | " name=model_name,\n",
714 | " source=\"{}/model\".format(run.info.artifact_uri),\n",
715 | " run_id=run.info.run_uuid\n",
716 | ")\n",
717 | "\n",
718 | "print(\"model_version: {}\".format(model_version))"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": null,
724 | "metadata": {},
725 | "outputs": [],
726 | "source": [
727 | "from mlflow.deployments import get_deploy_client\n",
728 | "\n",
729 | "model_uri = \"models:/{}/{}\".format(model_version.name, model_version.version)\n",
730 | "\n",
731 | "endpoint_name = 'california-housing'\n",
732 | "\n",
733 | "config={\n",
734 | " 'execution_role_arn': role,\n",
735 | " 'image_url': image_uri,\n",
736 | " 'instance_type': 'ml.m5.xlarge',\n",
737 | " 'instance_count': 1,\n",
738 | " 'region_name': region\n",
739 | "}\n",
740 | "\n",
741 | "client = get_deploy_client(\"sagemaker\")\n",
742 | "\n",
743 | "client.create_deployment(\n",
744 | " name=endpoint_name,\n",
745 | " model_uri=model_uri,\n",
746 | " flavor='python_function',\n",
747 | " config=config\n",
748 | ")"
749 | ]
750 | },
751 | {
752 | "cell_type": "markdown",
753 | "metadata": {},
754 | "source": [
755 | "## Predict\n",
756 | "\n",
757 | "We are now ready to make predictions again the endpoint."
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": null,
763 | "metadata": {},
764 | "outputs": [],
765 | "source": [
766 | "# load california dataset\n",
767 | "data = pd.read_csv('./california_test.csv')\n",
768 | "df_y = data[['target']]\n",
769 | "df = data.drop(['Latitude','Longitude','target'], axis=1)\n",
770 | "\n",
771 | "client = get_deploy_client(f\"sagemaker:/{region}\")\n",
772 | "\n",
773 | "for _ in range(0,2):\n",
774 | " # Randomly pick a row to test the prediction\n",
775 | " index = random.randrange(0, len(df_y))\n",
776 | " payload = df.iloc[[index]]\n",
777 | " y = df_y['target'][index]\n",
778 | " print(f\"payload: {payload}\")\n",
779 | " prediction = client.predict(endpoint_name, payload)\n",
780 | " print(f'This is the real value of the housing we want to predict (expressed in 100.000$): {y}')\n",
781 | " print(f\"This is the predicted value from our model (expressed in 100.000$): {prediction['predictions'][0]}\")"
782 | ]
783 | },
784 | {
785 | "cell_type": "markdown",
786 | "metadata": {},
787 | "source": [
788 | "## Delete endpoint\n",
789 | "\n",
790 | "In order to avoid unwanted costs, make sure you delete the endpoint."
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": null,
796 | "metadata": {},
797 | "outputs": [],
798 | "source": [
799 | "client.delete_deployment(endpoint_name, config=config)"
800 | ]
801 | },
802 | {
803 | "cell_type": "markdown",
804 | "metadata": {},
805 | "source": [
806 | "### Delete experiments (Optional)"
807 | ]
808 | },
809 | {
810 | "cell_type": "code",
811 | "execution_count": null,
812 | "metadata": {},
813 | "outputs": [],
814 | "source": [
815 | "my_experiment.delete_all(action=\"--force\")"
816 | ]
817 | },
818 | {
819 | "cell_type": "code",
820 | "execution_count": null,
821 | "metadata": {},
822 | "outputs": [],
823 | "source": []
824 | }
825 | ],
826 | "metadata": {
827 | "availableInstances": [
828 | {
829 | "_defaultOrder": 0,
830 | "_isFastLaunch": true,
831 | "category": "General purpose",
832 | "gpuNum": 0,
833 | "hideHardwareSpecs": false,
834 | "memoryGiB": 4,
835 | "name": "ml.t3.medium",
836 | "vcpuNum": 2
837 | },
838 | {
839 | "_defaultOrder": 1,
840 | "_isFastLaunch": false,
841 | "category": "General purpose",
842 | "gpuNum": 0,
843 | "hideHardwareSpecs": false,
844 | "memoryGiB": 8,
845 | "name": "ml.t3.large",
846 | "vcpuNum": 2
847 | },
848 | {
849 | "_defaultOrder": 2,
850 | "_isFastLaunch": false,
851 | "category": "General purpose",
852 | "gpuNum": 0,
853 | "hideHardwareSpecs": false,
854 | "memoryGiB": 16,
855 | "name": "ml.t3.xlarge",
856 | "vcpuNum": 4
857 | },
858 | {
859 | "_defaultOrder": 3,
860 | "_isFastLaunch": false,
861 | "category": "General purpose",
862 | "gpuNum": 0,
863 | "hideHardwareSpecs": false,
864 | "memoryGiB": 32,
865 | "name": "ml.t3.2xlarge",
866 | "vcpuNum": 8
867 | },
868 | {
869 | "_defaultOrder": 4,
870 | "_isFastLaunch": true,
871 | "category": "General purpose",
872 | "gpuNum": 0,
873 | "hideHardwareSpecs": false,
874 | "memoryGiB": 8,
875 | "name": "ml.m5.large",
876 | "vcpuNum": 2
877 | },
878 | {
879 | "_defaultOrder": 5,
880 | "_isFastLaunch": false,
881 | "category": "General purpose",
882 | "gpuNum": 0,
883 | "hideHardwareSpecs": false,
884 | "memoryGiB": 16,
885 | "name": "ml.m5.xlarge",
886 | "vcpuNum": 4
887 | },
888 | {
889 | "_defaultOrder": 6,
890 | "_isFastLaunch": false,
891 | "category": "General purpose",
892 | "gpuNum": 0,
893 | "hideHardwareSpecs": false,
894 | "memoryGiB": 32,
895 | "name": "ml.m5.2xlarge",
896 | "vcpuNum": 8
897 | },
898 | {
899 | "_defaultOrder": 7,
900 | "_isFastLaunch": false,
901 | "category": "General purpose",
902 | "gpuNum": 0,
903 | "hideHardwareSpecs": false,
904 | "memoryGiB": 64,
905 | "name": "ml.m5.4xlarge",
906 | "vcpuNum": 16
907 | },
908 | {
909 | "_defaultOrder": 8,
910 | "_isFastLaunch": false,
911 | "category": "General purpose",
912 | "gpuNum": 0,
913 | "hideHardwareSpecs": false,
914 | "memoryGiB": 128,
915 | "name": "ml.m5.8xlarge",
916 | "vcpuNum": 32
917 | },
918 | {
919 | "_defaultOrder": 9,
920 | "_isFastLaunch": false,
921 | "category": "General purpose",
922 | "gpuNum": 0,
923 | "hideHardwareSpecs": false,
924 | "memoryGiB": 192,
925 | "name": "ml.m5.12xlarge",
926 | "vcpuNum": 48
927 | },
928 | {
929 | "_defaultOrder": 10,
930 | "_isFastLaunch": false,
931 | "category": "General purpose",
932 | "gpuNum": 0,
933 | "hideHardwareSpecs": false,
934 | "memoryGiB": 256,
935 | "name": "ml.m5.16xlarge",
936 | "vcpuNum": 64
937 | },
938 | {
939 | "_defaultOrder": 11,
940 | "_isFastLaunch": false,
941 | "category": "General purpose",
942 | "gpuNum": 0,
943 | "hideHardwareSpecs": false,
944 | "memoryGiB": 384,
945 | "name": "ml.m5.24xlarge",
946 | "vcpuNum": 96
947 | },
948 | {
949 | "_defaultOrder": 12,
950 | "_isFastLaunch": false,
951 | "category": "General purpose",
952 | "gpuNum": 0,
953 | "hideHardwareSpecs": false,
954 | "memoryGiB": 8,
955 | "name": "ml.m5d.large",
956 | "vcpuNum": 2
957 | },
958 | {
959 | "_defaultOrder": 13,
960 | "_isFastLaunch": false,
961 | "category": "General purpose",
962 | "gpuNum": 0,
963 | "hideHardwareSpecs": false,
964 | "memoryGiB": 16,
965 | "name": "ml.m5d.xlarge",
966 | "vcpuNum": 4
967 | },
968 | {
969 | "_defaultOrder": 14,
970 | "_isFastLaunch": false,
971 | "category": "General purpose",
972 | "gpuNum": 0,
973 | "hideHardwareSpecs": false,
974 | "memoryGiB": 32,
975 | "name": "ml.m5d.2xlarge",
976 | "vcpuNum": 8
977 | },
978 | {
979 | "_defaultOrder": 15,
980 | "_isFastLaunch": false,
981 | "category": "General purpose",
982 | "gpuNum": 0,
983 | "hideHardwareSpecs": false,
984 | "memoryGiB": 64,
985 | "name": "ml.m5d.4xlarge",
986 | "vcpuNum": 16
987 | },
988 | {
989 | "_defaultOrder": 16,
990 | "_isFastLaunch": false,
991 | "category": "General purpose",
992 | "gpuNum": 0,
993 | "hideHardwareSpecs": false,
994 | "memoryGiB": 128,
995 | "name": "ml.m5d.8xlarge",
996 | "vcpuNum": 32
997 | },
998 | {
999 | "_defaultOrder": 17,
1000 | "_isFastLaunch": false,
1001 | "category": "General purpose",
1002 | "gpuNum": 0,
1003 | "hideHardwareSpecs": false,
1004 | "memoryGiB": 192,
1005 | "name": "ml.m5d.12xlarge",
1006 | "vcpuNum": 48
1007 | },
1008 | {
1009 | "_defaultOrder": 18,
1010 | "_isFastLaunch": false,
1011 | "category": "General purpose",
1012 | "gpuNum": 0,
1013 | "hideHardwareSpecs": false,
1014 | "memoryGiB": 256,
1015 | "name": "ml.m5d.16xlarge",
1016 | "vcpuNum": 64
1017 | },
1018 | {
1019 | "_defaultOrder": 19,
1020 | "_isFastLaunch": false,
1021 | "category": "General purpose",
1022 | "gpuNum": 0,
1023 | "hideHardwareSpecs": false,
1024 | "memoryGiB": 384,
1025 | "name": "ml.m5d.24xlarge",
1026 | "vcpuNum": 96
1027 | },
1028 | {
1029 | "_defaultOrder": 20,
1030 | "_isFastLaunch": false,
1031 | "category": "General purpose",
1032 | "gpuNum": 0,
1033 | "hideHardwareSpecs": true,
1034 | "memoryGiB": 0,
1035 | "name": "ml.geospatial.interactive",
1036 | "supportedImageNames": [
1037 | "sagemaker-geospatial-v1-0"
1038 | ],
1039 | "vcpuNum": 0
1040 | },
1041 | {
1042 | "_defaultOrder": 21,
1043 | "_isFastLaunch": true,
1044 | "category": "Compute optimized",
1045 | "gpuNum": 0,
1046 | "hideHardwareSpecs": false,
1047 | "memoryGiB": 4,
1048 | "name": "ml.c5.large",
1049 | "vcpuNum": 2
1050 | },
1051 | {
1052 | "_defaultOrder": 22,
1053 | "_isFastLaunch": false,
1054 | "category": "Compute optimized",
1055 | "gpuNum": 0,
1056 | "hideHardwareSpecs": false,
1057 | "memoryGiB": 8,
1058 | "name": "ml.c5.xlarge",
1059 | "vcpuNum": 4
1060 | },
1061 | {
1062 | "_defaultOrder": 23,
1063 | "_isFastLaunch": false,
1064 | "category": "Compute optimized",
1065 | "gpuNum": 0,
1066 | "hideHardwareSpecs": false,
1067 | "memoryGiB": 16,
1068 | "name": "ml.c5.2xlarge",
1069 | "vcpuNum": 8
1070 | },
1071 | {
1072 | "_defaultOrder": 24,
1073 | "_isFastLaunch": false,
1074 | "category": "Compute optimized",
1075 | "gpuNum": 0,
1076 | "hideHardwareSpecs": false,
1077 | "memoryGiB": 32,
1078 | "name": "ml.c5.4xlarge",
1079 | "vcpuNum": 16
1080 | },
1081 | {
1082 | "_defaultOrder": 25,
1083 | "_isFastLaunch": false,
1084 | "category": "Compute optimized",
1085 | "gpuNum": 0,
1086 | "hideHardwareSpecs": false,
1087 | "memoryGiB": 72,
1088 | "name": "ml.c5.9xlarge",
1089 | "vcpuNum": 36
1090 | },
1091 | {
1092 | "_defaultOrder": 26,
1093 | "_isFastLaunch": false,
1094 | "category": "Compute optimized",
1095 | "gpuNum": 0,
1096 | "hideHardwareSpecs": false,
1097 | "memoryGiB": 96,
1098 | "name": "ml.c5.12xlarge",
1099 | "vcpuNum": 48
1100 | },
1101 | {
1102 | "_defaultOrder": 27,
1103 | "_isFastLaunch": false,
1104 | "category": "Compute optimized",
1105 | "gpuNum": 0,
1106 | "hideHardwareSpecs": false,
1107 | "memoryGiB": 144,
1108 | "name": "ml.c5.18xlarge",
1109 | "vcpuNum": 72
1110 | },
1111 | {
1112 | "_defaultOrder": 28,
1113 | "_isFastLaunch": false,
1114 | "category": "Compute optimized",
1115 | "gpuNum": 0,
1116 | "hideHardwareSpecs": false,
1117 | "memoryGiB": 192,
1118 | "name": "ml.c5.24xlarge",
1119 | "vcpuNum": 96
1120 | },
1121 | {
1122 | "_defaultOrder": 29,
1123 | "_isFastLaunch": true,
1124 | "category": "Accelerated computing",
1125 | "gpuNum": 1,
1126 | "hideHardwareSpecs": false,
1127 | "memoryGiB": 16,
1128 | "name": "ml.g4dn.xlarge",
1129 | "vcpuNum": 4
1130 | },
1131 | {
1132 | "_defaultOrder": 30,
1133 | "_isFastLaunch": false,
1134 | "category": "Accelerated computing",
1135 | "gpuNum": 1,
1136 | "hideHardwareSpecs": false,
1137 | "memoryGiB": 32,
1138 | "name": "ml.g4dn.2xlarge",
1139 | "vcpuNum": 8
1140 | },
1141 | {
1142 | "_defaultOrder": 31,
1143 | "_isFastLaunch": false,
1144 | "category": "Accelerated computing",
1145 | "gpuNum": 1,
1146 | "hideHardwareSpecs": false,
1147 | "memoryGiB": 64,
1148 | "name": "ml.g4dn.4xlarge",
1149 | "vcpuNum": 16
1150 | },
1151 | {
1152 | "_defaultOrder": 32,
1153 | "_isFastLaunch": false,
1154 | "category": "Accelerated computing",
1155 | "gpuNum": 1,
1156 | "hideHardwareSpecs": false,
1157 | "memoryGiB": 128,
1158 | "name": "ml.g4dn.8xlarge",
1159 | "vcpuNum": 32
1160 | },
1161 | {
1162 | "_defaultOrder": 33,
1163 | "_isFastLaunch": false,
1164 | "category": "Accelerated computing",
1165 | "gpuNum": 4,
1166 | "hideHardwareSpecs": false,
1167 | "memoryGiB": 192,
1168 | "name": "ml.g4dn.12xlarge",
1169 | "vcpuNum": 48
1170 | },
1171 | {
1172 | "_defaultOrder": 34,
1173 | "_isFastLaunch": false,
1174 | "category": "Accelerated computing",
1175 | "gpuNum": 1,
1176 | "hideHardwareSpecs": false,
1177 | "memoryGiB": 256,
1178 | "name": "ml.g4dn.16xlarge",
1179 | "vcpuNum": 64
1180 | },
1181 | {
1182 | "_defaultOrder": 35,
1183 | "_isFastLaunch": false,
1184 | "category": "Accelerated computing",
1185 | "gpuNum": 1,
1186 | "hideHardwareSpecs": false,
1187 | "memoryGiB": 61,
1188 | "name": "ml.p3.2xlarge",
1189 | "vcpuNum": 8
1190 | },
1191 | {
1192 | "_defaultOrder": 36,
1193 | "_isFastLaunch": false,
1194 | "category": "Accelerated computing",
1195 | "gpuNum": 4,
1196 | "hideHardwareSpecs": false,
1197 | "memoryGiB": 244,
1198 | "name": "ml.p3.8xlarge",
1199 | "vcpuNum": 32
1200 | },
1201 | {
1202 | "_defaultOrder": 37,
1203 | "_isFastLaunch": false,
1204 | "category": "Accelerated computing",
1205 | "gpuNum": 8,
1206 | "hideHardwareSpecs": false,
1207 | "memoryGiB": 488,
1208 | "name": "ml.p3.16xlarge",
1209 | "vcpuNum": 64
1210 | },
1211 | {
1212 | "_defaultOrder": 38,
1213 | "_isFastLaunch": false,
1214 | "category": "Accelerated computing",
1215 | "gpuNum": 8,
1216 | "hideHardwareSpecs": false,
1217 | "memoryGiB": 768,
1218 | "name": "ml.p3dn.24xlarge",
1219 | "vcpuNum": 96
1220 | },
1221 | {
1222 | "_defaultOrder": 39,
1223 | "_isFastLaunch": false,
1224 | "category": "Memory Optimized",
1225 | "gpuNum": 0,
1226 | "hideHardwareSpecs": false,
1227 | "memoryGiB": 16,
1228 | "name": "ml.r5.large",
1229 | "vcpuNum": 2
1230 | },
1231 | {
1232 | "_defaultOrder": 40,
1233 | "_isFastLaunch": false,
1234 | "category": "Memory Optimized",
1235 | "gpuNum": 0,
1236 | "hideHardwareSpecs": false,
1237 | "memoryGiB": 32,
1238 | "name": "ml.r5.xlarge",
1239 | "vcpuNum": 4
1240 | },
1241 | {
1242 | "_defaultOrder": 41,
1243 | "_isFastLaunch": false,
1244 | "category": "Memory Optimized",
1245 | "gpuNum": 0,
1246 | "hideHardwareSpecs": false,
1247 | "memoryGiB": 64,
1248 | "name": "ml.r5.2xlarge",
1249 | "vcpuNum": 8
1250 | },
1251 | {
1252 | "_defaultOrder": 42,
1253 | "_isFastLaunch": false,
1254 | "category": "Memory Optimized",
1255 | "gpuNum": 0,
1256 | "hideHardwareSpecs": false,
1257 | "memoryGiB": 128,
1258 | "name": "ml.r5.4xlarge",
1259 | "vcpuNum": 16
1260 | },
1261 | {
1262 | "_defaultOrder": 43,
1263 | "_isFastLaunch": false,
1264 | "category": "Memory Optimized",
1265 | "gpuNum": 0,
1266 | "hideHardwareSpecs": false,
1267 | "memoryGiB": 256,
1268 | "name": "ml.r5.8xlarge",
1269 | "vcpuNum": 32
1270 | },
1271 | {
1272 | "_defaultOrder": 44,
1273 | "_isFastLaunch": false,
1274 | "category": "Memory Optimized",
1275 | "gpuNum": 0,
1276 | "hideHardwareSpecs": false,
1277 | "memoryGiB": 384,
1278 | "name": "ml.r5.12xlarge",
1279 | "vcpuNum": 48
1280 | },
1281 | {
1282 | "_defaultOrder": 45,
1283 | "_isFastLaunch": false,
1284 | "category": "Memory Optimized",
1285 | "gpuNum": 0,
1286 | "hideHardwareSpecs": false,
1287 | "memoryGiB": 512,
1288 | "name": "ml.r5.16xlarge",
1289 | "vcpuNum": 64
1290 | },
1291 | {
1292 | "_defaultOrder": 46,
1293 | "_isFastLaunch": false,
1294 | "category": "Memory Optimized",
1295 | "gpuNum": 0,
1296 | "hideHardwareSpecs": false,
1297 | "memoryGiB": 768,
1298 | "name": "ml.r5.24xlarge",
1299 | "vcpuNum": 96
1300 | },
1301 | {
1302 | "_defaultOrder": 47,
1303 | "_isFastLaunch": false,
1304 | "category": "Accelerated computing",
1305 | "gpuNum": 1,
1306 | "hideHardwareSpecs": false,
1307 | "memoryGiB": 16,
1308 | "name": "ml.g5.xlarge",
1309 | "vcpuNum": 4
1310 | },
1311 | {
1312 | "_defaultOrder": 48,
1313 | "_isFastLaunch": false,
1314 | "category": "Accelerated computing",
1315 | "gpuNum": 1,
1316 | "hideHardwareSpecs": false,
1317 | "memoryGiB": 32,
1318 | "name": "ml.g5.2xlarge",
1319 | "vcpuNum": 8
1320 | },
1321 | {
1322 | "_defaultOrder": 49,
1323 | "_isFastLaunch": false,
1324 | "category": "Accelerated computing",
1325 | "gpuNum": 1,
1326 | "hideHardwareSpecs": false,
1327 | "memoryGiB": 64,
1328 | "name": "ml.g5.4xlarge",
1329 | "vcpuNum": 16
1330 | },
1331 | {
1332 | "_defaultOrder": 50,
1333 | "_isFastLaunch": false,
1334 | "category": "Accelerated computing",
1335 | "gpuNum": 1,
1336 | "hideHardwareSpecs": false,
1337 | "memoryGiB": 128,
1338 | "name": "ml.g5.8xlarge",
1339 | "vcpuNum": 32
1340 | },
1341 | {
1342 | "_defaultOrder": 51,
1343 | "_isFastLaunch": false,
1344 | "category": "Accelerated computing",
1345 | "gpuNum": 1,
1346 | "hideHardwareSpecs": false,
1347 | "memoryGiB": 256,
1348 | "name": "ml.g5.16xlarge",
1349 | "vcpuNum": 64
1350 | },
1351 | {
1352 | "_defaultOrder": 52,
1353 | "_isFastLaunch": false,
1354 | "category": "Accelerated computing",
1355 | "gpuNum": 4,
1356 | "hideHardwareSpecs": false,
1357 | "memoryGiB": 192,
1358 | "name": "ml.g5.12xlarge",
1359 | "vcpuNum": 48
1360 | },
1361 | {
1362 | "_defaultOrder": 53,
1363 | "_isFastLaunch": false,
1364 | "category": "Accelerated computing",
1365 | "gpuNum": 4,
1366 | "hideHardwareSpecs": false,
1367 | "memoryGiB": 384,
1368 | "name": "ml.g5.24xlarge",
1369 | "vcpuNum": 96
1370 | },
1371 | {
1372 | "_defaultOrder": 54,
1373 | "_isFastLaunch": false,
1374 | "category": "Accelerated computing",
1375 | "gpuNum": 8,
1376 | "hideHardwareSpecs": false,
1377 | "memoryGiB": 768,
1378 | "name": "ml.g5.48xlarge",
1379 | "vcpuNum": 192
1380 | },
1381 | {
1382 | "_defaultOrder": 55,
1383 | "_isFastLaunch": false,
1384 | "category": "Accelerated computing",
1385 | "gpuNum": 8,
1386 | "hideHardwareSpecs": false,
1387 | "memoryGiB": 1152,
1388 | "name": "ml.p4d.24xlarge",
1389 | "vcpuNum": 96
1390 | },
1391 | {
1392 | "_defaultOrder": 56,
1393 | "_isFastLaunch": false,
1394 | "category": "Accelerated computing",
1395 | "gpuNum": 8,
1396 | "hideHardwareSpecs": false,
1397 | "memoryGiB": 1152,
1398 | "name": "ml.p4de.24xlarge",
1399 | "vcpuNum": 96
1400 | }
1401 | ],
1402 | "instance_type": "ml.t3.medium",
1403 | "interpreter": {
1404 | "hash": "04ffa0b675ec4736afd1210dd81a6f70b0b4fa83298b056bd6b4e16ede0b389c"
1405 | },
1406 | "kernelspec": {
1407 | "display_name": "Python 3 (Base Python 2.0)",
1408 | "language": "python",
1409 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/sagemaker-base-python-38"
1410 | },
1411 | "language_info": {
1412 | "codemirror_mode": {
1413 | "name": "ipython",
1414 | "version": 3
1415 | },
1416 | "file_extension": ".py",
1417 | "mimetype": "text/x-python",
1418 | "name": "python",
1419 | "nbconvert_exporter": "python",
1420 | "pygments_lexer": "ipython3",
1421 | "version": "3.8.12"
1422 | }
1423 | },
1424 | "nbformat": 4,
1425 | "nbformat_minor": 4
1426 | }
1427 |
--------------------------------------------------------------------------------
/lab/nginxBasicAuth/source_dir/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow==2.14.2
2 | matplotlib
3 | sagemaker-experiments
4 |
--------------------------------------------------------------------------------
/lab/nginxBasicAuth/source_dir/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 |
4 | from setuptools import setup, find_packages
5 |
6 | setup(name='sagemaker-example',
7 | version='1.0',
8 | description='SageMaker MLFlow Example.',
9 | author='Paolo',
10 | author_email='frpaolo@amazon.at',
11 | packages=find_packages(exclude=('tests', 'docs')))
--------------------------------------------------------------------------------
/lab/nginxBasicAuth/source_dir/train.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 |
4 | import os
5 | import logging
6 | import argparse
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.ensemble import RandomForestRegressor
10 |
11 | import mlflow
12 | import mlflow.sklearn
13 | from mlflow.tracking import MlflowClient
14 |
15 | import joblib
16 | import boto3
17 | import json
18 | import time
19 |
20 | from smexperiments.tracker import Tracker
21 |
22 | logging.basicConfig(level=logging.INFO)
23 |
24 | def retrieve_credentials(region_name, secret_name):
25 | session = boto3.session.Session()
26 | client = session.client(
27 | service_name='secretsmanager',
28 | region_name=region_name
29 | )
30 |
31 | kwarg = {'SecretId': secret_name}
32 | secret = client.get_secret_value(**kwarg)
33 | credentials = {}
34 |
35 | credentials['username'] = json.loads(secret['SecretString'])['username']
36 | credentials['password'] = json.loads(secret['SecretString'])['password']
37 |
38 | return credentials
39 |
40 | def print_auto_logged_info(r):
41 | tags = {k: v for k, v in r.data.tags.items()}
42 | artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
43 | print("run_id: {}".format(r.info.run_id))
44 | print("artifacts: {}".format(artifacts))
45 | print("params: {}".format(r.data.params))
46 | print("metrics: {}".format(r.data.metrics))
47 | #print("tags: {}".format(tags))
48 |
49 | if __name__ =='__main__':
50 | parser = argparse.ArgumentParser()
51 | # MLflow related parameters
52 | parser.add_argument("--tracking_uri", type=str)
53 | parser.add_argument("--experiment_name", type=str)
54 | parser.add_argument("--region", type=str, default='us-west-2')
55 | parser.add_argument("--secret_name", type=str)
56 | # hyperparameters sent by the client are passed as command-line arguments to the script.
57 | # to simplify the demo we don't use all sklearn RandomForest hyperparameters
58 | parser.add_argument('--n-estimators', type=int, default=10)
59 | parser.add_argument('--min-samples-leaf', type=int, default=3)
60 |
61 | # Data, model, and output directories
62 | parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
63 | parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
64 | parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
65 | parser.add_argument('--train-file', type=str, default='california_train.csv')
66 | parser.add_argument('--test-file', type=str, default='california_test.csv')
67 | parser.add_argument('--user', type=str, default='sagemaker')
68 | parser.add_argument('--features', type=str) # we ask user to explicitly name features
69 | parser.add_argument('--target', type=str) # we ask user to explicitly name the target
70 |
71 | args, _ = parser.parse_known_args()
72 |
73 | logging.info('reading data')
74 | train_df = pd.read_csv(os.path.join(args.train, args.train_file))
75 | test_df = pd.read_csv(os.path.join(args.test, args.test_file))
76 |
77 | logging.info('building training and testing datasets')
78 | X_train = train_df[args.features.split()]
79 | X_test = test_df[args.features.split()]
80 | y_train = train_df[args.target]
81 | y_test = test_df[args.target]
82 |
83 |
84 | # sets the header Authentication: Basic
85 | credentials = retrieve_credentials(args.region, args.secret_name)
86 | os.environ['MLFLOW_TRACKING_USERNAME'] = credentials['username']
87 | os.environ['MLFLOW_TRACKING_PASSWORD'] = credentials['password']
88 |
89 | # set remote mlflow server
90 | mlflow.set_tracking_uri(args.tracking_uri)
91 | experiment = mlflow.set_experiment(args.experiment_name)
92 |
93 | mlflow.autolog()
94 |
95 | with mlflow.start_run() as run:
96 | params = {
97 | "n-estimators": args.n_estimators,
98 | "min-samples-leaf": args.min_samples_leaf,
99 | "features": args.features
100 | }
101 | mlflow.log_params(params)
102 |
103 | # TRAIN
104 | logging.info('training model')
105 | model = RandomForestRegressor(
106 | n_estimators=args.n_estimators,
107 | min_samples_leaf=args.min_samples_leaf,
108 | n_jobs=-1
109 | )
110 |
111 | model.fit(X_train, y_train)
112 |
113 | # ABS ERROR AND LOG COUPLE PERF METRICS
114 | logging.info('evaluating model')
115 | abs_err = np.abs(model.predict(X_test) - y_test)
116 |
117 | for q in [10, 50, 90]:
118 | logging.info(f'AE-at-{q}th-percentile: {np.percentile(a=abs_err, q=q)}')
119 | mlflow.log_metric(f'AE-at-{str(q)}th-percentile', np.percentile(a=abs_err, q=q))
120 |
121 | # SAVE MODEL
122 | logging.info('saving model in MLflow')
123 | mlflow.sklearn.log_model(model, "model")
124 | sm_data = json.loads(os.environ.get('SM_TRAINING_ENV'))
125 | job_name = sm_data['job_name']
126 |
127 | # Overwrite system tags
128 | mlflow.set_tags(
129 | {
130 | 'mlflow.source.name': f"https://{args.region}.console.aws.amazon.com/sagemaker/home?region={args.region}#/jobs/{job_name}",
131 | 'mlflow.source.type': 'JOB',
132 | 'mlflow.user': args.user
133 | }
134 | )
135 | # Shovel all SageMaker related data into mlflow
136 | mlflow.set_tags(sm_data)
137 |
138 | run_id = run.info.run_id
139 | experiment_id = experiment.experiment_id
140 |
141 | r = mlflow.get_run(run_id=run_id)
142 | print_auto_logged_info(r)
143 |
144 | artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
145 |
146 | tracker_parameters = {
147 | "run_id": run_id,
148 | "experiment_id": experiment_id,
149 | "mlflow-run-url": f"{args.tracking_uri}/#/experiments/{experiment_id}/runs/{run_id}"
150 | }
151 | try:
152 | with Tracker.load() as tracker:
153 | tracker.log_parameters(tracker_parameters)
154 | tracker.log_parameters(r.data.params)
155 | for metric_name, value in r.data.metrics.items():
156 | tracker.log_metric(metric_name=metric_name, value=value)
157 | for artifact in artifacts:
158 | tracker.log_output(name=f"MLFlow.{artifact}", value=f"{r.info.artifact_uri}/{artifact}")
159 | # Nullify default SageMaker.ModelArtifact
160 | tracker.log_output(name="SageMaker.ModelArtifact", value="NA")
161 | print("Loaded existing tracker")
162 | except:
163 | print("Could not load tracker (likely running in local mode). Create a new one")
164 | create_date = time.strftime("%Y-%m-%d-%H-%M-%S")
165 | tracker_name = f"mlflow-tracker-{create_date}"
166 | with Tracker.create(display_name=tracker_name) as tracker:
167 | tracker.log_parameters(tracker_parameters)
168 | tracker.log_parameters(r.data.params)
169 | print("Metric cannot be logged when creating a tracker in this way")
170 | for artifact in artifacts:
171 | tracker.log_output(name=f"MLFlow.{artifact}", value=f"{r.info.artifact_uri}/{artifact}")
172 | tracker.log_output(name="SageMaker.ModelArtifact", value="NA")
--------------------------------------------------------------------------------
/resize-cloud9.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Specify the desired volume size in GiB as a command line argument. If not specified, default to 20 GiB.
4 | SIZE=${1:-20}
5 |
6 | # Get the ID of the environment host Amazon EC2 instance.
7 | INSTANCEID=$(curl http://169.254.169.254/latest/meta-data/instance-id)
8 | REGION=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/')
9 |
10 | # Get the ID of the Amazon EBS volume associated with the instance.
11 | VOLUMEID=$(aws ec2 describe-instances \
12 | --instance-id $INSTANCEID \
13 | --query "Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId" \
14 | --output text \
15 | --region $REGION)
16 |
17 | # Resize the EBS volume.
18 | aws ec2 modify-volume --volume-id $VOLUMEID --size $SIZE
19 |
20 | # Wait for the resize to finish.
21 | while [ \
22 | "$(aws ec2 describe-volumes-modifications \
23 | --volume-id $VOLUMEID \
24 | --filters Name=modification-state,Values="optimizing","completed" \
25 | --query "length(VolumesModifications)"\
26 | --output text)" != "1" ]; do
27 | sleep 1
28 | done
29 |
30 | #Check if we're on an NVMe filesystem
31 | if [[ -e "/dev/xvda" && $(readlink -f /dev/xvda) = "/dev/xvda" ]]
32 | then
33 | # Rewrite the partition table so that the partition takes up all the space that it can.
34 | sudo growpart /dev/xvda 1
35 |
36 | # Expand the size of the file system.
37 | # Check if we're on AL2
38 | STR=$(cat /etc/os-release)
39 | SUB="VERSION_ID=\"2\""
40 | if [[ "$STR" == *"$SUB"* ]]
41 | then
42 | sudo xfs_growfs -d /
43 | else
44 | sudo resize2fs /dev/xvda1
45 | fi
46 |
47 | else
48 | # Rewrite the partition table so that the partition takes up all the space that it can.
49 | sudo growpart /dev/nvme0n1 1
50 |
51 | # Expand the size of the file system.
52 | # Check if we're on AL2
53 | STR=$(cat /etc/os-release)
54 | SUB="VERSION_ID=\"2\""
55 | if [[ "$STR" == *"$SUB"* ]]
56 | then
57 | sudo xfs_growfs -d /
58 | else
59 | sudo resize2fs /dev/nvme0n1p1
60 | fi
61 | fi
62 |
--------------------------------------------------------------------------------
/src/mlflow/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | npm-debug.log
--------------------------------------------------------------------------------
/src/mlflow/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11.0
2 |
3 | RUN pip install \
4 | mlflow==2.14.2\
5 | pymysql==1.0.2 \
6 | boto3 && \
7 | mkdir /mlflow/
8 |
9 | EXPOSE 5000
10 |
11 | CMD mlflow server \
12 | --host 0.0.0.0 \
13 | --port 5000 \
14 | --default-artifact-root ${BUCKET} \
15 | --backend-store-uri mysql+pymysql://${USERNAME}:${PASSWORD}@${HOST}:${PORT}/${DATABASE}
16 |
--------------------------------------------------------------------------------
/src/nginx/basic_auth/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx:1.17.6
2 | RUN apt-get update
3 | RUN apt-get install openssl -y
4 | # Remove default Nginx config
5 | RUN rm /etc/nginx/nginx.conf
6 | # Copy the modified Nginx conf
7 | COPY nginx.conf /etc/nginx/nginx.conf
8 | RUN ln -sf /dev/stdout /var/log/nginx/access.log \
9 | && ln -sf /dev/stderr /var/log/nginx/error.log
10 |
11 | COPY script.sh /root/script.sh
12 | RUN chmod +x /root/script.sh
13 |
14 | CMD /root/script.sh && nginx -g 'daemon off;'
--------------------------------------------------------------------------------
/src/nginx/basic_auth/nginx.conf:
--------------------------------------------------------------------------------
1 | # Define the user that will own and run the Nginx server
2 |
3 | worker_processes 1;
4 | #daemon off; # Prevent forking
5 |
6 | pid /tmp/nginx.pid;
7 | error_log /var/log/nginx/error.log;
8 |
9 | events {
10 | # defaults
11 | }
12 |
13 | http {
14 | include /etc/nginx/mime.types;
15 | default_type application/octet-stream;
16 | access_log /var/log/nginx/access.log combined;
17 |
18 | server {
19 | listen 80;
20 | client_max_body_size 5m;
21 |
22 | keepalive_timeout 5;
23 | proxy_read_timeout 1200s;
24 |
25 | location /elb-status {
26 | access_log off;
27 | return 200;
28 | }
29 |
30 | location / {
31 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
32 | proxy_set_header Host $http_host;
33 | proxy_redirect off;
34 | resolver 169.254.169.253;
35 | set $mlflow mlflowservice.http-api.local;
36 | proxy_pass http://$mlflow:5000;
37 | auth_basic "Administrator’s Area";
38 | auth_basic_user_file /etc/nginx/.htpasswd;
39 | }
40 | }
41 | }
--------------------------------------------------------------------------------
/src/nginx/basic_auth/script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo -n $MLFLOW_USERNAME: >> /etc/nginx/.htpasswd
3 | openssl passwd -1 $MLFLOW_PASSWORD >> /etc/nginx/.htpasswd
4 |
--------------------------------------------------------------------------------