├── .gitmodules ├── AWS-Passport ├── Passport-Custom-Lens.json └── README.md ├── Amazon-ECS-Lens ├── Amazon_ECS_Lens.json └── README.md ├── Amazon-S3-Lens ├── Amazon_S3_Lens.json └── README.md ├── ApiGwLambda ├── README.md └── custom-lens-apigw-lambda-v0.1.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DocumentDB ├── README.md └── custom-lens-documentDB.json ├── DynamoDB ├── README.md └── custom-lensddb-v1.0.json ├── ElastiCache ├── README.md └── custom-lens-elasticache-v2.0 .json ├── Glue ├── README.md └── custom-lens-glue-v2.0.json ├── IDP-custom-lens ├── Amazon_Machine_Learning_IDP_Lens.json └── README.md ├── Iceberg-S3-Lens ├── README.md └── custom-lens-iceberg-amaozn-s3-v1.0.json ├── LICENSE ├── MSFT-Lens ├── .gitkeep ├── Microsoft_On_AWS_Lens.json ├── README.md └── images │ ├── .gitkeep │ ├── import-custom-lens.png │ ├── preview-interface.png │ ├── preview-lens.png │ └── wa-tool-console.png ├── ORR-Lens ├── ORR-Whitepaper-Sample-PUBLISHED.json └── README.md ├── OpenSearch ├── README.md └── custom-lens-OpenSearch.json ├── README.md ├── SaaS-Business-Lens ├── README.md ├── img │ └── WAView.png └── saas_biz_custom_lens.json ├── SageMaker-Flower-Lens ├── README.md └── custom-lens-sagemaker-flower-v1.0.json └── Streaming-Media-Lens ├── README.md └── streaming-media-lens-v1.02.json /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "SAP-Lens"] 2 | path = SAP-Lens 3 | url = https://github.com/aws-samples/aws-sap-lens-well-architected 4 | branch = main 5 | -------------------------------------------------------------------------------- /AWS-Passport/Passport-Custom-Lens.json: -------------------------------------------------------------------------------- 1 | { 2 | "name":"AWS Passport Custom Lens", 3 | "schemaVersion":"2021-11-01", 4 | "description":"ISVs that aspire to expand their business internationally face a unique set of challenges, including operational challenges, cultural adaptation, slow market entry, and failure to realize growth. AWS Passport provides guided access to AWS resources, programs, and partners to help address four key needs: 1/ Strategic: half day in person workshop to define a joint growth strategy, identify gaps, and achieve executive alignment, 2/ Technical: workshop to provide guidance on regulation and compliance, architectural best practices, 3/ Operational: referral to local resources for facilities, customer support, legal, HR, recruitment, sales, logistics, and other services, and 4/ Go-to-market: workshop to define a joint go-to-market strategy for pipeline development in the new target geography. This custom lens focuses on architectural best practices", 5 | "pillars":[ 6 | { 7 | "id":"overview", 8 | "name":"Overview", 9 | "questions":[ 10 | { 11 | "id":"overview01", 12 | "title":"Have you completed a Well Architected Framework Review for all workloads to be deployed to the new region?", 13 | "description":"Carrying technical debt to new regions increases risk. It is recommended to ensure all workloads are well-architected before deployment to new regions.", 14 | "choices":[ 15 | { 16 | "id":"choice1", 17 | "title":"Yes.", 18 | "helpfulResource":{ 19 | "displayText":"All workloads are well architected." 20 | }, 21 | "improvementPlan":{ 22 | "displayText":"A Well Architected Framework Review should be conducted for all workloads to be deployed in new regions. Schedule some time with an AWS Solutions Architect to guide you through a Well Architected Framework Review.", 23 | "url":"https://aws.amazon.com/architecture/" 24 | } 25 | }, 26 | { 27 | "id":"choice2", 28 | "title":"No.", 29 | "helpfulResource":{ 30 | "displayText":"A Well Architected Framework Review has not been completed for all workloads to be deployed to new regions.", 31 | "url":"https://aws.amazon.com/architecture/" 32 | }, 33 | "improvementPlan":{ 34 | "displayText":"A Well Architected Framework Review should be conducted for all workloads to be deployed in new regions. Schedule some time with an AWS Solutions Architect to guide you through a Well Architected Framework Review.", 35 | "url":"https://aws.amazon.com/architecture/" 36 | } 37 | } 38 | ], 39 | "riskRules":[ 40 | { 41 | "condition":"choice1", 42 | "risk":"NO_RISK" 43 | }, 44 | { 45 | "condition":"choice2", 46 | "risk":"HIGH_RISK" 47 | }, 48 | { 49 | "condition":"default", 50 | "risk":"MEDIUM_RISK" 51 | } 52 | ] 53 | }, 54 | { 55 | "id":"overview02", 56 | "title":"Have all risks identified in the Well Architected Framework Review been mitigated?", 57 | "description":"You can use the AWS Well-Architected Tool to identify and remediate risks in your workloads that map to the five pillars of the AWS Well-Architected Framework: operational excellence, security, reliability, performance efficiency, and cost optimization.", 58 | "url":"https://aws.amazon.com/well-architected-tool/", 59 | "choices":[ 60 | { 61 | "id":"choice1", 62 | "title":"Yes.", 63 | "helpfulResource":{ 64 | "displayText":"Risks have been identified and mitigated." 65 | }, 66 | "improvementPlan":{ 67 | "displayText":"Schedule some time with an AWS Solutions Architect to review risks and explore mitigations." 68 | } 69 | }, 70 | { 71 | "id":"choice2", 72 | "title":"No.", 73 | "helpfulResource":{ 74 | "displayText":"Risks have not been identified or mitigated.", 75 | "url":"https://aws.amazon.com/architecture/" 76 | }, 77 | "improvementPlan":{ 78 | "displayText":"Schedule some time with an AWS Solutions Architect to review risks and explore mitigations." 79 | } 80 | } 81 | ], 82 | "riskRules":[ 83 | { 84 | "condition":"choice1", 85 | "risk":"NO_RISK" 86 | }, 87 | { 88 | "condition":"choice2", 89 | "risk":"HIGH_RISK" 90 | }, 91 | { 92 | "condition":"default", 93 | "risk":"MEDIUM_RISK" 94 | } 95 | ] 96 | }, 97 | { 98 | "id":"overview03", 99 | "title":"Is the architecture modernized?", 100 | "description":"Modernizing your applications helps you reduce costs, gain efficiencies, and make the most of your existing investments. It involves a multi-dimensional approach to adopt and use new technology, to deliver portfolio, application, and infrastructure value faster, and to position your organization to scale at an optimal price. After you optimize your applications, you must operate in that new, modernized model without disruption to simplify your business operations, architecture, and overall engineering practices.", 101 | "choices":[ 102 | { 103 | "id":"choice1", 104 | "title":"Yes.", 105 | "helpfulResource":{ 106 | "displayText":"The workload is modernized.", 107 | "url":"https://docs.aws.amazon.com/prescriptive-guidance/latest/strategy-modernizing-applications/welcome.html" 108 | }, 109 | "improvementPlan":{ 110 | "displayText":"Consider employing a strategy for modernizing applications in the AWS Cloud", 111 | "url":"https://docs.aws.amazon.com/prescriptive-guidance/latest/strategy-modernizing-applications/welcome.html" 112 | } 113 | }, 114 | { 115 | "id":"choice2", 116 | "title":"No.", 117 | "helpfulResource":{ 118 | "displayText":"The workload is not modernized." 119 | }, 120 | "improvementPlan":{ 121 | "displayText":"Consider employing a strategy for modernizing applications in the AWS Cloud", 122 | "url":"https://docs.aws.amazon.com/prescriptive-guidance/latest/strategy-modernizing-applications/welcome.html" 123 | } 124 | } 125 | ], 126 | "riskRules":[ 127 | { 128 | "condition":"choice1", 129 | "risk":"NO_RISK" 130 | }, 131 | { 132 | "condition":"choice2", 133 | "risk":"HIGH_RISK" 134 | }, 135 | { 136 | "condition":"default", 137 | "risk":"MEDIUM_RISK" 138 | } 139 | ] 140 | }, 141 | { 142 | "id":"overview04", 143 | "title":"Are you using Infrastructure as Code (IaC) to deploy workloads?", 144 | "description":"Manual infrastructure management is time consuming and prone to errors, especially when you manage applications at scale. Infrastructure as code lets you define your infrastructure desired state without including all the steps to get to that state. It automates infrastructure management so developers can focus on building and improving applications instead of managing environments. Organizations use infrastructure as code to control costs, reduce risks, and respond with speed to new business opportunities.", 145 | "choices":[ 146 | { 147 | "id":"choice1", 148 | "title":"Yes.", 149 | "helpfulResource":{ 150 | "displayText":"Infrastructure as code is used for all production deployments." 151 | }, 152 | "improvementPlan":{ 153 | "displayText":"There are many IAC tools available from AWS and AWS partners. Schedule some time with an AWS Solutions Architect to explore which tools are best for you.", 154 | "url":"https://aws.amazon.com/what-is/iac/" 155 | } 156 | }, 157 | { 158 | "id":"choice2", 159 | "title":"No.", 160 | "helpfulResource":{ 161 | "displayText":"Manual methods are used for deployments.", 162 | "url":"https://aws.amazon.com/what-is/iac/" 163 | }, 164 | "improvementPlan":{ 165 | "displayText":"There are many IAC tools available from AWS and AWS partners. Schedule some time with an AWS Solutions Architect to explore which tools are best for you.", 166 | "url":"https://aws.amazon.com/what-is/iac/" 167 | } 168 | } 169 | ], 170 | "riskRules":[ 171 | { 172 | "condition":"choice1", 173 | "risk":"NO_RISK" 174 | }, 175 | { 176 | "condition":"choice2", 177 | "risk":"HIGH_RISK" 178 | }, 179 | { 180 | "condition":"default", 181 | "risk":"MEDIUM_RISK" 182 | } 183 | ] 184 | }, 185 | { 186 | "id":"overview05", 187 | "title":"Is service and feature parity for target regions known?", 188 | "description":"AWS offers 33 regions to choose from. There may be some variation in services and features from region to region. It is important to understand any disparity before expanding to new regions.", 189 | "choices":[ 190 | { 191 | "id":"choice1", 192 | "title":"Yes.", 193 | "helpfulResource":{ 194 | "displayText":"Service and feature parity is known." 195 | }, 196 | "improvementPlan":{ 197 | "displayText":"Schedule some time with an AWS Solutions Architect or AWS Technical Account Manager to explore service and feature availability in desired regions." 198 | } 199 | }, 200 | { 201 | "id":"choice2", 202 | "title":"No.", 203 | "helpfulResource":{ 204 | "displayText":"Service and feature parity is not known.", 205 | "url":"https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/" 206 | }, 207 | "improvementPlan":{ 208 | "displayText":"Schedule some time with an AWS Solutions Architect or AWS Technical Account Manager to explore service and feature availability in desired regions." 209 | } 210 | } 211 | ], 212 | "riskRules":[ 213 | { 214 | "condition":"choice1", 215 | "risk":"NO_RISK" 216 | }, 217 | { 218 | "condition":"choice2", 219 | "risk":"HIGH_RISK" 220 | }, 221 | { 222 | "condition":"default", 223 | "risk":"MEDIUM_RISK" 224 | } 225 | ] 226 | }, 227 | { 228 | "id":"overview06", 229 | "title":"Is service and feature parity for target regions acceptable?", 230 | "description":"There may be cases where variations in service and feature availability will require changes in workload architecture. Make sure you understand and plan for any changes.", 231 | "choices":[ 232 | { 233 | "id":"choice1", 234 | "title":"Yes.", 235 | "helpfulResource":{ 236 | "displayText":"Service and feature parity is known and has been accounted for in the target region architecture." 237 | }, 238 | "improvementPlan":{ 239 | "displayText":"Schedule some time with an AWS Solutions Architect or AWS Technical Account Manager to explore service and feature availability in desired regions." 240 | } 241 | }, 242 | { 243 | "id":"choice2", 244 | "title":"No.", 245 | "helpfulResource":{ 246 | "displayText":"There remains some ambiguity about how to identify or mitigate service or feature disparity." 247 | }, 248 | "improvementPlan":{ 249 | "displayText":"Schedule some time with an AWS Solutions Architect or AWS Technical Account Manager to explore service and feature availability in desired regions." 250 | } 251 | } 252 | ], 253 | "riskRules":[ 254 | { 255 | "condition":"choice1", 256 | "risk":"NO_RISK" 257 | }, 258 | { 259 | "condition":"choice2", 260 | "risk":"HIGH_RISK" 261 | }, 262 | { 263 | "condition":"default", 264 | "risk":"MEDIUM_RISK" 265 | } 266 | ] 267 | }, 268 | { 269 | "id":"overview07", 270 | "title":"Is service capacity for target regions known?", 271 | "description":"AWS offers 33 regions to choose from. There may be some variation in capacity from region to region. It is important to understand any large increases in capacity before expansing to new regions.", 272 | "choices":[ 273 | { 274 | "id":"choice1", 275 | "title":"Yes.", 276 | "helpfulResource":{ 277 | "displayText":"Capacity is known." 278 | }, 279 | "improvementPlan":{ 280 | "displayText":"Schedule some time with an AWS Technical Account Manager to explore capacity in desired regions. AWS offers a variety of tools including capcity reservation to ensure needed capacity is available.", 281 | "url":"https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/capacity-reservation-overview.html" 282 | } 283 | }, 284 | { 285 | "id":"choice2", 286 | "title":"No.", 287 | "helpfulResource":{ 288 | "displayText":"Capacity is NOT known." 289 | }, 290 | "improvementPlan":{ 291 | "displayText":"Schedule some time with an AWS Technical Account Manager to explore capacity in desired regions. AWS offers a variety of tools including capcity reservation to ensure needed capacity is available.", 292 | "url":"https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/capacity-reservation-overview.html" 293 | } 294 | } 295 | ], 296 | "riskRules":[ 297 | { 298 | "condition":"choice1", 299 | "risk":"NO_RISK" 300 | }, 301 | { 302 | "condition":"choice2", 303 | "risk":"HIGH_RISK" 304 | }, 305 | { 306 | "condition":"default", 307 | "risk":"MEDIUM_RISK" 308 | } 309 | ] 310 | }, 311 | { 312 | "id":"overview08", 313 | "title":"Is service capacity for target regions acceptable?", 314 | "description":"There may be cases where capacity constraints will require changes in workload architecture or service quota incrases. Make sure you understand and plan for any changes.", 315 | "choices":[ 316 | { 317 | "id":"choice1", 318 | "title":"Yes.", 319 | "helpfulResource":{ 320 | "displayText":"Capacity and service quotas are known and have been accounted for in the target region architecture." 321 | }, 322 | "improvementPlan":{ 323 | "displayText":"Schedule some time with an AWS Technical Account Manager to explore capacity desired regions." 324 | } 325 | }, 326 | { 327 | "id":"choice2", 328 | "title":"No.", 329 | "helpfulResource":{ 330 | "displayText":"There remains some ambiguity about how to identify or mitigate capacity or quota constraints." 331 | }, 332 | "improvementPlan":{ 333 | "displayText":"Schedule some time with an AWS Technical Account Manager to explore capacity desired regions." 334 | } 335 | } 336 | ], 337 | "riskRules":[ 338 | { 339 | "condition":"choice1", 340 | "risk":"NO_RISK" 341 | }, 342 | { 343 | "condition":"choice2", 344 | "risk":"HIGH_RISK" 345 | }, 346 | { 347 | "condition":"default", 348 | "risk":"MEDIUM_RISK" 349 | } 350 | ] 351 | }, { 352 | "id":"overview09", 353 | "title":"Is the cold start cost for target regions known?", 354 | "description":"Pricing for AWS services can vary from region to region. It's important to understand cost differences and the cost to initially deploy system architecture.", 355 | "choices":[ 356 | { 357 | "id":"choice1", 358 | "title":"Yes.", 359 | "helpfulResource":{ 360 | "displayText":"Cold start costs are known." 361 | }, 362 | "improvementPlan":{ 363 | "displayText":" Schedule some time with an AWS Technical Account Manager to explore costs in desired regions. AWS offers a variety of tools including the AWS Pricing Calculator to understand cold start cost." 364 | } 365 | }, 366 | { 367 | "id":"choice2", 368 | "title":"No.", 369 | "helpfulResource":{ 370 | "displayText":"Cold start costs are NOT known.", 371 | "url":"https://calculator.aws/#/" 372 | }, 373 | "improvementPlan":{ 374 | "displayText":" Schedule some time with an AWS Technical Account Manager to explore costs desired regions. AWS offers a variety of tools including the AWS Pricing Calculator to understand cold start cost.", 375 | "url":"https://calculator.aws/#/" 376 | } 377 | } 378 | ], 379 | "riskRules":[ 380 | { 381 | "condition":"choice1", 382 | "risk":"NO_RISK" 383 | }, 384 | { 385 | "condition":"choice2", 386 | "risk":"HIGH_RISK" 387 | }, 388 | { 389 | "condition":"default", 390 | "risk":"MEDIUM_RISK" 391 | } 392 | ] 393 | }, 394 | { 395 | "id":"overview10", 396 | "title":"Is the cold start cost for target regions acceptable?", 397 | "description":"There may be cases where cold start costs will require changes in workload architecture or service pricing. Make sure you understand and plan for any changes.", 398 | "choices":[ 399 | { 400 | "id":"choice1", 401 | "title":"Yes.", 402 | "helpfulResource":{ 403 | "displayText":"Cold start costs are known and have been accounted for in the target region." 404 | }, 405 | "improvementPlan":{ 406 | "displayText":"Schedule some time with an AWS Solutions Architect or AWS Technical Account Manager to explore cost optimization strategies.", 407 | "url":"https://aws.amazon.com/aws-cost-management/cost-optimization/" 408 | } 409 | }, 410 | { 411 | "id":"choice2", 412 | "title":"No.", 413 | "helpfulResource":{ 414 | "displayText":"There remains some ambiguity around cold start costs or costs are too high." 415 | }, 416 | "improvementPlan":{ 417 | "displayText":"Schedule some time with an AWS Solutions Architect or AWS Technical Account Manager to explore cost optimization strategies.", 418 | "url":"https://aws.amazon.com/aws-cost-management/cost-optimization/" 419 | } 420 | } 421 | ], 422 | "riskRules":[ 423 | { 424 | "condition":"choice1", 425 | "risk":"NO_RISK" 426 | }, 427 | { 428 | "condition":"choice2", 429 | "risk":"HIGH_RISK" 430 | }, 431 | { 432 | "condition":"default", 433 | "risk":"MEDIUM_RISK" 434 | } 435 | ] 436 | }, 437 | { 438 | "id":"overview11", 439 | "title":"Are the data residency, data sovereignty, and general compliance considerations known?", 440 | "description":"There may be cases where the data residency, data sovereignty, and general compliance considerations may impact your workload's architecture in a new AWS region.", 441 | "choices":[ 442 | { 443 | "id":"choice1", 444 | "title":"Yes.", 445 | "helpfulResource":{ 446 | "displayText":"Data residency, data sovereignty, and general compliance considerations are known and have been accounted for in the target region architecture." 447 | }, 448 | "improvementPlan":{ 449 | "displayText":"Schedule time to work with your internal compliance and regulatory teams to define the high-level requirements in desired regions and reach out to the AWS account team for further assistance." 450 | } 451 | }, 452 | { 453 | "id":"choice2", 454 | "title":"No.", 455 | "helpfulResource":{ 456 | "displayText":"There remains some ambiguity around the data residency, data sovereignty, and general compliance considerations." 457 | }, 458 | "improvementPlan":{ 459 | "displayText":"Schedule time to work with your internal compliance and regulatory teams to define the high-level requirements in desired regions and reach out to the AWS account team for further assistance." 460 | } 461 | } 462 | ], 463 | "riskRules":[ 464 | { 465 | "condition":"choice1", 466 | "risk":"NO_RISK" 467 | }, 468 | { 469 | "condition":"choice2", 470 | "risk":"HIGH_RISK" 471 | }, 472 | { 473 | "condition":"default", 474 | "risk":"MEDIUM_RISK" 475 | } 476 | ] 477 | } 478 | ] 479 | } 480 | ] 481 | } 482 | -------------------------------------------------------------------------------- /AWS-Passport/README.md: -------------------------------------------------------------------------------- 1 | # AWS Passport Well-Architected Custom Lens 2 | 3 | AWS Passport is a program designed to help software companies grow their businesses internationally and lower the risks of global expansion (e.g. operational challenges, cultural adaptation, slow market entry, and failure to realize growth). Software Companies selected into the AWS Passport program will receive guided access to AWS resources/programs and partners to help address strategic planning, operational, technical, and go-to-market challenges. 4 | 5 | AWS Passport Well-Architected Custom Lens specifies best practices to reduce risks associated with expansion to new regions. This custom lens is intended to be supplemental to the AWS Well Architected Framework. It is recommended that a Well Architected Framework Review be conducted across all pillars before employing this custom lens. The pillars include: 6 | * Operational excellence 7 | * Security 8 | * Reliability 9 | * Performance efficiency 10 | * Cost optimization 11 | * Sustainability 12 | 13 | The custom lens surfaces risks associated with: 14 | * Technical Debt 15 | * Mutable Infrastructure 16 | * Region Feature Parity 17 | * Region Capacity 18 | * Cold Start Costs 19 | * Data residency, data sovereignty, and general compliance 20 | 21 | ## Intended Audience 22 | 23 | CTO, Technical Leaders, Architects 24 | 25 | ## Contributing 26 | - [Ray Zaman](mailto:radzez@amazon.com), Principal Solutions Architect, AWS 27 | - [Neela Kulkarni](mailto:kulneel@amazon.com), Solutions Architect, AWS 28 | - [Viswanath Tej Nagabhatla](mailto:tejnagab@amazon.com), Senior Solutions Architect, AWS 29 | -------------------------------------------------------------------------------- /Amazon-ECS-Lens/README.md: -------------------------------------------------------------------------------- 1 | # AWS Well-Architected Amazon ECS Lens 2 | 3 | Amazon Elastic Container Service (Amazon ECS) is a highly scalable and fast container management service that you can use to manage containers on a cluster. 4 | 5 | This guide covers many of the most important operational best practices for Amazon ECS. It also describes several core concepts that are involved in how Amazon ECS based applications work. The goal is to provide a concrete and actionable approach to operating and troubleshooting Amazon ECS based applications. 6 | 7 | The pillars are based on the Amazon ECS best practices guide listed in the resources below. 8 | 9 | ## Contributing 10 | 11 | From the Container TFC 12 | 13 | Author: 14 | - Mai Nishitani, Solutions Architect 15 | 16 | Contributors: 17 | - Dean O'Reilly, Solutions Architect 18 | - Jake Rapinett, Associate Solutions Architect 19 | - Arran Peterson, Solutions Architect 20 | - Joel Skepper, Senior Technical Trainer 21 | 22 | ## Resources 23 | 24 | [Amazon Elastic Container Services Best Practices Guide](https://docs.aws.amazon.com/AmazonECS/latest/bestpracticesguide/intro.html) 25 | -------------------------------------------------------------------------------- /Amazon-S3-Lens/README.md: -------------------------------------------------------------------------------- 1 | # Amazon S3 Lens 2 | Amazon S3 Lens provides best practices for using Amazon S3 storage in 6 pillars of the AWS Well-Architected Framework: Operational Excellence, Security, Reliability, Performance Efficiency, Cost Optimization and Sustainability. 3 | 4 | ## Contributing 5 | - Saadelden Abdelkreem, Senior Solutions Architect, AWS 6 | - Ryan Baker, Principal TAM, AWS 7 | 8 | ## Reviewers: 9 | - Lee Kear, Principle Storage Solutions Architect, AWS 10 | - Rafeal Koik, Principal Solutions Architect, AWS 11 | -------------------------------------------------------------------------------- /ApiGwLambda/README.md: -------------------------------------------------------------------------------- 1 | # API Gateway and Lambda Custom Lens 2 | 3 | API Gateway and Lambda Custom Lens specifies best practices in Application Design, Security, Monitoring and Cost and Performance pillars. The best practices are designed for API Gateway and Lambda workload. 4 | 5 | ## Intended Audience 6 | 7 | CTO, Technical Leader or technical owner for an AWS API Gateway and Lambda workload 8 | 9 | ## Contributing 10 | 11 | - [Paul Lu](mailto:weichil@amazon.com), AWS Senior Solution Architect, API Gateway SME 12 | - [Haipeng Qi](mailto:haipenq@amazon.com), AWS Senior Solution Architect 13 | - [Zhe Zhang](mailto:zzhe@amazon.com), AWS Support Pratice Leader 14 | - [Yunfei Lu](mailto:yunfeilu@amazon.com), AWS Solution Architect 15 | - [Jun-Tin Yeh](mailto:bobyeh@amazon.com), AWS Well-Architected Geo SA 16 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /DocumentDB/README.md: -------------------------------------------------------------------------------- 1 | # AWS DocumentDB Well-Architected Custom Lens 2 | 3 | AWS DocumentDB Well-Architected Custom Lens specifies best practices in Operational Excellence, Security, Performance Efficiency, Reliability and Cost Optimization pillars. The best practices are designed for DocumentDB workload. 4 | 5 | ## Intended Audience 6 | 7 | CTO, Technical Leader or technical owner for an AWS DocumentDB workload 8 | 9 | ## Contributing 10 | 11 | - [Karthik Vijayraghavan](mailto:kvijayra@amazon.com), Sr.Manager NoSQL DB Specialist SA, WWSO Database, AWS 12 | 13 | ## Co-contributing 14 | - [Jack Hsu](mailto:jackhsu@amazon.com), Partner Solutions Architect, AWS 15 | 16 | -------------------------------------------------------------------------------- /DynamoDB/README.md: -------------------------------------------------------------------------------- 1 | # Customize Well-Architected Reviews for DynamoDB 2 | 3 | The AWS Well-Archtected Tool makes it easy to create custom lenses by providing a json template that you can use. The template outlines how the lens content must be defined, and it controls how the lens is presented within the AWS Well-Architected Tool. This repo contains the recommended DynamoDB custom lens json file which can be used to create a DynamoDB Custom Lens in Well-Architected console from json template with the following steps: 4 | 1. [Prepare your custom lens Well-Architected template json file](#prepare-your-custom-lens-well-architected-template-json-file) 5 | 2. [Navigate to the AWS Well-Architected Tool on the AWS console and create custom lens](#navigate-to-the-aws-well-architected-tool-on-the-aws-console-and-create-custom-lens) 6 | 3. [Publish your custom lens and attach a version tag](#publish-your-custom-lens-and-attach-a-version-tag) 7 | 4. [Review workloads using custom lenses](#review-workloads-using-custom-lenses) 8 | 9 | ## Prepare your custom lens Well-Architected template json file 10 | Prepare your custom lens Well-Architected template json file or just download the [provided recommended dynamodb custom lens file](custom-lensddb-v1.0.json). 11 | 12 | ```json 13 | { 14 | "schemaVersion":"2022-11-01", 15 | "name":"DynamoDB Best Practice Lens", 16 | "description":"Best practices for optimization your DynamoDB", 17 | "pillars":[ 18 | { 19 | "id":"DDBOPS", 20 | "name":"Operational Excellence", 21 | "questions":[ 22 | { 23 | "id":"ddbops1", 24 | "title":"How do you backup DynamoDB tables?", 25 | "description":"With proper backup process, you will be able to prevent unexpected data lost.", 26 | "choices":[ 27 | { 28 | "id":"ddbops1_1", 29 | "title":"Use On-Demand Backup for DynamoDB tables", 30 | "description":"Some helpful choice description", 31 | "helpfulResource":{ 32 | "displayText":"A detail description or definition of this best practice, and give a clear scope of this best practice in pillar, also the impact of the risk.", 33 | "url":"https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/BackupRestore.html" 34 | }, 35 | "improvementPlan":{ 36 | "displayText":"Enable On-Demand Backup", 37 | "url":"https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/BackupRestore.html" 38 | } 39 | },... 40 | ] 41 | },... 42 | ] 43 | } 44 | ] 45 | } 46 | 47 | ``` 48 | 49 | ## Navigate to the AWS Well-Architected Tool on the AWS console and create custom lens 50 | 51 | After you prepare your custom lens Well-Architected template json file, you can navigate to the AWS Well-Architected Tool on the AWS console and create custom lens. 52 | ![image 1](https://user-images.githubusercontent.com/17841922/175503831-cf89ff5e-8c6e-42c7-b796-3ff91e9d8470.png) 53 | 54 | Upload the your custom lens Well-Architected template json template, and submit it. 55 | ![image 2](https://user-images.githubusercontent.com/17841922/175503996-9b734d2c-8220-4efb-b5d2-f4ad77ad0ff4.png) 56 | 57 | ## Publish your custom lens and attach a version tag 58 | 59 | And then you will find your Well-Architected custom lens in draft version. 60 | ![image 3](https://user-images.githubusercontent.com/17841922/175504307-f5bd6dec-bab0-4dc1-be1f-c6dc77906483.png) 61 | 62 | Publish your draft version and provide a version tag. 63 | ![image 4](https://user-images.githubusercontent.com/17841922/175504406-4dcff143-00a5-4a7b-9952-2a2075ce95ab.png) 64 | 65 | ![image 5](https://user-images.githubusercontent.com/17841922/175504664-ed77ea17-6595-4e14-9751-6c8060daaea7.png) 66 | 67 | ![image 6](https://user-images.githubusercontent.com/17841922/175504933-b339be90-d99a-4bf9-a5ec-31d46943b3e0.png) 68 | 69 | ## Review workloads using custom lenses 70 | 71 | After your Well-Architected custom lens is published, you can define a workload in your AWS Well-Architected console. 72 | ![image 7](https://user-images.githubusercontent.com/17841922/175505004-1f9026f7-c3f8-415d-92a1-747ab68f6610.png) 73 | 74 | Input your workload information, and select the Well-Architected custom lens you just published. 75 | ![image 8](https://user-images.githubusercontent.com/17841922/175505110-aed421d6-648e-4821-a20c-ae092b48962d.png) 76 | 77 | Choose Continue reviewing for the just published DynamoDB Well-Architected custom lens. 78 | ![image 9](https://user-images.githubusercontent.com/17841922/175506629-f1afdcd8-06d5-4fa2-ae65-8dd991714b9b.png) 79 | 80 | Now you can review your DynamoDB workload with DynamoDB Well-Architected Custom Lens. 81 | ![image 10](https://user-images.githubusercontent.com/17841922/175505647-835e5413-1b65-4a2d-89e3-072f8e695f2d.png) 82 | -------------------------------------------------------------------------------- /ElastiCache/README.md: -------------------------------------------------------------------------------- 1 | # ElastiCache Lens for Well-Architected 2 | 3 | The well-architected lens for ElastiCache will help users understand and implement best practices when using ElastiCache. The lens will provide guidance on how to design and operate highly available and scalable ElastiCache clusters. 4 | 5 | ElastiCache is a popular in-memory caching service that enables users to improve the performance of their applications by caching frequently accessed data in memory. However, designing and operating highly available and scalable ElastiCache clusters can be challenging, especially for users who are new to the service or lack expertise in distributed systems. 6 | This lens covers five pillars: operational excellence, security, reliability, performance efficiency, cost optimization, and sustainability (is to be updated) 7 | 8 | ## Benefits: 9 | The well-architected lens for ElastiCache will provide the following benefits: 10 | 11 | * Help users design and operate highly available and scalable ElastiCache clusters. 12 | * Provide best practices for securing ElastiCache clusters and protecting sensitive data. 13 | * Explain how to monitor and troubleshoot ElastiCache clusters to identify and resolve issues quickly. 14 | * Improve the reliability and performance of applications that use ElastiCache. 15 | 16 | ## Contributors: 17 | * [Ravi Thakur](mailto:rrthakur@amazon.com), Sr Solutions Architect, AWS 18 | * [Steven Hancz](mailto:shancz@amazon.com), Sr ElastiCache/MemDB SA, AWS 19 | * [Lakshmi Peri](mailto:lvperi@amazon.com), Sr ElastiCache/MemDB SA, AWS 20 | 21 | ## Reviewers: 22 | * [Damon LaCaille](mailto:lacdamon@amazon.com), Sr ElastiCache/MemDB SA, AWS 23 | * [Roberto Luna Rojas](mailto:rberoj@amazon.com), Sr ElastiCache/MemDB SA, AWS 24 | -------------------------------------------------------------------------------- /Glue/README.md: -------------------------------------------------------------------------------- 1 | # AWS Glue Well-Architected Custom Lens 2 | 3 | - [About](#about) 4 | - [Prepare the json file](#prepare-the-json-file) 5 | - [Create custom lens on Well-Architected Tool](#create-custom-lens-on-well-architected-tool) 6 | - [Intended Audience](#intended-audience) 7 | - [Contributing](#contributing) 8 | 9 | ## About 10 | AWS Glue Well-Architected Custom Lens specifies best practices in Operational Excellence, Security, Performance Efficiency and Cost Optimization pillars. The best practices are designed for Glue workload. 11 | 12 | ## Prepare the json file 13 | Prepare your custom lens Well-Architected template json file or just download the [provided recommended glue custom lens file](custom-lens-glue-v2.0.json). 14 | ```json 15 | { 16 | "schemaVersion":"2021-11-01", 17 | "name":"Glue Best Practice Lens", 18 | "description":"Best practices for configuring Glue", 19 | "pillars":[ 20 | { 21 | "id":"PERF", 22 | "name":"Performance Efficiency", 23 | "questions":[ 24 | { 25 | "id":"PERF1", 26 | "title":"Do konw how file formats, file size, file layout and compression effect your job performance?", 27 | "description":"To reduce the amount of data loaded into your job when reading from Amazon S3, you need to consider FileSize, Compression, FileFormat and FileLayout (Partitions) for your dataset.", 28 | "choices":[ 29 | { 30 | "id":"PERF1_1", 31 | "title":"Choice the suitable file format for you Glue ETL Job", 32 | "helpfulResource":{ 33 | "displayText":"Have you use a columnar format. Apache Parquet and Apache ORC are popular columnar data formats?\n\nWhen using columnar formats, you can skip blocks of data that correspond to columns you do not plan to use.", 34 | "url":"https://aws.amazon.com/blogs/big-data/top-10-performance-tuning-tips-for-amazon-athena/" 35 | }, 36 | "improvementPlan":{ 37 | "displayText":"Use input/output parquet, orc dataformat", 38 | "url":"https://aws.amazon.com/blogs/big-data/top-10-performance-tuning-tips-for-amazon-athena/" 39 | } 40 | } 41 | ] 42 | } 43 | ] 44 | } 45 | ] 46 | } 47 | 48 | ``` 49 | 50 | ## Create custom lens on Well-Architected Tool 51 | After you prepare your custom lens Well-Architected template json file, you can navigate to the AWS Well-Architected Tool on the AWS console and create custom lens. 52 | ![image 1](https://user-images.githubusercontent.com/17841922/175503831-cf89ff5e-8c6e-42c7-b796-3ff91e9d8470.png) 53 | 54 | Upload the your custom lens Well-Architected template json template, and submit it. 55 | ![image 2](https://user-images.githubusercontent.com/17841922/175503996-9b734d2c-8220-4efb-b5d2-f4ad77ad0ff4.png) 56 | 57 | And then you will find your Well-Architected custom lens in draft version. 58 | ![image 3](https://github.com/aws-samples/custom-lens-wa-hub/assets/17841922/957adf6d-9bde-422c-bf93-1a04ac416473) 59 | 60 | Publish your draft version and provide a version tag. 61 | ![image 4](https://github.com/aws-samples/custom-lens-wa-hub/assets/17841922/a0aeaafb-e3ba-439f-a4e5-eb27faf68a62) 62 | 63 | After your Well-Architected custom lens is published, you can define a workload in your AWS Well-Architected console. 64 | ![image 5](https://user-images.githubusercontent.com/17841922/175505004-1f9026f7-c3f8-415d-92a1-747ab68f6610.png) 65 | 66 | Input your workload information, and select the Well-Architected custom lens you just published. 67 | ![image 6](https://github.com/aws-samples/custom-lens-wa-hub/assets/17841922/aa84a735-15e9-4852-b600-e9d67cf5c9d9) 68 | 69 | Choose Continue reviewing for the just published Glue Well-Architected custom lens. 70 | ![image 7](https://github.com/aws-samples/custom-lens-wa-hub/assets/17841922/69affe5c-9ec6-4f6b-8ab5-b97e43c2ae2c) 71 | 72 | Now you can review your Glue workload with Glue Well-Architected Custom Lens. 73 | ![image 8](https://github.com/aws-samples/custom-lens-wa-hub/assets/17841922/3ebf536b-e453-4688-b15a-fbcfb7bb97d7) 74 | 75 | ## Intended Audience 76 | 77 | CTO, Technical Leader or technical owner for an AWS Glue workload 78 | 79 | ## Contributing 80 | 81 | - Noritaka Sekiyama, Principal Big Data Architect, AWS 82 | - [Ray Wang](mailto:hsiawang@amazon.com), Solutions Architect, AWS 83 | - Hsu, Chia-Wei Cloud Support Engineer (BigData), Glue SME, AWS 84 | 85 | 86 | -------------------------------------------------------------------------------- /IDP-custom-lens/README.md: -------------------------------------------------------------------------------- 1 | # AWS Machine Learning - Intelligent Document Processing Custom Lens 2 | 3 | AWS Machine Learning - Intelligent Document Processing Custom Lens specifies best practices in Security, Reliability and Performance Efficiency pillars. 4 | 5 | The AWS Well-Architected Framework describes key concepts, design principles, and architectural best practices for designing and running workloads in the cloud with six pillars, including operational excellence, security, reliability, performance efficiency, cost optimization, and sustainability. 6 | 7 | A Well-Architected Framework Review(WAFR) can help customers have better outcomes with a production workload AWS using Amazon Well Architected Tool. Usually the WAR run by an account team (as an SA or an Account Manager), as a 1-2 days activity with customer. By answering a few foundational questions, learn how well the customer’s architecture aligns with well-architected pillars and gain guidance for making improvements. Make sure customer focus on the business value they want to achieve. Customers can learn about the common issues, and have an opportunity deepen their knowledge of AWS. 8 | 9 | AWS Well-Architected Custom Lenses is a feature of the AWS Well-Architected Tool. So that customer can have a well-architected address the common pain points of a particular industry or use case. This project is built with customer requests from EMEA and AMER customers across ENT, ISV, DNB segments across the globe. 10 | 11 | With this Custom Lenses, customer can learn how to build an automated document processing solution on Amazon S3, Amazon Textract, Amazon Comprehend, and Amazon SageMaker GroundTruth with AWS Well-Architected Framework. 12 | 13 | 14 | ## Intended Audience 15 | 16 | CTO, Technical Leader or technical owner for an Machine Learning - Intelligent Document Processing workload 17 | 18 | ## Version 19 | 20 | |Date |Version |Details | 21 | |--- |---- |---- | 22 | |2023.12.08.| V1.1| Fix choices with *_no keys & logic | 23 | |2023.11.02.| V1| Six Pillars, with security review. | 24 | |2023.08.16.| V0.9| Security, Reliability and Performance Efficiency pillars.| 25 | 26 | 27 | ## Contributing 28 | 29 | * Mia Chang, EMEA Machine Learning Specialist Solutions Architect, AWS 30 | * Sherry Ding, Sr. AI/ML Specialist Solutions Architect, AWS 31 | * Brijesh Pati, Solutions Architect, AWS 32 | * Rui Cardoso, Partner Solutions Architect, AWS 33 | * Christian Denich, Sr. Customer Solutions Manager, AWS 34 | * Suyin Wang, AI/ML Solutions Architect, AWS 35 | * Tim Condello. Senior AI Services Solutions Architect, AWS 36 | 37 | ## Co-contributing 38 | 39 | * Bob Yeh, Geo Solutions Architect, Well-Architected APAC, AWS 40 | * Bruce Ross, Lens Lead SA, AWS Well-Architected, AWS 41 | * Martin Schade, Principal ML Product SA, Textract Service Team, AWS 42 | 43 | -------------------------------------------------------------------------------- /Iceberg-S3-Lens/README.md: -------------------------------------------------------------------------------- 1 | # Iceberg on Amazon S3 Best Practice Lens 2 | Iceberg on Amazon S3 Best Practice Lens provides best practices for using Apache Iceberg open table format on Amazon S3 includes 3 pillars of the AWS Well-Architected Framework: Operational Excellence, Performance Efficiency, Cost Optimization. 3 | 4 | ## Contributing 5 | - Noritaka Sekiyama, Principal Big Data Architect, AWS 6 | - Tomohiro Tanaka, Sr. Cloud Supp Eng (Big Data), AWS 7 | - Joey Wu, Solutions Architect, AWS 8 | - Ray Wang, Senior Solutions Architect, AWS 9 | -------------------------------------------------------------------------------- /Iceberg-S3-Lens/custom-lens-iceberg-amaozn-s3-v1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "2021-11-01", 3 | "name": "Iceberg on Amazon S3 Best Practice Lens", 4 | "description": "Best practices for configuring Iceberg on Amazon S3", 5 | "pillars": [ 6 | { 7 | "id": "PERF", 8 | "name": "Performance Efficiency", 9 | "questions": [ 10 | { 11 | "id": "PERF1", 12 | "title": "How you optimizing read performance", 13 | "description": "Optimizing data layout and file size in Iceberg tables is crucial for data pruning in order to maintain query efficiency, especially at scale with large datasets.", 14 | "choices": [ 15 | { 16 | "id": "PERF1_1", 17 | "title": "Partition your data", 18 | "helpfulResource": { 19 | "displayText": "To reduce the amount of data that's scanned when querying Iceberg tables, choose a balanced partition strategy that aligns with your expected read patterns", 20 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-partitioning" 21 | }, 22 | "improvementPlan": { 23 | "displayText": "Identify columns that are frequently used in queries. These are ideal partitioning candidates. \nChoose a low cardinality partition column to avoid creating an excessive number of partitions. Too many partitions can increase the number of files in the table, which can negatively impact query performance.", 24 | "url": "https://iceberg.apache.org/docs/latest/partitioning/#what-is-partitioning" 25 | } 26 | }, 27 | { 28 | "id": "PERF1_2", 29 | "title": "Use hidden partitioning", 30 | "helpfulResource": { 31 | "displayText": "If you typically query by using filters on a high cardinality column (for example, an id column that can have thousands of values), use Iceberg's hidden partitioning feature with bucket transforms", 32 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-partitioning" 33 | }, 34 | "improvementPlan": { 35 | "displayText": "The most common use cases for hidden partitioning are: \nPartitioning on date or time, when the data has a timestamp column. Iceberg offers multiple transforms to extract the date or time parts of a timestamp.\nPartitioning on a hash function of a column, when the partitioning column has high cardinality and would result in too many partitions. Iceberg's bucket transform groups multiple partition values together into fewer, hidden (bucket) partitions by using hash functions on the partitioning column.", 36 | "url": "https://iceberg.apache.org/docs/latest/partitioning/#icebergs-hidden-partitioning" 37 | } 38 | }, 39 | { 40 | "id": "PERF1_3", 41 | "title": "Use partition evolution", 42 | "helpfulResource": { 43 | "displayText": "Use Iceberg's partition evolution when the existing partition strategy isn't optimal. For example, if you choose hourly partitions that turn out to be too small (just a few megabytes each), consider shifting to daily or monthly partitions.", 44 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-partitioning" 45 | }, 46 | "improvementPlan": { 47 | "displayText": "You can use this approach when the best partition strategy for a table is initially unclear, and you want to refine your partitioning strategy as you gain more insights. Another effective use of partition evolution is when data volumes change and the current partitioning strategy becomes less effective over time.\nFor instructions on how to evolve partitions, see ALTER TABLE SQL extensions in the Iceberg documentation. ", 48 | "url": "https://iceberg.apache.org/docs/latest/spark-ddl/#alter-table-sql-extensions" 49 | } 50 | }, 51 | { 52 | "id": "PERF1_4", 53 | "title": "Set target file and row group size", 54 | "helpfulResource": { 55 | "displayText": "Small tables (up to few gigabytes) Reduce the target file size to 128 MB. Also reduce the row group or stripe size (for example, to 8 or 16 MB).\nMedium to large tables (from a few gigabytes to hundreds of gigabytes) The default values are a good starting point for these tables. If your queries are very selective, adjust the row group or stripe size (for example, to 16 MB).\nVery large tables (hundreds of gigabytes or terabytes) Increase the target file size to 1024 MB or more, and consider increasing the row group or stripe size if your queries usually pull large sets of data.", 56 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-file-size" 57 | }, 58 | "improvementPlan": { 59 | "displayText": "Based on your expected table size, follow these general guidelines.", 60 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html" 61 | } 62 | }, 63 | { 64 | "id": "PERF1_5", 65 | "title": "Run regular compaction", 66 | "helpfulResource": { 67 | "displayText": "Run compaction regularly to combine small files into larger files. Re-cluster the data with desired distribution if you need.", 68 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-file-size" 69 | }, 70 | "improvementPlan": { 71 | "displayText": "Iceberg includes features that enable you to carry out table maintenance operations after writing data to the table. Some maintenance operations focus on streamlining metadata files, while others enhance how the data is clustered in the files so that query engines can efficiently locate the necessary information to respond to user requests. ", 72 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 73 | } 74 | }, 75 | { 76 | "id": "PERF1_6", 77 | "title": "Optimize column statistics", 78 | "helpfulResource": { 79 | "displayText": "Iceberg uses column statistics to perform file pruning. It estimates the number of distinct values in each column of the Iceberg table and and store them in Puffin files. To benefit from column statistics, make sure that Iceberg collects statistics for columns that are frequently used in query filters. ", 80 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-column-statistics" 81 | }, 82 | "improvementPlan": { 83 | "displayText": "Run the column statistics on-demand or in a regualr scheduled basis. You can also configure AWS Glue for the column statistics generation task using AWS Glue console or AWS CLI.", 84 | "url": "https://docs.aws.amazon.com/glue/latest/dg/iceberg-column-statistics.html" 85 | } 86 | }, 87 | { 88 | "id": "PERF1_7", 89 | "title": "Choose the right update/delete strategy", 90 | "helpfulResource": { 91 | "displayText": "Use a copy-on-write strategy to optimize read performance, when slower write operations are acceptable for your use case. ", 92 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-update" 93 | }, 94 | "improvementPlan": { 95 | "displayText": "Copy-on-write results in better read performance, because files are directly written to storage in a read-optimized fashion. However, compared with merge-on-read, each write operation takes longer and consumes more compute resources. This presents a classic trade-off between read and write latency. Typically, copy-on-write is ideal for use cases where most updates are collocated in the same table partitions (for example, for daily batch loads).\nCopy-on-write configurations (write.update.mode, write.delete.mode, and write.merge.mode) can be set at the table level or independently on the application side.", 96 | "url": "https://iceberg.apache.org/docs/latest/configuration/#write-properties" 97 | } 98 | }, 99 | { 100 | "id": "PERF1_8", 101 | "title": "Use ZSTD compression", 102 | "helpfulResource": { 103 | "displayText": "We recommend that you use the ZSTD compression codec to improve overall performance on tables.", 104 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-compression" 105 | }, 106 | "improvementPlan": { 107 | "displayText": "By default, Iceberg versions 1.3 and earlier use GZIP compression, which provides slower read/write performance compared with ZSTD. You can modify the compression codec used by Iceberg by using the table property write..compression-codec. ", 108 | "url": "https://docs.aws.amazon.com/athena/latest/ug/compression-support-iceberg.html" 109 | } 110 | }, 111 | { 112 | "id": "PERF1_9", 113 | "title": "Set the sort order", 114 | "helpfulResource": { 115 | "displayText": "Sorting, combined with Iceberg's column statistics, can make file pruning significantly more efficient, which results in faster read operations. Sorting also reduces the number of Amazon S3 requests for queries that use the sort columns in query filters.", 116 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-read.html#read-sort-order" 117 | }, 118 | "improvementPlan": { 119 | "displayText": "You can set a hierarchical sort order at the table level by running a data definition language (DDL) statement with Spark. For available options, see the Iceberg documentation. After you set the sort order, writers will apply this sorting to subsequent data write operations in the Iceberg table.", 120 | "url": "https://iceberg.apache.org/docs/latest/spark-ddl/#alter-table-write-ordered-by" 121 | } 122 | } 123 | ], 124 | "riskRules": [ 125 | { 126 | "condition": "PERF1_1 && PERF1_2 && PERF1_3 && PERF1_4 && PERF1_5 && PERF1_6 && PERF1_7 && PERF1_8 && PERF1_9", 127 | "risk": "NO_RISK" 128 | }, 129 | { 130 | "condition": "(!PERF1_1) || (!PERF1_3) || (!PERF1_4) || (!PERF1_5) || (!PERF1_7)", 131 | "risk": "HIGH_RISK" 132 | }, 133 | { 134 | "condition": "default", 135 | "risk": "MEDIUM_RISK" 136 | } 137 | ] 138 | }, 139 | { 140 | "id": "PERF2", 141 | "title": "How you optimizing write performance", 142 | "description": "Iceberg's write performance is optimized through strategic configuration choices that eliminate unnecessary processing overhead.", 143 | "choices": [ 144 | { 145 | "id": "PERF2_1", 146 | "title": "Set the table write distribution mode", 147 | "helpfulResource": { 148 | "displayText": "For use cases that prioritize write speed, especially in streaming workloads, set write.distribution-mode to none. This ensures that Iceberg doesn't request additional Spark shuffling and that data is written as it becomes available in Spark tasks. But not suggested for partitioned tables when we have write operations that touch multiple partitions.", 149 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-write.html#write-distribution-mode" 150 | }, 151 | "improvementPlan": { 152 | "displayText": "There are 3 options for write.distribution-mode:\nnone - This is the previous default for Iceberg. This mode does not request any shuffles or sort to be performed automatically by Spark. \nhash - This mode is the new default and requests that Spark uses a hash-based exchange to shuffle the incoming write data before writing.\nrange - This mode is the most expensive one. Requests that Spark perform a range-based exchange to shuffle the data before writing. This is a two stage procedure. The first stage samples the data to be written based on the partition and sort columns. The second stage uses the range information to shuffle the input data into Spark tasks.", 153 | "url": "https://iceberg.apache.org/docs/latest/spark-writes/#writing-distribution-modes" 154 | } 155 | }, 156 | { 157 | "id": "PERF2_2", 158 | "title": "Choose the right update/delete strategy", 159 | "helpfulResource": { 160 | "displayText": "Use a merge-on-read strategy to optimize write performance, when slower read operations on the latest data are acceptable for your use case.", 161 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-write.html#write-update-strategy" 162 | }, 163 | "improvementPlan": { 164 | "displayText": "When you use merge-on-read, Iceberg writes updates and deletes to storage as separate small files. When the table is read, the reader has to merge these changes with the base files to return the latest view of the data. This results in a performance penalty for read operations, but speeds up the writing of updates and deletes. Typically, merge-on-read is ideal for streaming workloads with updates or jobs with few updates that are spread across many table partitions.\nYou can set merge-on-read configurations (write.update.mode, write.delete.mode, and write.merge.mode) at the table level or independently on the application side.", 165 | "url": "https://iceberg.apache.org/docs/latest/configuration/#write-properties" 166 | } 167 | }, 168 | { 169 | "id": "PERF2_3", 170 | "title": "Choose the right file format", 171 | "helpfulResource": { 172 | "displayText": "If write speed is important for your use case, such as in streaming workloads, consider writing in Avro format by setting write-format to Avro in the writer's options. Because Avro is a row-based format, it provides faster write times at the cost of slower read performance. To improve read performance, run regular compaction to merge and transform small Avro files into larger Parquet files. ", 173 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-write.html#write-file-format" 174 | }, 175 | "improvementPlan": { 176 | "displayText": "To improve read performance, run regular compaction to merge and transform small Avro files into larger Parquet files. The outcome of the compaction process is governed by the write.format.default table setting. The default format for Iceberg is Parquet, so if you write in Avro and then run compaction, Iceberg will transform the Avro files into Parquet files. ", 177 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-write.html#write-file-format" 178 | } 179 | }, 180 | { 181 | "id": "PERF2_4", 182 | "title": "Manage concurrent write conflicts", 183 | "helpfulResource": { 184 | "displayText": "Iceberg uses optimistic concurrency control, where multiple writers can proceed with their operations simultaneously, and conflicts are detected at commit time. For append-only use cases, manage catalog commit conflicts for multi-writers.", 185 | "url": "https://aws.amazon.com/blogs/big-data/manage-concurrent-write-conflicts-in-apache-iceberg-on-the-aws-glue-data-catalog/" 186 | }, 187 | "improvementPlan": { 188 | "displayText": "Before diving into specific implementation patterns, it s essential to understand how Iceberg manages concurrent writes through its table architecture and transaction model. Iceberg uses a layered architecture to manage table state and data:\nCatalog layer Maintains a pointer to the current table metadata file, serving as the single source of truth for table state. The Data Catalog provides the functionality as the Iceberg catalog.\nMetadata layer Contains metadata files that track table history, schema evolution, and snapshot information. These files are stored on Amazon Simple Storage Service (Amazon S3).\nData layer Stores the actual data files and delete files (for Merge-on-Read operations). These files are also stored on Amazon S3.\nThe most critical concept to remember is the distinction between catalog commit conflicts and data conflicts. ", 189 | "url": "https://aws.amazon.com/blogs/big-data/manage-concurrent-write-conflicts-in-apache-iceberg-on-the-aws-glue-data-catalog/" 190 | } 191 | }, 192 | { 193 | "id": "PERF2_5", 194 | "title": "Set that granularity to file", 195 | "helpfulResource": { 196 | "displayText": "This configure works for the write performance because this option controls the number of delete files. The default options is the faster one (file) if customers are using Spark.", 197 | "url": "https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/TableProperties.java" 198 | }, 199 | "improvementPlan": { 200 | "displayText": "'write.delete.granularity'='file' (spark default: file since 1.8.0, other defaults to partition)\n", 201 | "url": "https://iceberg.apache.org/docs/nightly/spark-configuration/#write-options" 202 | } 203 | } 204 | ], 205 | "riskRules": [ 206 | { 207 | "condition": "PERF2_1 && PERF2_2 && PERF2_3 && PERF2_4 && PERF2_5", 208 | "risk": "NO_RISK" 209 | }, 210 | { 211 | "condition": "(!PERF2_1) || (!PERF2_2)", 212 | "risk": "HIGH_RISK" 213 | }, 214 | { 215 | "condition": "default", 216 | "risk": "MEDIUM_RISK" 217 | } 218 | ] 219 | }, 220 | { 221 | "id": "PERF3", 222 | "title": "Have you use last version of Icebreg", 223 | "description": "The format version number is incremented when new features are added that will break forward-compatibility. Versions 1 and 2 of the Iceberg spec are complete and adopted by the community. Version 3 is under active development and has not been formally adopted.", 224 | "choices": [ 225 | { 226 | "id": "PERF3_1", 227 | "title": "Use Iceberg format version 2", 228 | "helpfulResource": { 229 | "displayText": "The primary change in version 2 adds delete files to encode rows that are deleted in existing data files. This version can be used to delete or replace individual rows in immutable data files without rewriting the files.", 230 | "url": "https://iceberg.apache.org/spec/#version-1-analytic-data-tables" 231 | }, 232 | "improvementPlan": { 233 | "displayText": "Version 3 of the Iceberg spec extends data types and existing metadata structures to add new capabilities:\nNew data types: nanosecond timestamp(tz), unknown, variant, geometry, geography\nDefault value support for columns\nMulti-argument transforms for partitioning and sorting\nRow Lineage tracking\nBinary deletion vectors", 234 | "url": "https://iceberg.apache.org/spec/#version-1-analytic-data-tables" 235 | } 236 | } 237 | ], 238 | "riskRules": [ 239 | { 240 | "condition": "PERF3_1", 241 | "risk": "NO_RISK" 242 | }, 243 | { 244 | "condition": "default", 245 | "risk": "MEDIUM_RISK" 246 | } 247 | ] 248 | } 249 | ] 250 | }, 251 | { 252 | "id": "OPS", 253 | "name": "Operational Excellence", 254 | "questions": [ 255 | { 256 | "id": "OPS1", 257 | "title": "How you manage your data catalog", 258 | "description": "Iceberg has several catalog back-ends that can be used to track tables, like JDBC, Hive MetaStore and Glue. Each has its characteristics. We recommand you Glue catalog as its cloud-native and better scalability.", 259 | "choices": [ 260 | { 261 | "id": "OPS1_1", 262 | "title": "Use the AWS Glue Data Catalog as your data catalog", 263 | "helpfulResource": { 264 | "displayText": "Regardless of your use case, when you use Apache Iceberg on AWS, we recommend that you use the AWS Glue Data Catalog as your data catalog.", 265 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-general.html" 266 | }, 267 | "improvementPlan": { 268 | "displayText": "Use AWS Glue Data Catalog as your data catalog.", 269 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-general.html" 270 | } 271 | }, 272 | { 273 | "id": "OPS1_2", 274 | "title": "Use the AWS Glue Data Catalog as lock manager", 275 | "helpfulResource": { 276 | "displayText": "AWS Glue 4.0 or later uses optimistic locking by default. Please use AWS SDK version >= 2.17.131 to leverage Glue's Optimistic Locking. If the AWS SDK version is below 2.17.131, only in-memory lock is used. ", 277 | "url": "https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-format-iceberg.html#aws-glue-programming-etl-format-iceberg-enable" 278 | }, 279 | "improvementPlan": { 280 | "displayText": "With optimistic locking, each table has a version id. If users retrieve the table metadata, Iceberg records the version id of that table. Users can update the table as long as the version ID on the server side remains unchanged. Version mismatch occurs if someone else modified the table before you did, causing an update failure. Iceberg then refreshes metadata and checks if there is a conflict. If there is no commit conflict, the operation will be retried. Optimistic locking guarantees atomic transaction of Iceberg tables in Glue. It also prevents others from accidentally overwriting your changes.", 281 | "url": "https://iceberg.apache.org/docs/latest/aws/#optimistic-locking" 282 | } 283 | } 284 | ], 285 | "riskRules": [ 286 | { 287 | "condition": "OPS1_1 && OPS1_2", 288 | "risk": "NO_RISK" 289 | }, 290 | { 291 | "condition": "default", 292 | "risk": "MEDIUM_RISK" 293 | } 294 | ] 295 | }, 296 | { 297 | "id": "OPS2", 298 | "title": "How you maintaining iceberg tables", 299 | "description": "Iceberg requires regular maintenance operations since it uses a multi-layered metadata approach that accumulates state over time. Without maintenance, tables experience degraded query performance from fragmented metadata, excessive storage costs from unreferenced files, and slower scan planning due to metadata bloat, ultimately compromising the performance advantages that make Iceberg valuable.", 300 | "choices": [ 301 | { 302 | "id": "OPS2_1", 303 | "title": "Expire Snapshots", 304 | "helpfulResource": { 305 | "displayText": "Each write to an Iceberg table creates a new snapshot, or version, of a table. Snapshots can be used for time-travel queries, or the table can be rolled back to any valid snapshot.\nSnapshots accumulate until they are expired by the expireSnapshots operation. Regularly expiring snapshots is recommended to delete data files that are no longer needed, and to keep the size of table metadata small.", 306 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#expire-snapshots" 307 | }, 308 | "improvementPlan": { 309 | "displayText": "Data files are not deleted until they are no longer referenced by a snapshot that may be used for time travel or rollback. Regularly expiring snapshots deletes unused data files.", 310 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#expire-snapshots" 311 | } 312 | }, 313 | { 314 | "id": "OPS2_2", 315 | "title": "Remove old metadata files", 316 | "helpfulResource": { 317 | "displayText": "Old metadata files are kept for history by default. Tables with frequent commits, like those written by streaming jobs, may need to regularly clean metadata files.\nTo automatically clean metadata files, set write.metadata.delete-after-commit.enabled=true in table properties. This will keep some metadata files (up to write.metadata.previous-versions-max) and will delete the oldest metadata file after each new one is created.", 318 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#remove-old-metadata-files" 319 | }, 320 | "improvementPlan": { 321 | "displayText": "Example: With write.metadata.delete-after-commit.enabled=false and write.metadata.previous-versions-max=10, one will have 10 tracked metadata files and 90 orphaned metadata files after 100 commits", 322 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#remove-old-metadata-files" 323 | } 324 | }, 325 | { 326 | "id": "OPS2_3", 327 | "title": "Delete orphan files", 328 | "helpfulResource": { 329 | "displayText": "In Spark and other distributed processing engines, task or job failures can leave files that are not referenced by table metadata, and in some cases normal snapshot expiration may not be able to determine a file is no longer needed and delete it.\nTo clean up these \"orphan\" files under a table location, use the deleteOrphanFiles action.", 330 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#delete-orphan-files" 331 | }, 332 | "improvementPlan": { 333 | "displayText": "This action may take a long time to finish if you have lots of files in data and metadata directories. It is recommended to execute this periodically, but you may not need to execute this often. ", 334 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#delete-orphan-files" 335 | } 336 | }, 337 | { 338 | "id": "OPS2_4", 339 | "title": "Compact data files", 340 | "helpfulResource": { 341 | "displayText": "Iceberg tracks each data file in a table. More data files leads to more metadata stored in manifest files, and small data files causes an unnecessary amount of metadata and less efficient queries from file open costs.", 342 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#compact-data-files" 343 | }, 344 | "improvementPlan": { 345 | "displayText": "Use Amazon EMR or AWS Glue. If you use Glue auto compaction and takes too long, you can raise a support case so that we can fine tune the compaction.", 346 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#compact-data-files" 347 | } 348 | }, 349 | { 350 | "id": "OPS2_5", 351 | "title": "Rewrite manifests", 352 | "helpfulResource": { 353 | "displayText": "When a table's write pattern doesn't align with the query pattern, metadata can be rewritten to re-group data files into manifests using rewriteManifests or the rewriteManifests action (for parallel rewrites using Spark).", 354 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#rewrite-manifests" 355 | }, 356 | "improvementPlan": { 357 | "displayText": "Rewrite the manifest can reorganize file metadata according to query-relevant dimensions rather than just creation order. This reduces scan planning time by grouping files with similar partition values together, enabling Iceberg to efficiently prune entire manifests during query planning.", 358 | "url": "https://iceberg.apache.org/docs/latest/maintenance/#rewrite-manifests" 359 | } 360 | } 361 | ], 362 | "riskRules": [ 363 | { 364 | "condition": "OPS2_1 && OPS2_2 && OPS2_3 && OPS2_4 && OPS2_5", 365 | "risk": "NO_RISK" 366 | }, 367 | { 368 | "condition": "(!OPS2_1) || (!OPS2_3) || (!OPS2_4)", 369 | "risk": "HIGH_RISK" 370 | }, 371 | { 372 | "condition": "default", 373 | "risk": "MEDIUM_RISK" 374 | } 375 | ] 376 | }, 377 | { 378 | "id": "OPS3", 379 | "title": "What's your compaction strategy", 380 | "description": "Iceberg supports multiple compaction strategies. This flexibility allows organizations to tailor their maintenance approach to their specific query patterns, data freshness requirements, and computational resource constraints.", 381 | "choices": [ 382 | { 383 | "id": "OPS3_1", 384 | "title": "Running bin packing compaction", 385 | "helpfulResource": { 386 | "displayText": "Bin packing strategy determines which files to be rewritten based on their size. If files are either smaller than the MIN_FILE_SIZE_BYTES threshold or larger than the MAX_FILE_SIZE_BYTES threshold, they are considered targets for being rewritten.", 387 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 388 | }, 389 | "improvementPlan": { 390 | "displayText": "Use Amazon EMR or AWS Glue with dynamic scaling when you expect large volumes of small files to be compacted.", 391 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 392 | } 393 | }, 394 | { 395 | "id": "OPS3_2", 396 | "title": "Running compaction to sort data", 397 | "helpfulResource": { 398 | "displayText": "Sort strategy for data files which aims to reorder data with data files to optimally lay them out in relation to a column. ", 399 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 400 | }, 401 | "improvementPlan": { 402 | "displayText": "Use Amazon EMR or AWS Glue, because sorting is an expensive operation and might need to spill data to disk.", 403 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 404 | } 405 | }, 406 | { 407 | "id": "OPS3_3", 408 | "title": "Running compaction to cluster the data using z-order sorting", 409 | "helpfulResource": { 410 | "displayText": "Z-order sorting organizes data by interleaving the bits of multiple column values, effectively creating a space-filling curve that groups similar values from different dimensions close together in storage. This multi-dimensional clustering dramatically improves query performance when filtering on several columns simultaneously.", 411 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 412 | }, 413 | "improvementPlan": { 414 | "displayText": "Use Amazon EMR or AWS Glue, because z-order sorting is a very expensive operation and might need to spill data to disk.", 415 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 416 | } 417 | }, 418 | { 419 | "id": "OPS3_4", 420 | "title": "Running compaction on partitions that might be updated by other applications because of late-arriving data", 421 | "helpfulResource": { 422 | "displayText": "Enable the Iceberg PARTIAL_PROGRESS_ENABLED property. When you use this option, Iceberg splits the compaction output into multiple commits. If there is a collision (that is, if the data file is updated while compaction is running), this setting reduces the cost of retry by limiting it to the commit that includes the affected file. Otherwise, you might have to recompact all files.", 423 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 424 | }, 425 | "improvementPlan": { 426 | "displayText": "Use Amazon EMR or AWS Glue. If you use Glue auto compaction and takes too long, you can raise a support case so that we can fine tune the compaction.", 427 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-compaction.html" 428 | } 429 | } 430 | ], 431 | "riskRules": [ 432 | { 433 | "condition": "OPS3_1 && OPS3_2 && OPS3_3 && OPS3_4", 434 | "risk": "NO_RISK" 435 | }, 436 | { 437 | "condition": "default", 438 | "risk": "MEDIUM_RISK" 439 | } 440 | ] 441 | }, 442 | { 443 | "id": "OPS4", 444 | "title": "Do you konw to troubleshooting when using Iceberg workloads in Amazon S3", 445 | "description": "This section discusses Iceberg properties that you can use to optimize Iceberg's interaction with Amazon S3.", 446 | "choices": [ 447 | { 448 | "id": "OPS4_1", 449 | "title": "Prevent hot partitioning (HTTP 503 errors)", 450 | "helpfulResource": { 451 | "displayText": " Set write.distribution-mode to hash or range so that Iceberg writes large files which results in fewer Amazon S3 requests. Also set write.object-storage.enabled to true in Iceberg. This instructs Iceberg to hash object names and distribute the load across multiple, randomized Amazon S3 prefixes.", 452 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-workloads.html#workloads-503" 453 | }, 454 | "improvementPlan": { 455 | "displayText": "Some data lake applications that run on Amazon S3 handle millions or billions of objects and process petabytes of data. This can lead to prefixes that receive a high volume of traffic, which are typically detected through HTTP 503 (service unavailable) errors.", 456 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-workloads.html#workloads-503" 457 | } 458 | }, 459 | { 460 | "id": "OPS4_2", 461 | "title": "Use Iceberg maintenance operations to release unused data", 462 | "helpfulResource": { 463 | "displayText": "To delete old or unused files from Amazon S3, we recommend that you only use Iceberg native APIs to remove snapshots, remove old metadata files, and delete orphan files.", 464 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-workloads.html#workloads-unused-data" 465 | }, 466 | "improvementPlan": { 467 | "displayText": "Using Amazon S3 APIs through Boto3, the Amazon S3 SDK, or the AWS Command Line Interface (AWS CLI), or using any other, non-Iceberg methods to overwrite or remove Amazon S3 files for an Iceberg table leads to table corruption and query failures.", 468 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-workloads.html#workloads-unused-data" 469 | } 470 | }, 471 | { 472 | "id": "OPS4_3", 473 | "title": "Replicate data across AWS Regions", 474 | "helpfulResource": { 475 | "displayText": "When you store Iceberg tables in Amazon S3, you can use the built-in features in Amazon S3, such as Cross-Region Replication (CRR) and Multi-Region Access Points (MRAP), to replicate data across multiple AWS Regions. MRAP provides a global endpoint for applications to access S3 buckets that are located in multiple AWS Regions. ", 476 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-workloads.html#workloads-replication" 477 | }, 478 | "improvementPlan": { 479 | "displayText": "Currently, Iceberg integration with MRAP works only with Apache Spark. If you need to fail over to the secondary AWS Region, you have to plan to redirect user queries to a Spark SQL environment (such as Amazon EMR) in the failover Region.", 480 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-workloads.html#workloads-replication" 481 | } 482 | } 483 | ], 484 | "riskRules": [ 485 | { 486 | "condition": "OPS4_1 && OPS4_2 && OPS4_3", 487 | "risk": "NO_RISK" 488 | }, 489 | { 490 | "condition": "(!OPS4_2)", 491 | "risk": "HIGH_RISK" 492 | }, 493 | { 494 | "condition": "default", 495 | "risk": "MEDIUM_RISK" 496 | } 497 | ] 498 | } 499 | ] 500 | }, 501 | { 502 | "id": "COST", 503 | "name": "Cost Optimization", 504 | "questions": [ 505 | { 506 | "id": "COST1", 507 | "title": "Do you know how optimizing storage effect to your storage cost", 508 | "description": "Iceberg designs to keep the historical data and snapshots for time travel. Implementing strategic snapshot retention policies and running regular orphan file cleanup. For compliance requirements, moving the old objects to appropriate S3 storage classes can optimize your storage costs. ", 509 | "choices": [ 510 | { 511 | "id": "COST1_1", 512 | "title": "Enable S3 Intelligent-Tiering", 513 | "helpfulResource": { 514 | "displayText": "Use the Amazon S3 Intelligent-Tiering storage class to automatically move data to the most cost-effective access tier when access patterns change. This option has no operational overhead or impact on performance.", 515 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-storage.html#storage-s3-intelligent-tiering" 516 | }, 517 | "improvementPlan": { 518 | "displayText": "Don't use the optional tiers (such as Archive Access and Deep Archive Access) in S3 Intelligent-Tiering with Iceberg tables. To archive data, see the guidelines in the next section.\nYou can also use Amazon S3 Lifecycle rules to set your own rules for moving objects to another Amazon S3 storage class, such as S3 Standard-IA or S3 One Zone-IA (see Supported transitions and related constraints in the Amazon S3 documentation).", 519 | "url": "https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-lifecycle-mgmt.html" 520 | } 521 | }, 522 | { 523 | "id": "COST1_2", 524 | "title": "Archive or delete historic snapshots", 525 | "helpfulResource": { 526 | "displayText": "Keeping snapshots of a table is required for features such as snapshot isolation, table rollback, and time travel queries. However, storage costs grow with the number of versions that you retain.", 527 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-storage.html#storage-snapshots" 528 | }, 529 | "improvementPlan": { 530 | "displayText": "You can consider:1. Delete old snapshots\n2. Set retention policies for specific snapshots\n3. Archive old snapshots", 531 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-storage.html#storage-snapshots" 532 | } 533 | }, 534 | { 535 | "id": "COST1_3", 536 | "title": "Delete orphan files", 537 | "helpfulResource": { 538 | "displayText": "In certain situations, Iceberg applications can fail before you commit your transactions. This leaves data files in Amazon S3. Because there was no commit, these files won't be associated with any table, so you might have to clean them up asynchronously.", 539 | "url": "https://docs.aws.amazon.com/prescriptive-guidance/latest/apache-iceberg-on-aws/best-practices-storage.html#storage-orphan-files" 540 | }, 541 | "improvementPlan": { 542 | "displayText": "To handle these deletions, you can use the VACUUM statement in Amazon Athena. This statement removes snapshots and also deletes orphaned files. This is very cost-efficient, because Athena doesn't charge for the compute cost of this operation. Also, you don't have to schedule any additional operations when you use the VACUUM statement.\nAlternatively, you can use Spark on Amazon EMR or AWS Glue to run the remove_orphan_files procedure. This operation has a compute cost and has to be scheduled independently. For more information, see the Iceberg documentation.", 543 | "url": "https://iceberg.apache.org/docs/latest/spark-procedures/#remove_orphan_files" 544 | } 545 | } 546 | ], 547 | "riskRules": [ 548 | { 549 | "condition": "COST1_1 && COST1_2 && COST1_3", 550 | "risk": "NO_RISK" 551 | }, 552 | { 553 | "condition": "(!COST1_1) || (!COST1_2) || (!COST1_3)", 554 | "risk": "HIGH_RISK" 555 | }, 556 | { 557 | "condition": "default", 558 | "risk": "MEDIUM_RISK" 559 | } 560 | ] 561 | } 562 | ] 563 | } 564 | ] 565 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /MSFT-Lens/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/custom-lens-wa-hub/29b26f41ba31c25185309b90cbc2661342c8377d/MSFT-Lens/.gitkeep -------------------------------------------------------------------------------- /MSFT-Lens/README.md: -------------------------------------------------------------------------------- 1 | # AWS Well-Architected Microsoft on AWS Lens 2 | 3 | The AWS Well-Architected for Microsoft Workloads Lens offers comprehensive guidance to make sure your migration and modernization strategies are designed in accordance with AWS best practices. It lets you bring your own best practices to complement the existing framework based on your industry, operational plans, and internal processes. Custom Lenses provide a consolidated view and a consistent way to measure and improve your workloads on AWS without relying on external spreadsheets or third-party systems. 4 | 5 | Migrating to the cloud is just the start of the journey for many customers that continue to realize the ongoing benefits of the cloud through progressive modernization of their applications, data, and infrastructure. We've been helping customers modernize their Windows workloads on AWS for over 16 years and have the broadest portfolio of services, programs, and expertise to speed up the transformation of the applications that power your business. 6 | 7 | ## Overview 8 | 9 | This custom lens provides a structured approach to evaluate Microsoft workloads running on AWS across 8 key pillars/phases: 10 | 11 | 1. **Assessment** - Evaluate your current Microsoft workload environment 12 | 2. **Operational Excellence** - Optimize operations and monitoring 13 | 3. **Security** - Implement security best practices for Microsoft workloads 14 | 4. **Reliability** - Ensure high availability and disaster recovery 15 | 5. **Performance Efficiency** - Optimize performance and resource utilization 16 | 6. **Cost Optimization** - Manage and optimize costs effectively 17 | 7. **Sustainability** - Implement sustainable practices 18 | 8. **Modernization** - Plan and execute modernization strategies 19 | 20 | ## Getting Started 21 | 22 | ### Prerequisites 23 | 24 | - Access to AWS Well-Architected Tool 25 | - AWS account with appropriate permissions 26 | - Understanding of your Microsoft workload architecture 27 | 28 | ### How to Import the Custom Lens 29 | 30 | 1. **Download the Lens File** 31 | - Download the `Microsoft_On_AWS_Lens.json` file from this repository 32 | 33 | 2. **Access AWS Well-Architected Tool** 34 | - Navigate to the [AWS Well-Architected Tool](https://console.aws.amazon.com/wellarchitected/) in your AWS Console 35 | 36 | ![AWS Well-Architected Tool Console](images/wa-tool-console.png) 37 | 38 | 3. **Import Custom Lens** 39 | - Go to "Custom lenses" in the left navigation 40 | - Click "Create custom lens" 41 | - Select "Import lens" 42 | - Upload the `Microsoft_On_AWS_Lens.json` file 43 | - Review and confirm the import 44 | 45 | ![Import Custom Lens](images/import-custom-lens.png) 46 | 47 | 4. **Preview the Lens** 48 | - After importing, you can use the "Preview Experience" to explore the lens 49 | - This allows you to see all questions, pillars, and guidance without creating a formal workload 50 | - Perfect for understanding the lens structure and content 51 | 52 | ![Preview Lens Experience](images/preview-lens.png) 53 | 54 | For complete workload reviews and publishing the lens for broader use, refer to the [AWS Well-Architected Custom Lenses documentation](https://docs.aws.amazon.com/wellarchitected/latest/userguide/lenses-custom.html). 55 | 56 | ## How to Use This Lens 57 | 58 | ### Exploring the Lens Content 59 | After importing the lens, use the **Preview Experience** to: 60 | - Navigate through all 8 pillars systematically 61 | - Review questions and understand the guidance provided 62 | - Explore helpful resources and improvement plans for each question 63 | - Get familiar with the lens structure before formal assessments 64 | 65 | ![Preview Lens Interface](images/preview-interface.png) 66 | 67 | ### Key Areas to Focus On 68 | 69 | **Start with Assessment**: The Assessment pillar provides foundational questions about your current Microsoft workload environment. 70 | 71 | **Review Question Structure**: Each question includes multiple choice options, helpful resources, and improvement plans. 72 | 73 | ### Next Steps 74 | For conducting formal workload reviews, publishing the lens organization-wide, and advanced features, consult the [AWS Well-Architected Custom Lenses documentation](https://docs.aws.amazon.com/wellarchitected/latest/userguide/lenses-custom.html). 75 | 76 | ## Example Use Cases 77 | 78 | ### Scenario 1: .NET Application Migration 79 | **Situation**: Migrating a legacy .NET Framework application to AWS 80 | 81 | **Lens Application**: 82 | - **Assessment**: Use OLA to understand current resource utilization 83 | - **Modernization**: Evaluate containerization vs. lift-and-shift approaches 84 | - **Cost Optimization**: Compare licensing models (BYOL vs. License Included) 85 | - **Security**: Implement AWS security best practices for Windows workloads 86 | 87 | ### Scenario 2: SQL Server Database Modernization 88 | **Situation**: Modernizing SQL Server databases on AWS 89 | 90 | **Lens Application**: 91 | - **Assessment**: Analyze current database performance and dependencies 92 | - **Reliability**: Design for high availability with Multi-AZ deployments 93 | - **Performance**: Optimize with Amazon RDS Performance Insights 94 | - **Cost Optimization**: Right-size instances and storage 95 | 96 | ### Scenario 3: Microsoft Active Directory Integration 97 | **Situation**: Integrating on-premises AD with AWS services 98 | 99 | **Lens Application**: 100 | - **Security**: Implement AWS Managed Microsoft AD or AD Connector 101 | - **Operational Excellence**: Set up monitoring and logging 102 | - **Reliability**: Design for cross-region redundancy 103 | - **Performance**: Optimize network connectivity 104 | 105 | ## Lens Structure 106 | 107 | This custom lens contains: 108 | - **8 Pillars** covering the complete Microsoft workload lifecycle 109 | - **50+ Questions** with detailed guidance 110 | - **Multiple Choice Options** for each question 111 | - **Helpful Resources** with links to AWS documentation 112 | - **Improvement Plans** with actionable recommendations 113 | 114 | ## Best Practices 115 | 116 | 1. **Start with Assessment**: Always begin with the Assessment pillar to establish baseline understanding 117 | 2. **Involve Stakeholders**: Include application owners, security teams, and operations staff 118 | 3. **Document Decisions**: Keep track of architectural decisions and rationale 119 | 4. **Regular Reviews**: Schedule periodic reviews to ensure continuous improvement 120 | 5. **Leverage AWS Support**: Engage with AWS Solutions Architects for complex scenarios 121 | 122 | ## Contributing 123 | 124 | * [Bruno Lopes](https://www.linkedin.com/in/blopesinfo/), Sr. Specialist SA, Containers 125 | * [Carlos Felicio](https://www.linkedin.com/in/cafeliciobrz/), Sr. TAM (Partner) 126 | * [Luciano Bernardes](https://www.linkedin.com/in/lucianobernardes/), US, Specialist Sr. SA 127 | * [Vitor Euphrasio](https://www.linkedin.com/in/vitoreuphrasio/), Sr. Specialist SA, Infra Mig & Mod 128 | 129 | ## Resources 130 | 131 | ### AWS Documentation 132 | * [AWS Well-Architected Framework](https://aws.amazon.com/architecture/well-architected/) 133 | * [AWS Well-Architected Tool User Guide](https://docs.aws.amazon.com/wellarchitected/latest/userguide/) 134 | * [Announcing AWS Well-Architected Custom Lenses](https://aws.amazon.com/blogs/aws/well-architected-custom-lenses-internal-best-practices/) 135 | 136 | ### Microsoft Workloads on AWS 137 | * [Modernize Windows Workloads with AWS](https://aws.amazon.com/windows/modernization/) 138 | * [Modernization pathways for a legacy .NET Framework monolithic application on AWS](https://aws.amazon.com/blogs/architecture/modernization-pathways-for-a-legacy-net-framework-monolithic-application-on-aws/) 139 | * [AWS Optimization and Licensing Assessment](https://aws.amazon.com/windows/optimization-and-licensing-assessment/) 140 | 141 | ### Migration and Modernization 142 | * [AWS Migration Hub](https://aws.amazon.com/migration-hub/) 143 | * [AWS Application Discovery Service](https://aws.amazon.com/application-discovery/) 144 | * [AWS Database Migration Service](https://aws.amazon.com/dms/) 145 | 146 | ## Support 147 | 148 | For questions or issues with this custom lens: 149 | 1. Check the [AWS Well-Architected Tool documentation](https://docs.aws.amazon.com/wellarchitected/) 150 | 2. Contact your AWS Solutions Architect 151 | 3. Open an issue in this repository 152 | -------------------------------------------------------------------------------- /MSFT-Lens/images/.gitkeep: -------------------------------------------------------------------------------- 1 | # This file ensures the images directory is tracked by git 2 | # Remove this file once you add actual screenshots 3 | -------------------------------------------------------------------------------- /MSFT-Lens/images/import-custom-lens.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/custom-lens-wa-hub/29b26f41ba31c25185309b90cbc2661342c8377d/MSFT-Lens/images/import-custom-lens.png -------------------------------------------------------------------------------- /MSFT-Lens/images/preview-interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/custom-lens-wa-hub/29b26f41ba31c25185309b90cbc2661342c8377d/MSFT-Lens/images/preview-interface.png -------------------------------------------------------------------------------- /MSFT-Lens/images/preview-lens.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/custom-lens-wa-hub/29b26f41ba31c25185309b90cbc2661342c8377d/MSFT-Lens/images/preview-lens.png -------------------------------------------------------------------------------- /MSFT-Lens/images/wa-tool-console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/custom-lens-wa-hub/29b26f41ba31c25185309b90cbc2661342c8377d/MSFT-Lens/images/wa-tool-console.png -------------------------------------------------------------------------------- /ORR-Lens/ORR-Whitepaper-Sample-PUBLISHED.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "2021-11-01", 3 | "name": "AWS Operational Readiness Review Whitepaper Sample", 4 | "description": "This Well Architected Lense is an adaptation of the AWS Operational Readiness Review program-- a set of questions designed to capture and help correct common failure-points. It is adapted for the ORR whitepaper in order to aid customer as a sample reference question set to further customize.", 5 | "pillars": [ 6 | { 7 | "id": "architecture", 8 | "name": "01 - Architecture", 9 | "questions": [ 10 | { 11 | "id": "architecture_architecture_diagram", 12 | "title": "Architecture Diagram (H)", 13 | "description": "Please provide a diagram of your system or application architecture, both at the infrastructure level and at the data/network flow level. Note the locations in the notes section below.", 14 | "choices": [ 15 | { 16 | "id": "architecture_diagram", 17 | "title": "Architecture design to reduce the blast radius of failures provided", 18 | "helpfulResource": { 19 | "displayText": "An architecture diagram shows the multi-(AZ/regional) setup of the underlying infrastructure, relevant ELBs, ASGs, how they are split across AZ's, etc." 20 | }, 21 | "improvementPlan": { 22 | "displayText": "A review of the architecture diagram is highly recommended prior to go-live in order to sanity check there are no visible single points of failures. " 23 | } 24 | }, 25 | { 26 | "id": "data_flow_diagram", 27 | "title": "Data/network flow diagram provided.", 28 | "helpfulResource": { 29 | "displayText": "A data or network flow diagram shows the flow of data through the system in order to identify external dependencies or internal single points of failure/bottle necks to performance." 30 | }, 31 | "improvementPlan": { 32 | "displayText": "A review of the network/data flow diagram is highly recommended prior to go-live in order to ensure there are no noticeable bottlenecks or external dependencies that could impact service operation." 33 | } 34 | } 35 | ], 36 | "riskRules": [ 37 | { 38 | "condition": "data_flow_diagram && architecture_diagram", 39 | "risk": "NO_RISK" 40 | }, 41 | { 42 | "condition": "(!data_flow_diagram && architecture_diagram) || (!architecture_diagram && data_flow_diagram)", 43 | "risk": "MEDIUM_RISK" 44 | }, 45 | { 46 | "condition": "default", 47 | "risk": "HIGH_RISK" 48 | } 49 | ] 50 | }, 51 | { 52 | "id": "architecture_api_matrix", 53 | "title": "Impacted API Matrix (H)", 54 | "description": "Please provide a table enumerating all customer-facing APIs, an explanation of what each does, and the components and dependencies of your service that it touches. Include all APIs whether they are public or private from the customer's perspective.", 55 | "choices": [ 56 | { 57 | "id": "table_provided", 58 | "title": "Matrix (or wiki link) has been provided in the notes section.", 59 | "helpfulResource": { 60 | "displayText": "The API Matrix, or a link to it, has been noted below." 61 | }, 62 | "improvementPlan": { 63 | "displayText": "Key Customer-facing APIs that are expecting high traffic should be documented along with its component pieces and dependencies, and expected traffic load if known." 64 | } 65 | } 66 | ], 67 | "riskRules": [ 68 | { 69 | "condition": "table_provided", 70 | "risk": "NO_RISK" 71 | }, 72 | { 73 | "condition": "default", 74 | "risk": "HIGH_RISK" 75 | } 76 | ] 77 | }, 78 | { 79 | "id": "architecture_failure_models", 80 | "title": "Failure Models (H)", 81 | "description": "Please construct a failure model listing soft and hard failure modes for each of your system's components and dependencies.", 82 | "choices": [ 83 | { 84 | "id": "failure_model_documented", 85 | "title": "Failure model documented.", 86 | "helpfulResource": { 87 | "displayText": "Your failure model should include columns for Component/Dependency, Failure Type, Service Impact, and Customer Impact" 88 | }, 89 | "improvementPlan": { 90 | "displayText": "Please address an outage of your service in its largest blast radius unit (a cell, an AZ, a or region) plus a total infrastructure outage in its largest blast radius (an AZ)." 91 | } 92 | }, 93 | { 94 | "id": "soft_failures_known", 95 | "title": "Soft failures known and documented.", 96 | "helpfulResource": { 97 | "displayText": "Soft failures are failures where an application is partially operating; for example high latency rendering a high percentage of response errors." 98 | }, 99 | "improvementPlan": { 100 | "displayText": "Known soft failure scenarios for the application or workload should be documented and discussed in order to identify mitigations." 101 | } 102 | }, 103 | { 104 | "id": "soft_failures_detection", 105 | "title": "Soft failures conditions are actively monitored for occurrence.", 106 | "helpfulResource": { 107 | "displayText": "Soft failures such as high latency can be monitored through p90 / p99 transaction monitoring, response error rates can be emitted by the application. Alarms should be set for known soft-failure conditions." 108 | }, 109 | "improvementPlan": { 110 | "displayText": "Known soft failure scenarios should be discussed and documented and alarms set to detect such scenarios, if possible." 111 | } 112 | }, 113 | { 114 | "id": "soft_failures_runbooks", 115 | "title": "Known soft failure conditions have documented mitigations or playbooks", 116 | "helpfulResource": { 117 | "displayText": "On-call engineers should ideally not be scrambling to 'figure out' what to do. If a failure condition is known ahead of time, playbooks should be written that can be followed in the event of occurrence." 118 | }, 119 | "improvementPlan": { 120 | "displayText": "Known soft failure scenarios should have runbooks built that on-call engineers can leverage in the event of occurrence. Links to the relevant runbooks should be added to the detection alarms." 121 | } 122 | }, 123 | { 124 | "id": "hard_failures_known", 125 | "title": "Hard failures known and documented", 126 | "helpfulResource": { 127 | "displayText": "Hard failures are failures where an application is entirely non-operational." 128 | }, 129 | "improvementPlan": { 130 | "displayText": "Known hard failure scenarios should be discussed and documented and alarms set to detect such scenarios, if possible." 131 | } 132 | }, 133 | { 134 | "id": "hard_failures_detection", 135 | "title": "Known hard failure conditions are actively monitored for occurrence.", 136 | "helpfulResource": { 137 | "displayText": "Hard failure conditions can be monitored through things like health checks, or instance status checks." 138 | }, 139 | "improvementPlan": { 140 | "displayText": "Known hard failure scenarios should be discussed and documented and alarms set to detect such scenarios, if possible." 141 | } 142 | }, 143 | { 144 | "id": "hard_failures_runbooks", 145 | "title": "Known hard failure conditions have documented mitigations or playbooks", 146 | "helpfulResource": { 147 | "displayText": "On-call engineers should ideally not be scrambling to 'figure out' what to do. If a failure condition is known ahead of time, playbooks should be written that can be followed in the event of occurrence." 148 | }, 149 | "improvementPlan": { 150 | "displayText": "Known hard failure scenarios should have playbooks built that on-call engineers can leverage in the event of occurrence. Links to the relevant playbooks should be added to the detection alarms." 151 | } 152 | } 153 | ], 154 | "riskRules": [ 155 | { 156 | "condition": "failure_model_documented && hard_failures_runbooks && hard_failures_detection && hard_failures_known && soft_failures_runbooks && soft_failures_detection && soft_failures_known", 157 | "risk": "NO_RISK" 158 | }, 159 | { 160 | "condition": "failure_model_documented && hard_failures_known && soft_failures_known && (!hard_failures_detection || !soft_failures_detection || !hard_failures_runbooks || !soft_failures_runbooks)", 161 | "risk": "MEDIUM_RISK" 162 | }, 163 | { 164 | "condition": "default", 165 | "risk": "HIGH_RISK" 166 | } 167 | ] 168 | }, 169 | { 170 | "id": "architecture_dependency_retry", 171 | "title": "Dependency Retry/Backoff (M)", 172 | "description": "What is the retry/back-off strategy for each of your service's dependencies?", 173 | "choices": [ 174 | { 175 | "id": "sync_API", 176 | "title": "Safety for code calling out to dependencies within the context of a Sync API established", 177 | "helpfulResource": { 178 | "displayText": "For dependency calls made within the context of a sync API call, you should generally retry once immediately, then give up." 179 | }, 180 | "improvementPlan": { 181 | "displayText": "For dependency calls made within the context of a sync API call, you should generally retry once immediately, then give up." 182 | } 183 | }, 184 | { 185 | "id": "aws_services", 186 | "title": "Safety for code calling out to dependencies within the context of an async API call established", 187 | "helpfulResource": { 188 | "displayText": "It is an AWS best practice, and a required practice for large applications, to properly catch ThrottlingExceptions and implement retry, backoff, and jitter strategies.", 189 | "Url": "https://aws.amazon.com/builders-library/timeouts-retries-and-backoff-with-jitter/" 190 | }, 191 | "improvementPlan": { 192 | "displayText": "While some of the AWS SDKs will properly capture ThrottlingExceptions and automatically handle retries and backoff conditions, those retries are limited to a set number of attempts and those errors could still be raised to the application. Those errors should be caught and handled appropriately with fail-safe code paths." 193 | } 194 | }, 195 | { 196 | "id": "third_parties", 197 | "title": "Throttling techniques to defensively protect your service from customers established", 198 | "helpfulResource": { 199 | "displayText": "Are you using distributed throttling on your front-end? Do you have pre-authentication throttles? Are limits on request size enforced before authentication?", 200 | "url": "https://aws.amazon.com/builders-library/fairness-in-multi-tenant-systems" 201 | }, 202 | "improvementPlan": { 203 | "displayText": "Throttling techniques to defensively protect your service from customers established." 204 | } 205 | } 206 | ], 207 | "riskRules": [ 208 | { 209 | "condition": "sync_API && aws_services && third_parties", 210 | "risk": "NO_RISK" 211 | }, 212 | { 213 | "condition": "default", 214 | "risk": "HIGH_RISK" 215 | } 216 | ] 217 | }, 218 | { 219 | "id": "architecture_retry_timeouts", 220 | "title": "Retries & Socket Timeouts (H)", 221 | "description": "Have you intentionally set appropriate retry and socket timeout configuration for all SDK usage?", 222 | "choices": [ 223 | { 224 | "id": "reviewed", 225 | "title": "Retry count and socket timeouts reviewed.", 226 | "helpfulResource": { 227 | "displayText": "Not setting the appropriate retry and timeout logic for your AWS SDK clients can lead to a thread pool with all threads engaged in dependency operations.", 228 | "url": "https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html" 229 | }, 230 | "improvementPlan": { 231 | "displayText": "It's better to fail fast and a return a response to the client for dependency calls made within the context of sync calls from customers to let the client decide how and when to retry then timeout customer requests." 232 | } 233 | } 234 | ], 235 | "riskRules": [ 236 | { 237 | "condition": "reviewed", 238 | "risk": "NO_RISK" 239 | }, 240 | { 241 | "condition": "default", 242 | "risk": "HIGH_RISK" 243 | } 244 | ] 245 | }, 246 | { 247 | "id": "architecture_rpo_rto", 248 | "title": "Recovery Objectives (M)", 249 | "description": "If your service was temporarily deactivated or shut down, what is your RTO for restarting your service?", 250 | "choices": [ 251 | { 252 | "id": "rto_defined", 253 | "title": "RTO has been defined", 254 | "helpfulResource": { 255 | "displayText": "Recovery Time Objective is a measure of the amount of acceptable downtime per incident, for example five minutes, 30 minutes, an hour, a day, etc." 256 | }, 257 | "improvementPlan": { 258 | "displayText": "The application and operations teams should work together with the business team in order to identify a supportable recovery time objective based upon design and end customer agreements." 259 | } 260 | }, 261 | { 262 | "id": "rto_verified", 263 | "title": "RTO has been verified through a dry-run or game-day exercise", 264 | "helpfulResource": { 265 | "displayText": "A theoretical RTO is a good starting point, but until the teams have verified their ability to support it, it is difficult to rely upon." 266 | }, 267 | "improvementPlan": { 268 | "displayText": "A game-day, dry-run or similar exercise should be used to ensure that all relevant teams know what actions need to be taken in the event of an outage. Drafting a written runbook may be useful for documentation purposes. A hot or cold stand-by environment may also be useful in order to achieve faster RTO by evacuating the primary environment. Also consider if there are any off-box dependencies that are mandatory for a restart. Confirm that there are no circular dependencies." 269 | } 270 | } 271 | ], 272 | "riskRules": [ 273 | { 274 | "condition": "rto_defined && rto_verified", 275 | "risk": "NO_RISK" 276 | }, 277 | { 278 | "condition": "(rto_defined) && (!rto_defined)", 279 | "risk": "MEDIUM_RISK" 280 | }, 281 | { 282 | "condition": "default", 283 | "risk": "HIGH_RISK" 284 | } 285 | ] 286 | } 287 | ] 288 | }, 289 | { 290 | "id": "release_quality", 291 | "name": "02 - Release Quality", 292 | "questions": [ 293 | { 294 | "id": "releases_deployment_rollback", 295 | "title": "Automated Deployment Rollback (M)", 296 | "description": "Do your customer impacting deployments automatically rollback incorrect deployments before they breach internal SLAs?", 297 | "choices": [ 298 | { 299 | "id": "manual_rollback", 300 | "title": "Manual rollbacks can be initiated by operators", 301 | "helpfulResource": { 302 | "displayText": "Simple rollback mechanisms allow for operators to make the call on whether a given deployment is going to succeed or not after problems arise." 303 | }, 304 | "improvementPlan": { 305 | "displayText": "If manual rollback is not being used currently because it is not supported, then the fail-forward plan should be clearly documented " 306 | } 307 | }, 308 | { 309 | "id": "auto_rollback", 310 | "title": "Automatic rollbacks are initiated by monitoring systems.", 311 | "helpfulResource": { 312 | "displayText": "Automated rollback mechanisms free up operator time and allows for faster response to a deployment that is not going as planned. This requires deployment metrics to be configured, such as canary alarms. See example of automatic rollback on CloudWatch alarm.", 313 | "url": "http://docs.aws.amazon.com/codedeploy/latest/userguide/deployments-rollback-and-redeploy.html#deployments-rollback-and-redeploy-automatic-rollbacks" 314 | }, 315 | "improvementPlan": { 316 | "displayText": "In blue/green deployment environments, automatically reverting back to the previous environment can be a safe choice in order to minimize user-impact. Assuming that the app supports rolling back to a previous version, discuss as a team what parts of the manual steps can be turned into automation and how that automation can be triggered." 317 | } 318 | } 319 | ], 320 | "riskRules": [ 321 | { 322 | "condition": "auto_rollback", 323 | "risk": "NO_RISK" 324 | }, 325 | { 326 | "condition": "manual_rollback", 327 | "risk": "MEDIUM_RISK" 328 | }, 329 | { 330 | "condition": "default", 331 | "risk": "HIGH_RISK" 332 | } 333 | ] 334 | }, 335 | { 336 | "id": "releases_validation", 337 | "title": "Deployment Validation (L)", 338 | "description": "Do your customer impacting deployments run on-host validation tests?", 339 | "choices": [ 340 | { 341 | "id": "risk_mitigated", 342 | "title": "On-host validations run", 343 | "helpfulResource": { 344 | "displayText": "Post-deployment validation is critical to ensure that the software which was deployed correctly executes and functions as intended. Such as certain files being in place, services in running states, configuration as expected. For example, verify that the software has started successfully and is responding correctly to health checks on local host before re-registering with the load balancer. See example of CodeDeploy lifecycle hook to perform post deploy validations.", 345 | "url": "http://docs.aws.amazon.com/codedeploy/latest/userguide/reference-appspec-file-structure-hooks.html#reference-appspec-file-structure-hooks-list" 346 | }, 347 | "improvementPlan": { 348 | "displayText": "Depending on deployment methodology (updating existing instances vs blue/green), look to adding a validation step to existing deployment mechanisms to validate seemingly successful deployments. In the event an error is found, that error needs to be propagated to operators and the deployment failed." 349 | } 350 | } 351 | ], 352 | "riskRules": [ 353 | { 354 | "condition": "risk_mitigated", 355 | "risk": "NO_RISK" 356 | }, 357 | { 358 | "condition": "default", 359 | "risk": "HIGH_RISK" 360 | } 361 | ] 362 | }, 363 | { 364 | "id": "releases_change_management", 365 | "title": "Change Management (H)", 366 | "description": "Do you have a mechanism to ensure all code changes (software, configuration, infrastructure, and operational tooling) to production systems are reviewed and approved by someone other than the code author?", 367 | "choices": [ 368 | { 369 | "id": "risk_mitigated", 370 | "title": "The risk presented here has been fully mitigated with no lingering questions or concerns that need to be followed up on.", 371 | "helpfulResource": { 372 | "displayText": "Add a manual approval action into your CodePipeline pipelines and limit permissions to a set of approvers. See example.", 373 | "url": "https://docs.aws.amazon.com/codepipeline/latest/userguide/approvals-action-add.html" 374 | }, 375 | "improvementPlan": { 376 | "displayText": "Add a manual approval action into the CICD pipeline if appropriate." 377 | } 378 | } 379 | ], 380 | "riskRules": [ 381 | { 382 | "condition": "risk_mitigated", 383 | "risk": "NO_RISK" 384 | }, 385 | { 386 | "condition": "default", 387 | "risk": "HIGH_RISK" 388 | } 389 | ] 390 | }, 391 | { 392 | "id": "architecture_load_testing", 393 | "title": "Load Testing (H)", 394 | "description": "Have you performed multiple rounds of load testing to discover and address any unexpected performance bottlenecks and establish known breaking points?", 395 | "choices": [ 396 | { 397 | "id": "lb_1x", 398 | "title": "Tested to expected (1X) load / capacity requirement.", 399 | "helpfulResource": { 400 | "displayText": "Load test is done to level of or beyond 1x the expected traffic load / traffic requirements" 401 | }, 402 | "improvementPlan": { 403 | "displayText": "Assuming that load levels are known, the application stack should be tested to the expected load in order to ensure stability." 404 | } 405 | }, 406 | { 407 | "id": "lb_2x", 408 | "title": "Test to two times (2X) expected load / capacity requirement.", 409 | "helpfulResource": { 410 | "displayText": "Load test is done to level of or beyond 2x the expected traffic load / traffic requirements" 411 | }, 412 | "improvementPlan": { 413 | "displayText": "Assuming that load levels are known, the application stack should be tested to 2X the expected load in order to ensure stability." 414 | } 415 | }, 416 | { 417 | "id": "lb_xx", 418 | "title": "Load test performed to break-point.", 419 | "helpfulResource": { 420 | "displayText": "You should assume that you will find the breaking point of your service multiple times, iteratively addressing uncovered performance bottlenecks and repeating the load test. It should consider a small number of very large customers, a large number of very small customers, and sinusoidal load." 421 | }, 422 | "improvementPlan": { 423 | "displayText": "Conduct a load test that simulates a surge of traffic from a single customer to validate behavior under this kind of load. Conduct one large scale load test against your production environment (a) before you launch, and (b) subsequently per quarter, to validate proper scaling as you grow (or for any potential peak usage)." 424 | } 425 | } 426 | ], 427 | "riskRules": [ 428 | { 429 | "condition": "lb_1x && lb_2x && lb_xx", 430 | "risk": "NO_RISK" 431 | }, 432 | { 433 | "condition": "lb_1x && (!lb_2x || !lb_xx)", 434 | "risk": "MEDIUM_RISK" 435 | }, 436 | { 437 | "condition": "default", 438 | "risk": "HIGH_RISK" 439 | } 440 | ] 441 | }, 442 | { 443 | "id": "releases_canary_errors", 444 | "title": "Independent Canary Errors (L)", 445 | "description": "Do you publish your canary synthetics errors to an independent metric? Subsequently, do you alarm on this metric?", 446 | "choices": [ 447 | { 448 | "id": "canaries_exist", 449 | "title": "Canary alarms have been configured", 450 | "helpfulResource": { 451 | "displayText": "Ensure that your canary synthetics errors are published to their own metrics, as opposed to being combined with all errors. This allows your service to alarm on an individual canary error rate." 452 | }, 453 | "improvementPlan": { 454 | "displayText": "Canary alarms should include things such as heartbeat checks, broken link detection." 455 | } 456 | }, 457 | { 458 | "id": "canary_ops", 459 | "title": "Canary Metrics have alarms tied to them that engage the operations team.", 460 | "helpfulResource": { 461 | "displayText": "Canary alarms primary purpose is to proactively catch downward trends in application or service health. As such, those alarms should notify operator teams so that they can investigate potential issues." 462 | }, 463 | "improvementPlan": { 464 | "displayText": "Canaries failing should be set to engage relevant teams (content teams for broken links, operators for heartbeats / API failures, etc) at an appropriate severity level. Such as tickets for broken links, or perhaps paging for failed heartbeats. " 465 | } 466 | } 467 | ], 468 | "riskRules": [ 469 | { 470 | "condition": "canaries_exist && canary_ops", 471 | "risk": "NO_RISK" 472 | }, 473 | { 474 | "condition": "canaries_exist && !canary_ops", 475 | "risk": "MEDIUM_RISK" 476 | }, 477 | { 478 | "condition": "default", 479 | "risk": "HIGH_RISK" 480 | } 481 | ] 482 | } 483 | ] 484 | }, 485 | { 486 | "id": "event_management", 487 | "name": "03 - Event Management", 488 | "questions": [ 489 | { 490 | "id": "event_gameday", 491 | "title": "Event preparedness through game days (H)", 492 | "description": "Have you performed a gameday to verify that your service's monitoring and alarming function as expected and your on-call engineers are engaged and able to rapidly diagnose and remediate failures?", 493 | "choices": [ 494 | { 495 | "id": "gameday_practiced", 496 | "title": "Gameday conducted and lessons learnt documented/procedures updated.", 497 | "helpfulResource": { 498 | "displayText": "While incidents are opportunities to measure, report and learn from the effectiveness of the established practices, failure modes and how personnel and systems will respond are hard to predict. Test your failure scenarios and validate your understanding of their impact. " 499 | }, 500 | "improvementPlan": { 501 | "displayText": "Test your response procedures to ensure that they are effective, and that teams are familiar with their execution." 502 | } 503 | } 504 | ], 505 | "riskRules": [ 506 | { 507 | "condition": "gameday_practiced", 508 | "risk": "NO_RISK" 509 | }, 510 | { 511 | "condition": "default", 512 | "risk": "HIGH_RISK" 513 | } 514 | ] 515 | }, 516 | { 517 | "id": "event_canary_alarms", 518 | "title": "Independent Canary Alarms (L)", 519 | "description": "Does your canary synthetic tests detect and alarm on shallow API test failures in under five minutes?", 520 | "choices": [ 521 | { 522 | "id": "canaries_exist", 523 | "title": "Performance synthetics measure P50, P99 and P99.9s to track variability (including tail latency)", 524 | "helpfulResource": { 525 | "displayText": "Performance variability should be measured along with median performance since there are edge cases which can affect both overall performance as well as the customer perception. Understanding this variability will allow your service to improve customer experience." 526 | }, 527 | "improvementPlan": { 528 | "displayText": "Integrate business KPIs with continuous synthetic transaction testing (canaries). Canaries help verify your customer experience and discover issues before your customers do.", 529 | "url": "https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch_Synthetics_Canaries.html" 530 | } 531 | }, 532 | { 533 | "id": "canary_ops", 534 | "title": "Canary Metrics have alarms tied to them that engage the operations team.", 535 | "helpfulResource": { 536 | "displayText": "Canary alarms primary purpose is to proactively catch downward trends in application or service health. As such, those alarms should notify operator teams so that they can investigate potential issues." 537 | }, 538 | "improvementPlan": { 539 | "displayText": "Canaries failing should be set to engage relevant teams (content teams for broken links, operators for heartbeats / API failures, etc) at an appropriate severity level. Such as tickets for broken links, or perhaps paging for failed heartbeats. " 540 | } 541 | } 542 | ], 543 | "riskRules": [ 544 | { 545 | "condition": "canaries_exist && canary_ops", 546 | "risk": "NO_RISK" 547 | }, 548 | { 549 | "condition": "canaries_exist && !canary_ops", 550 | "risk": "MEDIUM_RISK" 551 | }, 552 | { 553 | "condition": "default", 554 | "risk": "HIGH_RISK" 555 | } 556 | ] 557 | }, 558 | { 559 | "id": "event_jvm_metrics", 560 | "title": "JVM Metrics & Alarms", 561 | "description": "Do you monitor (and alarm) on your JVM metrics? Do you monitor (and alarm on) your hosts for file system, inode, and file descriptor utilization? Do you monitor (and alarm on) your hosts for CPU and memory utilization?", 562 | "choices": [ 563 | { 564 | "id": "jvm_metrics", 565 | "title": "JVM metrics monitored and alarmed on as needed.", 566 | "helpfulResource": { 567 | "displayText": "Applicable JVM metrics could include, but are not limited to: memory utilization, garbage collection, heap usage, thread summary." 568 | }, 569 | "improvementPlan": { 570 | "displayText": "Relevant JVM metrics (thread summary, memory utilization, garbage collection, heap usage) should be metriced and alarmed upon. The CloudWatch Agent can collect the JVM metrics via it's collectd plugin." 571 | } 572 | }, 573 | { 574 | "id": "fs_capacity", 575 | "title": "Filesystem space capacity", 576 | "helpfulResource": { 577 | "displayText": "For non-autoscaled systems, monitoring and alarming the level free capacity on the filesystem can be critical for catching problems early and preventing a crash." 578 | }, 579 | "improvementPlan": { 580 | "displayText": "Non-autoscaled systems that ingest data should have filesystem capacity alarmed on in order to avoid a situation where the drive is filled to capacity, resulting in a crash." 581 | } 582 | }, 583 | { 584 | "id": "inode_capacity", 585 | "title": "Filesystem inode capacity", 586 | "helpfulResource": { 587 | "displayText": "Particularly EXT4 based systems may want to monitor inode capacity given that EXT4 inode capacity is hard-coded at creation time and cannot be expanded. Inode exhaustion is a concern on systems that deal with many small files rather than fewer medium to large files." 588 | }, 589 | "improvementPlan": { 590 | "displayText": "On systems which primarily deal with many small files (logging servers, for example), monitoring inode capacity can be critical for ensuring application availability. " 591 | } 592 | }, 593 | { 594 | "id": "fd_utilization", 595 | "title": "File descriptor utilization.", 596 | "helpfulResource": { 597 | "displayText": "File descriptors are a measurement of how many files can be opened at a single time. " 598 | }, 599 | "improvementPlan": { 600 | "displayText": "File descriptor utilization should be metriced and alarmed on if the server in question handles many services or if one of those services primarily opens and closes many files simultaneously." 601 | } 602 | }, 603 | { 604 | "id": "cpu_utilization", 605 | "title": "CPU utilization", 606 | "helpfulResource": { 607 | "displayText": "CPU Utilization is a default metric that is available within EC2, and its monitoring can be critical to detecting problems such as processing running out of control. This is a common scale-out metric for autoscaling groups." 608 | }, 609 | "improvementPlan": { 610 | "displayText": "Non-autoscaled systems should alarm on CPU utilization around 75-80%, or use CW Synthetics for dynamic alarms, in order to detect a server that is being overwhelmed." 611 | } 612 | }, 613 | { 614 | "id": "mem_utilization", 615 | "title": "Memory utilization", 616 | "helpfulResource": { 617 | "displayText": "Memory utilization is not a default metric that is available within EC2. Its monitoring can be critical to detecting problems such as processing running out of control. This is a common scale-out metric for autoscaling groups." 618 | }, 619 | "improvementPlan": { 620 | "displayText": "Non-autoscaled systems should alarm on memory utilization, either statically defined thresholds or CloudWatch synthetics, in order to detect memory leaks or being under-provisioned." 621 | } 622 | } 623 | ], 624 | "riskRules": [ 625 | { 626 | "condition": "jvm_metrics && mem_utilization && cpu_utilization && fd_utilization && inode_capacity && fs_capacity", 627 | "risk": "NO_RISK" 628 | }, 629 | { 630 | "condition": "jvm_metrics && (mem_utilization && cpu_utilization) && (!inode_capacity || !fs_capacity)", 631 | "risk": "MEDIUM_RISK" 632 | }, 633 | { 634 | "condition": "default", 635 | "risk": "HIGH_RISK" 636 | } 637 | ] 638 | }, 639 | { 640 | "id": "event_kpis", 641 | "title": "Operational KPIs (H)", 642 | "description": "When do you look at your weekly and operator dashboards? What operational goals or KPIs (latency, throughput/TPS, etc.) have you identified for your service?", 643 | "choices": [ 644 | { 645 | "id": "kpis_reviewed", 646 | "title": "Key metrics reviewed in Ops meeting at cadence (weekly/bi-weekly).", 647 | "helpfulResource": { 648 | "displayText": "During a regular schedule of Ops meeting review the following: 1/ Review outstanding action items from the previous week. 2/ Review last week\u2019s high severity tickets. 3/ Review pipelines for things like rollbacks or blocks. 4/ Review open customer support tickets. 5/ Review open high severity tickets. 6/ What new runbook entries were added this week? 7/ Review the detailed metrics dashboard for one of your components. 8/ Discuss on-call rotation." 649 | }, 650 | "improvementPlan": { 651 | "displayText": "Work with business leadership to understand the KPIs that will determine whether the operational goals are being achieved or at risk thereby showing how operations is contributing towards business outcomes." 652 | } 653 | }, 654 | { 655 | "id": "kpis_documented", 656 | "title": "KPIs are known and documented in the notes section.", 657 | "helpfulResource": { 658 | "displayText": "Knowing your KPIs are an important piece in understanding if you are meeting the needs of the users during an event. KPIs could include uptime/availability, number of active users or session, number of transactions per second, amount of time each transaction takes or the amount of latency a user is experiencing." 659 | }, 660 | "improvementPlan": { 661 | "displayText": "Work with business leadership to understand the KPIs that will determine whether the event is a success, and then work to implement those KPIs as metrics and alarms." 662 | } 663 | } 664 | ], 665 | "riskRules": [ 666 | { 667 | "condition": "kpis_reviewed && kpis_documented", 668 | "risk": "NO_RISK" 669 | }, 670 | { 671 | "condition": "default", 672 | "risk": "HIGH_RISK" 673 | } 674 | ] 675 | }, 676 | { 677 | "id": "event_resilience_recoveries", 678 | "title": "Withstand failures & fast recoveries (M)", 679 | "description": "What resilience measures implemented when dependencies are impaired? What procedures exist for faster recoveries?", 680 | "choices": [ 681 | { 682 | "id": "runbooks_exist", 683 | "title": "Have the ability to weight workload out of an AZ within 15 minutes.", 684 | "helpfulResource": { 685 | "displayText": "Create Runbook that clearly documents the process to weight workload out of an AZ at a minimum. Ideally the steps should be automated." 686 | }, 687 | "improvementPlan": { 688 | "displayText": "Create Runbook that clearly documents the process to weight workload out of an AZ at a minimum. Ideally the steps should be automated." 689 | } 690 | }, 691 | { 692 | "id": "withstand_failure", 693 | "title": "Workload can withstand loss of AZ without customer impact", 694 | "helpfulResource": { 695 | "displayText": "Architect your workload to be statically stable during an AZ failure thereby avoiding the need to make changes or deploy new capacity in response. See statically stable article for details.", 696 | "url": "https://aws.amazon.com/builders-library/static-stability-using-availability-zones/" 697 | }, 698 | "improvementPlan": { 699 | "displayText": "Architect your workload to be statically stable during an AZ failure thereby avoiding the need to make changes or deploy new capacity in response. See statically stable article for details.", 700 | "url": "https://aws.amazon.com/builders-library/static-stability-using-availability-zones/" 701 | } 702 | } 703 | ], 704 | "riskRules": [ 705 | { 706 | "condition": "runbooks_exist && withstand_failure", 707 | "risk": "NO_RISK" 708 | }, 709 | { 710 | "condition": "default", 711 | "risk": "MEDIUM_RISK" 712 | } 713 | ] 714 | } 715 | ] 716 | } 717 | ] 718 | } -------------------------------------------------------------------------------- /ORR-Lens/README.md: -------------------------------------------------------------------------------- 1 | # Operational Readiness Review (ORR) Lens 2 | 3 | Amazon Web Services (AWS) created the Operational Readiness Review (ORR) to distill the learnings from AWS operational incidents into curated questions with best practice guidance. Refer to the [ORR Whitepaper](https://docs.aws.amazon.com/wellarchitected/latest/operational-readiness-reviews/). Within the Whitepaper [Appendix B](https://docs.aws.amazon.com/wellarchitected/latest/operational-readiness-reviews/appendix-b-example-orr-questions.html) provides example ORR questions. The following ORR lens models those questions as a template to get started quickly building an ORR practice using WA custom lens as the tool. 4 | 5 | 6 | # Customize Well-Architected Reviews using Custom Lenses Sample Questionnaire 7 | 8 | The AWS Well-Architected Tool makes it easy to create custom lenses by providing a json template that you can use. The template outlines how the lens content must be defined, and it controls how the lens is presented within the AWS WA Tool. This sample used the example questions from the ORR Whitepaper (see above) to show how to create Custom Lens in WA console from json template with the following several steps: 9 | 1. [Prepare your custom lens WA template json file](#prepare-your-custom-lens-WA-template-json-file) 10 | 2. [Navigate to the AWS WA Tool on the AWS console and create custom lens](#navigate-to-the-aws-wa-tool-on-the-aws-console-and-create-custom-lens) 11 | 3. [Publish your custom lens and attached with version tag](#publish-your-custom-lens-and-attached-with-version-tag) 12 | 4. [Review workloads using custom lenses](#review-workloads-using-custom-lenses) 13 | 14 | ## Prepare your custom lens WA template json file 15 | Feel free to download the [ORR sample custom lens](https://github.com/aws-samples/custom-lens-wa-sample/blob/main/ORR-Lens/ORR-Whitepaper-Sample-PUBLISHED.json) and edit is as needed. 16 | 17 | ## Navigate to the AWS WA Tool on the AWS console and create custom lens 18 | After you prepare your custom lens WA template json file, you can navigate to the AWS WA Tool on the AWS console and a create custom lens. 19 | ![orr-create-custom-lens](https://user-images.githubusercontent.com/3434790/187253712-ca472b7b-30bb-41f9-8cbd-db622ecea5c9.jpg) 20 | 21 | Upload your custom lens WA template json template, and submit it. 22 | ![orr-upload-json-file](https://user-images.githubusercontent.com/3434790/187254567-fa9e9fd6-4625-4dd9-98d4-89e477172b2b.jpg) 23 | 24 | ## Publish your custom lens and attached with version tag 25 | You will find your custom lens WA in draft version. 26 | ![orr-publish-custom-lens](https://user-images.githubusercontent.com/3434790/187256107-3f3db861-bd6c-4ce1-b3ae-a924cd3607bd.jpg) 27 | 28 | Publish your draft version attached with version tag. After which the status will change from DRAFT to PUBLISHED 29 | 30 | ## Review workloads using custom lenses 31 | After your custom lens WA is published, you can define a workload in your WA console to use the custom lens. 32 | ![image 7](https://user-images.githubusercontent.com/17841922/175505004-1f9026f7-c3f8-415d-92a1-747ab68f6610.png) 33 | 34 | Input your workload information, and select the newly published 'AWS Operational Readiness Review Whtiepaper Sample' custom lens. 35 | ![orr-define-workload](https://user-images.githubusercontent.com/3434790/187257574-a0cb454a-5108-42cb-b590-9e0f6a1e71ee.jpg) 36 | 37 | You can continue reviewing and answer the questions for custom lens. 38 | ![orr-review-workload-using-custom-lens](https://user-images.githubusercontent.com/3434790/187259743-689d717e-cc64-4f57-899d-2ba05b556100.jpg) 39 | 40 | Now you can review your 'Operational Readiness Review' (ORR) with customized questionnaire and practice. 41 | ![orr-workload-review-custom-lens](https://user-images.githubusercontent.com/3434790/187260519-53a034c2-7663-448b-9011-09c7d53cf0d4.jpg) 42 | -------------------------------------------------------------------------------- /OpenSearch/README.md: -------------------------------------------------------------------------------- 1 | # AWS OpenSearch Well-Architected Custom Lens 2 | 3 | - [About](#about) 4 | - [Prepare the json file](#prepare-the-json-file) 5 | - [Create custom lens on Well-Architected Tool](#create-custom-lens-on-well-architected-tool) 6 | - [Intended Audience](#intended-audience) 7 | - [Contributing](#contributing) 8 | 9 | ## About 10 | 11 | AWS OpenSearch Well-Architected Custom Lens specifies best practices in Operational Excellence, Security, Performance Efficiency, Reliability and Cost Optimization pillars. The best practices are designed for OpenSearch workload. 12 | 13 | ## Prepare the json file 14 | Prepare your custom lens Well-Architected template json file or just download the [provided recommended glue custom lens file](custom-lens-OpenSearch.json). 15 | ```json 16 | { 17 | "schemaVersion": "2021-11-01", 18 | "name": "OpenSearch Best Practice - For WA Custom Lens", 19 | "description": "Best practices for configuring OpenSearch", 20 | "pillars": [ 21 | { 22 | "id": "AOSSOPS", 23 | "name": "Operational Excellence", 24 | "questions": [ 25 | { 26 | "id": "aossops1", 27 | "title": "How do you monitor and analyze your Amazon OpenSearch domains performance?", 28 | "description": "Amazon OpenSearch Service emits performance metrics to Amazon CloudWatch. Regularly review your cluster and instance metrics and configure recommended CloudWatch alarms based on your workload performance.", 29 | "helpfulResource": { 30 | "displayText": "Amazon OpenSearch Service exposes OpenSearch error logs, search slow logs, indexing slow logs, and audit logs in Amazon CloudWatch Logs. You can enable log publishing through AWS Console, CLI, SDK, or CloudFormation.", 31 | "url": "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/createdomain-configure-slow-logs.html" 32 | }, 33 | "choices": [ 34 | { 35 | "id": "aossops1_1", 36 | "title": "Setup the recommended CloudWatch alarms for your Amazon OpenSearch Service.", 37 | "helpfulResource": { 38 | "displayText": "Understand what recommended CloudWatch alarms are for your Amazon OpenSearch Service.", 39 | "url": "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cloudwatch-alarms.html" 40 | }, 41 | "improvementPlan": { 42 | "displayText": "Setup CloudWatch alarms for Amazon OpenSearch Service.", 43 | "url": "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cloudwatch-alarms.html" 44 | } 45 | } 46 | ] 47 | } 48 | ] 49 | } 50 | ] 51 | } 52 | 53 | ``` 54 | 55 | ## Create custom lens on Well-Architected Tool 56 | After you prepare your custom lens Well-Architected template json file, you can navigate to the AWS Well-Architected Tool on the AWS console and create custom lens. 57 | ![image 1](https://user-images.githubusercontent.com/17841922/175503831-cf89ff5e-8c6e-42c7-b796-3ff91e9d8470.png) 58 | 59 | Upload the your custom lens Well-Architected template json template, and submit it. 60 | ![image 2](https://user-images.githubusercontent.com/17841922/175503996-9b734d2c-8220-4efb-b5d2-f4ad77ad0ff4.png) 61 | 62 | And then you will find your Well-Architected custom lens in draft version. 63 | ![image 3](https://github.com/user-attachments/assets/28a0a3c0-2bd0-4d2f-b652-bc7ce3e04d9e) 64 | 65 | Publish your draft version and provide a version tag. 66 | ![image 4](https://github.com/user-attachments/assets/177e88d4-d05e-40d0-9aa9-cafa6d2b317f) 67 | 68 | After your Well-Architected custom lens is published, you can define a workload in your AWS Well-Architected console. 69 | ![image 5](https://user-images.githubusercontent.com/17841922/175505004-1f9026f7-c3f8-415d-92a1-747ab68f6610.png) 70 | 71 | Input your workload information, and select the Well-Architected custom lens you just published. 72 | ![image 6](https://github.com/user-attachments/assets/065eeac4-9030-4bec-9463-be2ed81e8a35) 73 | 74 | Choose Continue reviewing for the just published OpenSearch Well-Architected custom lens. 75 | ![image 7](https://github.com/user-attachments/assets/4cc84630-ac7b-4bad-b602-de9c804d1a54) 76 | 77 | Now you can review your OpenSearch workload with OpenSearch Well-Architected Custom Lens. 78 | ![image 8](https://github.com/user-attachments/assets/788a66ed-56be-4f64-982e-108588a0d2b5) 79 | 80 | ## Intended Audience 81 | 82 | CTO, Technical Leader or technical owner for an AWS OpenSearch workload 83 | 84 | ## Contributing 85 | - Shih-Yong Wang, Manager, Solutions Architect, AWS 86 | - [Ray Wang](mailto:hsiawang@amazon.com), Solutions Architect, AWS 87 | - Ankush Agarwal, OpenSearch SME, APAC, AWS 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Custom Lens Hub 2 | 3 | ![GitHub](https://img.shields.io/github/license/aws-samples/custom-lens-wa-hub?style=flat-square) 4 | ![GitHub commit activity (branch)](https://img.shields.io/github/commit-activity/t/aws-samples/custom-lens-wa-hub?style=flat-square) 5 | ![GitHub Repo stars](https://img.shields.io/github/stars/aws-samples/custom-lens-wa-hub?style=flat-square) 6 | ![GitHub watchers](https://img.shields.io/github/watchers/aws-samples/custom-lens-wa-hub?style=flat-square) 7 | ![GitHub forks](https://img.shields.io/github/forks/aws-samples/custom-lens-wa-hub?style=flat-square) 8 | ![GitHub contributors](https://img.shields.io/github/contributors-anon/aws-samples/custom-lens-wa-hub?style=flat-square) 9 | 10 | Provide JSON file template that demonstrate how to create customize Well-Architected reviews using Custom lenses. 11 | 12 | Note: 13 | > This repository is maintained by internal volunteer, community members and domain service experts. 14 | 15 | Disclaimer: 16 | > All the sample artifacts (JSON) were contributed by domain expert for pilot preview purpose. All the pilot preview with custom lens have to be conducted by certified domain expert after customer enablement. If you are looking for official AWS Well-Architected Lenses material for specific topics, please go to https://aws.amazon.com/architecture/well-architected/. 17 | 18 | ## :books: Background 19 | [AWS Well-Architected](https://aws.amazon.com/architecture/well-architected/) helps cloud architects build secure, high-performing, resilient, and efficient infrastructure for a variety of applications and workloads. 20 | 21 | The AWS Well-Archtected Tool makes it easy to create [Custom Lenses](https://aws.amazon.com/blogs/mt/customize-well-architected-reviews-using-custom-lenses-and-the-aws-well-architected-tool/) by providing a json template that you can use. The template outlines how the lens content must be defined, and it controls how the lens is presented within the AWS WA Tool. 22 | 23 | ## :hammer_and_wrench: Setup 24 | The quickest setup to run Customs Lens need setup includes: 25 | - An [AWS account](https://portal.aws.amazon.com/gp/aws/developer/registration/index.html) 26 | - Proper [IAM User and Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id.html) to access the AWS Well-Architected Tool with the [WellArchitectedConsoleFullAccess](https://docs.aws.amazon.com/wellarchitected/latest/userguide/iam-auth-access.html) managed policy. 27 | 28 | ## :computer: Usage 29 | This repository used DynamoDB as an example to show how to create Custom Lens in WA console from json template with several step as following: 30 | 1. [Prepare your custom lens WA template json file](/DynamoDB/README.md#prepare-your-custom-lens-WA-template-json-file) 31 | 2. [Navigate to the AWS WA Tool on the AWS console and create custom lens](/DynamoDB/README.md#navigate-to-the-aws-wa-tool-on-the-aws-console-and-create-custom-lens) 32 | 3. [Publish your custom lens and attached with version tag](/DynamoDB/README.md#publish-your-custom-lens-and-attached-with-version-tag) 33 | 4. [Review workloads using custom lenses](/DynamoDB/README.md#review-workloads-using-custom-lenses) 34 | 35 | You can also reference the [Well-Architected Lab](https://wellarchitectedlabs.com/well-architectedtool/100_labs/100_custom_lenses_on_watool/) which provide step by step guideline. 36 | 37 | ## :clipboard: Repository 38 | ### Custom Lens for AWS Key Services 39 | * [DynamoDB](/DynamoDB/) 40 | * [DocumentDB](/DocumentDB/) 41 | * [Glue](/Glue/) 42 | * [OpenSearch](/OpenSearch/) 43 | * [API Gateway and Lambda](/ApiGwLambda/) 44 | * [ElastiCache](/ElastiCache/) 45 | * [ECS](/Amazon-ECS-Lens/) 46 | * [S3](/Amazon-S3-Lens/) 47 | * [Federated Learning (Flower) on SageMaker AI](/SageMaker-Flower-Lens/) 48 | * [Apache Iceberg on Amazon S3](/Iceberg-S3-Lens) 49 | 50 | ### Custom Lens for AWS Solutions 51 | * [Streaming Media Lens](/Streaming-Media-Lens/) 52 | * [Operational Readiness Review](/ORR-Lens/) 53 | * [Intelligent Document Processing Lens](/IDP-custom-lens/) 54 | * [AWS Passport Lens](/AWS-Passport/) 55 | * [SaaS Business Lens](/SaaS-Business-Lens) 56 | * [Microsoft on AWS Lens](/MSFT-Lens) 57 | 58 | ## :bookmark: Whitepaper 59 | Whitepaper to describe candidate new Custom Lenses, which you can reference as following: 60 | * [Container Build Lens](https://docs.aws.amazon.com/wellarchitected/latest/container-build-lens/container-build-lens.html) 61 | * [Amazon ECS Lens](https://docs.aws.amazon.com/AmazonECS/latest/bestpracticesguide/intro.html) 62 | * [SAP Lens](https://docs.aws.amazon.com/wellarchitected/latest/sap-lens/sap-lens.html) 63 | * [Serverless Applications Lens](https://docs.aws.amazon.com/wellarchitected/latest/serverless-applications-lens/welcome.html) 64 | * [Hybrid Networking Lens](https://docs.aws.amazon.com/wellarchitected/latest/hybrid-networking-lens/hybrid-networking-lens.html) 65 | * [Games Industry Lens](https://docs.aws.amazon.com/wellarchitected/latest/games-industry-lens/games-industry-lens.html) 66 | * [Data Analytics Lens](https://docs.aws.amazon.com/wellarchitected/latest/analytics-lens/analytics-lens.html) 67 | * [Machine Learning Lens](https://docs.aws.amazon.com/wellarchitected/latest/machine-learning-lens/machine-learning-lens.html) 68 | * [Streaming Media Lens](https://docs.aws.amazon.com/wellarchitected/latest/streaming-media-lens/streaming-media-lens.html) 69 | * [SaaS Lens](https://docs.aws.amazon.com/wellarchitected/latest/saas-lens/saas-lens.html) 70 | * [Financial Services Industry Lens](https://docs.aws.amazon.com/wellarchitected/latest/financial-services-industry-lens/welcome.html) 71 | * [IoT Lens](https://docs.aws.amazon.com/wellarchitected/latest/iot-lens/abstract-and-introduction.html) 72 | * [IoT Lens Checklist](https://docs.aws.amazon.com/wellarchitected/latest/iot-lens-checklist/overview.html) 73 | * [High Performance Computing Lens](https://docs.aws.amazon.com/wellarchitected/latest/high-performance-computing-lens/welcome.html) 74 | 75 | ## :balance_scale: License 76 | 77 | This library is licensed under the MIT-0 License. For more details, please take a look at the [LICENSE](LICENSE) file. 78 | 79 | ## :handshake: Contributing 80 | Please read our [contributing guidelines](CONTRIBUTING.md) 81 | -------------------------------------------------------------------------------- /SaaS-Business-Lens/README.md: -------------------------------------------------------------------------------- 1 | # SaaS Business Journey Custom Lens 2 | 3 | Many companies are motivated to transform into SaaS (Software as a Service) businesses, but they often struggle to navigate this journey with confidence. The mindset required for operating a SaaS enterprise differs from that of managing a traditional software company, particularly in terms of finance, organizational structure, culture, and sales approaches. A transformation of this nature often requires a fundamental shift in strategic and operational mindset. 4 | 5 | This tool is designed to help you profile your company and examine all moving parts of your business along your SaaS journey. After careful consideration, it enables you to make informed decisions among various trade-offs. The aim is to ensure that you can think comprehensively not only about the technical aspects but also about the business facets of this transformation. 6 | 7 | By using this tool, you'll be better equipped to consider all angles thoroughly, allowing for a more holistic approach to your company's SaaS transition. This custom lens can help you: 8 | 9 | * Generating concrete strategies and forecasting for near-term plan. 10 | * Making informed decisions after thorough consideration. 11 | * Rethinking the trade-offs and the type of one-way or two-way door for decisions. 12 | * Deepening cross-functions communication and enhancing mutual understanding. 13 | 14 | 15 | ## Getting started 16 | 17 | This is a custom lens for [AWS Well-Architected Tool](https://aws.amazon.com/well-architected-tool/) to assess your SaaS business journey and implement both the strategic plans and product designs that will scale over time. This lens summarizes the AWS whitepaper - [SaaS Journey Framework: Building a New SaaS Solution on AWS](https://docs.aws.amazon.com/whitepapers/latest/saas-journey-framework/saas-journey-framework.html), addresses the challenges of the SaaS movements, and guides you to accelerate delivery of the SaaS solutions. 18 | 19 | 20 | ## Upload the JSON file 21 | 22 | - Sign in to the AWS Management Console and open the [AWS Well-Architected Tool console](https://console.aws.amazon.com/wellarchitected) 23 | - Choose **Custom lenses** 24 | - Choose **Create custom lens** 25 | - Upload the [saas_biz_custom_lens.json](./saas_biz_custom_lens.json) from this repo 26 | - Choose **Submit & Preview** to preview the custom lens, or **Submit** to submit the custom lens without previewing. 27 | 28 | For detailed instructions, see [Creating a custom lens for a workload in AWS WA Tool](https://docs.aws.amazon.com/wellarchitected/latest/userguide/lenses-create.html) 29 | 30 | You should see similar view as below: 31 | ![WAView](./img/WAView.png) 32 | 33 | ## Collaborate with your team 34 | Walk through the questions with different functional roles: 35 | - Phase 1 Business Planning : Executives, product managers and financial leaders. Technical and operational teams. 36 | - Phase 2 Product Strategy and Roadmap Development : Product managers will be at the center of the effort. Technical and operational teams. Sales and marketing. 37 | - Phase 3 Minimum Viable Service (MVS) : Product teams with product managers. Leadership team plays a key role to ensure all teams are delivering to the promise of the defined state. **As you define MVS, it will determine who should participate. E.g. If you are launching a new onboarding experience, the product, sales and marketing teams should be involved.** 38 | - Phase 4 Launch / Go-To-Market : Leaders from sales, marketing, customer success and finance. Align with investor and board-level revenue expectations. 39 | 40 | The framework is a dynamic working process that is not necessarily linear. While working on the product strategy, for example, you might revisit your business case and update it. Some activities might take place simultaneously. 41 | 42 | ## Contributing 43 | Please read our [contributing guidelines](./CONTRIBUTING.md) 44 | 45 | 46 | ## License 47 | This library is licensed under the MIT-0 License. For more details, please take a look at the [LICENSE](./LICENSE) file. 48 | 49 | ## Disclaimer 50 | This document is provided for informational purposes only. It represents the current product offerings and practices from Amazon Web Services (AWS) as of the date of issue of this document, which are subject to change without notice. Customers are responsible for making their own independent assessment of the information in this document and any use of AWS products or services, each of which is provided “as is” without warranty of any kind, whether express or implied. This document does not create any warranties, representations, contractual commitments, conditions, or assurances from AWS, its affiliates, suppliers, or licensors. The responsibilities and liabilities of AWS to its customers are controlled by AWS agreements, and this document is not part of, nor does it modify, any agreement between AWS and its customers. 51 | -------------------------------------------------------------------------------- /SaaS-Business-Lens/img/WAView.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/custom-lens-wa-hub/29b26f41ba31c25185309b90cbc2661342c8377d/SaaS-Business-Lens/img/WAView.png -------------------------------------------------------------------------------- /SageMaker-Flower-Lens/README.md: -------------------------------------------------------------------------------- 1 | # Federated Learning (Flower) on SageMaker AI Best Practice Lens 2 | Federated Learning (Flower) on SageMaker AI Best Practice Lens provides best practices for federated learning using flower framework on SageMaker AI includes 4 pillars of the AWS Well-Architected Framework: Operational Excellence, Security, Performance Efficiency, Cost Optimization. 3 | 4 | ## Contributing 5 | - Ray Wang, Senior Solutions Architect, AWS 6 | 7 | ## Reviewers 8 | - Sasi Kumar, Machine Learning Engineer, Flower Labs 9 | - Dimitris Stripelis, Research Engineer, Flower Labs 10 | -------------------------------------------------------------------------------- /SageMaker-Flower-Lens/custom-lens-sagemaker-flower-v1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "schemaVersion": "2021-11-01", 3 | "name": "Federated Learning (Flower) on SageMaker AI Best Practice Lens", 4 | "description": "Best practices for configuring Federated Learning (Flower) on SageMaker AI", 5 | "pillars": [ 6 | { 7 | "id": "PERF", 8 | "name": "Performance Efficiency", 9 | "questions": [ 10 | { 11 | "id": "PERF1", 12 | "title": "Are you aware of how data quality affects model performance in federated learning?", 13 | "description": "Ensure balanced data distribution across nodes to prevent skewed training and improve model accuracy.", 14 | "choices": [ 15 | { 16 | "id": "PERF1_1", 17 | "title": "Anomaly detection in AWS Glue Data Quality", 18 | "helpfulResource": { 19 | "displayText": "Anomaly detection employs machine learning to automatically identify unusual patterns in data", 20 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-anomaly-detection.html" 21 | }, 22 | "improvementPlan": { 23 | "displayText": "AWS Glue Data Quality helping to detect issues that traditional methods might miss.", 24 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-anomaly-detection.html" 25 | } 26 | }, 27 | { 28 | "id": "PERF1_2", 29 | "title": "Automates the creation of data quality rules", 30 | "helpfulResource": { 31 | "displayText": "Improving data quality by analyzing data and suggesting useful rules without manual code writing ", 32 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-getting-started.html" 33 | }, 34 | "improvementPlan": { 35 | "displayText": "AWS Glue Data Quality allows users to generate data quality rules automatically. ", 36 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-getting-started.html" 37 | } 38 | }, 39 | { 40 | "id": "PERF1_3", 41 | "title": "Evaluate and enhance data quality in workflow", 42 | "helpfulResource": { 43 | "displayText": "Simplifying the process of identifying and addressing data issues in workflow.", 44 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-gs-studio.html" 45 | }, 46 | "improvementPlan": { 47 | "displayText": "AWS Glue Studio enables users to evaluate and monitor data quality by adding data quality nodes to visual jobs.", 48 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-gs-studio.html" 49 | } 50 | }, 51 | { 52 | "id": "PERF1_4", 53 | "title": "Integration of data quality assessments directly into ETL job", 54 | "helpfulResource": { 55 | "displayText": "AWS Glue Studio notebooks support the integration of data quality assessments directly into ETL job scripts.", 56 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-gs-studio-notebooks.html" 57 | }, 58 | "improvementPlan": { 59 | "displayText": "Using AWS Glue 3.0 and configure your notebooks to leverage the EvaluateDataQuality class for real-time data validation.", 60 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-gs-studio-notebooks.html" 61 | } 62 | }, 63 | { 64 | "id": "PERF1_5", 65 | "title": "Implementing Custom Data Quality Rules with DQDL in AWS Glue", 66 | "helpfulResource": { 67 | "displayText": "AWS Glue DQDL supports various rule types like completeness, uniqueness, and custom SQL.", 68 | "url": "https://docs.aws.amazon.com/glue/latest/dg/dqdl.html" 69 | }, 70 | "improvementPlan": { 71 | "displayText": "The Data Quality Definition Language (DQDL) in AWS Glue enables users to define custom data quality rules.", 72 | "url": "https://docs.aws.amazon.com/glue/latest/dg/dqdl.html" 73 | } 74 | }, 75 | { 76 | "id": "PERF1_6", 77 | "title": "Setting up alerts, deployments, and scheduling", 78 | "helpfulResource": { 79 | "displayText": "Setting up alerts and notifications in Amazon EventBridge integration", 80 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-alerts.html" 81 | }, 82 | "improvementPlan": { 83 | "displayText": "AWS Glue Data Quality supports the publishing of EventBridge events", 84 | "url": "https://docs.aws.amazon.com/glue/latest/dg/data-quality-alerts.html" 85 | } 86 | } 87 | ], 88 | "riskRules": [ 89 | { 90 | "condition": "PERF1_1 && PERF1_2 && PERF1_3 && PERF1_4 && PERF1_5 && PERF1_6", 91 | "risk": "NO_RISK" 92 | }, 93 | { 94 | "condition": "(!PERF1_1) || (!PERF1_2)", 95 | "risk": "HIGH_RISK" 96 | }, 97 | { 98 | "condition": "default", 99 | "risk": "MEDIUM_RISK" 100 | } 101 | ] 102 | }, 103 | { 104 | "id": "PERF2", 105 | "title": "Do you optimize resource allocation for efficient training and inference?", 106 | "description": "Chose the right compute resource for model training and inference", 107 | "choices": [ 108 | { 109 | "id": "PERF2_1", 110 | "title": "Choose instances based on model complexity and dataset size for training", 111 | "helpfulResource": { 112 | "displayText": "Regularly evaluate performance to ensure cost-effectiveness.", 113 | "url": "https://pages.awscloud.com/rs/112-TZM-766/images/AL-ML%20for%20Startups%20-%20Select%20the%20Right%20ML%20Instance.pdf" 114 | }, 115 | "improvementPlan": { 116 | "displayText": "Opt for GPU instances for deep learning, and high-memory instances for large datasets.", 117 | "url": "https://pages.awscloud.com/rs/112-TZM-766/images/AL-ML%20for%20Startups%20-%20Select%20the%20Right%20ML%20Instance.pdf" 118 | } 119 | }, 120 | { 121 | "id": "PERF2_2", 122 | "title": "Select instances that meet cost and performance needs for inference", 123 | "helpfulResource": { 124 | "displayText": "For machine learning inference tasks, differentiate based on complexity and urgency.", 125 | "url": "https://aws.amazon.com/cn/blogs/awsmarketplace/rightsizing-amazon-sagemaker-endpoints/" 126 | }, 127 | "improvementPlan": { 128 | "displayText": "Deploy specialized hardware like AWS Inferentia for scenarios demanding low-latency", 129 | "url": "https://aws.amazon.com/cn/blogs/awsmarketplace/rightsizing-amazon-sagemaker-endpoints/" 130 | } 131 | }, 132 | { 133 | "id": "PERF2_3", 134 | "title": "Consider Auto scaling in your model inference", 135 | "helpfulResource": { 136 | "displayText": "Auto scaling dynamically adjusts the number of instances provisioned for a model in response to changes in your workload", 137 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/endpoint-auto-scaling.html" 138 | }, 139 | "improvementPlan": { 140 | "displayText": "Amazon SageMaker AI supports automatic scaling (auto scaling) for your hosted models.", 141 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/endpoint-auto-scaling.html" 142 | } 143 | } 144 | ], 145 | "riskRules": [ 146 | { 147 | "condition": "PERF2_1 && PERF2_2 && PERF2_3", 148 | "risk": "NO_RISK" 149 | }, 150 | { 151 | "condition": "(!PERF2_1) || (!PERF2_2) || (!PERF2_3)", 152 | "risk": "HIGH_RISK" 153 | }, 154 | { 155 | "condition": "default", 156 | "risk": "MEDIUM_RISK" 157 | } 158 | ] 159 | }, 160 | { 161 | "id": "PERF3", 162 | "title": "What aggregation methods ensure consistent learning across nodes?", 163 | "description": "Use methods like FedAvg for convergence and variance reduction.", 164 | "choices": [ 165 | { 166 | "id": "PERF3_1", 167 | "title": "Use FedAvg to compute a weighted average of client model updates before", 168 | "helpfulResource": { 169 | "displayText": "A common approach when clients have similar data distributions and computational power, making it suitable for general federated learning applications.", 170 | "url": "https://flower.ai/docs/framework/tutorial-series-use-a-federated-learning-strategy-pytorch.html" 171 | }, 172 | "improvementPlan": { 173 | "displayText": "FedAvg is natively supported in Flower and can be implemented by setting strategy = FedAvg() when starting the server.", 174 | "url": "https://github.com/adap/flower/tree/main/src/py/flwr/server/strategy" 175 | } 176 | }, 177 | { 178 | "id": "PERF3_2", 179 | "title": "Use FedBN to reduce variance in local model updates, leading to more stable training.", 180 | "helpfulResource": { 181 | "displayText": "Works well when data across clients is highly non-IID and datasets have varying sizes, requiring variance reduction to improve convergence.", 182 | "url": "https://flower.ai/docs/framework/how-to-implement-fedbn.html" 183 | }, 184 | "improvementPlan": { 185 | "displayText": "Flower does not provide FedBN natively, but you can modify the client logic slightly and use a network that performs batch normalisation. In general, it is easy to implement any aggregation algorithm at the server by implementing a custom strategy class.", 186 | "url": "https://flower.ai/docs/framework/tutorial-series-build-a-strategy-from-scratch-pytorch.html" 187 | } 188 | }, 189 | { 190 | "id": "PERF3_3", 191 | "title": "Use FedProx to add a proximal term that regularizes local training, preventing drastic deviations.", 192 | "helpfulResource": { 193 | "displayText": "Useful when client devices have varying computational power or when data heterogeneity causes unstable training.", 194 | "url": "https://github.com/adap/flower/blob/main/src/py/flwr/server/strategy/fedprox.py" 195 | }, 196 | "improvementPlan": { 197 | "displayText": "FedProx is available out-of-the-box in the Flower Framework, which also has multiple other aggregation algorithms", 198 | "url": "https://github.com/adap/flower/blob/main/src/py/flwr/server/strategy/fedprox.py" 199 | } 200 | }, 201 | { 202 | "id": "PERF3_4", 203 | "title": "Use adaptive optimization techniques such as Adam or SGD with momentum at the server to improve global model stability.", 204 | "helpfulResource": { 205 | "displayText": "Designed for large-scale federated learning where clients have unstable convergence patterns, requiring server-side optimization.", 206 | "url": "https://github.com/adap/flower/blob/main/src/py/flwr/server/strategy/fedopt.py" 207 | }, 208 | "improvementPlan": { 209 | "displayText": "Flower supports FedOpt via FedOpt(), which allows configuring server-side optimizers.", 210 | "url": "https://github.com/adap/flower/blob/main/src/py/flwr/server/strategy/fedopt.py" 211 | } 212 | }, 213 | { 214 | "id": "PERF3_5", 215 | "title": "Use secure aggregation to protect local model updates before transmission to the server.", 216 | "helpfulResource": { 217 | "displayText": "Critical for privacy-sensitive federated learning applications, such as fraud detection and financial risk modeling, where client updates must remain confidential.", 218 | "url": "https://flower.ai/docs/framework/contributor-ref-secure-aggregation-protocols.html" 219 | }, 220 | "improvementPlan": { 221 | "displayText": "Secure Aggregation is available out-of-the-box in Flower", 222 | "url": "https://flower.ai/docs/framework/contributor-ref-secure-aggregation-protocols.html" 223 | } 224 | } 225 | ], 226 | "riskRules": [ 227 | { 228 | "condition": "PERF3_1 && PERF3_2 && PERF3_3 && PERF3_4 && PERF3_5", 229 | "risk": "NO_RISK" 230 | }, 231 | { 232 | "condition": "(!PERF3_1) || (!PERF3_4)", 233 | "risk": "HIGH_RISK" 234 | }, 235 | { 236 | "condition": "default", 237 | "risk": "MEDIUM_RISK" 238 | } 239 | ] 240 | }, 241 | { 242 | "id": "PERF4", 243 | "title": "How to communicate efficiently during the federated training process?", 244 | "description": "Use model compression techniques along with other optimisations to reduce the communication overhead in federated learning", 245 | "choices": [ 246 | { 247 | "id": "PERF4_1", 248 | "title": "Use model compression techniques along with adaptive client sampling and asynchronous updates", 249 | "helpfulResource": { 250 | "displayText": "It is recommended to optimise the communications between the server and the clients when sharing updates after each round of training. ", 251 | "url": "https://flower.ai/docs/baselines/tamuna.html" 252 | }, 253 | "improvementPlan": { 254 | "displayText": "Use techniques such as TAMUNA (available as a baseline in the Flower framework) to improve communication efficiency during the federated training process", 255 | "url": "https://flower.ai/docs/baselines/tamuna.html" 256 | } 257 | }, 258 | { 259 | "id": "PERF4_2", 260 | "title": "If finetuning large language models, use methods like parameter efficient finetuning (PEFT) to keep update size in limit", 261 | "helpfulResource": { 262 | "displayText": "Improves efficiency when the models to the trained/finetuned are very large", 263 | "url": "https://flower.ai/blog/2024-03-14-llm-flowertune-federated-llm-finetuning-with-flower/" 264 | }, 265 | "improvementPlan": { 266 | "displayText": "Finetune LLMs with efficiency improving techniques such as PEFT and LoRA", 267 | "url": "https://flower.ai/docs/examples/flowertune-llm.html" 268 | } 269 | } 270 | ], 271 | "riskRules": [ 272 | { 273 | "condition": "PERF4_1 && PERF4_2", 274 | "risk": "NO_RISK" 275 | }, 276 | { 277 | "condition": "default", 278 | "risk": "MEDIUM_RISK" 279 | } 280 | ] 281 | } 282 | ] 283 | }, 284 | { 285 | "id": "OPS", 286 | "name": "Operational Excellence", 287 | "questions": [ 288 | { 289 | "id": "OPS1", 290 | "title": "How do you collect and monitoring metric and log for your model training and inference?", 291 | "description": "You can monitor your model training and inference with SageMaker AI and Cloud Watch native integration", 292 | "choices": [ 293 | { 294 | "id": "OPS1_1", 295 | "title": "Metrics for monitoring Amazon SageMaker AI with Amazon CloudWatch", 296 | "helpfulResource": { 297 | "displayText": "You can access historical information and gain a better perspective on how your web application or service is performing.", 298 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html" 299 | }, 300 | "improvementPlan": { 301 | "displayText": "You can monitor Amazon SageMaker AI using Amazon CloudWatch, which collects raw data and processes it into readable, near real-time metrics.", 302 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html" 303 | } 304 | }, 305 | { 306 | "id": "OPS1_2", 307 | "title": "Log groups and streams that Amazon SageMaker AI sends to Amazon CloudWatch Logs", 308 | "helpfulResource": { 309 | "displayText": "You can debug your training job and model inference with CloudWatch Log", 310 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/logging-cloudwatch.html" 311 | }, 312 | "improvementPlan": { 313 | "displayText": "By default, log data is stored in CloudWatch Logs indefinitely. You can configure how long to store log data in a log group.", 314 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/logging-cloudwatch.html" 315 | } 316 | } 317 | ], 318 | "riskRules": [ 319 | { 320 | "condition": "OPS1_1 && OPS1_2", 321 | "risk": "NO_RISK" 322 | }, 323 | { 324 | "condition": "(!OPS1_1) || (!OPS1_2)", 325 | "risk": "HIGH_RISK" 326 | }, 327 | { 328 | "condition": "default", 329 | "risk": "MEDIUM_RISK" 330 | } 331 | ] 332 | }, 333 | { 334 | "id": "OPS2", 335 | "title": "How do you implement MLOps in federated learning?", 336 | "description": "One should consider all the steps in a Federated learning process for operations", 337 | "choices": [ 338 | { 339 | "id": "OPS2_1", 340 | "title": "Flower Server starts and notifies the Client to start the process", 341 | "helpfulResource": { 342 | "displayText": "Initialize and trigger a Federated Learning task and start Flower Server is the init step", 343 | "url": "https://aws.amazon.com/blogs/compute/cross-account-integration-with-amazon-sns/" 344 | }, 345 | "improvementPlan": { 346 | "displayText": "SNS can be used to send messages within a single account or to resources in different accounts to create administrative isolation.", 347 | "url": "https://aws.amazon.com/blogs/compute/cross-account-integration-with-amazon-sns/" 348 | } 349 | }, 350 | { 351 | "id": "OPS2_2", 352 | "title": "Flower Client can send notification to client team", 353 | "helpfulResource": { 354 | "displayText": "When federated learning is finished or you get some error in Training job, you can send an email to the ML team in your client side account", 355 | "url": "https://repost.aws/questions/QUM-mTUMVsRHqKNO0elOvyYA/how-to-trigger-an-event-bridge-rule-for-sagemaker-training-job-when-failed" 356 | }, 357 | "improvementPlan": { 358 | "displayText": "You can trigger an event bridge rule for sagemaker training job completed or failed with error and send notification to your team", 359 | "url": "https://repost.aws/questions/QUM-mTUMVsRHqKNO0elOvyYA/how-to-trigger-an-event-bridge-rule-for-sagemaker-training-job-when-failed" 360 | } 361 | }, 362 | { 363 | "id": "OPS2_3", 364 | "title": "Deploy and monitor global model", 365 | "helpfulResource": { 366 | "displayText": "You can ask your ML team evaluate the model performance or deploy to SageMaker AI endpoint when model training complete", 367 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html" 368 | }, 369 | "improvementPlan": { 370 | "displayText": "You can automate model deployment to SageMaker AI when finish federated training.", 371 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-deployment.html" 372 | } 373 | } 374 | ], 375 | "riskRules": [ 376 | { 377 | "condition": "OPS2_1 && OPS2_2 && OPS2_3", 378 | "risk": "NO_RISK" 379 | }, 380 | { 381 | "condition": "(!OPS2_1)", 382 | "risk": "HIGH_RISK" 383 | }, 384 | { 385 | "condition": "default", 386 | "risk": "MEDIUM_RISK" 387 | } 388 | ] 389 | } 390 | ] 391 | }, 392 | { 393 | "id": "SEC", 394 | "name": "Security", 395 | "questions": [ 396 | { 397 | "id": "SEC1", 398 | "title": "How will you design network infrastructure for multi-environment federated learning?", 399 | "description": "Use Site to site VPN or Dedicated network connection for secure, scalable connectivity.", 400 | "choices": [ 401 | { 402 | "id": "SEC1_1", 403 | "title": "Build a secure VPN connection between AWS and Customer Network", 404 | "helpfulResource": { 405 | "displayText": "Consider a secure and quick network setup for connecting on-premise infrastructure to the cloud.", 406 | "url": "https://docs.aws.amazon.com/vpn/latest/s2svpn/VPC_VPN.html" 407 | }, 408 | "improvementPlan": { 409 | "displayText": "Use AWS Site-to-Site VPN to establish an encrypted VPN connection between on-premise infrastructure and AWS, enabling secure federated learning model updates.", 410 | "url": "https://docs.aws.amazon.com/whitepapers/latest/building-scalable-secure-multi-vpc-network-infrastructure/transit-gateway.html" 411 | } 412 | }, 413 | { 414 | "id": "SEC1_2", 415 | "title": "Build internal and cross-organization networks", 416 | "helpfulResource": { 417 | "displayText": "Build a centralized network for internal or cross-region connectivity within large enterprises.", 418 | "url": "https://docs.aws.amazon.com/whitepapers/latest/building-scalable-secure-multi-vpc-network-infrastructure/transit-gateway.html" 419 | }, 420 | "improvementPlan": { 421 | "displayText": "Use AWS Transit Gateway to efficiently manage and route traffic between multiple AWS VPCs, branch offices, and federated learning nodes across regions.", 422 | "url": "https://docs.aws.amazon.com/whitepapers/latest/building-scalable-secure-multi-vpc-network-infrastructure/transit-gateway.html" 423 | } 424 | }, 425 | { 426 | "id": "SEC1_3", 427 | "title": "Build a high-performance network connection for AWS and Customer Network", 428 | "helpfulResource": { 429 | "displayText": "Consider a high-performance hybrid cloud setup for secure, low-latency data transfer between on-premise and cloud environments.", 430 | "url": "https://docs.aws.amazon.com/whitepapers/latest/aws-vpc-connectivity-options/aws-direct-connect.html" 431 | }, 432 | "improvementPlan": { 433 | "displayText": "Use AWS Direct Connect to establish a private, low-latency, high-bandwidth connection for stable and secure model training and updates in a federated learning architecture.", 434 | "url": "https://docs.aws.amazon.com/whitepapers/latest/aws-vpc-connectivity-options/aws-direct-connect.html" 435 | } 436 | }, 437 | { 438 | "id": "SEC1_4", 439 | "title": "Ensure TLS connections are enabled for Flower internal communications", 440 | "helpfulResource": { 441 | "displayText": "Make the communications between the analyst and the server + the server and the client secure ", 442 | "url": "https://flower.ai/docs/framework/how-to-enable-tls-connections.html" 443 | }, 444 | "improvementPlan": { 445 | "displayText": "Use the TLS-enabled Flower CLI and ensure client to server communication happens with TLS.", 446 | "url": "https://flower.ai/docs/framework/how-to-enable-tls-connections.html" 447 | } 448 | } 449 | ], 450 | "riskRules": [ 451 | { 452 | "condition": "SEC1_1 && SEC1_2 && SEC1_3 && SEC1_4", 453 | "risk": "NO_RISK" 454 | }, 455 | { 456 | "condition": "(!SEC1_1) || (!SEC1_3) || (!SEC1_4)", 457 | "risk": "HIGH_RISK" 458 | }, 459 | { 460 | "condition": "default", 461 | "risk": "MEDIUM_RISK" 462 | } 463 | ] 464 | }, 465 | { 466 | "id": "SEC2", 467 | "title": "How do security and privacy measures influence the reliability of federated learning systems?", 468 | "description": "Implement robust security measures to protect data during transmission, and use privacy-preserving techniques like differential privacy.", 469 | "choices": [ 470 | { 471 | "id": "SEC2_1", 472 | "title": "Use Partial Homomorphic Encryption (PHE) for faster operations or Fully Homomorphic Encryption (FHE) for complex computations.", 473 | "helpfulResource": { 474 | "displayText": "Optimize computational overhead by selecting the appropriate HE scheme.", 475 | "url": "https://www.youtube.com/watch?v=rgDx_o0YWR8" 476 | }, 477 | "improvementPlan": { 478 | "displayText": "Integrate HE within the Flower framework to manage encrypted computations across clients.", 479 | "url": "https://www.youtube.com/watch?v=rgDx_o0YWR8" 480 | } 481 | }, 482 | { 483 | "id": "SEC2_2", 484 | "title": "Consider Secure Multi-Party Computation (SMPC) in your federated learning process.", 485 | "helpfulResource": { 486 | "displayText": "Prioritize network security and careful handling of computation values." 487 | }, 488 | "improvementPlan": { 489 | "displayText": "Incorporate SMPC into the Flower framework to handle secure computations in a distributed environment." 490 | } 491 | }, 492 | { 493 | "id": "SEC2_3", 494 | "title": "Implement cryptographic techniques for secure model update aggregation.", 495 | "helpfulResource": { 496 | "displayText": "Enhance privacy with differential privacy by adding noise to the model updates.", 497 | "url": "https://arxiv.org/pdf/2205.06117" 498 | }, 499 | "improvementPlan": { 500 | "displayText": "Use TensorFlow Federated or PyTorch for built-in secure aggregation support.", 501 | "url": "https://arxiv.org/pdf/2205.06117" 502 | } 503 | } 504 | ], 505 | "riskRules": [ 506 | { 507 | "condition": "SEC2_1 && SEC2_2 && SEC2_3", 508 | "risk": "NO_RISK" 509 | }, 510 | { 511 | "condition": "(!SEC2_1) || (!SEC2_2) || (!SEC2_3)", 512 | "risk": "HIGH_RISK" 513 | }, 514 | { 515 | "condition": "default", 516 | "risk": "MEDIUM_RISK" 517 | } 518 | ] 519 | } 520 | ] 521 | }, 522 | { 523 | "id": "COST", 524 | "name": "Cost Optimization", 525 | "questions": [ 526 | { 527 | "id": "COST1", 528 | "title": "Have you considered EC2 Spot Instances for model training, and what are your concerns about interruptions?", 529 | "description": "Utilize EC2 Spot Instances for training models to reduce costs.", 530 | "choices": [ 531 | { 532 | "id": "COST1_1", 533 | "title": "Consider Spot instance for your Federated Learning Training job", 534 | "helpfulResource": { 535 | "displayText": "Managed spot training can optimize the cost of training models up to 90% over on-demand instances. SageMaker AI manages the Spot interruptions on your behalf.", 536 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html" 537 | }, 538 | "improvementPlan": { 539 | "displayText": "Amazon SageMaker AI makes it easy to train machine learning models using managed Amazon EC2 Spot instances.", 540 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html" 541 | } 542 | }, 543 | { 544 | "id": "COST1_2", 545 | "title": "Handle your training job interruption with checkpoint", 546 | "helpfulResource": { 547 | "displayText": "Checkpoints are snapshots of the model and can be configured by the callback functions of ML frameworks.", 548 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html" 549 | }, 550 | "improvementPlan": { 551 | "displayText": "Use checkpoints in Amazon SageMaker AI to save the state of machine learning (ML) models during training.", 552 | "url": "https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html" 553 | } 554 | } 555 | ], 556 | "riskRules": [ 557 | { 558 | "condition": "COST1_1 && COST1_2", 559 | "risk": "NO_RISK" 560 | }, 561 | { 562 | "condition": "(!COST1_1) || (!COST1_2)", 563 | "risk": "HIGH_RISK" 564 | }, 565 | { 566 | "condition": "default", 567 | "risk": "MEDIUM_RISK" 568 | } 569 | ] 570 | }, 571 | { 572 | "id": "COST2", 573 | "title": "Do you fully understand the pricing model for Amazon SageMaker AI?", 574 | "description": "SageMaker pricing depends on instance type, training time, inference, storage, and services.", 575 | "choices": [ 576 | { 577 | "id": "COST2_1", 578 | "title": "Understand the SageMaker AI pricing model", 579 | "helpfulResource": { 580 | "displayText": "SageMaker AI have different pricing strategy for different feature you engage in model training and inference.", 581 | "url": "https://aws.amazon.com/sagemaker/pricing/" 582 | }, 583 | "improvementPlan": { 584 | "displayText": "SageMaker AI pricing page provide more information. ", 585 | "url": "https://aws.amazon.com/sagemaker/pricing/" 586 | } 587 | }, 588 | { 589 | "id": "COST2_2", 590 | "title": "Consider SageMaker Saving Plans for your model training and inference", 591 | "helpfulResource": { 592 | "displayText": "These plans automatically apply to the usage of eligible to SageMaker training and real-time infernece", 593 | "url": "https://aws.amazon.com/savingsplans/ml-pricing/" 594 | }, 595 | "improvementPlan": { 596 | "displayText": "Amazon SageMaker Savings Plans provide the most flexibility and help to reduce your costs by up to 64%.", 597 | "url": "https://aws.amazon.com/savingsplans/ml-pricing/" 598 | } 599 | } 600 | ], 601 | "riskRules": [ 602 | { 603 | "condition": "COST2_1 && COST2_2", 604 | "risk": "NO_RISK" 605 | }, 606 | { 607 | "condition": "(!COST2_1)", 608 | "risk": "HIGH_RISK" 609 | }, 610 | { 611 | "condition": "default", 612 | "risk": "MEDIUM_RISK" 613 | } 614 | ] 615 | } 616 | ] 617 | } 618 | ] 619 | } -------------------------------------------------------------------------------- /Streaming-Media-Lens/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 |

5 | 6 |

AWS Streaming Media Well-Architected Custom Lens

7 | 8 |
9 | 10 | [![Status](https://img.shields.io/badge/status-active-success.svg?style=flat-square)]() 11 | ![GitHub](https://img.shields.io/github/license/aws-samples/custom-lens-wa-hub?style=flat-square) 12 | 13 |
14 | 15 | --- 16 | ## 📝 Table of Contents 17 | 18 | - [About](#about) 19 | - [How to create custom lens on Well-Architected Tool](#getting_started) 20 | - [Report Demo](#demo) 21 | - [Contributing](#authors) 22 | - [Acknowledgments](#acknowledgement) 23 | 24 | ## 🧐 About
25 | 26 | AWS Streaming Media Well-Architected Custom Lens specifies best practices in Operational Excellence, Security, Performance Efficiency, Reliability and Cost Optimization pillars. The best practices are designed for Media Streaming workload. 27 | 28 | The Streaming Media Lens five pillars questionnaire is based on the [Streaming Media Lens whitepaper](https://docs.aws.amazon.com/wellarchitected/latest/streaming-media-lens/streaming-media-lens.html), more detailed information please reference back to the AWS official whitepaper. 29 | 30 | ### Intended Audience 31 | 32 | CTO, Technical Leader or technical owner for an AWS Media Streaming workload 33 | 34 | 35 | 36 | ## 🏁 How to create custom lens on Well-Architected Tool 37 | 38 | ### Step 1 39 | 40 | Download the latest custom lens from *aws-samples* > *custom-lens-wa-hub* > [Streaming-Media-Lens](https://github.com/aws-samples/custom-lens-wa-hub/tree/main/Streaming-Media-Lens), download the latest custom lens json file. 41 | 42 | ![圖片1](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/12beeb72-9e91-4ab9-ac06-e197fcca4ef9) 43 | 44 | 45 | ### Step 2 46 | 47 | Go to *AWS Well-Architected Tool console* > *Custom lenses* > Create custom lens. 48 | 49 | ![圖片2](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/165cc83f-6725-49b6-8359-1bb1402695c6) 50 | 51 | 52 | ### Step 3 53 | 54 | Upload JSON file you just download in *Step 1* and Submit. 55 | 56 | ![圖片3](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/c4949cd1-8127-40c4-a3d5-091bb0c6db5a) 57 | 58 | 59 | ### Step 4 60 | 61 | Choose the Lens you just created (DRAFT) and go to *Actions* > *Publish lens*, and input the Version name you want to publish. 62 | 63 | ![圖片4](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/a4513bbb-6fb2-4817-93e0-dab993a63f59) 64 | 65 | ![圖片5](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/36f7834b-2be6-4609-98a3-d7a5e08ad4b8) 66 | 67 | ![圖片6](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/11cafe44-1b87-425a-97f9-7f1804631099) 68 | 69 | Once the custom lens published, you will see it shows up on Custom lenses dashboard and show it Status as *“PUBLISHED”*. 70 | 71 | 72 | ### Step 5 73 | 74 | Create or choose the workload you want to apply Custom lens. Choose or edit the workload Lenses you preferred to apply > *Save*. 75 | 76 | ![圖片7](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/741b52b6-a41b-42e0-9dcc-cb6c167cb369) 77 | 78 | ![圖片8](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/ed86c679-800a-4242-a520-fc910dd6f7b4) 79 | 80 | 81 | ### Step 6 82 | 83 | Once you add the custom lens to the workload, you can choose *Continue reviewing* to get start the WA review. 84 | 85 | ![圖片9](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/3ff6b0a0-6a47-4a07-8d41-64c53e51db4a) 86 | 87 | ![圖片10](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/53a145d6-1156-4ca1-9fdc-d06bd7dc0e2d) 88 | 89 | 90 | ### Step 7 91 | 92 | After you finish all question review with customer’s workload, you can *Generate report* for your customers. 93 | 94 | ![圖片11](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/5f5c3e85-1e89-4f09-983d-ee9e7a65e82d) 95 | 96 | 97 | 98 | ## 📋 Well-Architected Report Demo 99 | 100 | After you finish the Well-Architected with the customers, you can generate a WA report as below: 101 | 102 | ![圖片12](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/c90a7bcb-14a0-4823-a817-3f9a1ffc4df2) 103 | 104 | ![圖片13](https://github.com/alegriaw/custom-lens-wa-hub/assets/10775909/4b3a83a3-cb5b-44f3-ac6a-e36908013d96) 105 | 106 | 107 | 108 | ## ✍️ Contributors 109 | 110 | - [Jill Wang](mailto:jiwanwg@amazon.com), Sr. Technical Account Manager, GCR, AWS 111 | - [Ray Wang](mailto:hsiawang@amazon.com), Solutions Architect, GCR, AWS 112 | 113 | ## 🤝 Co-Contributors 114 | 115 | - [Bruce Ross](mailto:rosbruc@amazon.com) - Lead Solutions Architect, Well-Architected Lens 116 | - [Ally Yong](mailto:allyyong@amazon.com) - Sr. Media Edge Services Specialist SA 117 | 118 | 119 | ## 🎉 Acknowledgements 120 | 121 | - Hat tip to anyone whose code was used 122 | - Inspiration 123 | - References 124 | --------------------------------------------------------------------------------