├── .github └── workflows │ └── main.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── Templates ├── AWS-HPC-Cluster.yaml ├── HPC-AD.yaml ├── HPC-Networking.yaml └── HPC-Storage.yaml ├── docs ├── CF-2VPC.FSx.md ├── CF-2VPC.md ├── CF-3VPC.FSx.md ├── CF-3VPC.md ├── EnginFrame-1Click-Arch.png ├── README.md ├── step2.png ├── step3.png ├── step4.png ├── step5.png ├── step6.png └── step7.png ├── enginframe ├── alb.session.closing.hook.sh ├── alb.session.starting.hook.sh ├── efinstall.config ├── fm.browse.ui ├── mysql │ ├── ef.mysql │ └── efdb.config └── services │ └── ef-services.Linux Desktop.2022-11-22T10-22-47.zip ├── modules ├── 04.configure.disable.anacron.compute.sh ├── 04.configure.slurm.AllOrNothing.headnode.sh ├── 07.configure.slurm.tagging.headnode.sh ├── 10.install.enginframe.headnode.sh ├── 12.configure.enginframe.alb.headnode.sh ├── 15.install.dcv.broker.headnode.sh ├── 20.install.dcv.slurm.headnode.sh ├── 25.install.dcv-server.compute.sh ├── 25.install.dcv-server.gpu.sh ├── 26.configure.dcv.alb.compute.sh ├── 27.configure.dcv.nat.compute.sh ├── 30.install.dcv-sm-agent.compute.sh ├── 40.install.monitoring.compute.sh └── 40.install.monitoring.headnode.sh ├── monitoring ├── custom-metrics │ ├── 1h-cost-metrics.sh │ ├── 1m-cost-metrics.sh │ └── aws-region.py ├── docker-compose │ ├── docker-compose.compute.gpu.yml │ ├── docker-compose.compute.yml │ └── docker-compose.headnode.yml ├── grafana │ ├── dashboards │ │ ├── ParallelCluster.json │ │ ├── compute-node-details.json │ │ ├── compute-node-list.json │ │ ├── costs.json.OLD │ │ ├── dashboards.yml │ │ ├── gpu.json │ │ ├── headnode-details.json │ │ └── logs.json │ └── datasources │ │ └── datasource.yml ├── nginx │ └── conf.d │ │ └── nginx.conf ├── prometheus-slurm-exporter │ └── slurm_exporter.service ├── prometheus │ └── prometheus.yml └── www │ ├── aws-logo.svg │ ├── background.png │ └── index.html ├── parallelcluster ├── config.ap-east-1.sample.yaml ├── config.ap-northeast-1.sample.yaml ├── config.ap-northeast-2.sample.yaml ├── config.ap-south-1.sample.yaml ├── config.ca-central-1.sample.yaml ├── config.eu-central-1.sample.yaml ├── config.eu-north-1.sample.yaml ├── config.eu-south-1.sample.yaml ├── config.eu-west-1.sample.yaml ├── config.us-east-1.sample.yaml ├── config.us-east-2.sample.yaml ├── config.us-west-1.sample.yaml └── config.us-west-2.sample.yaml └── scripts ├── Cloud9-Bootstrap.sh ├── motd ├── post.install.sh └── prolog.sh /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CloudFormation Template S3 upload 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@master 13 | - uses: jakejarvis/s3-sync-action@master 14 | with: 15 | args: --acl public-read 16 | env: 17 | AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} 18 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 19 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 20 | AWS_REGION: ${{ secrets.AWS_REGION }} 21 | SOURCE_DIR: ${{ secrets.SOURCE_DIR }} -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 1Click-HPC 2 | This project aims at speeding up the deployment of an HPC Cluster on AWS. 3 | Following the instructions below a fully functional and ready to use HPC cluster will be created with just 1-Click. 4 | 5 | # Get Started 6 | 7 | ## Step 1 8 | Click the link below corresponding to your preferred [AWS Region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/) . 9 | You will be asked a few questions about services like VPC, FSx, etc; if you have no idea how to answer or what these services are, just leave the detault values. 10 | 1Click-HPC will take care of creating everything needed for your HPC Cluster to run. 11 | 12 | | Region | Launch | 13 | |--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 14 | | US | --- | 15 | | N. Virginia (us-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 16 | | Ohio (us-east-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 17 | | N. California (us-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 18 | | Oregon (us-west-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 19 | | Canada | --- | 20 | | Central (ca-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ca-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ca-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 21 | | EU |---| 22 | | Frankfurt (eu-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 23 | | Ireland (eu-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 24 | | Stockholm (eu-north-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-north-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-north-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 25 | | Milan (eu-south-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-south-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-south-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 26 | | APJ |---| 27 | | Tokyo (ap-northeast-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-northeast-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 28 | | Seoul (ap-northeast-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-northeast-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 29 | | Hong Kong (ap-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 30 | | Mumbai (ap-south-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-south-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-south-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.yaml&stackName=hpc-cluster) | 31 | 32 | 33 | ## Step 2 34 | 1. Just change the "Stack Name" as you like. 35 | 2. Enter the password for the Admin user "ec2-user": 36 | 2. Check the checkbox to acknowledge the IAM resources creations. 37 | 3. Click the "Create Stack" botton. 38 |
39 | 40 | ![Step2](docs/step2.png?raw=true "Step 2") 41 | 42 | ## Step 3 43 | 1. Click on the "Stack Name" to monitor the cluster creation steps. 44 | 2. Wait until all the resources are created 45 |
46 | 47 | ![Step3](docs/step3.png?raw=true "Step 3") 48 | 49 | ## Step 4 50 | 1. When the cluster creation is completed, go to the "outputs" tab 51 | 2. Click the "EnginFrameURL" to access your HPC Cluster using the EnginFrame portal. 52 | 3. Alternatively, Click the "Cloud9URL" if you wish to connect to your Cloud9 Instance and then ssh into your cluster form there. 53 |
54 | 55 | ![Step4](docs/step4.png?raw=true "Step 4") 56 | 57 | ## Step 5 58 | You can login on EnginFrame by using "ec2-user" as username and the password you chose. 59 | ```Username: ec2-user``` 60 |
61 | ```Password: *********``` 62 |
63 | 64 | ![Step5](docs/step5.png?raw=true "Step 5") 65 | 66 | ## Step 6 67 | After you login, you are redirected to the "list Spoolers" page. 68 | Spoolers are scratch area located in the /fsx FileSystem that are managed by EnginFrame and used as the HPC jobs execution directory. 69 |
70 | 71 | ![Step6](docs/step6.png?raw=true "Step 6") 72 | 73 | ## Step 7 74 | We would reccomend to immediatelly change the password by using the service as below. 75 |
76 | 77 | ![Step7](docs/step7.png?raw=true "Step 7") 78 | 79 | # Architecture 80 | ![Architecture](docs/EnginFrame-1Click-Arch.png?raw=true "Architecture") 81 | 82 | # Additional Docs 83 | 84 | https://github.com/aws-samples/1click-hpc/tree/main/docs 85 | 86 | # License 87 | 88 | This software is licensed under the MIT-0 License. See the LICENSE file. -------------------------------------------------------------------------------- /Templates/HPC-Networking.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: HPC-Networking 3 | 4 | Parameters: 5 | CidrBlock: 6 | AllowedPattern: '((\d{1,3})\.){3}\d{1,3}/\d{1,2}' 7 | Default: 10.3.0.0/16 8 | Description: VPC CIDR Block (eg 10.3.0.0/16) 9 | Type: String 10 | CidrPublicSubnetA: 11 | AllowedPattern: '((\d{1,3})\.){3}\d{1,3}/\d{1,2}' 12 | Default: 10.3.128.0/20 13 | Description: VPC CIDR Block for the Public Subnet A (eg 10.3.128.0/20) 14 | Type: String 15 | CidrPublicSubnetB: 16 | AllowedPattern: '((\d{1,3})\.){3}\d{1,3}/\d{1,2}' 17 | Default: 10.3.144.0/20 18 | Description: VPC CIDR Block for the Public Subnet B (eg 10.3.144.0/20) 19 | Type: String 20 | CidrPrivateSubnetA: 21 | AllowedPattern: '((\d{1,3})\.){3}\d{1,3}/\d{1,2}' 22 | Default: 10.3.0.0/18 23 | Description: VPC CIDR Block for the Private Subnet A (eg 10.3.1.0/18) 24 | Type: String 25 | CidrPrivateSubnetB: 26 | AllowedPattern: '((\d{1,3})\.){3}\d{1,3}/\d{1,2}' 27 | Default: 10.3.64.0/18 28 | Description: VPC CIDR Block for the Private Subnet B (eg 10.3.64.0/18) 29 | Type: String 30 | 31 | Mappings: 32 | RegionMap: 33 | us-east-1: 34 | ZoneId1: use1-az6 35 | ZoneId2: use1-az4 36 | us-east-2: 37 | ZoneId1: use2-az2 38 | ZoneId2: use2-az3 39 | us-west-1: 40 | ZoneId1: usw1-az1 41 | ZoneId2: usw1-az3 42 | us-west-2: 43 | ZoneId1: usw2-az1 44 | ZoneId2: usw2-az2 45 | eu-central-1: 46 | ZoneId1: euc1-az3 47 | ZoneId2: euc1-az2 48 | eu-west-1: 49 | ZoneId1: euw1-az1 50 | ZoneId2: euw1-az2 51 | eu-north-1: 52 | ZoneId1: eun1-az2 53 | ZoneId2: eun1-az1 54 | ca-central-1: 55 | ZoneId1: cac1-az2 56 | ZoneId2: cac1-az1 57 | eu-south-1: 58 | ZoneId1: eus1-az2 59 | ZoneId2: eus1-az1 60 | ap-east-1: 61 | ZoneId1: ape1-az3 62 | ZoneId2: ape1-az2 63 | ap-northeast-1: 64 | ZoneId1: apne1-az4 65 | ZoneId2: apne1-az1 66 | ap-northeast-2: 67 | ZoneId1: apne2-az1 68 | ZoneId2: apne2-az3 69 | ap-south-1: 70 | ZoneId1: aps1-az2 71 | ZoneId2: aps1-az3 72 | 73 | Resources: 74 | 75 | VPC: 76 | Type: AWS::EC2::VPC 77 | Properties: 78 | CidrBlock: !Ref CidrBlock 79 | EnableDnsHostnames: true 80 | EnableDnsSupport: true 81 | Tags: 82 | - Key: "Name" 83 | Value: !Sub '${AWS::StackName}-HPC-VPC' 84 | 85 | PublicSubnetA: 86 | Type: AWS::EC2::Subnet 87 | Properties: 88 | VpcId: !Ref VPC 89 | CidrBlock: !Ref CidrPublicSubnetA 90 | AvailabilityZone: !GetAtt AvailabiltyZone1.ZoneName 91 | MapPublicIpOnLaunch: true 92 | Tags: 93 | - Key: Name 94 | Value: !Sub '${AWS::StackName}-Public-SubnetA' 95 | 96 | PublicSubnetB: 97 | Type: AWS::EC2::Subnet 98 | Properties: 99 | VpcId: !Ref VPC 100 | CidrBlock: !Ref CidrPublicSubnetB 101 | AvailabilityZone: !GetAtt AvailabiltyZone2.ZoneName 102 | MapPublicIpOnLaunch: true 103 | Tags: 104 | - Key: Name 105 | Value: !Sub '${AWS::StackName}-Public-SubnetB' 106 | 107 | InternetGateway: 108 | Type: AWS::EC2::InternetGateway 109 | 110 | AttachGateway: 111 | Type: AWS::EC2::VPCGatewayAttachment 112 | Properties: 113 | VpcId: !Ref VPC 114 | InternetGatewayId: !Ref InternetGateway 115 | 116 | PublicRouteTable: 117 | Type: AWS::EC2::RouteTable 118 | Properties: 119 | VpcId: !Ref VPC 120 | Tags: 121 | - Key: Name 122 | Value: !Sub '${AWS::StackName}-Public-Route' 123 | PublicRoute1: 124 | Type: AWS::EC2::Route 125 | Properties: 126 | RouteTableId: !Ref PublicRouteTable 127 | DestinationCidrBlock: 0.0.0.0/0 128 | GatewayId: !Ref InternetGateway 129 | 130 | PublicSubnetARouteTableAssociation: 131 | Type: AWS::EC2::SubnetRouteTableAssociation 132 | Properties: 133 | SubnetId: !Ref PublicSubnetA 134 | RouteTableId: !Ref PublicRouteTable 135 | 136 | PublicSubnetBRouteTableAssociation: 137 | Type: AWS::EC2::SubnetRouteTableAssociation 138 | Properties: 139 | SubnetId: !Ref PublicSubnetB 140 | RouteTableId: !Ref PublicRouteTable 141 | 142 | PrivateSubnetA: 143 | Type: AWS::EC2::Subnet 144 | Properties: 145 | VpcId: !Ref VPC 146 | AvailabilityZone: !GetAtt AvailabiltyZone1.ZoneName 147 | CidrBlock: !Ref CidrPrivateSubnetA 148 | MapPublicIpOnLaunch: false 149 | Tags: 150 | - Key: Name 151 | Value: !Sub '${AWS::StackName}-Private-SubnetA' 152 | 153 | PrivateSubnetB: 154 | Type: AWS::EC2::Subnet 155 | Properties: 156 | VpcId: !Ref VPC 157 | AvailabilityZone: !GetAtt AvailabiltyZone2.ZoneName 158 | CidrBlock: !Ref CidrPrivateSubnetB 159 | MapPublicIpOnLaunch: false 160 | Tags: 161 | - Key: Name 162 | Value: !Sub '${AWS::StackName}-Private-SubnetB' 163 | 164 | NatGatewayAEIP: 165 | Type: AWS::EC2::EIP 166 | DependsOn: AttachGateway 167 | Properties: 168 | Domain: vpc 169 | 170 | NatGatewayBEIP: 171 | Type: AWS::EC2::EIP 172 | DependsOn: AttachGateway 173 | Properties: 174 | Domain: vpc 175 | 176 | NatGatewayA: 177 | Type: AWS::EC2::NatGateway 178 | Properties: 179 | AllocationId: !GetAtt NatGatewayAEIP.AllocationId 180 | SubnetId: !Ref PublicSubnetA 181 | 182 | NatGatewayB: 183 | Type: AWS::EC2::NatGateway 184 | Properties: 185 | AllocationId: !GetAtt NatGatewayBEIP.AllocationId 186 | SubnetId: !Ref PublicSubnetB 187 | 188 | PrivateRouteTableA: 189 | Type: AWS::EC2::RouteTable 190 | Properties: 191 | VpcId: !Ref VPC 192 | Tags: 193 | - Key: Name 194 | Value: !Sub '${AWS::StackName}-Private-Route-A' 195 | 196 | PrivateRouteTableB: 197 | Type: AWS::EC2::RouteTable 198 | Properties: 199 | VpcId: !Ref VPC 200 | Tags: 201 | - Key: Name 202 | Value: !Sub '${AWS::StackName}-Private-Route-B' 203 | 204 | DefaultPrivateRouteA: 205 | Type: AWS::EC2::Route 206 | Properties: 207 | RouteTableId: !Ref PrivateRouteTableA 208 | DestinationCidrBlock: 0.0.0.0/0 209 | NatGatewayId: !Ref NatGatewayA 210 | 211 | DefaultPrivateRouteB: 212 | Type: AWS::EC2::Route 213 | Properties: 214 | RouteTableId: !Ref PrivateRouteTableB 215 | DestinationCidrBlock: 0.0.0.0/0 216 | NatGatewayId: !Ref NatGatewayB 217 | 218 | PrivateSubnetARouteTableAssociation: 219 | Type: AWS::EC2::SubnetRouteTableAssociation 220 | Properties: 221 | RouteTableId: !Ref PrivateRouteTableA 222 | SubnetId: !Ref PrivateSubnetA 223 | 224 | PrivateSubnetBRouteTableAssociation: 225 | Type: AWS::EC2::SubnetRouteTableAssociation 226 | Properties: 227 | RouteTableId: !Ref PrivateRouteTableB 228 | SubnetId: !Ref PrivateSubnetB 229 | 230 | AvailabiltyZone1: 231 | Type: Custom::AvailabiltyZone 232 | DependsOn: LogGroupGetAZLambdaFunction 233 | Properties: 234 | ServiceToken: !GetAtt GetAZLambdaFunction.Arn 235 | ZoneId: !FindInMap [RegionMap, !Ref "AWS::Region", ZoneId1] 236 | 237 | AvailabiltyZone2: 238 | Type: Custom::AvailabiltyZone 239 | DependsOn: LogGroupGetAZLambdaFunction 240 | Properties: 241 | ServiceToken: !GetAtt GetAZLambdaFunction.Arn 242 | ZoneId: !FindInMap [RegionMap, !Ref "AWS::Region", ZoneId2] 243 | 244 | LogGroupGetAZLambdaFunction: 245 | Type: AWS::Logs::LogGroup 246 | DeletionPolicy: Delete 247 | Properties: 248 | LogGroupName: !Sub /aws/lambda/${GetAZLambdaFunction} 249 | RetentionInDays: 7 250 | 251 | GetAZLambdaFunction: 252 | Type: AWS::Lambda::Function 253 | Properties: 254 | Description: GetAZLambdaFunction 255 | Timeout: 60 256 | Runtime: python3.7 257 | Handler: index.handler 258 | Role: !GetAtt GetAZLambdaRole.Arn 259 | Code: 260 | ZipFile: | 261 | import cfnresponse 262 | from json import dumps 263 | from boto3 import client 264 | EC2 = client('ec2') 265 | def handler(event, context): 266 | if event['RequestType'] in ('Create', 'Update'): 267 | print(dumps(event, default=str)) 268 | data = {} 269 | try: 270 | response = EC2.describe_availability_zones( 271 | Filters=[{'Name': 'zone-id', 'Values': [event['ResourceProperties']['ZoneId']]}] 272 | ) 273 | print(dumps(response, default=str)) 274 | data['ZoneName'] = response['AvailabilityZones'][0]['ZoneName'] 275 | except Exception as error: 276 | cfnresponse.send(event, context, cfnresponse.FAILED, {}, reason=error) 277 | finally: 278 | cfnresponse.send(event, context, cfnresponse.SUCCESS, data) 279 | else: 280 | cfnresponse.send(event, context, cfnresponse.SUCCESS, {}) 281 | Tags: 282 | - Key: Name 283 | Value: !Sub ${AWS::StackName}-GetAZLambdaFunction 284 | 285 | GetAZLambdaRole: 286 | Type: AWS::IAM::Role 287 | Properties: 288 | Path: / 289 | Description: GetAZLambdaFunction 290 | AssumeRolePolicyDocument: 291 | Version: '2012-10-17' 292 | Statement: 293 | - Effect: Allow 294 | Action: 295 | - sts:AssumeRole 296 | Principal: 297 | Service: 298 | - !Sub 'lambda.${AWS::URLSuffix}' 299 | ManagedPolicyArns: 300 | - !Sub 'arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole' 301 | Policies: 302 | - PolicyName: GetAZLambdaFunction 303 | PolicyDocument: 304 | Version: '2012-10-17' 305 | Statement: 306 | - Sid: ec2 307 | Effect: Allow 308 | Action: 309 | - ec2:DescribeAvailabilityZones 310 | Resource: 311 | - '*' 312 | Tags: 313 | - Key: Name 314 | Value: !Sub ${AWS::StackName}-GetAZLambdaFunction 315 | 316 | S3Endpoint: 317 | Type: 'AWS::EC2::VPCEndpoint' 318 | Properties: 319 | VpcEndpointType: 'Gateway' 320 | ServiceName: !Sub 'com.amazonaws.${AWS::Region}.s3' 321 | RouteTableIds: 322 | - !Ref PublicRouteTable 323 | - !Ref PrivateRouteTableA 324 | - !Ref PrivateRouteTableB 325 | VpcId: !Ref VPC 326 | 327 | localSG: 328 | Type: AWS::EC2::SecurityGroup 329 | Properties: 330 | GroupDescription: Allow all traffic from resources in VPC 331 | VpcId: 332 | Ref: VPC 333 | SecurityGroupIngress: 334 | - IpProtocol: -1 335 | CidrIp: !Ref CidrBlock 336 | SecurityGroupEgress: 337 | - IpProtocol: -1 338 | CidrIp: !Ref CidrBlock 339 | 340 | Outputs: 341 | VPC: 342 | Description: The ID of the VPC 343 | Value: !Ref VPC 344 | Export: 345 | Name: !Sub "${AWS::StackName}-VPC" 346 | PrivateSubnetA: 347 | Description: The ID of the PrivateSubnetA 348 | Value: !Ref PrivateSubnetA 349 | Export: 350 | Name: !Sub "${AWS::StackName}-PrivateSubnetA" 351 | PrivateSubnetB: 352 | Description: The ID of the PrivateSubnetB 353 | Value: !Ref PrivateSubnetB 354 | Export: 355 | Name: !Sub "${AWS::StackName}-PrivateSubnetB" 356 | PublicSubnetA: 357 | Description: The ID of the PublicSubnetA 358 | Value: !Ref PublicSubnetA 359 | Export: 360 | Name: !Sub "${AWS::StackName}-PublicSubnetA" 361 | PublicSubnetB: 362 | Description: The ID of the PublicSubnetB 363 | Value: !Ref PublicSubnetB 364 | Export: 365 | Name: !Sub "${AWS::StackName}-PublicSubnetB" 366 | localSG: 367 | Description: The ID of the localSG 368 | Value: !Ref localSG 369 | Export: 370 | Name: !Sub "${AWS::StackName}-localSG" -------------------------------------------------------------------------------- /Templates/HPC-Storage.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: HPC-FSx-Lustre 3 | 4 | Parameters: 5 | PrivateSubnet: 6 | Description: The ID of your private subnet. 7 | Type: String 8 | AllowedPattern: ^(subnet-[0-9a-z]+)$ 9 | SecurityGroup: 10 | Description: The ID of the SecurityGroup you want to attach. 11 | Type: String 12 | AllowedPattern: ^(sg-[0-9a-z]+)$ 13 | 14 | Resources: 15 | 16 | FSx: 17 | DeletionPolicy: Retain 18 | Type: AWS::FSx::FileSystem 19 | Properties: 20 | FileSystemType: LUSTRE 21 | StorageCapacity: 1200 22 | StorageType: SSD 23 | SubnetIds: 24 | - !Ref PrivateSubnet 25 | SecurityGroupIds: 26 | - !Ref SecurityGroup 27 | LustreConfiguration: 28 | WeeklyMaintenanceStartTime: "4:00:00" 29 | DeploymentType: PERSISTENT_2 30 | PerUnitStorageThroughput: 1000 31 | DataCompressionType: LZ4 32 | FileSystemTypeVersion: "2.12" -------------------------------------------------------------------------------- /docs/CF-2VPC.FSx.md: -------------------------------------------------------------------------------- 1 | # 1Click-HPC with existing FSx and VPC (Public only) 2 | This CloudFormation Template allows you to deploy 1Click-HPC using your existing FSx for Lustre FS, within an existing VPC with 2 Public subnets (No private subnet). 3 |
4 | Click the link below corresponding to your preferred [AWS Region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/) . 5 | 6 | | Region | Launch | 7 | |--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 8 | | US | --- | 9 | | N. Virginia (us-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 10 | | Ohio (us-east-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 11 | | N. California (us-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 12 | | Oregon (us-west-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 13 | | Canada | --- | 14 | | Central (ca-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ca-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ca-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 15 | | EU |---| 16 | | Frankfurt (eu-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 17 | | Ireland (eu-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 18 | | Stockholm (eu-north-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-north-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-north-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 19 | | APJ |---| 20 | | Tokyo (ap-northeast-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-northeast-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 21 | | Hong Kong (ap-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 22 | | Mumbai (ap-south-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-south-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-south-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.FSx.yaml&stackName=hpc-cluster) | 23 | -------------------------------------------------------------------------------- /docs/CF-2VPC.md: -------------------------------------------------------------------------------- 1 | # 1Click-HPC within existing VPC (Public only) 2 | This CloudFormation Template allows you to deploy 1Click-HPC within an existing VPC with 2 Public subnets (No private subnet). 3 |
4 | Click the link below corresponding to your preferred [AWS Region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/) . 5 | 6 | | Region | Launch | 7 | |--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 8 | | US | --- | 9 | | N. Virginia (us-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 10 | | Ohio (us-east-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 11 | | N. California (us-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 12 | | Oregon (us-west-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 13 | | Canada | --- | 14 | | Central (ca-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ca-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ca-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 15 | | EU |---| 16 | | Frankfurt (eu-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 17 | | Ireland (eu-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 18 | | Stockholm (eu-north-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-north-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-north-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 19 | | APJ |---| 20 | | Tokyo (ap-northeast-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-northeast-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 21 | | Hong Kong (ap-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 22 | | Mumbai (ap-south-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-south-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-south-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PublicVPC.yaml&stackName=hpc-cluster) | 23 | -------------------------------------------------------------------------------- /docs/CF-3VPC.FSx.md: -------------------------------------------------------------------------------- 1 | # 1Click-HPC with existing FSx and VPC (Public & Private) 2 | This CloudFormation Template allows you to deploy 1Click-HPC using your existing FSx for Lustre FS, within an existing VPC with 2 Public and 1 Private subnets. 3 |
4 | Please note that the cluster is deployed into the Private subnet. Public subnets are being used to host Cloud9 and the Application Load Balancer. 5 |
6 | Click the link below corresponding to your preferred [AWS Region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/) . 7 | 8 | | Region | Launch | 9 | |--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 10 | | US | --- | 11 | | N. Virginia (us-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 12 | | Ohio (us-east-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 13 | | N. California (us-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 14 | | Oregon (us-west-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 15 | | Canada | --- | 16 | | Central (ca-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ca-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ca-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 17 | | EU |---| 18 | | Frankfurt (eu-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 19 | | Ireland (eu-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 20 | | Stockholm (eu-north-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-north-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-north-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 21 | | APJ |---| 22 | | Tokyo (ap-northeast-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-northeast-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 23 | | Hong Kong (ap-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 24 | | Mumbai (ap-south-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-south-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-south-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.FSx.yaml&stackName=hpc-cluster) | 25 | -------------------------------------------------------------------------------- /docs/CF-3VPC.md: -------------------------------------------------------------------------------- 1 | # 1Click-HPC within existing VPC (Public & Private) 2 | This CloudFormation Template allows you to deploy 1Click-HPC within an existing VPC with 2 Public and 1 Private subnets. 3 |
4 | Please note that the cluster is deployed into the Private subnet. Public subnets are being used to host Cloud9 and the Application Load Balancer. 5 |
6 | Click the link below corresponding to your preferred [AWS Region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/) . 7 | 8 | | Region | Launch | 9 | |--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 10 | | US | --- | 11 | | N. Virginia (us-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 12 | | Ohio (us-east-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-east-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 13 | | N. California (us-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 14 | | Oregon (us-west-2) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/us-west-2.svg)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 15 | | Canada | --- | 16 | | Central (ca-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ca-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ca-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 17 | | EU |---| 18 | | Frankfurt (eu-central-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-central-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-central-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 19 | | Ireland (eu-west-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-west-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 20 | | Stockholm (eu-north-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/eu-north-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=eu-north-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 21 | | APJ |---| 22 | | Tokyo (ap-northeast-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-northeast-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 23 | | Hong Kong (ap-east-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-east-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-east-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 24 | | Mumbai (ap-south-1) | [![Launch](https://samdengler.github.io/cloudformation-launch-stack-button-svg/images/ap-south-1.svg)](https://console.aws.amazon.com/cloudformation/home?region=ap-south-1#/stacks/quickcreate?templateUrl=https%3A%2F%2Fenginframe.s3.amazonaws.com%2FAWS-HPC-Cluster.PrivateVPC.yaml&stackName=hpc-cluster) | 25 | -------------------------------------------------------------------------------- /docs/EnginFrame-1Click-Arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/EnginFrame-1Click-Arch.png -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # QuickStart 2 | In case you do not want to use our 1Click-HPC Cloudformation template, but you still want to build your cluster with all the components and modules available in thie reporitory, you can follow the instruction below to configure your ParallelCluster configuration file. 3 | You can create a new cluster using your existing configuration file and just add the following parameters, everything will be installed and configured automatically.
4 | If this is your first approach to AWS ParallelCluster, either go back to the section above or follow all the steps of our [Workshop](https://www.hpcworkshops.com/03-hpc-aws-parallelcluster-workshop.html) and include the following configuration: 5 | ```ini 6 | [cluster yourcluster] 7 | ... 8 | post_install = https://raw.githubusercontent.com/aws-samples/1click-hpc/main/scripts/post.install.sh 9 | post_install_args = "05.install.ldap.server.headnode.sh 06.install.ldap.client.compute.sh 06.install.ldap.client.headnode.sh 10.install.enginframe.headnode.sh 11.install.ldap.enginframe.headnode.sh 20.install.dcv.slurm.headnode.sh 25.install.dcv-server.compute.sh 35.install.dcv.slurm.compute.sh" 10 | extra_json = {"post_install":{"enginframe":{"ef_admin_pass":"Put_Your_Password_HERE"}}} 11 | tags = {"EnginFrame" : "true"} 12 | ... 13 | ``` 14 |
Note: You need to specify a custom Security Group (that allows inbound connection to the port 8443) defined as `additional_sg` parameter in the `[VPC]` section of your AWS ParallelCluster config file.
15 | 16 | # (Optional) QuickStart parameters customization 17 | In addition to the Quickstart deployment, there are a few parameters that you can optionally define to customize the components installed.
18 | These parameters are defined as part of the `extra_json` [parameter](https://docs.aws.amazon.com/parallelcluster/latest/ug/cluster-definition.html#extra-json) in the [cluster section](https://docs.aws.amazon.com/parallelcluster/latest/ug/cluster-definition.html) of the AWS ParallelCluster configuration file. 19 | If the `extra_json` is not specified, all the components will be installed using the default values.
20 | See below a example: 21 | ```json 22 | { 23 | "post_install": { 24 | "enginframe": { 25 | "nice_root": "/fsx/nice", 26 | "ef_admin": "ec2-user", 27 | "ef_conf_root": "/fsx/nice/enginframe/conf", 28 | "ef_data_root": "/fsx/nice/enginframe/data", 29 | "ef_spooler": "/fsx/nice/enginframe/spoolers", 30 | "ef_repository": "/fsx/nice/enginframe/repository", 31 | "ef_admin_pass": "Change_this!" 32 | }, 33 | "dcvsm": { 34 | "agent_broker_port": 8445, 35 | "broker_ca": "/home/ec2-user/dcvsmbroker_ca.pem", 36 | "client_broker_port": 8446 37 | }, 38 | "dcv": { 39 | "dcv_queue_keyword": "dcv" 40 | } 41 | } 42 | } 43 | ``` 44 | * `nice_root` by default `${SHARED_FS_DIR}/nice` , is the base directory where EnginFrame is installed. 45 | * `ef_admin` by default `ec2-user` , is the EnginFrame user with administrative rights. 46 | * `ef_conf_root` by default `${NICE_ROOT}/enginframe/conf`, is the path of the EnginFrame configuration directory. 47 | * `ef_data_root` by default `${NICE_ROOT}/enginframe/data`, is the path of the EnginFrame data directory. 48 | * `ef_spooler` by default `${NICE_ROOT}/enginframe/spoolers`, is the path of the EnginFrame Spoolers. Please consider that the Spoolers are the loaction where your jobs are executed. 49 | * `ef_repository` by default `${NICE_ROOT}/enginframe/repository`, is the EnginFrame repository directory path. 50 | * `ef_admin_pass` by default `Change_this!` , is the EnginFrame admin password. Use this user and pass for your first login into EnginFrame. 51 | * `agent_broker_port` by default `8445`, is the DCV Session Manager Broker port. 52 | * `broker_ca` by default `/home/ec2-user/dcvsmbroker_ca.pem`, is the location for the DCV Session Manager Broker certificate. 53 | * `client_broker_port` by default `8446` , is the DCV Session Manager Broker port used by the client. 54 | * `dcv_queue_keyword` by default `dcv` , is a keyword that identifies the queues of your cluster where you want to enable DCV. 55 | 56 | **Note:** Because of the `extra_json` is a parameter in a `.ini` file, you need to put your custom json on a single line. 57 | You can use the following command to convert your json into a one-line json: 58 | ```bash 59 | tr -d '\n' < your_extra.json 60 | ``` 61 | See below an example output. 62 | ```json 63 | { "post_install": { "enginframe": { "nice_root": "/fsx/nice", "ef_admin": "ec2-user", "ef_conf_root": "/fsx/nice/enginframe/conf", "ef_data_root": "/fsx/nice/enginframe/data", "ef_spooler": "/fsx/nice/enginframe/spoolers", "ef_repository": "/fsx/nice/enginframe/repository", "ef_admin_pass": "Change_this!" }, "dcvsm": { "agent_broker_port": 8445, "broker_ca": "/home/ec2-user/dcvsmbroker_ca.pem", "client_broker_port": 8446 }, "dcv": { "dcv_queue_keyword": "dcv" }}} 64 | ``` 65 | 66 | # (Optional) Launch script customization 67 | An additional way to further customize the installation and configuration of your components is by downlaoding the scripts locally, modify them, and put them back onto S3.
68 | ```bash 69 | export S3_BUCKET= 70 | 71 | aws s3 cp --quiet --recursive 1click-hpc/scripts/ s3://$S3_BUCKET/scripts/ 72 | aws s3 cp --quiet --recursive 1click-hpc/packages/ s3://$S3_BUCKET/packages/ 73 | aws s3 cp --quiet --recursive 1click-hpc/parallelcluster/ s3://$S3_BUCKET/parallelcluster/ 74 | aws s3 cp --quiet --recursive 1click-hpc/enginframe/ s3://$S3_BUCKET/enginframe/ 75 | ``` 76 | 77 | In this case, your AWS ParallelCluster configuration file has the following parameteres: 78 | ```ini 79 | post_install = s3:///scripts/post.install.sh 80 | post_install_args = "01.install.enginframe.headnode.sh 03.install.dcv.slurm.headnode.sh 04.install.dcv-server.compute.sh 06.install.dcv.slurm.compute.sh" 81 | ``` 82 | 83 | The first one, `post_install`, specifies the S3 bucket you choose to store your post_install bash script. 84 | This is the main script that will run all the secondary scripts for installing EnginFrame, DCV Session Manager, DCV Server, and other components.
85 | The second parameter, `post_install_args`, contains the scripts being launched for installing the selected components.
86 | EnginFrame and DCV Session Manager Broker, and all the other secondary scripts are build indipendently, so you can potentially install just one of them.
87 |
88 | 89 |
Note: This procedure has been tested with EnginFrame version 2020.0 and DCV Session Manager Broker version 2020.2. With easy modifications, though, it can work with previous versions, just mind to add the license management.
90 |

Requirements

91 | To perform a successful installation of EnginFrame and DCV Sesssion Manager broker, you’ll need:
92 |
  • An S3 bucket, made accessible to ParallelCluster via its s3_read_resource or s3_read_write_resource [cluster] settings. Refer to ParallelCluster configuration for details. 93 |
  • An EnginFrame efinstall.config file, containing the desired settings for EnginFrame installation. This enables post-install script to install EnginFrame in unattended mode. An example efinstall.config is provided in this post code: You an review and modify it according to your preferences.
    Alternatively, you can generate your own one by performing an EnginFrame installation: in this case an efinstall.config containing all your choices will be generated in the folder where you ran the installation. 94 |
  • A security group allowing EnginFrame inbound port. By default ParallelCluster creates a new security group with just port 22 publicly opened, so you can either use a replacement (via ParallelCluster vpc_security_group_id setting) or add an additional security group (additional_sg setting). In this post I’ll specify an additional security group. 95 |
  • ParallelCluster configuration including post_install and post_install_args as mentioned above and described later with more details 96 |
  • (optionally) EnginFrame and DCV Session Manager packages, available online from https://download.enginframe.com. Having them in the bucket avoids the need for outgoing internet access for your ParallelCluster headnode to download them. In this article I’ll instead have them copied into my target S3 bucket. My scripts will copy them from S3 to the headnode node. 97 |
Note: neither EnginFrame 2020 or DCV Session Manager Broker need a license if running on EC2 instances. For more details please refer to their documentation.
98 | 99 |

Troubleshooting

100 | Detailed output log is available on the headnode node, in:
101 |
  • /var/log/cfn-init.log 102 |
  • /var/log/cfn-init-cmd.log 103 |
You can reach it via ssh, after getting the headnode node IP address from AWS Console → EC2 → Instances and looking for an instance named HeadNode.
104 | 105 | ## Security 106 | 107 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 108 | 109 | ## License 110 | 111 | This library is licensed under the MIT-0 License. See the LICENSE file. 112 | 113 | -------------------------------------------------------------------------------- /docs/step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/step2.png -------------------------------------------------------------------------------- /docs/step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/step3.png -------------------------------------------------------------------------------- /docs/step4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/step4.png -------------------------------------------------------------------------------- /docs/step5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/step5.png -------------------------------------------------------------------------------- /docs/step6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/step6.png -------------------------------------------------------------------------------- /docs/step7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/docs/step7.png -------------------------------------------------------------------------------- /enginframe/alb.session.closing.hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 1999-2021 by Nice, srl., 4 | # Via Milliavacca, 9 5 | # 14100 Asti - ITALY 6 | # All rights reserved. 7 | # 8 | # This software is the confidential and proprietary information 9 | # of Nice, srl. ("Confidential Information"). 10 | # You shall not disclose such Confidential Information 11 | # and shall use it only in accordance with the terms of 12 | # the license agreement you entered into with Nice. 13 | 14 | # This script configures an AWS Application Load Balancer (ALB) to disable a connection to an host 15 | # where an Interactive Session was running. 16 | # This script is meant to be used with DCV 2017 (and later) interactive sessions only. 17 | 18 | # This script delete the Target Group containing the instance where the Session was running 19 | # and delete the previously created Listener Rule. 20 | 21 | # The Listener Rule has the role to associate the input URL path to the Target Group. This path 22 | # must be the web url path of the DCV server running on the execution node. 23 | # Since it not possible to do URL path translation with ALB, every DCV server must have an unique 24 | # web url path configured. It is suggested to use the hostname of the node as web url path 25 | # for the DCV server running on that node. 26 | 27 | # The maximum number of Listener Rule per ALB is 100, hence a single ALB can handle at maximum 28 | # 100 Interactive Session running concurrently. To increase this limit, consider to add more ALB 29 | # in the infrastructure. 30 | 31 | # Prerequisites for: 32 | # EnginFrame node: 33 | # - AWS Command Line Interface (CLI) must be installed 34 | # - Since this script is going to be executed by the user running the EnginFrame Server, i.e. the Apache Tomcat user, 35 | # an AWS CLI profile must be configured for that user, having the permissions to list instances and to manage load balancers. 36 | # (see https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) 37 | # Or alternatively, if EnginFrame is installed into an EC2 instance, configure the correct AWS role for this instance. 38 | # 39 | # AWS account: 40 | # - AWS Application Load Balancer (ALB) and an HTTPS listener with a Default Target Group must be already configured and running. 41 | # 42 | # DCV server node: 43 | # - configure each DCV server node with a unique web url path (see dcv.conf) 44 | 45 | # Configuration parameters: 46 | 47 | # ALB public DNS name 48 | ALB_PUBLIC_DNS_NAME= 49 | # ALB port 50 | ALB_PORT=443 51 | # AWS default region 52 | export AWS_DEFAULT_REGION= 53 | 54 | _die() { 55 | echo "ERROR: $@" 56 | exit 1 57 | } 58 | 59 | _help() { 60 | _cmd=$(basename "$0") 61 | echo "${_cmd}" 62 | echo "Usage:" 63 | echo " ${_cmd} \"\" \"\" \"\" \"\" \"\" \"\"" 64 | echo " ${_cmd} \"tmp3569402005256372176\" \"alb-enginframe-xxx.eu-west-1.elb.amazonaws.com\" 443 \"10.0.0.10\" 8443 \"/dcv-server1\"" 65 | } 66 | 67 | # Input parameters: 68 | # - $1 session-id 69 | # - $2 alb-host (alb public dnsname) 70 | # - $3 alb-port 71 | # - $4 target-host (private dnsname) 72 | # - $5 target-port 73 | # - $6 target-web-url-path (it must start with the "/" character) 74 | main() { 75 | # parse input parameters 76 | if [[ $# -lt 3 ]] ; then 77 | _help 78 | exit 0 79 | fi 80 | local -- _session_id=$1 81 | local -- _alb_host=$2 82 | local -- _alb_port=$3 83 | 84 | [ -z "${_session_id}" ] && _die "Missing input Session Id parameter." 85 | [ -z "${_alb_host}" ] && _die "Missing input ALB Host parameter." 86 | [ -z "${_alb_port}" ] && _die "Missing input ALB Port parameter." 87 | 88 | # check if AWS Cli is in the path 89 | aws help >/dev/null || _die "AWS Cli is not installed." 90 | 91 | # get ALB Amazon Resource Name (ARN) by dns-name 92 | local -- _alb_arn=$(aws elbv2 describe-load-balancers --query "LoadBalancers[? DNSName == '${_alb_host}'].LoadBalancerArn" --output text) 93 | [ -n "${_alb_arn}" ] || _die "Unable to get ALB identifier for the ALB (${_alb_host})." 94 | 95 | # get Listener arn 96 | local -- _listener_arn=$(aws elbv2 describe-listeners --load-balancer-arn "${_alb_arn}" \ 97 | --query 'Listeners[? Port == `'${_alb_port}'`].ListenerArn' --output text) 98 | [ -n "${_listener_arn}" ] || _die "Listener for port (${_alb_port}) does not exist in the ALB (${_alb_host})." 99 | 100 | # get Target Group arn 101 | local -- _target_group_name=$(printf "%s" "${_session_id}" | tr -c 'a-zA-Z0-9' -) 102 | local -- _target_group_arn=$(aws elbv2 describe-target-groups --load-balancer-arn "${_alb_arn}" \ 103 | --query "TargetGroups[? TargetGroupName == '${_target_group_name}'].TargetGroupArn" --output text) 104 | [ -n "${_target_group_arn}" ] || _die "Unable to get Target Group (${_target_group_name})" 105 | 106 | # get Rule arn 107 | local -- _rule_arn=$(aws elbv2 describe-rules --listener-arn "${_listener_arn}" \ 108 | --query "Rules[? Actions[? TargetGroupArn == '${_target_group_arn}']].RuleArn" --output text) 109 | [ -n "${_rule_arn}" ] || _die "Unable to get Rule for Target Group (${_target_group_arn}) in the Listener (${_listener_arn})." 110 | 111 | # delete Rule 112 | aws elbv2 delete-rule --rule-arn "${_rule_arn}" >/dev/null 113 | [ $? -eq 0 ] || _die "Unable to delete Listener Rule (${_rule_arn})." 114 | 115 | # delete Target Group 116 | aws elbv2 delete-target-group --target-group-arn "${_target_group_arn}" >/dev/null 117 | [ $? -eq 0 ] || _die "Unable to delete Target Group (${_target_group_arn})." 118 | } 119 | 120 | # Check it's a DCV 2017 interactive session. 121 | if [ "${INTERACTIVE_SESSION_REMOTE}" = "dcv2" ]; then 122 | main "${INTERACTIVE_SESSION_REMOTE_SESSION_ID}" "${ALB_PUBLIC_DNS_NAME}" "${ALB_PORT}" 123 | fi 124 | 125 | # ex:ts=4:sw=4:et:ft=sh: -------------------------------------------------------------------------------- /enginframe/alb.session.starting.hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 1999-2021 by Nice, srl., 4 | # Via Milliavacca, 9 5 | # 14100 Asti - ITALY 6 | # All rights reserved. 7 | # 8 | # This software is the confidential and proprietary information 9 | # of Nice, srl. ("Confidential Information"). 10 | # You shall not disclose such Confidential Information 11 | # and shall use it only in accordance with the terms of 12 | # the license agreement you entered into with Nice. 13 | 14 | # This script configures an AWS Application Load Balancer (ALB) to enable a connection to an host 15 | # where an Interactive Session is running. 16 | # This script is meant to be used with DCV 2017 (and later) interactive sessions only. 17 | 18 | # This script creates a new Target Group containing the instance where the Session is running 19 | # and add a new Listener Rule for the HTTPS listener of the ALB. 20 | 21 | # The Listener Rule has the role to associate the input URL path to the Target Group. This path 22 | # must be the web url path of the DCV server running on the execution node. 23 | # Since it not possible to do URL path translation with ALB, every DCV server must have an unique 24 | # web url path configured. It is suggested to use the hostname of the node as web url path 25 | # for the DCV server running on that node. 26 | 27 | # The maximum number of Listener Rule per ALB is 100, hence a single ALB can handle at maximum 28 | # 100 Interactive Session running concurrently. To increase this limit, consider to add more ALB 29 | # in the infrastructure. 30 | 31 | # Prerequisites for: 32 | # EnginFrame node: 33 | # - AWS Command Line Interface (CLI) must be installed 34 | # - Since this script is going to be executed by the user running the EnginFrame Server, i.e. the Apache Tomcat user, 35 | # an AWS CLI profile must be configured for that user, having the permissions to list instances and to manage load balancers. 36 | # (see https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) 37 | # Or alternatively, if EnginFrame is installed into an EC2 instance, configure the correct AWS role for this instance. 38 | # 39 | # AWS account: 40 | # - AWS Application Load Balancer (ALB) and an HTTPS listener with a Default Target Group must be already configured and running. 41 | # 42 | # DCV server node: 43 | # - configure each DCV server node with a unique web url path (see dcv.conf) 44 | 45 | # Configuration parameters: 46 | 47 | # ALB public DNS name 48 | ALB_PUBLIC_DNS_NAME= 49 | # ALB port 50 | ALB_PORT=443 51 | # AWS default region 52 | export AWS_DEFAULT_REGION= 53 | 54 | _die() { 55 | echo "ERROR: $@" 56 | exit 1 57 | } 58 | 59 | _help() { 60 | _cmd=$(basename "$0") 61 | echo "${_cmd}" 62 | echo "Usage:" 63 | echo " ${_cmd} \"\" \"\" \"\" \"\" \"\" \"\"" 64 | echo " ${_cmd} \"tmp3569402005256372176\" \"alb-enginframe-xxx.eu-west-1.elb.amazonaws.com\" 443 \"10.0.0.10\" 8443 \"/dcv-server1\"" 65 | } 66 | 67 | # Input parameters: 68 | # - $1 session-id 69 | # - $2 alb-host (alb public dnsname) 70 | # - $3 alb-port 71 | # - $4 target-host (private dnsname) 72 | # - $5 target-port 73 | # - $6 target-web-url-path (it must start with the "/" character) 74 | main() { 75 | # parse input parameters 76 | if [[ $# -lt 6 ]] ; then 77 | _help 78 | exit 0 79 | fi 80 | local -- _session_id=$1 81 | local -- _alb_host=$2 82 | local -- _alb_port=$3 83 | local -- _instance_id=$4 84 | local -- _target_port=$5 85 | local -- _target_web_url_path=$6 86 | 87 | [ -z "${_session_id}" ] && _die "Missing input Session Id parameter." 88 | [ -z "${_alb_host}" ] && _die "Missing input ALB Host parameter." 89 | [ -z "${_alb_port}" ] && _die "Missing input ALB Port parameter." 90 | [ -z "${_instance_id}" ] && _die "Missing input InstanceID." 91 | [ -z "${_target_port}" ] && _die "Missing input Target Port parameter." 92 | [ -z "${_target_web_url_path}" ] && _die "Missing input Target Web Url Path parameter." 93 | 94 | # check if AWS Cli is in the path 95 | aws help >/dev/null || _die "AWS Cli is not installed." 96 | 97 | # get ALB Amazon Resource Name (ARN) by dns-name 98 | local -- _alb_arn=$(aws elbv2 describe-load-balancers --query "LoadBalancers[? DNSName == '${_alb_host}'].LoadBalancerArn" --output text) 99 | [ -n "${_alb_arn}" ] || _die "Unable to get ALB identifier for the ALB (${_alb_host})." 100 | 101 | # detect VPC of the ALB 102 | local -- _vpc_id=$(aws elbv2 describe-load-balancers --load-balancer-arns "${_alb_arn}" \ 103 | --query "LoadBalancers[].VpcId" --output text) 104 | [ -n "${_vpc_id}" ] || _die "Unable to detect VPC of the ALB (${_alb_host})." 105 | 106 | # check if Listener exist 107 | local -- _listener_arn=$(aws elbv2 describe-listeners --load-balancer-arn "${_alb_arn}" \ 108 | --query 'Listeners[? Port == `'${_alb_port}'`].ListenerArn' --output text) 109 | [ -n "${_listener_arn}" ] || _die "Listener for port (${_alb_port}) does not exist in the ALB (${_alb_host})." 110 | 111 | # check if Target Group for the given session already exists 112 | local -- _target_group_name=$(printf "%s" "${_session_id}" | tr -c 'a-zA-Z0-9' -) 113 | local -- _target_group_arn=$(aws elbv2 describe-target-groups --load-balancer-arn "${_alb_arn}" \ 114 | --query "TargetGroups[? TargetGroupName == '${_target_group_name}'].TargetGroupArn" --output text) 115 | if [ -z "${_target_group_arn}" ]; then 116 | 117 | # create new target group for the given instance (Healty Check 404 is expected from the DCV Server) 118 | _target_group_arn=$(aws elbv2 create-target-group --name "${_target_group_name}" --protocol HTTPS --port "${_target_port}" --matcher "HttpCode=404" --vpc-id "${_vpc_id}" \ 119 | --query "TargetGroups[0].TargetGroupArn" --output text) 120 | [ -n "${_target_group_arn}" ] || _die "Unable to create Target Group (${_target_group_name}) in the VPC (${_vpc_id})" 121 | 122 | # enable sticky session 123 | #aws elbv2 modify-target-group-attributes --target-group-arn "${_target_group_arn}" --attributes "Key=stickiness.enabled,Value=true" >/dev/null 124 | #[ $? -eq 0 ] || _die "Unable to set sticky session for the Target Group (${_target_group_arn})." 125 | 126 | # register instance in the new target group 127 | aws elbv2 register-targets --target-group-arn "${_target_group_arn}" --targets "Id=${_instance_id}" >/dev/null 128 | [ $? -eq 0 ] || _die "Unable to register Instance (${_instance_id}) in the Target Group (${_target_group_arn})." 129 | 130 | # get current max priority 131 | local -- _current_priority=$(aws elbv2 describe-rules --listener-arn "${_listener_arn}" \ 132 | --query "max(Rules[? Priority != 'default'].Priority.to_number(@))" --output text) 133 | [ -n "${_current_priority}" ] || _current_priority=0 134 | 135 | # add target rule to the selected listener 136 | local -- _priority=$((_current_priority+1)) 137 | local -- _target_path="${_target_web_url_path}*" 138 | 139 | local -- _rule_arn=$(aws elbv2 create-rule --listener-arn "${_listener_arn}" --priority "${_priority}" \ 140 | --conditions Field=path-pattern,Values="${_target_path}" --actions Type=forward,TargetGroupArn=${_target_group_arn} \ 141 | --query "Rules[0].RuleArn" --output text) 142 | [ -n "${_rule_arn}" ] || _die "Unable to create Rule for the Listener (${_listener_arn}), Target Group (${_target_group_arn}) and target path (${_target_path})." 143 | fi 144 | 145 | #avoid 404 ALB error 146 | sleep 10 147 | 148 | # set output variables 149 | export INTERACTIVE_SESSION_TARGET_HOST="${_alb_host}" 150 | export INTERACTIVE_SESSION_TARGET_PORT="${_alb_port}" 151 | export INTERACTIVE_SESSION_TARGET_WEBURLPATH="${_target_web_url_path}" 152 | } 153 | 154 | # Check it's a DCV 2017 interactive session. 155 | if [ "${INTERACTIVE_SESSION_REMOTE}" = "dcv2" ]; then 156 | main "${INTERACTIVE_SESSION_REMOTE_SESSION_ID}" "${ALB_PUBLIC_DNS_NAME}" "${ALB_PORT}" "${INTERACTIVE_SESSION_DCV2_WEBURLPATH:1}" "${INTERACTIVE_DEFAULT_DCV2_WEB_PORT}" "${INTERACTIVE_SESSION_DCV2_WEBURLPATH}" 157 | fi 158 | 159 | # ex:ts=4:sw=4:et:ft=sh: -------------------------------------------------------------------------------- /enginframe/efinstall.config: -------------------------------------------------------------------------------- 1 | efinstall.config.version = 1.0 2 | ef.accept.eula = true 3 | kernel.agent.on.same.machine = true 4 | kernel.agent.rmi.port = 9999 5 | kernel.agent.rmi.bind.port = 9998 6 | kernel.ef.admin.user = ec2-user 7 | kernel.server.tomcat.https = true 8 | kernel.ef.tomcat.user = efnobody 9 | kernel.ef.root.context = enginframe 10 | kernel.tomcat.https.port = 8443 11 | kernel.tomcat.shutdown.port = 8005 12 | kernel.start_enginframe_at_boot = true 13 | demo.install = true 14 | default.auth.mgr = pam 15 | pam.service = system-auth 16 | ef.jobmanager = slurm 17 | slurm.binaries.path = /opt/slurm/bin 18 | ef.delegate.dcvsm = true 19 | dcvsm.oauth2.url = https\://sm-hostname\:sm-port/oauth2/token 20 | dcvsm.oauth2.id = 21 | dcvsm.broker.url = https\://sm-hostname\:sm-port 22 | dcvsm.no.strict.tls = false 23 | intro-targets = component_enginframe,component_kernel,component_applets,component_parser,component_http,component_pam,component_ldap,component_activedirectory,component_rss,component_lsf,component_pbs,component_torque,component_sge,component_slurm,component_awsbatch,component_dcvsm,component_demo,component_neutro,component_vdi,component_applications,component_service-manager,component_user-group-manager,component_hpc,component_enginframe_finalizer, 24 | progress-targets = cleanuptarget, 25 | kernel.ef.db = other-db 26 | kernel.ef.derby.db.port = 3306 27 | kernel.ef.db.admin.name = admin 28 | kernel.ef.db.savePasswordInKeystore = true 29 | kernel.ef.db.url = jdbc\:mysql\://admin@${SLURM_DB_ENDPOINT}\:3306/EnginFrameDB 30 | -------------------------------------------------------------------------------- /enginframe/fm.browse.ui: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . "${EF_ROOT}/plugins/ef/lib/utils" 4 | 5 | ef_source_conf hydrogen "ui.hydrogen.conf" 6 | 7 | if [ -n "${HY_FM_BROWSE_SORT_BY}" ]; then 8 | sortBy="${HY_FM_BROWSE_SORT_BY}" 9 | else 10 | sortBy="${HY_FM_BROWSE_DEFAULT_SORT_BY}" 11 | fi 12 | 13 | _ui="hydrogen" 14 | 15 | _widget_id="fm-browse" 16 | 17 | # Create File Manager /fsx anchor 18 | _fsx_vroot=$("${EF_ROOT}/plugins/fm/bin/fm.vroot.create" "${FM_BROWSE_SPOOLER}" "fm" "file:///fsx") 19 | if [ $? -ne 0 ]; then 20 | echo "Problem creating vroot for /fsx location. Please check your permissions" >&2 21 | exit 1 22 | fi 23 | 24 | _s3_vroot=$("${EF_ROOT}/plugins/fm/bin/fm.vroot.create" "${FM_BROWSE_SPOOLER}" "fm" "s3://@${S3_BUCKET}/") 25 | if [ $? -ne 0 ]; then 26 | echo "Problem creating vroot for S3. Please check your permissions" >&2 27 | exit 1 28 | fi 29 | 30 | cat << EOF 31 | 32 | EOF 33 | 34 | if [ -n "${_fsx_vroot}" ]; then 35 | cat << EOF 36 | 37 | FSx for Lustre (/fsx) 38 | 39 | EOF 40 | fi 41 | 42 | if [ -n "${_s3_vroot}" ]; then 43 | cat << EOF 44 | 45 | S3 (${S3_BUCKET}) 46 | 47 | EOF 48 | fi 49 | 50 | cat << EOF 51 | 52 | EOF -------------------------------------------------------------------------------- /enginframe/mysql/ef.mysql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE EnginFrameDB DEFAULT CHARACTER SET latin1; 2 | GRANT ALL ON `%`.* TO admin@`%`; 3 | flush privileges; -------------------------------------------------------------------------------- /enginframe/mysql/efdb.config: -------------------------------------------------------------------------------- 1 | [client] 2 | user=admin 3 | password=${EF_DB_PASS} 4 | 5 | [mysql] 6 | no-auto-rehash 7 | host=${SLURM_DB_ENDPOINT} 8 | port=3306 -------------------------------------------------------------------------------- /enginframe/services/ef-services.Linux Desktop.2022-11-22T10-22-47.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/enginframe/services/ef-services.Linux Desktop.2022-11-22T10-22-47.zip -------------------------------------------------------------------------------- /modules/04.configure.disable.anacron.compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | set -x 20 | set -e 21 | 22 | #temporary fix to manually disable Anacron, up until PC handles this. 23 | disableAnacron() { 24 | sudo sed 's/^/#/' /etc/anacrontab | sudo tee /etc/anacrontab.tmp 25 | sudo mv -f --backup /etc/anacrontab.tmp /etc/anacrontab 26 | } 27 | 28 | # main 29 | # ---------------------------------------------------------------------------- 30 | main() { 31 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 04.configure.disable.anacron.compute.sh: START" >&2 32 | disableAnacron 33 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 04.configure.disable.anacron.compute.sh: STOP" >&2 34 | } 35 | 36 | main "$@" -------------------------------------------------------------------------------- /modules/04.configure.slurm.AllOrNothing.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | SLURM_RESUME_CONF_FILE="/etc/parallelcluster/slurm_plugin/parallelcluster_slurm_resume.conf" 20 | 21 | set -x 22 | set -e 23 | 24 | #ADD All or Nothing to the Slurm conf 25 | addAllOrNothingtoSlurmConf() { 26 | echo "all_or_nothing_batch = True" >> "${SLURM_RESUME_CONF_FILE}" 27 | } 28 | 29 | restartSlurmDaemon() { 30 | systemctl restart slurmctld 31 | } 32 | 33 | # main 34 | # ---------------------------------------------------------------------------- 35 | main() { 36 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 04.configure.slurm.AllOrNothing.headnode.sh: START" >&2 37 | addAllOrNothingtoSlurmConf 38 | restartSlurmDaemon 39 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 04.configure.slurm.AllOrNothing.headnode.sh: STOP" >&2 40 | } 41 | 42 | main "$@" -------------------------------------------------------------------------------- /modules/07.configure.slurm.tagging.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | set -x 20 | set -e 21 | 22 | configureSACCT() { 23 | aws s3 cp --quiet "${post_install_base}/scripts/prolog.sh" "${SLURM_ETC}/" --region "${cfn_region}" || exit 1 24 | chmod +x "${SLURM_ETC}/prolog.sh" 25 | echo "Prolog=/opt/slurm/etc/prolog.sh" >> "${SLURM_ETC}/slurm.conf" 26 | } 27 | 28 | restartSlurmDaemons() { 29 | systemctl restart slurmctld 30 | } 31 | 32 | # main 33 | # ---------------------------------------------------------------------------- 34 | main() { 35 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 07.configure.slurm.tagging.headnode.sh: START" >&2 36 | configureSACCT 37 | restartSlurmDaemons 38 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 07.configure.slurm.tagging.headnode.sh: STOP" >&2 39 | } 40 | 41 | main "$@" -------------------------------------------------------------------------------- /modules/10.install.enginframe.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | # Installs EnginFrame on the headnode 21 | 22 | set -x 23 | set -e 24 | 25 | # install EnginFrame 26 | # ---------------------------------------------------------------------------- 27 | installEnginFrame() { 28 | 29 | amazon-linux-extras install -y java-openjdk11 30 | 31 | wget -nv -P /tmp/packages https://dn3uclhgxk1jt.cloudfront.net/enginframe/packages/enginframe-latest.jar || exit 1 32 | 33 | aws s3 cp --quiet "${post_install_base}/enginframe/efinstall.config" /tmp/packages/ --region "${cfn_region}" || exit 1 34 | 35 | # set permissions and uncompress 36 | chmod 755 -R /tmp/packages/* 37 | enginframe_jar=$(find /tmp/packages -type f -name 'enginframe-*.jar') 38 | # some checks 39 | [[ -z ${enginframe_jar} ]] && \ 40 | echo "[ERROR] missing enginframe jar" && return 1 41 | [[ ! -f /tmp/packages/efinstall.config ]] && \ 42 | echo "[ERROR] missing efinstall.config" && return 1 43 | 44 | cat <<-EOF >> /tmp/packages/efinstall.config 45 | kernel.java.home = /usr/lib/jvm/jre-11/ 46 | nice.root.dir.ui = ${NICE_ROOT} 47 | ef.spooler.dir = ${NICE_ROOT}/enginframe/spoolers/ 48 | ef.repository.dir = ${NICE_ROOT}/enginframe/repository/ 49 | ef.sessions.dir = ${NICE_ROOT}/enginframe/sessions/ 50 | ef.data.root.dir = ${NICE_ROOT}/enginframe/data/ 51 | ef.logs.root.dir = ${NICE_ROOT}/enginframe/logs/ 52 | ef.temp.root.dir = ${NICE_ROOT}/enginframe/tmp/ 53 | kernel.server.tomcat.https.ef.hostname = ${head_node_hostname} 54 | kernel.ef.db.admin.password = ${ec2user_pass} 55 | EOF 56 | 57 | 58 | # add EnginFrame users if not already exist 59 | id -u efnobody &>/dev/null || adduser efnobody 60 | 61 | echo "${ec2user_pass}" | passwd ec2-user --stdin 62 | 63 | if [[ -d "${SHARED_FS_DIR}/nice" ]]; then 64 | mv -f "${SHARED_FS_DIR}/nice" "${SHARED_FS_DIR}/nice.$(date "+%d-%m-%Y-%H-%M").BAK" 65 | fi 66 | 67 | # finally, launch EnginFrame installer 68 | ( cd /tmp/packages 69 | /usr/lib/jvm/jre-11/bin/java -jar "${enginframe_jar}" --text --batch ) 70 | } 71 | 72 | configureEnginFrameDB(){ 73 | 74 | #FIXME: use latest link 75 | wget -nv -P "${EF_ROOT}/WEBAPP/WEB-INF/lib/" https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.28/mysql-connector-java-8.0.28.jar 76 | chown ec2-user:efnobody "${EF_ROOT}/WEBAPP/WEB-INF/lib/mysql-connector-java-8.0.28.jar" 77 | 78 | aws s3 cp --quiet "${post_install_base}/enginframe/mysql/efdb.config" /tmp/ --region "${cfn_region}" || exit 1 79 | aws s3 cp --quiet "${post_install_base}/enginframe/mysql/ef.mysql" /tmp/ --region "${cfn_region}" || exit 1 80 | aws s3 cp --quiet "${post_install_base}/enginframe/mysql/mysql" /tmp/ --region "${cfn_region}" || exit 1 81 | 82 | chown ec2-user:efnobody "/tmp/mysql" 83 | chmod +x "/tmp/mysql" 84 | 85 | export EF_DB_PASS="${ec2user_pass}" 86 | /usr/bin/envsubst < efdb.config > efdb.pass.config 87 | 88 | /tmp/mysql --defaults-extra-file="efdb.pass.config" < "ef.mysql" 89 | rm efdb.pass.config efdb.config ef.mysql mysql 90 | } 91 | 92 | customizeEnginFrame() { 93 | aws s3 cp --quiet "${post_install_base}/enginframe/fm.browse.ui" "${EF_ROOT}/plugins/applications/bin/" --region "${cfn_region}" || exit 1 94 | chown ec2-user:efnobody "${EF_ROOT}/plugins/applications/bin/fm.browse.ui" 95 | chmod 755 "${EF_ROOT}/plugins/applications/bin/fm.browse.ui" 96 | 97 | sed -i \ 98 | "s/^HY_CONNECT_SESSION_MAX_WAIT=.*$/HY_CONNECT_SESSION_MAX_WAIT='600'/" \ 99 | "${EF_ROOT}/plugins/hydrogen/conf/ui.hydrogen.conf" 100 | 101 | #Fix DCV sessions not working with AD users 102 | sed '2 i id "${USER}"' -i "${EF_ROOT}/plugins/interactive/lib/remote/linux.jobscript.functions" 103 | } 104 | 105 | startEnginFrame() { 106 | systemctl start enginframe 107 | } 108 | 109 | 110 | # main 111 | # ---------------------------------------------------------------------------- 112 | main() { 113 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 10.install.enginframe.headnode.sh: START" >&2 114 | export ec2user_pass="$(aws secretsmanager get-secret-value --secret-id "${stack_name}" --query SecretString --output text --region "${cfn_region}")" 115 | installEnginFrame 116 | EF_TOP="${NICE_ROOT}/enginframe" 117 | unset EF_VERSION 118 | source "${EF_TOP}/current-version" 119 | export EF_ROOT="${EF_TOP}/${EF_VERSION}/enginframe" 120 | customizeEnginFrame 121 | configureEnginFrameDB 122 | startEnginFrame 123 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 10.install.enginframe.headnode.sh: STOP" >&2 124 | } 125 | 126 | main "$@" -------------------------------------------------------------------------------- /modules/12.configure.enginframe.alb.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | set -x 20 | set -e 21 | 22 | configureEF4ALB() { 23 | 24 | cat <<-EOF >> ${EF_CONF_ROOT}/plugins/interactive/interactive.efconf 25 | INTERACTIVE_SESSION_STARTING_HOOK=${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.starting.hook.sh 26 | INTERACTIVE_SESSION_CLOSING_HOOK=${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.closing.hook.sh 27 | EOF 28 | 29 | cat <<-EOF >> ${EF_CONF_ROOT}/enginframe/agent.conf 30 | ef.download.server.url=https://127.0.0.1:8443/enginframe/download 31 | EOF 32 | 33 | alb_name="$(echo $stack_name | sed 's/hpc-1click-//')" 34 | ALB_PUBLIC_DNS_NAME=$(aws elbv2 describe-load-balancers --names "${alb_name}" --query "LoadBalancers[? LoadBalancerName == '${alb_name}'].DNSName" --output text --region "${cfn_region}") 35 | 36 | pattern='^ALB_PUBLIC_DNS_NAME=.*$' 37 | replace="ALB_PUBLIC_DNS_NAME=${ALB_PUBLIC_DNS_NAME}" 38 | sed -i -e "s|${pattern}|${replace}|" "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.starting.hook.sh" 39 | sed -i -e "s|${pattern}|${replace}|" "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.closing.hook.sh" 40 | 41 | pattern='^export AWS_DEFAULT_REGION=.*$' 42 | replace="export AWS_DEFAULT_REGION=${cfn_region}" 43 | sed -i -e "s|${pattern}|${replace}|" "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.starting.hook.sh" 44 | sed -i -e "s|${pattern}|${replace}|" "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.closing.hook.sh" 45 | 46 | } 47 | 48 | 49 | downloadALBhooks() { 50 | 51 | aws s3 cp --quiet "${post_install_base}/enginframe/alb.session.closing.hook.sh" "${EF_DATA_ROOT}/plugins/interactive/bin/" --region "${cfn_region}" || exit 1 52 | aws s3 cp --quiet "${post_install_base}/enginframe/alb.session.starting.hook.sh" "${EF_DATA_ROOT}/plugins/interactive/bin/" --region "${cfn_region}" || exit 1 53 | 54 | ### FIX: DO NOT TO HARDCODE usernames 55 | chown ec2-user:efnobody "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.closing.hook.sh" 56 | chmod +x "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.closing.hook.sh" 57 | chown ec2-user:efnobody "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.starting.hook.sh" 58 | chmod +x "${EF_DATA_ROOT}/plugins/interactive/bin/alb.session.starting.hook.sh" 59 | } 60 | 61 | 62 | # main 63 | # ---------------------------------------------------------------------------- 64 | main() { 65 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 12.configure.enginframe.alb.headnode.sh: START" >&2 66 | downloadALBhooks 67 | configureEF4ALB 68 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 12.configure.enginframe.alb.headnode.sh: STOP" >&2 69 | 70 | } 71 | 72 | main "$@" 73 | -------------------------------------------------------------------------------- /modules/15.install.dcv.broker.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | # Installs DCV Session Broker on headnode 20 | 21 | set -x 22 | set -e 23 | 24 | # install DCV Session Broker 25 | installDCVSessionBroker() { 26 | 27 | rpm --import "${NICE_GPG_KEY_URL}" 28 | yum install -y -q https://d1uj6qtbmh3dt5.cloudfront.net/nice-dcv-session-manager-broker.el7.noarch.rpm || exit 1 29 | 30 | # switch broker to 8446 since 8443 is used by EnginFrame 31 | pattern='^ *client-to-broker-connector-https-port *=.*$' 32 | replace="client-to-broker-connector-https-port = ${CLIENT_BROKER_PORT}" 33 | sed -i -e "s|${pattern}|${replace}|" '/etc/dcv-session-manager-broker/session-manager-broker.properties' 34 | 35 | pattern='^ *agent-to-broker-connector-https-port *=.*$' 36 | replace="agent-to-broker-connector-https-port = ${AGENT_BROKER_PORT}" 37 | sed -i -e "s|${pattern}|${replace}|" '/etc/dcv-session-manager-broker/session-manager-broker.properties' 38 | 39 | # switch broker discovery port to 45001 since in the boot phase it can be busy 40 | #sed -i 's/broker-to-broker-discovery-port = .*$/broker-to-broker-discovery-port = 47501/' \ 41 | # /etc/dcv-session-manager-broker/session-manager-broker.properties 42 | #sed -i 's/broker-to-broker-discovery-addresses = .*$/broker-to-broker-discovery-addresses = 127.0.0.1:47501/' \ 43 | # /etc/dcv-session-manager-broker/session-manager-broker.properties 44 | } 45 | 46 | 47 | # start DCV session broker 48 | startDCVSessionBroker() { 49 | local -i attempts=10 wait=1 50 | systemctl enable dcv-session-manager-broker 51 | systemctl start dcv-session-manager-broker 52 | sleep 10 # wait for a correct ignite initialization 53 | 54 | # wait for the certificate to be available, and copy it to efadmin's home 55 | while [[ $((attempts--)) -gt 0 ]]; do 56 | if [[ -r /var/lib/dcvsmbroker/security/dcvsmbroker_ca.pem ]]; then 57 | cp /var/lib/dcvsmbroker/security/dcvsmbroker_ca.pem "${BROKER_CA}" 58 | break 59 | else sleep $((wait++)) 60 | fi 61 | done 62 | [[ ${attempts} -gt 0 ]] || return 1 63 | } 64 | 65 | 66 | # sets DCV session broker in EnginFrame 67 | # avoid this function if you don't install EnginFrame 68 | setupEFSessionManager() { 69 | local -i attempts=10 wait=1 70 | source "${NICE_ROOT}/enginframe/conf/enginframe.conf" 71 | 72 | # register and set EnginFrame as API client 73 | while [[ $((attempts--)) -gt 0 ]]; do 74 | systemctl is-active --quiet dcv-session-manager-broker 75 | if [[ $? == 0 ]]; then 76 | dcv-session-manager-broker register-api-client --client-name EnginFrame > /tmp/packages/ef_client_reg 77 | [[ $? == 0 ]] || return 1 78 | break 79 | else sleep $((wait++)) 80 | fi 81 | done 82 | [[ ${attempts} -gt 0 ]] || return 1 83 | 84 | client_id=$(cat /tmp/packages/ef_client_reg | sed -n 's/^[ \t]*client-id:[ \t]*//p') 85 | client_pw=$(cat /tmp/packages/ef_client_reg | sed -n 's/^[ \t]*client-password:[ \t]*//p') 86 | sed -i "s/^DCVSM_CLUSTER_dcvsm_cluster1_AUTH_ID=.*$/DCVSM_CLUSTER_dcvsm_cluster1_AUTH_ID=${client_id//\//\\/}/" \ 87 | "${NICE_ROOT}/enginframe/conf/plugins/dcvsm/clusters.props" 88 | sed -i \ 89 | "s/^DCVSM_CLUSTER_dcvsm_cluster1_AUTH_PASSWORD=.*$/DCVSM_CLUSTER_dcvsm_cluster1_AUTH_PASSWORD=${client_pw//\//\\/}/" \ 90 | "${NICE_ROOT}/enginframe/conf/plugins/dcvsm/clusters.props" 91 | sed -i \ 92 | "s/^DCVSM_CLUSTER_dcvsm_cluster1_AUTH_ENDPOINT=.*$/DCVSM_CLUSTER_dcvsm_cluster1_AUTH_ENDPOINT=https:\/\/${host_name}:${CLIENT_BROKER_PORT}\/oauth2\/token/" \ 93 | "${NICE_ROOT}/enginframe/conf/plugins/dcvsm/clusters.props" 94 | sed -i \ 95 | "s/^DCVSM_CLUSTER_dcvsm_cluster1_SESSION_MANAGER_ENDPOINT=.*$/DCVSM_CLUSTER_dcvsm_cluster1_SESSION_MANAGER_ENDPOINT=https:\/\/${host_name}:${CLIENT_BROKER_PORT}/" \ 96 | "${NICE_ROOT}/enginframe/conf/plugins/dcvsm/clusters.props" 97 | 98 | # add dcvsm certificate to Java keystore 99 | openssl x509 -in /var/lib/dcvsmbroker/security/dcvsmbroker_ca.pem -inform pem \ 100 | -out /tmp/packages/dcvsmbroker_ca.der -outform der 101 | keytool -importcert -alias dcvsm \ 102 | -keystore "${JAVA_HOME}/lib/security/cacerts" \ 103 | -storepass changeit \ 104 | -noprompt \ 105 | -file /tmp/packages/dcvsmbroker_ca.der 106 | systemctl restart enginframe 107 | } 108 | 109 | 110 | # main 111 | # ---------------------------------------------------------------------------- 112 | main() { 113 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 15.install.dcv.broker.headnode.sh: START" >&2 114 | installDCVSessionBroker 115 | startDCVSessionBroker 116 | setupEFSessionManager 117 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 15.install.dcv.broker.headnode.sh: STOP" >&2 118 | } 119 | 120 | main "$@" -------------------------------------------------------------------------------- /modules/20.install.dcv.slurm.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | # Add "dcv2" requirements on the DCV nodes. 21 | 22 | set -x 23 | set -e 24 | 25 | DCV_KEY_WORD="dcv" 26 | 27 | #ADD DCV as a features to Slurm Partitions 28 | addDCVtoSlurmPartitions() { 29 | for conf_file in $(ls ${SLURM_CONF_FILE} | grep "${DCV_KEY_WORD}"); do 30 | sed -i 's/Feature=/Feature=dcv2,/g' "${conf_file}" 31 | done 32 | } 33 | 34 | restartSlurmDaemon() { 35 | systemctl restart slurmctld 36 | } 37 | 38 | # main 39 | # ---------------------------------------------------------------------------- 40 | main() { 41 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 20.install.dcv.slurm.headnode.sh: START" >&2 42 | addDCVtoSlurmPartitions 43 | restartSlurmDaemon 44 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 20.install.dcv.slurm.headnode.sh: STOP" >&2 45 | } 46 | 47 | main "$@" 48 | -------------------------------------------------------------------------------- /modules/25.install.dcv-server.compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | set -x 20 | set -e 21 | 22 | installSimpleExternalAuth() { 23 | 24 | yum -y -q install nice-dcv-*/nice-dcv-simple-external-authenticator-*.rpm 25 | 26 | systemctl start dcvsimpleextauth.service 27 | 28 | } 29 | 30 | installMissingLib() { 31 | yum -y -q install ImageMagick 32 | } 33 | 34 | configureDCVexternalAuth() { 35 | 36 | pattern='\[security\]' 37 | replace='&\n' 38 | replace+="auth-token-verifier=\"http://localhost:8444\"" 39 | cp '/etc/dcv/dcv.conf' "/etc/dcv/dcv.conf.$(date --iso=s --utc)" 40 | # remove duplicates if any 41 | #sed -i -e '/^ *\(administrators\|ca-file\|auth-token-verifier\) *=.*$/d' '/etc/dcv/dcv.conf' 42 | sed -i -e "s|${pattern}|${replace}|" '/etc/dcv/dcv.conf' 43 | 44 | } 45 | 46 | restartDCV() { 47 | 48 | systemctl restart dcvserver.service 49 | 50 | } 51 | 52 | # main 53 | # ---------------------------------------------------------------------------- 54 | main() { 55 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] install.dcv-server.compute.sh: START" >&2 56 | 57 | wget -nv https://d1uj6qtbmh3dt5.cloudfront.net/nice-dcv-el7-x86_64.tgz 58 | tar zxvf nice-dcv-el7-x86_64.tgz 59 | installSimpleExternalAuth 60 | dcvusbdriverinstaller --quiet 61 | 62 | installMissingLib 63 | configureDCVexternalAuth 64 | restartDCV 65 | 66 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] install.dcv-server.compute.sh: STOP" >&2 67 | } 68 | 69 | main "$@" -------------------------------------------------------------------------------- /modules/25.install.dcv-server.gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | set -x 20 | set -e 21 | 22 | installSimpleExternalAuth() { 23 | 24 | yum -y -q install nice-dcv-*/nice-dcv-simple-external-authenticator-*.rpm 25 | systemctl start dcvsimpleextauth.service 26 | 27 | } 28 | 29 | installDCVGLonG4() { 30 | 31 | systemctl stop dcvserver.service 32 | systemctl disable slurmd 33 | systemctl isolate multi-user.target 34 | 35 | nvidia-xconfig --enable-all-gpus --preserve-busid --connected-monitor=DFP-0,DFP-1,DFP-2,DFP-3 36 | nvidia-persistenced 37 | nvidia-smi -ac 5001,1590 38 | 39 | yum -y -q install nice-dcv-*/nice-dcv-gl*.rpm nice-dcv-*/nice-dcv-server*.rpm nice-dcv-*/nice-xdcv*.rpm nice-dcv-*/nice-dcv-web-viewer*.rpm 40 | 41 | systemctl isolate graphical.target 42 | systemctl start dcvserver.service 43 | systemctl enable slurmd 44 | } 45 | 46 | installMissingLib() { 47 | yum -y -q install ImageMagick 48 | } 49 | 50 | configureDCVexternalAuth() { 51 | 52 | pattern='\[security\]' 53 | replace='&\n' 54 | replace+="auth-token-verifier=\"http://localhost:8444\"" 55 | cp '/etc/dcv/dcv.conf' "/etc/dcv/dcv.conf.$(date --iso=s --utc)" 56 | # remove duplicates if any 57 | #sed -i -e '/^ *\(administrators\|ca-file\|auth-token-verifier\) *=.*$/d' '/etc/dcv/dcv.conf' 58 | sed -i -e "s|${pattern}|${replace}|" '/etc/dcv/dcv.conf' 59 | 60 | } 61 | 62 | restartDCV() { 63 | 64 | systemctl restart dcvserver.service 65 | 66 | } 67 | 68 | # main 69 | # ---------------------------------------------------------------------------- 70 | main() { 71 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] install.dcv-server.gpu.sh: START" >&2 72 | 73 | wget -nv https://d1uj6qtbmh3dt5.cloudfront.net/nice-dcv-el7-x86_64.tgz 74 | tar zxvf nice-dcv-el7-x86_64.tgz 75 | installDCVGLonG4 76 | installSimpleExternalAuth 77 | dcvusbdriverinstaller --quiet 78 | 79 | installMissingLib 80 | configureDCVexternalAuth 81 | restartDCV 82 | 83 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] install.dcv-server.gpu.sh: STOP" >&2 84 | } 85 | 86 | main "$@" -------------------------------------------------------------------------------- /modules/26.configure.dcv.alb.compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | # Intall DCV con compute Nodes. 20 | 21 | set -x 22 | set -e 23 | 24 | configureDCVforALB() { 25 | cp '/etc/dcv/dcv.conf' "/etc/dcv/dcv.conf.$(date --iso=s --utc)" 26 | WEB_URL_PATH="$(ec2-metadata -i| awk '{print $2}')" 27 | pattern='^ *#web-url-path*=.*$' 28 | replace="web-url-path=\"/${WEB_URL_PATH}\"" 29 | sed -i -e "s|${pattern}|${replace}|" "/etc/dcv/dcv.conf" 30 | } 31 | 32 | restartDCV() { 33 | 34 | systemctl restart dcvserver.service 35 | 36 | } 37 | 38 | # main 39 | # ---------------------------------------------------------------------------- 40 | main() { 41 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] configure.dcv.alb.compute.sh: START" >&2 42 | configureDCVforALB 43 | restartDCV 44 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] configure.dcv.alb.compute.sh: STOP" >&2 45 | } 46 | 47 | main "$@" -------------------------------------------------------------------------------- /modules/27.configure.dcv.nat.compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | # Intall DCV con compute Nodes. 21 | EF_NAT_CONF="${NICE_ROOT}/enginframe/conf/plugins/interactive/nat.conf" 22 | 23 | 24 | set -x 25 | set -e 26 | 27 | fixNat() { 28 | 29 | #fix the nat 30 | h2="${host_name//./\\.}" 31 | sed -i "/^${h2} .*$/d" "${EF_NAT_CONF}" 32 | echo "$host_name $(ec2-metadata -p| awk '{print $2}')" >> "${EF_NAT_CONF}" 33 | } 34 | 35 | 36 | # main 37 | # ---------------------------------------------------------------------------- 38 | main() { 39 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 27.configure.dcv.nat.compute.sh: START" >&2 40 | for conf_file in $(ls ${SLURM_CONF_FILE} | grep "${DCV_KEY_WORD}"); do 41 | if [[ ! -z $(grep "${compute_instance_type}" "${conf_file}") ]]; then 42 | fixNat 43 | fi 44 | done 45 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 27.configure.dcv.nat.compute.sh: STOP" >&2 46 | } 47 | 48 | main "$@" -------------------------------------------------------------------------------- /modules/30.install.dcv-sm-agent.compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | DCV_SM_ROOT="/etc/dcv-session-manager-agent" 20 | 21 | set -x 22 | set -e 23 | 24 | configureDCVforSMAgent() { 25 | 26 | pattern='\[security\]' 27 | replace='&\n' 28 | replace+='administrators=["dcvsmagent"]\n' 29 | replace+='ca-file="/etc/dcv-session-manager-agent/dcvsmbroker_ca.pem"\n' 30 | replace+="auth-token-verifier=\"https://${head_node_hostname}:${AGENT_BROKER_PORT}/agent/validate-authentication-token\"" 31 | cp '/etc/dcv/dcv.conf' "/etc/dcv/dcv.conf.$(date --iso=s --utc)" 32 | # remove duplicates if any 33 | #sed -i -e '/^ *\(administrators\|ca-file\|auth-token-verifier\) *=.*$/d' '/etc/dcv/dcv.conf' 34 | sed -i -e "s|${pattern}|${replace}|" '/etc/dcv/dcv.conf' 35 | 36 | } 37 | 38 | installDCVSMAgent() { 39 | 40 | BROKER_CA_NEW="${DCV_SM_ROOT}/dcvsmbroker_ca.pem" 41 | DCV_SM_AGENT_CONF="${DCV_SM_ROOT}/agent.conf" 42 | 43 | rpm --import "${NICE_GPG_KEY_URL}" 44 | yum -y -q install https://d1uj6qtbmh3dt5.cloudfront.net/nice-dcv-session-manager-agent.el7.x86_64.rpm || exit 1 45 | 46 | pattern='^ *broker_host *=.*$' 47 | replace="broker_host = \'${head_node_hostname}\'" 48 | sed -i -e "s|${pattern}|${replace}|" "${DCV_SM_AGENT_CONF}" 49 | 50 | pattern='^ *#broker_port *=.*$' 51 | replace="broker_port = ${AGENT_BROKER_PORT}" 52 | sed -i -e "s|${pattern}|${replace}|" "${DCV_SM_AGENT_CONF}" 53 | 54 | pattern='^ *#ca_file *=.*$' 55 | replace="ca_file = \'${BROKER_CA_NEW}\'" 56 | sed -i -e "s|${pattern}|${replace}|" "${DCV_SM_AGENT_CONF}" 57 | cp "${BROKER_CA}" "${BROKER_CA_NEW}" 58 | 59 | } 60 | 61 | 62 | configureAgentTags() { 63 | mkdir -p "${DCV_SM_ROOT}/tags" 64 | echo "AWS_EC2_PUBLIC_HOSTNAME=\"$(ec2-metadata -p| awk '{print $2}')\"" >> "${DCV_SM_ROOT}/tags/agent_tags.toml" 65 | echo "INSTANCE_TYPE=\"$(ec2-metadata -t| awk '{print $2}')\"" >> "${DCV_SM_ROOT}/tags/agent_tags.toml" 66 | echo "AWS_EC2_INSTANCE_ID=\"$(ec2-metadata -i| awk '{print $2}')\"" >> "${DCV_SM_ROOT}/tags/agent_tags.toml" 67 | } 68 | 69 | startServices() { 70 | 71 | systemctl start dcv-session-manager-agent 72 | systemctl restart dcvserver.service 73 | 74 | } 75 | 76 | # main 77 | # ---------------------------------------------------------------------------- 78 | main() { 79 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] install.dcv-sm-agent.compute.sh: START" >&2 80 | if [[ ! -z $(grep "${compute_instance_type}" "${conf_file}") ]]; then 81 | configureDCVforSMAgent 82 | installDCVSMAgent 83 | configureAgentTags 84 | startServices 85 | fi 86 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] install.dcv-sm-agent.compute.sh: STOP" >&2 87 | } 88 | 89 | main "$@" -------------------------------------------------------------------------------- /modules/40.install.monitoring.compute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | 21 | set -x 22 | set -e 23 | 24 | installPreReq() { 25 | yum -y -q install docker golang-bin 26 | service docker start 27 | chkconfig docker on 28 | usermod -a -G docker $cfn_cluster_user 29 | 30 | #to be replaced with yum -y install docker-compose as the repository problem is fixed 31 | curl -s -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose 32 | chmod +x /usr/local/bin/docker-compose 33 | } 34 | 35 | installMonitoring() { 36 | 37 | gpu_instances="[pg][2-9].*\.[0-9]*[x]*large" 38 | 39 | if [[ $compute_instance_type =~ $gpu_instances ]]; then 40 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 41 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo 42 | yum -y -q clean expire-cache 43 | yum -y -q install nvidia-docker2 44 | systemctl restart docker 45 | /usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.gpu.yml" -p monitoring-compute up -d 46 | else 47 | /usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.yml" -p monitoring-compute up -d 48 | fi 49 | } 50 | 51 | # main 52 | # ---------------------------------------------------------------------------- 53 | main() { 54 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: START" >&2 55 | 56 | job_id=$($SLURM_ROOT/bin/squeue -h -w "${host_name}" | awk '{print $1}') 57 | job_comment=$($SLURM_ROOT/bin/scontrol show job $job_id | grep Comment | sed 's/Comment=//' | sed 's/^ *//g') 58 | 59 | if [[ $job_comment == *"Key=Monitoring,Value=ON"* ]]; then 60 | installPreReq 61 | installMonitoring 62 | fi 63 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: STOP" >&2 64 | } 65 | main "$@" -------------------------------------------------------------------------------- /modules/40.install.monitoring.headnode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | #max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue') 21 | 22 | s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//") 23 | 24 | set -x 25 | set -e 26 | 27 | installPreReq() { 28 | yum -y -q install docker golang-bin 29 | service docker start 30 | chkconfig docker on 31 | usermod -a -G docker $cfn_cluster_user 32 | 33 | #to be replaced with yum -y install docker-compose as the repository problem is fixed 34 | curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose 35 | chmod +x /usr/local/bin/docker-compose 36 | } 37 | 38 | saveClusterConfigLocally(){ 39 | 40 | cluster_s3_bucket=$(jq -r '.cluster.cluster_s3_bucket' "${dna_json}") 41 | cluster_config_s3_key=$(jq -r '.cluster.cluster_config_s3_key' "${dna_json}") 42 | cluster_config_version=$(jq -r '.cluster.cluster_config_version' "${dna_json}") 43 | log_group_names=$(jq -r '.cluster.log_group_name' "${dna_json}") 44 | 45 | mkdir -p "${monitoring_home}/parallelcluster" 46 | aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version "${monitoring_home}/parallelcluster/cluster-config.json" 47 | } 48 | 49 | installMonitoring(){ 50 | 51 | aws s3 cp --quiet --recursive "${post_install_base}/monitoring" "${monitoring_home}" --region "${cfn_region}" || exit 1 52 | chown $cfn_cluster_user:$cfn_cluster_user -R "${monitoring_home}" 53 | chmod +x ${monitoring_home}/custom-metrics/* 54 | 55 | cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/ 56 | mv -f "${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service" /etc/systemd/system/ 57 | 58 | cp -rp ${monitoring_home}/www/* "${NICE_ROOT}/enginframe/conf/tomcat/webapps/ROOT/" 59 | } 60 | 61 | 62 | 63 | configureMonitoring() { 64 | 65 | fsx_fs_id=$(jq -r '.cluster.fsx_fs_id' "${dna_json}") 66 | headnode_instance_id=$(ec2-metadata -i | awk '{print $2}') 67 | 68 | #FIXME: the cost dashboard need to be re-designed. 69 | #(crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user - 70 | #(crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user - 71 | 72 | # replace tokens 73 | sed -i "s/_S3_BUCKET_/${s3_bucket}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" 74 | sed -i "s/__INSTANCE_ID__/${headnode_instance_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" 75 | sed -i "s/__FSX_ID__/${fsx_fs_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" 76 | sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" 77 | 78 | sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/logs.json" 79 | sed -i "s~__LOG_GROUP__NAMES__~${log_group_names}~g" "${monitoring_home}/grafana/dashboards/logs.json" 80 | 81 | sed -i "s/__Application__/${stack_name}/g" "${monitoring_home}/prometheus/prometheus.yml" 82 | sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/prometheus/prometheus.yml" 83 | 84 | sed -i "s/__INSTANCE_ID__/${headnode_instance_id}/g" "${monitoring_home}/grafana/dashboards/headnode-details.json" 85 | sed -i "s/__INSTANCE_ID__/${headnode_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-list.json" 86 | sed -i "s/__INSTANCE_ID__/${headnode_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-details.json" 87 | 88 | sed -i "s~__MONITORING_DIR__~${monitoring_home}~g" "${monitoring_home}/docker-compose/docker-compose.headnode.yml" 89 | sed -i "s~__GRAFANA_PASSWORD__~${ec2user_pass}~g" "${monitoring_home}/docker-compose/docker-compose.headnode.yml" 90 | 91 | # Download and build prometheus-slurm-exporter 92 | ##### Plese note this software package is under GPLv3 License ##### 93 | # More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE 94 | cd "${monitoring_home}" 95 | #FIXME: temporary 96 | rm -rf prometheus-slurm-exporter 97 | git clone https://github.com/vpenso/prometheus-slurm-exporter.git 98 | cd prometheus-slurm-exporter 99 | sed -i 's/NodeList,AllocMem,Memory,CPUsState,StateLong/NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:/' node.go 100 | GOPATH=/root/go-modules-cache HOME=/root go mod download 101 | GOPATH=/root/go-modules-cache HOME=/root go build 102 | mv -f "${monitoring_home}/prometheus-slurm-exporter/prometheus-slurm-exporter" /usr/bin/prometheus-slurm-exporter 103 | } 104 | 105 | 106 | startMonitoringDaemons() { 107 | 108 | /usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f "${monitoring_home}/docker-compose/docker-compose.headnode.yml" -p monitoring-1click-hpc up -d 109 | systemctl daemon-reload 110 | systemctl enable slurm_exporter 111 | systemctl start slurm_exporter 112 | 113 | } 114 | 115 | # main 116 | # ---------------------------------------------------------------------------- 117 | main() { 118 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.headnode.sh: START" >&2 119 | if [[ -d "${monitoring_home}" ]]; then 120 | mv -f "${monitoring_home}" "${monitoring_home}.$(date "+%d-%m-%Y-%H-%M").BAK" 121 | fi 122 | installPreReq 123 | saveClusterConfigLocally 124 | installMonitoring 125 | configureMonitoring 126 | startMonitoringDaemons 127 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.headnode.sh: STOP" >&2 128 | } 129 | 130 | main "$@" -------------------------------------------------------------------------------- /monitoring/custom-metrics/1h-cost-metrics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # 4 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 5 | # SPDX-License-Identifier: MIT-0 6 | # 7 | # 8 | 9 | #source the AWS ParallelCluster profile 10 | . /etc/parallelcluster/cfnconfig 11 | 12 | export AWS_DEFAULT_REGION=$cfn_region 13 | aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region) 14 | aws_region_long_name=${aws_region_long_name/Europe/EU} 15 | 16 | headnodeInstanceType=$(ec2-metadata -t | awk '{print $2}') 17 | headnodeInstanceId=$(ec2-metadata -i | awk '{print $2}') 18 | s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//") 19 | s3_size_gb=$(echo "$(aws s3api list-objects --bucket $s3_bucket --output json --query "[sum(Contents[].Size)]"| sed -n 2p | tr -d ' ') / 1024 / 1024 / 1024" | bc) 20 | 21 | 22 | #retrieve the s3 cost 23 | if [[ $s3_size_gb -le 51200 ]]; then 24 | s3_range=51200 25 | elif [[ $VAR -le 512000 ]]; then 26 | s3_range=512000 27 | else 28 | s3_range="Inf" 29 | fi 30 | 31 | ####################### S3 ######################### 32 | 33 | s3_cost_gb_month=$(aws --region us-east-1 pricing get-products \ 34 | --service-code AmazonS3 \ 35 | --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 36 | 'Type=TERM_MATCH,Field=storageClass,Value=General Purpose' \ 37 | --query 'PriceList[0]' --output text \ 38 | | jq -r --arg endRange $s3_range '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[].value | select(.endRange==$endRange).pricePerUnit.USD') 39 | 40 | s3=$(echo "scale=2; $s3_cost_gb_month * $s3_size_gb / 720" | bc) 41 | echo "s3_cost $s3" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost 42 | 43 | 44 | ####################### headnode ######################### 45 | headnode_node_h_price=$(aws pricing get-products \ 46 | --region us-east-1 \ 47 | --service-code AmazonEC2 \ 48 | --filters 'Type=TERM_MATCH,Field=instanceType,Value='$headnodeInstanceType \ 49 | 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 50 | 'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \ 51 | 'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \ 52 | 'Type=TERM_MATCH,Field=tenancy,Value=Shared' \ 53 | 'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \ 54 | --output text \ 55 | --query 'PriceList' \ 56 | | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') 57 | 58 | echo "headnode_cost $headnode_node_h_price" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost 59 | 60 | 61 | fsx_id=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region \ 62 | | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "FSXOptions"))[0].ParameterValue' \ 63 | | awk -F "," '{print $2}') 64 | fsx_summary=$(aws fsx describe-file-systems --region $cfn_region --file-system-ids $fsx_id) 65 | fsx_size_gb=$(echo $fsx_summary | jq -r '.FileSystems[0].StorageCapacity') 66 | fsx_type=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.DeploymentType') 67 | fsx_throughput=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.PerUnitStorageThroughput') 68 | 69 | if [[ $fsx_type = "SCRATCH_2" ]] || [[ $fsx_type = "SCRATCH_1" ]]; then 70 | fsx_cost_gb_month=$(aws pricing get-products \ 71 | --region us-east-1 \ 72 | --service-code AmazonFSx \ 73 | --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 74 | 'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \ 75 | 'Type=TERM_MATCH,Field=throughputCapacity,Value=N/A' \ 76 | --output text \ 77 | --query 'PriceList' \ 78 | | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') 79 | 80 | elif [ $fsx_type = "PERSISTENT_1" ]; then 81 | fsx_cost_gb_month=$(aws pricing get-products \ 82 | --region us-east-1 \ 83 | --service-code AmazonFSx \ 84 | --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 85 | 'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \ 86 | 'Type=TERM_MATCH,Field=throughputCapacity,Value='$fsx_throughput \ 87 | --output text \ 88 | --query 'PriceList' \ 89 | | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') 90 | 91 | else 92 | fsx_cost_gb_month=0 93 | fi 94 | 95 | fsx=$(echo "scale=2; $fsx_cost_gb_month * $fsx_size_gb / 720" | bc) 96 | echo "fsx_cost $fsx" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost 97 | 98 | 99 | #parametrize: 100 | ebs_volume_total_cost=0 101 | ebs_volume_ids=$(aws ec2 describe-instances --instance-ids $headnodeInstanceId \ 102 | | jq -r '.Reservations | to_entries[].value | .Instances | to_entries[].value | .BlockDeviceMappings | to_entries[].value | .Ebs.VolumeId') 103 | 104 | for ebs_volume_id in $ebs_volume_ids 105 | do 106 | ebs_volume_type=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.VolumeType') 107 | #ebs_volume_iops=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Iops') 108 | ebs_volume_size=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Size') 109 | 110 | ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \ 111 | --service-code AmazonEC2 \ 112 | --query 'PriceList' \ 113 | --output text \ 114 | --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 115 | 'Type=TERM_MATCH,Field=productFamily,Value=Storage' \ 116 | 'Type=TERM_MATCH,Field=volumeApiName,Value='$ebs_volume_type \ 117 | | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') 118 | 119 | ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $ebs_volume_size / 720" | bc) 120 | ebs_volume_total_cost=$(echo "scale=2; $ebs_volume_total_cost + $ebs_volume_cost" | bc) 121 | done 122 | 123 | echo "ebs_headnode_cost $ebs_volume_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost -------------------------------------------------------------------------------- /monitoring/custom-metrics/1m-cost-metrics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # 4 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 5 | # SPDX-License-Identifier: MIT-0 6 | # 7 | # 8 | 9 | #!/bin/bash 10 | # 11 | # 12 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 13 | # SPDX-License-Identifier: MIT-0 14 | # 15 | # 16 | 17 | exit 0 18 | 19 | #source the AWS ParallelCluster profile 20 | . /etc/parallelcluster/cfnconfig 21 | 22 | export AWS_DEFAULT_REGION=$cfn_region 23 | aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region) 24 | aws_region_long_name=${aws_region_long_name/Europe/EU} 25 | 26 | #FIXME: not hardcode dir 27 | monitoring_dir_name="monitoring" 28 | monitoring_home="/fsx/${monitoring_dir_name}" 29 | 30 | queues=$(/opt/slurm/bin/sinfo --noheader -O partition | sed 's/\*//g') 31 | cluster_config_file="${monitoring_home}/parallelcluster/cluster-config.json" 32 | 33 | compute_nodes_total_cost=0 34 | 35 | for queue in $queues; do 36 | 37 | instance_type=$(cat "${cluster_config_file}" | jq -r --arg queue $queue '.cluster.queue_settings | to_entries[] | select(.key==$queue).value.compute_resource_settings | to_entries[]| .value.instance_type') 38 | 39 | compute_node_h_price=$(aws pricing get-products \ 40 | --region us-east-1 \ 41 | --service-code AmazonEC2 \ 42 | --filters 'Type=TERM_MATCH,Field=instanceType,Value='$instance_type \ 43 | 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 44 | 'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \ 45 | 'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \ 46 | 'Type=TERM_MATCH,Field=tenancy,Value=Shared' \ 47 | 'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \ 48 | --output text \ 49 | --query 'PriceList' \ 50 | | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') 51 | 52 | ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \ 53 | --service-code AmazonEC2 \ 54 | --query 'PriceList' \ 55 | --output text \ 56 | --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ 57 | 'Type=TERM_MATCH,Field=productFamily,Value=Storage' \ 58 | 'Type=TERM_MATCH,Field=volumeApiName,Value=gp2' \ 59 | | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') 60 | 61 | total_num_compute_nodes=$(/opt/slurm/bin/sinfo --noheader --partition=$queue | egrep -v "idle~" | awk '{sum += $4} END {if (sum) print sum; else print 0; }') 62 | 63 | ebs_volume_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "ComputeRootVolumeSize"))[0].ParameterValue') 64 | compute_ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $total_num_compute_nodes * $ebs_volume_size / 720" | bc) 65 | compute_nodes_cost=$(echo "scale=2; $total_num_compute_nodes * $compute_node_h_price" | bc) 66 | 67 | compute_nodes_total_cost=$(echo "scale=2; $compute_nodes_total_cost + $compute_nodes_cost" | bc) 68 | 69 | done 70 | 71 | echo "ebs_compute_cost $compute_ebs_volume_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost 72 | echo "compute_nodes_cost $compute_nodes_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost -------------------------------------------------------------------------------- /monitoring/custom-metrics/aws-region.py: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # 7 | import json 8 | import sys 9 | 10 | from pkg_resources import resource_filename 11 | 12 | region = str(sys.argv[1]) 13 | 14 | name = None 15 | endpoint_file = resource_filename('botocore', 'data/endpoints.json') 16 | with open(endpoint_file, 'r') as ep_file: 17 | data = json.load(ep_file) 18 | for partition in data['partitions']: 19 | if region in partition['regions']: 20 | name = partition['regions'][region]['description'] 21 | break 22 | 23 | print(name) -------------------------------------------------------------------------------- /monitoring/docker-compose/docker-compose.compute.gpu.yml: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # 7 | version: '3.8' 8 | services: 9 | prometheus-node-exporter: 10 | container_name: node-exporter 11 | network_mode: host 12 | pid: host 13 | restart: unless-stopped 14 | volumes: 15 | - '/:/host:ro,rslave' 16 | image: quay.io/prometheus/node-exporter 17 | command: 18 | - '--path.rootfs=/host' 19 | dcgm-exporter: 20 | container_name: nvidia-dcgm 21 | network_mode: host 22 | pid: host 23 | restart: unless-stopped 24 | image: nvidia/dcgm-exporter 25 | runtime: nvidia 26 | environment: 27 | - NVIDIA_VISIBLE_DEVICES=all 28 | - NVIDIA_DRIVER_CAPABILITIES=all -------------------------------------------------------------------------------- /monitoring/docker-compose/docker-compose.compute.yml: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # 7 | version: '3.8' 8 | services: 9 | prometheus-node-exporter: 10 | container_name: node-exporter 11 | network_mode: host 12 | pid: host 13 | restart: unless-stopped 14 | volumes: 15 | - '/:/host:ro,rslave' 16 | image: quay.io/prometheus/node-exporter 17 | command: 18 | - '--path.rootfs=/host' -------------------------------------------------------------------------------- /monitoring/docker-compose/docker-compose.headnode.yml: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # 7 | version: '3.8' 8 | services: 9 | pushgateway: 10 | container_name: pushgateway 11 | network_mode: host 12 | pid: host 13 | restart: unless-stopped 14 | image: prom/pushgateway 15 | prometheus: 16 | container_name: prometheus 17 | network_mode: host 18 | pid: host 19 | restart: unless-stopped 20 | volumes: 21 | - '__MONITORING_DIR__/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml' 22 | - 'prometheus-data:/prometheus' 23 | image: prom/prometheus 24 | command: 25 | - '--config.file=/etc/prometheus/prometheus.yml' 26 | - '--storage.tsdb.path=/prometheus' 27 | - '--web.console.libraries=/usr/share/prometheus/console_libraries' 28 | - '--web.console.templates=/usr/share/prometheus/consoles' 29 | - '--web.external-url=/prometheus/' 30 | - '--web.route-prefix=/' 31 | grafana: 32 | container_name: grafana 33 | network_mode: host 34 | pid: host 35 | restart: unless-stopped 36 | environment: 37 | - 'GF_SECURITY_ADMIN_PASSWORD=__GRAFANA_PASSWORD__' 38 | - 'GF_SERVER_ROOT_URL=http://%(domain)s/grafana/' 39 | - 'GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/provisioning/dashboards/ParallelCluster.json' 40 | volumes: 41 | - '__MONITORING_DIR__/grafana:/etc/grafana/provisioning' 42 | - 'grafana-data:/var/lib/grafana' 43 | image: grafana/grafana 44 | prometheus-node-exporter: 45 | container_name: node-exporter 46 | network_mode: host 47 | pid: host 48 | restart: unless-stopped 49 | volumes: 50 | - '/:/host:ro,rslave' 51 | image: quay.io/prometheus/node-exporter 52 | command: 53 | - '--path.rootfs=/host' 54 | nginx: 55 | container_name: nginx 56 | network_mode: host 57 | pid: host 58 | restart: unless-stopped 59 | volumes: 60 | - '__MONITORING_DIR__/nginx/conf.d:/etc/nginx/conf.d/' 61 | - '__MONITORING_DIR__/nginx/ssl:/etc/ssl/' 62 | - '__MONITORING_DIR__/www:/usr/share/nginx/html' 63 | image: nginx 64 | volumes: 65 | prometheus-data: 66 | grafana-data: -------------------------------------------------------------------------------- /monitoring/grafana/dashboards/compute-node-list.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "iteration": 1592242343557, 19 | "links": [ 20 | { 21 | "$$hashKey": "object:53", 22 | "icon": "external link", 23 | "tags": [], 24 | "type": "dashboards" 25 | } 26 | ], 27 | "panels": [ 28 | { 29 | "columns": [], 30 | "datasource": null, 31 | "fieldConfig": { 32 | "defaults": { 33 | "custom": {} 34 | }, 35 | "overrides": [] 36 | }, 37 | "fontSize": "100%", 38 | "gridPos": { 39 | "h": 24, 40 | "w": 9, 41 | "x": 0, 42 | "y": 0 43 | }, 44 | "id": 2, 45 | "pageSize": null, 46 | "showHeader": true, 47 | "sort": { 48 | "col": 2, 49 | "desc": true 50 | }, 51 | "styles": [ 52 | { 53 | "alias": "Time", 54 | "align": "auto", 55 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 56 | "pattern": "Time", 57 | "type": "hidden" 58 | }, 59 | { 60 | "alias": "Availability Zone", 61 | "align": "left", 62 | "colorMode": null, 63 | "colors": [ 64 | "rgba(245, 54, 54, 0.9)", 65 | "rgba(237, 129, 40, 0.89)", 66 | "rgba(50, 172, 45, 0.97)" 67 | ], 68 | "decimals": 2, 69 | "pattern": "instance_az", 70 | "thresholds": [], 71 | "type": "number", 72 | "unit": "short" 73 | }, 74 | { 75 | "alias": "Instance Id", 76 | "align": "auto", 77 | "colorMode": null, 78 | "colors": [ 79 | "rgba(245, 54, 54, 0.9)", 80 | "rgba(237, 129, 40, 0.89)", 81 | "rgba(50, 172, 45, 0.97)" 82 | ], 83 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 84 | "decimals": 2, 85 | "link": true, 86 | "linkTargetBlank": true, 87 | "linkTooltip": "Go To Node Details", 88 | "linkUrl": "/grafana/d/qI8VfvXZz/node-details-copy?var-instance_id=${__cell}", 89 | "mappingType": 1, 90 | "pattern": "instance_id", 91 | "thresholds": [], 92 | "type": "number", 93 | "unit": "short" 94 | }, 95 | { 96 | "alias": "Instance Type", 97 | "align": "auto", 98 | "colorMode": null, 99 | "colors": [ 100 | "rgba(245, 54, 54, 0.9)", 101 | "rgba(237, 129, 40, 0.89)", 102 | "rgba(50, 172, 45, 0.97)" 103 | ], 104 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 105 | "decimals": 2, 106 | "mappingType": 1, 107 | "pattern": "instance_type", 108 | "thresholds": [], 109 | "type": "number", 110 | "unit": "short" 111 | }, 112 | { 113 | "alias": "CPU load", 114 | "align": "auto", 115 | "colorMode": null, 116 | "colors": [ 117 | "rgba(245, 54, 54, 0.9)", 118 | "rgba(237, 129, 40, 0.89)", 119 | "rgba(50, 172, 45, 0.97)" 120 | ], 121 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 122 | "decimals": 2, 123 | "mappingType": 1, 124 | "pattern": "Value #A", 125 | "thresholds": [], 126 | "type": "number", 127 | "unit": "short" 128 | }, 129 | { 130 | "alias": "Transmit Rate", 131 | "align": "auto", 132 | "colorMode": null, 133 | "colors": [ 134 | "rgba(245, 54, 54, 0.9)", 135 | "rgba(237, 129, 40, 0.89)", 136 | "rgba(50, 172, 45, 0.97)" 137 | ], 138 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 139 | "decimals": 2, 140 | "mappingType": 1, 141 | "pattern": "Value #B", 142 | "thresholds": [], 143 | "type": "number", 144 | "unit": "Bps" 145 | }, 146 | { 147 | "alias": "Receive Rate", 148 | "align": "auto", 149 | "colorMode": null, 150 | "colors": [ 151 | "rgba(245, 54, 54, 0.9)", 152 | "rgba(237, 129, 40, 0.89)", 153 | "rgba(50, 172, 45, 0.97)" 154 | ], 155 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 156 | "decimals": 2, 157 | "mappingType": 1, 158 | "pattern": "Value #C", 159 | "thresholds": [], 160 | "type": "number", 161 | "unit": "Bps" 162 | } 163 | ], 164 | "targets": [ 165 | { 166 | "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[1m])) by (instance_id, instance_type, instance_az)", 167 | "format": "table", 168 | "instant": true, 169 | "legendFormat": "", 170 | "refId": "A" 171 | }, 172 | { 173 | "expr": "sum(rate(node_network_transmit_bytes_total[1m])) by (instance_id, instance_type, instance_az)", 174 | "format": "table", 175 | "instant": true, 176 | "refId": "B" 177 | }, 178 | { 179 | "expr": "sum(rate(node_network_receive_bytes_total[1m])) by (instance_id, instance_type, instance_az)", 180 | "format": "table", 181 | "instant": true, 182 | "refId": "C" 183 | } 184 | ], 185 | "timeFrom": null, 186 | "timeShift": null, 187 | "title": "All Available Nodes", 188 | "transform": "table", 189 | "type": "table-old" 190 | } 191 | ], 192 | "schemaVersion": 25, 193 | "style": "dark", 194 | "tags": [], 195 | "templating": { 196 | "list": [ 197 | { 198 | "datasource": null, 199 | "filters": [ 200 | { 201 | "condition": "", 202 | "key": "instance_id", 203 | "operator": "!=", 204 | "value": "__INSTANCE_ID__" 205 | } 206 | ], 207 | "hide": 2, 208 | "label": "", 209 | "name": "Filters", 210 | "skipUrlSync": false, 211 | "type": "adhoc" 212 | } 213 | ] 214 | }, 215 | "time": { 216 | "from": "now-15m", 217 | "to": "now" 218 | }, 219 | "timepicker": { 220 | "refresh_intervals": [ 221 | "10s", 222 | "30s", 223 | "1m", 224 | "5m", 225 | "15m", 226 | "30m", 227 | "1h", 228 | "2h", 229 | "1d" 230 | ] 231 | }, 232 | "timezone": "", 233 | "title": "Compute Node List", 234 | "uid": "SugNQvuWk", 235 | "version": 1 236 | } -------------------------------------------------------------------------------- /monitoring/grafana/dashboards/dashboards.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | providers: 3 | - name: "Dashboards" 4 | orgId: 1 5 | folder: "" 6 | type: file 7 | disableDeletion: false 8 | editable: true 9 | options: 10 | path: /etc/grafana/provisioning/dashboards 11 | -------------------------------------------------------------------------------- /monitoring/grafana/datasources/datasource.yml: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # 7 | apiVersion: 1 8 | datasources: 9 | - name: prometheus 10 | type: prometheus 11 | access: proxy 12 | orgId: 1 13 | version: 1 14 | url: http://localhost:9090 15 | isDefault: true 16 | editable: true 17 | jsonData: 18 | timeInterval: 10s 19 | - name: cloudwatch 20 | type: cloudwatch 21 | orgId: 1 22 | version: 1 23 | editable: true 24 | jsonData: 25 | authType: default 26 | defaultRegion: us-east 27 | -------------------------------------------------------------------------------- /monitoring/nginx/conf.d/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80 default_server; 3 | listen [::]:80 default_server; 4 | server_name _; 5 | server_tokens off; 6 | 7 | location /grafana/ { 8 | proxy_set_header Host $http_host; 9 | proxy_pass http://localhost:3000/; 10 | } 11 | 12 | location /prometheus/ { 13 | proxy_pass http://localhost:9090/; 14 | } 15 | 16 | location /pushgateway/ { 17 | proxy_pass http://localhost:9091/; 18 | } 19 | 20 | location /slurmexporter/ { 21 | proxy_pass http://localhost:8081/; 22 | } 23 | } -------------------------------------------------------------------------------- /monitoring/prometheus-slurm-exporter/slurm_exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus SLURM Exporter 3 | 4 | [Service] 5 | Environment=PATH=/opt/slurm/bin:$PATH 6 | ExecStart=/usr/bin/prometheus-slurm-exporter -listen-address 0.0.0.0:8081 7 | Restart=on-failure 8 | RestartSec=15 9 | Type=simple 10 | 11 | 12 | [Install] 13 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /monitoring/prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # 7 | global: 8 | scrape_interval: 15s 9 | evaluation_interval: 15s 10 | scrape_timeout: 15s 11 | 12 | scrape_configs: 13 | - job_name: 'slurm_exporter' 14 | scrape_interval: 30s 15 | scrape_timeout: 30s 16 | static_configs: 17 | - targets: ['localhost:8081'] 18 | - job_name: 'pushgateway' 19 | honor_labels: true 20 | static_configs: 21 | - targets: ['localhost:9091'] 22 | - job_name: 'prometheus_server' 23 | scrape_interval: 5s 24 | static_configs: 25 | - targets: ['localhost:9090'] 26 | - job_name: 'ec2_instances' 27 | scrape_interval: 5s 28 | ec2_sd_configs: 29 | - port: 9100 30 | region: __AWS_REGION__ 31 | refresh_interval: 10s 32 | - port: 9400 33 | region: __AWS_REGION__ 34 | refresh_interval: 10s 35 | filters: 36 | - name: instance-type 37 | values: 38 | - p2.xlarge 39 | - p2.8xlarge 40 | - p2.16xlarge 41 | - p3.2xlarge 42 | - p3.8xlarge 43 | - p3.16xlarge 44 | - p3dn.24xlarge 45 | - p4d.24xlarge 46 | - g3s.xlarge 47 | - g3.4xlarge 48 | - g3.8xlarge 49 | - g3.16xlarge 50 | - g4dn.xlarge 51 | - g4dn.2xlarge 52 | - g4dn.4xlarge 53 | - g4dn.8xlarge 54 | - g4dn.16xlarge 55 | - g4dn.12xlarge 56 | - g4dn.metal 57 | relabel_configs: 58 | - source_labels: [__meta_ec2_tag_Name] 59 | target_label: instance_name 60 | - source_labels: [__meta_ec2_tag_parallelcluster_cluster_name] 61 | target_label: instance_grafana 62 | regex: __Application__ 63 | action: keep 64 | - source_labels: [__meta_ec2_instance_id] 65 | target_label: instance_id 66 | - source_labels: [__meta_ec2_availability_zone] 67 | target_label: instance_az 68 | - source_labels: [__meta_ec2_instance_state] 69 | regex: running 70 | action: keep 71 | target_label: instance_state 72 | - source_labels: [__meta_ec2_instance_type] 73 | target_label: instance_type 74 | - source_labels: [__meta_ec2_vpc_id] 75 | target_label: instance_vpc -------------------------------------------------------------------------------- /monitoring/www/aws-logo.svg: -------------------------------------------------------------------------------- 1 | AWS-Logo_White-Color -------------------------------------------------------------------------------- /monitoring/www/background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/1click-hpc/390dfe1e4c131f135cdd8b5f853371e763cd8877/monitoring/www/background.png -------------------------------------------------------------------------------- /monitoring/www/index.html: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | AWS ParallelCluster 9 | 73 | 74 | 75 |
76 | 77 |

1Click-HPC

78 |

1Click-HPC is an open-source project that aims at speeding up the deployment of an HPC Cluster on AWS. You can have a fully functional and ready to use HPC cluster in minutes and with just 1-Click.

79 |

1Click-HPC source code and get started guide can be found .

80 |

It leverages on AWS supported services and projects, like:

81 | 82 |
    83 |
  • AWS ParallelCluster 84 |

    an AWS-supported open source cluster management tool that makes it easy for you to deploy and manage High Performance Computing (HPC) clusters on AWS.

    85 |
  • 86 |
  • NICE DCV
  • 87 |

    a high-performance remote display protocol that provides customers with a secure way to deliver remote desktops and application streaming from any cloud or data center to any device, over varying network conditions.

    88 |
  • NICE EnginFrame
  • 89 |

    is an advanced web front-end for accessing technical and scientific applications running on an HPC Cluster

    90 |
91 | 92 |

93 |

94 |

95 |
96 | 97 | -------------------------------------------------------------------------------- /parallelcluster/config.ap-east-1.sample.yaml: -------------------------------------------------------------------------------- 1 | HeadNode: 2 | LocalStorage: 3 | RootVolume: 4 | Size: 100 5 | Encrypted: false 6 | VolumeType: gp3 7 | Iops: 3000 8 | Throughput: 250 9 | CustomActions: 10 | OnNodeConfigured: 11 | Args: 12 | - 04.configure.slurm.AllOrNothing.headnode.sh 13 | - 07.configure.slurm.tagging.headnode.sh 14 | - 10.install.enginframe.headnode.sh 15 | - 12.configure.enginframe.alb.headnode.sh 16 | - 20.install.dcv.slurm.headnode.sh 17 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 18 | Iam: 19 | AdditionalIamPolicies: 20 | - Policy: arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess 21 | - Policy: arn:aws:iam::aws:policy/SecretsManagerReadWrite 22 | S3Access: 23 | - BucketName: '*' 24 | EnableWriteAccess: true 25 | InstanceType: c5.4xlarge 26 | Networking: 27 | AdditionalSecurityGroups: 28 | - ${ADDITIONAL_SG} 29 | - ${DB_SG} 30 | ElasticIp: true 31 | SubnetId: ${PRIVATE_SUBNET_ID} 32 | Ssh: 33 | KeyName: ${KEY_PAIR} 34 | Imds: 35 | Secured: false 36 | Image: 37 | Os: alinux2 38 | DirectoryService: 39 | DomainName: dc=corp,dc=pcluster,dc=com 40 | DomainAddr: ${NLB_PUBLIC_DNS_NAME} 41 | PasswordSecretArn: ${SECRET_ARN} 42 | DomainReadOnlyUser: cn=ReadOnlyUser,ou=Users,ou=CORP,dc=corp,dc=pcluster,dc=com 43 | LdapTlsReqCert: never 44 | Region: ${AWS_REGION_NAME} 45 | Scheduling: 46 | Scheduler: slurm 47 | SlurmSettings: 48 | ScaledownIdletime: 10 49 | EnableMemoryBasedScheduling: true 50 | Database: 51 | Uri: ${SLURM_DB_ENDPOINT} 52 | UserName: 'admin' 53 | PasswordSecretArn: ${SECRET_ARN} 54 | SlurmQueues: 55 | - Name: compute-od-1 56 | CapacityType: ONDEMAND 57 | ComputeResources: 58 | - Name: c5n-18xlarge 59 | DisableSimultaneousMultithreading: true 60 | Efa: 61 | Enabled: true 62 | InstanceType: c5n.18xlarge 63 | MaxCount: 300 64 | MinCount: 0 65 | - Name: r5n-24xlarge 66 | DisableSimultaneousMultithreading: true 67 | Efa: 68 | Enabled: true 69 | InstanceType: r5n.24xlarge 70 | MaxCount: 300 71 | MinCount: 0 72 | CustomActions: 73 | OnNodeConfigured: 74 | Args: 75 | - 04.configure.disable.anacron.compute.sh 76 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 77 | Iam: 78 | AdditionalIamPolicies: 79 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 80 | S3Access: 81 | - BucketName: '*' 82 | EnableWriteAccess: true 83 | Networking: 84 | AdditionalSecurityGroups: 85 | - ${ADDITIONAL_SG} 86 | PlacementGroup: 87 | Enabled: true 88 | SubnetIds: 89 | - ${PRIVATE_SUBNET_ID} 90 | AssignPublicIp: false 91 | - Name: compute-od-2 92 | CapacityType: ONDEMAND 93 | ComputeResources: 94 | - Name: c5-24xlarge 95 | DisableSimultaneousMultithreading: true 96 | InstanceType: c5.24xlarge 97 | MaxCount: 300 98 | MinCount: 0 99 | - Name: m5-24xlarge 100 | DisableSimultaneousMultithreading: true 101 | InstanceType: m5.24xlarge 102 | MaxCount: 300 103 | MinCount: 0 104 | - Name: r5-24xlarge 105 | DisableSimultaneousMultithreading: true 106 | InstanceType: r5.24xlarge 107 | MaxCount: 300 108 | MinCount: 0 109 | CustomActions: 110 | OnNodeConfigured: 111 | Args: 112 | - 04.configure.disable.anacron.compute.sh 113 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 114 | Iam: 115 | AdditionalIamPolicies: 116 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 117 | S3Access: 118 | - BucketName: '*' 119 | EnableWriteAccess: true 120 | Networking: 121 | AdditionalSecurityGroups: 122 | - ${ADDITIONAL_SG} 123 | PlacementGroup: 124 | Enabled: true 125 | SubnetIds: 126 | - ${PRIVATE_SUBNET_ID} 127 | AssignPublicIp: false 128 | - Name: compute-od-3 129 | CapacityType: ONDEMAND 130 | ComputeResources: 131 | - Name: m5d-24xlarge 132 | DisableSimultaneousMultithreading: true 133 | InstanceType: m5d.24xlarge 134 | MaxCount: 300 135 | MinCount: 0 136 | - Name: r5d-24xlarge 137 | DisableSimultaneousMultithreading: true 138 | InstanceType: r5d.24xlarge 139 | MaxCount: 300 140 | MinCount: 0 141 | CustomActions: 142 | OnNodeConfigured: 143 | Args: 144 | - 04.configure.disable.anacron.compute.sh 145 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 146 | Iam: 147 | AdditionalIamPolicies: 148 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 149 | S3Access: 150 | - BucketName: '*' 151 | EnableWriteAccess: true 152 | Networking: 153 | AdditionalSecurityGroups: 154 | - ${ADDITIONAL_SG} 155 | PlacementGroup: 156 | Enabled: true 157 | SubnetIds: 158 | - ${PRIVATE_SUBNET_ID} 159 | AssignPublicIp: false 160 | - Name: compute-spot-1 161 | CapacityType: SPOT 162 | ComputeResources: 163 | - Name: c5n-18xlarge 164 | DisableSimultaneousMultithreading: true 165 | Efa: 166 | Enabled: true 167 | InstanceType: c5n.18xlarge 168 | MaxCount: 300 169 | MinCount: 0 170 | - Name: c5-24xlarge 171 | DisableSimultaneousMultithreading: true 172 | InstanceType: c5.24xlarge 173 | MaxCount: 300 174 | MinCount: 0 175 | - Name: m5-24xlarge 176 | DisableSimultaneousMultithreading: true 177 | InstanceType: m5.24xlarge 178 | MaxCount: 300 179 | MinCount: 0 180 | - Name: r5-24xlarge 181 | DisableSimultaneousMultithreading: true 182 | InstanceType: r5.24xlarge 183 | MaxCount: 300 184 | MinCount: 0 185 | CustomActions: 186 | OnNodeConfigured: 187 | Args: 188 | - 04.configure.disable.anacron.compute.sh 189 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 190 | Iam: 191 | AdditionalIamPolicies: 192 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 193 | S3Access: 194 | - BucketName: '*' 195 | EnableWriteAccess: true 196 | Networking: 197 | AdditionalSecurityGroups: 198 | - ${ADDITIONAL_SG} 199 | PlacementGroup: 200 | Enabled: true 201 | SubnetIds: 202 | - ${PRIVATE_SUBNET_ID} 203 | AssignPublicIp: false 204 | - Name: dcv-gpu 205 | CapacityType: ONDEMAND 206 | ComputeResources: 207 | - Name: g4dn-xlarge 208 | DisableSimultaneousMultithreading: true 209 | InstanceType: g4dn.xlarge 210 | MaxCount: 100 211 | MinCount: 0 212 | - Name: g4dn-2xlarge 213 | DisableSimultaneousMultithreading: true 214 | InstanceType: g4dn.2xlarge 215 | MaxCount: 100 216 | MinCount: 0 217 | - Name: g4dn-4xlarge 218 | DisableSimultaneousMultithreading: true 219 | InstanceType: g4dn.4xlarge 220 | MaxCount: 100 221 | MinCount: 0 222 | - Name: g4dn-8xlarge 223 | DisableSimultaneousMultithreading: true 224 | InstanceType: g4dn.8xlarge 225 | MaxCount: 100 226 | MinCount: 0 227 | - Name: g4dn-16xlarge 228 | DisableSimultaneousMultithreading: true 229 | InstanceType: g4dn.16xlarge 230 | MaxCount: 100 231 | MinCount: 0 232 | CustomActions: 233 | OnNodeConfigured: 234 | Args: 235 | - 04.configure.disable.anacron.compute.sh 236 | - 25.install.dcv-server.gpu.sh 237 | - 26.configure.dcv.alb.compute.sh 238 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 239 | Iam: 240 | AdditionalIamPolicies: 241 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 242 | S3Access: 243 | - BucketName: '*' 244 | EnableWriteAccess: true 245 | Networking: 246 | AdditionalSecurityGroups: 247 | - ${ADDITIONAL_SG} 248 | PlacementGroup: 249 | Enabled: true 250 | SubnetIds: 251 | - ${PRIVATE_SUBNET_ID} 252 | AssignPublicIp: false 253 | - Name: dcv 254 | CapacityType: ONDEMAND 255 | ComputeResources: 256 | - Name: m5-24xlarge 257 | DisableSimultaneousMultithreading: true 258 | InstanceType: m5.24xlarge 259 | MaxCount: 200 260 | MinCount: 0 261 | - Name: m5-2xlarge 262 | DisableSimultaneousMultithreading: true 263 | InstanceType: m5.2xlarge 264 | MaxCount: 200 265 | MinCount: 0 266 | - Name: m5-8xlarge 267 | DisableSimultaneousMultithreading: true 268 | InstanceType: m5.8xlarge 269 | MaxCount: 200 270 | MinCount: 0 271 | - Name: m5-16xlarge 272 | DisableSimultaneousMultithreading: true 273 | InstanceType: m5.16xlarge 274 | MaxCount: 200 275 | MinCount: 0 276 | - Name: m5-xlarge 277 | DisableSimultaneousMultithreading: true 278 | InstanceType: m5.xlarge 279 | MaxCount: 200 280 | MinCount: 0 281 | CustomActions: 282 | OnNodeConfigured: 283 | Args: 284 | - 04.configure.disable.anacron.compute.sh 285 | - 25.install.dcv-server.compute.sh 286 | - 26.configure.dcv.alb.compute.sh 287 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 288 | Iam: 289 | AdditionalIamPolicies: 290 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 291 | S3Access: 292 | - BucketName: '*' 293 | EnableWriteAccess: true 294 | Networking: 295 | AdditionalSecurityGroups: 296 | - ${ADDITIONAL_SG} 297 | PlacementGroup: 298 | Enabled: true 299 | SubnetIds: 300 | - ${PRIVATE_SUBNET_ID} 301 | AssignPublicIp: false 302 | SharedStorage: 303 | ${FSX} 304 | Tags: 305 | - Key: EnginFrame 306 | Value: 'true' 307 | - Key: 1Click-HPC 308 | Value: 'true' 309 | - Key: 1Click-HPC-version 310 | Value: '0.4' -------------------------------------------------------------------------------- /parallelcluster/config.ap-south-1.sample.yaml: -------------------------------------------------------------------------------- 1 | HeadNode: 2 | LocalStorage: 3 | RootVolume: 4 | Size: 100 5 | Encrypted: false 6 | VolumeType: gp3 7 | Iops: 3000 8 | Throughput: 250 9 | CustomActions: 10 | OnNodeConfigured: 11 | Args: 12 | - 04.configure.slurm.AllOrNothing.headnode.sh 13 | - 07.configure.slurm.tagging.headnode.sh 14 | - 10.install.enginframe.headnode.sh 15 | - 12.configure.enginframe.alb.headnode.sh 16 | - 20.install.dcv.slurm.headnode.sh 17 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 18 | Iam: 19 | AdditionalIamPolicies: 20 | - Policy: arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess 21 | - Policy: arn:aws:iam::aws:policy/SecretsManagerReadWrite 22 | S3Access: 23 | - BucketName: '*' 24 | EnableWriteAccess: true 25 | InstanceType: c5.4xlarge 26 | Networking: 27 | AdditionalSecurityGroups: 28 | - ${ADDITIONAL_SG} 29 | - ${DB_SG} 30 | ElasticIp: true 31 | SubnetId: ${PRIVATE_SUBNET_ID} 32 | Ssh: 33 | KeyName: ${KEY_PAIR} 34 | Imds: 35 | Secured: false 36 | Image: 37 | Os: alinux2 38 | DirectoryService: 39 | DomainName: dc=corp,dc=pcluster,dc=com 40 | DomainAddr: ${NLB_PUBLIC_DNS_NAME} 41 | PasswordSecretArn: ${SECRET_ARN} 42 | DomainReadOnlyUser: cn=ReadOnlyUser,ou=Users,ou=CORP,dc=corp,dc=pcluster,dc=com 43 | LdapTlsReqCert: never 44 | Region: ${AWS_REGION_NAME} 45 | Scheduling: 46 | Scheduler: slurm 47 | SlurmSettings: 48 | ScaledownIdletime: 10 49 | EnableMemoryBasedScheduling: true 50 | Database: 51 | Uri: ${SLURM_DB_ENDPOINT} 52 | UserName: 'admin' 53 | PasswordSecretArn: ${SECRET_ARN} 54 | SlurmQueues: 55 | - Name: compute-od-1 56 | CapacityType: ONDEMAND 57 | ComputeResources: 58 | - Name: c6i-32xlarge 59 | DisableSimultaneousMultithreading: true 60 | Efa: 61 | Enabled: true 62 | InstanceType: c6i.32xlarge 63 | MaxCount: 300 64 | MinCount: 0 65 | - Name: m6i-32xlarge 66 | DisableSimultaneousMultithreading: true 67 | Efa: 68 | Enabled: true 69 | InstanceType: m6i.32xlarge 70 | MaxCount: 300 71 | MinCount: 0 72 | - Name: r6i-32xlarge 73 | DisableSimultaneousMultithreading: true 74 | Efa: 75 | Enabled: true 76 | InstanceType: r6i.32xlarge 77 | MaxCount: 300 78 | MinCount: 0 79 | CustomActions: 80 | OnNodeConfigured: 81 | Args: 82 | - 04.configure.disable.anacron.compute.sh 83 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 84 | Iam: 85 | AdditionalIamPolicies: 86 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 87 | S3Access: 88 | - BucketName: '*' 89 | EnableWriteAccess: true 90 | Networking: 91 | AdditionalSecurityGroups: 92 | - ${ADDITIONAL_SG} 93 | PlacementGroup: 94 | Enabled: true 95 | SubnetIds: 96 | - ${PRIVATE_SUBNET_ID} 97 | AssignPublicIp: false 98 | - Name: compute-od-2 99 | CapacityType: ONDEMAND 100 | ComputeResources: 101 | - Name: c5n-18xlarge 102 | DisableSimultaneousMultithreading: true 103 | Efa: 104 | Enabled: true 105 | InstanceType: c5n.18xlarge 106 | MaxCount: 300 107 | MinCount: 0 108 | - Name: r5n-24xlarge 109 | DisableSimultaneousMultithreading: true 110 | Efa: 111 | Enabled: true 112 | InstanceType: r5n.24xlarge 113 | MaxCount: 300 114 | MinCount: 0 115 | CustomActions: 116 | OnNodeConfigured: 117 | Args: 118 | - 04.configure.disable.anacron.compute.sh 119 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 120 | Iam: 121 | AdditionalIamPolicies: 122 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 123 | S3Access: 124 | - BucketName: '*' 125 | EnableWriteAccess: true 126 | Networking: 127 | AdditionalSecurityGroups: 128 | - ${ADDITIONAL_SG} 129 | PlacementGroup: 130 | Enabled: true 131 | SubnetIds: 132 | - ${PRIVATE_SUBNET_ID} 133 | AssignPublicIp: false 134 | - Name: compute-od-3 135 | CapacityType: ONDEMAND 136 | ComputeResources: 137 | - Name: c5-24xlarge 138 | DisableSimultaneousMultithreading: true 139 | InstanceType: c5.24xlarge 140 | MaxCount: 300 141 | MinCount: 0 142 | - Name: m5-24xlarge 143 | DisableSimultaneousMultithreading: true 144 | InstanceType: m5.24xlarge 145 | MaxCount: 300 146 | MinCount: 0 147 | - Name: r5-24xlarge 148 | DisableSimultaneousMultithreading: true 149 | InstanceType: r5.24xlarge 150 | MaxCount: 300 151 | MinCount: 0 152 | CustomActions: 153 | OnNodeConfigured: 154 | Args: 155 | - 04.configure.disable.anacron.compute.sh 156 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 157 | Iam: 158 | AdditionalIamPolicies: 159 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 160 | S3Access: 161 | - BucketName: '*' 162 | EnableWriteAccess: true 163 | Networking: 164 | AdditionalSecurityGroups: 165 | - ${ADDITIONAL_SG} 166 | PlacementGroup: 167 | Enabled: true 168 | SubnetIds: 169 | - ${PRIVATE_SUBNET_ID} 170 | AssignPublicIp: false 171 | - Name: compute-od-4 172 | CapacityType: ONDEMAND 173 | ComputeResources: 174 | - Name: c5d-24xlarge 175 | DisableSimultaneousMultithreading: true 176 | InstanceType: c5d.24xlarge 177 | MaxCount: 300 178 | MinCount: 0 179 | - Name: m5d-24xlarge 180 | DisableSimultaneousMultithreading: true 181 | InstanceType: m5d.24xlarge 182 | MaxCount: 300 183 | MinCount: 0 184 | - Name: r5d-24xlarge 185 | DisableSimultaneousMultithreading: true 186 | InstanceType: r5d.24xlarge 187 | MaxCount: 300 188 | MinCount: 0 189 | CustomActions: 190 | OnNodeConfigured: 191 | Args: 192 | - 04.configure.disable.anacron.compute.sh 193 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 194 | Iam: 195 | AdditionalIamPolicies: 196 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 197 | S3Access: 198 | - BucketName: '*' 199 | EnableWriteAccess: true 200 | Networking: 201 | AdditionalSecurityGroups: 202 | - ${ADDITIONAL_SG} 203 | PlacementGroup: 204 | Enabled: true 205 | SubnetIds: 206 | - ${PRIVATE_SUBNET_ID} 207 | AssignPublicIp: false 208 | - Name: compute-spot-1 209 | CapacityType: SPOT 210 | ComputeResources: 211 | - Name: c6i-32xlarge 212 | DisableSimultaneousMultithreading: true 213 | Efa: 214 | Enabled: true 215 | InstanceType: c6i.32xlarge 216 | MaxCount: 300 217 | MinCount: 0 218 | - Name: m6i-32xlarge 219 | DisableSimultaneousMultithreading: true 220 | Efa: 221 | Enabled: true 222 | InstanceType: m6i.32xlarge 223 | MaxCount: 300 224 | MinCount: 0 225 | - Name: r6i-32xlarge 226 | DisableSimultaneousMultithreading: true 227 | Efa: 228 | Enabled: true 229 | InstanceType: r6i.32xlarge 230 | MaxCount: 300 231 | MinCount: 0 232 | CustomActions: 233 | OnNodeConfigured: 234 | Args: 235 | - 04.configure.disable.anacron.compute.sh 236 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 237 | Iam: 238 | AdditionalIamPolicies: 239 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 240 | S3Access: 241 | - BucketName: '*' 242 | EnableWriteAccess: true 243 | Networking: 244 | AdditionalSecurityGroups: 245 | - ${ADDITIONAL_SG} 246 | PlacementGroup: 247 | Enabled: true 248 | SubnetIds: 249 | - ${PRIVATE_SUBNET_ID} 250 | AssignPublicIp: false 251 | - Name: compute-spot-2 252 | CapacityType: SPOT 253 | ComputeResources: 254 | - Name: c5n-18xlarge 255 | DisableSimultaneousMultithreading: true 256 | Efa: 257 | Enabled: true 258 | InstanceType: c5n.18xlarge 259 | MaxCount: 300 260 | MinCount: 0 261 | - Name: c5-24xlarge 262 | DisableSimultaneousMultithreading: true 263 | InstanceType: c5.24xlarge 264 | MaxCount: 300 265 | MinCount: 0 266 | - Name: m5-24xlarge 267 | DisableSimultaneousMultithreading: true 268 | InstanceType: m5.24xlarge 269 | MaxCount: 300 270 | MinCount: 0 271 | - Name: r5-24xlarge 272 | DisableSimultaneousMultithreading: true 273 | InstanceType: r5.24xlarge 274 | MaxCount: 300 275 | MinCount: 0 276 | - Name: z1d-12xlarge 277 | DisableSimultaneousMultithreading: true 278 | InstanceType: z1d.12xlarge 279 | MaxCount: 100 280 | MinCount: 0 281 | CustomActions: 282 | OnNodeConfigured: 283 | Args: 284 | - 04.configure.disable.anacron.compute.sh 285 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 286 | Iam: 287 | AdditionalIamPolicies: 288 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 289 | S3Access: 290 | - BucketName: '*' 291 | EnableWriteAccess: true 292 | Networking: 293 | AdditionalSecurityGroups: 294 | - ${ADDITIONAL_SG} 295 | PlacementGroup: 296 | Enabled: true 297 | SubnetIds: 298 | - ${PRIVATE_SUBNET_ID} 299 | AssignPublicIp: false 300 | - Name: dcv-gpu 301 | CapacityType: ONDEMAND 302 | ComputeResources: 303 | - Name: g4dn-xlarge 304 | DisableSimultaneousMultithreading: true 305 | InstanceType: g4dn.xlarge 306 | MaxCount: 100 307 | MinCount: 0 308 | - Name: g4dn-2xlarge 309 | DisableSimultaneousMultithreading: true 310 | InstanceType: g4dn.2xlarge 311 | MaxCount: 100 312 | MinCount: 0 313 | - Name: g4dn-4xlarge 314 | DisableSimultaneousMultithreading: true 315 | InstanceType: g4dn.4xlarge 316 | MaxCount: 100 317 | MinCount: 0 318 | - Name: g4dn-8xlarge 319 | DisableSimultaneousMultithreading: true 320 | InstanceType: g4dn.8xlarge 321 | MaxCount: 100 322 | MinCount: 0 323 | - Name: g4dn-16xlarge 324 | DisableSimultaneousMultithreading: true 325 | InstanceType: g4dn.16xlarge 326 | MaxCount: 100 327 | MinCount: 0 328 | CustomActions: 329 | OnNodeConfigured: 330 | Args: 331 | - 04.configure.disable.anacron.compute.sh 332 | - 25.install.dcv-server.gpu.sh 333 | - 26.configure.dcv.alb.compute.sh 334 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 335 | Iam: 336 | AdditionalIamPolicies: 337 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 338 | S3Access: 339 | - BucketName: '*' 340 | EnableWriteAccess: true 341 | Networking: 342 | AdditionalSecurityGroups: 343 | - ${ADDITIONAL_SG} 344 | PlacementGroup: 345 | Enabled: true 346 | SubnetIds: 347 | - ${PRIVATE_SUBNET_ID} 348 | AssignPublicIp: false 349 | - Name: dcv 350 | CapacityType: ONDEMAND 351 | ComputeResources: 352 | - Name: m6i-32xlarge 353 | DisableSimultaneousMultithreading: true 354 | InstanceType: m6i.32xlarge 355 | Efa: 356 | Enabled: true 357 | MaxCount: 200 358 | MinCount: 0 359 | - Name: m6i-2xlarge 360 | DisableSimultaneousMultithreading: true 361 | InstanceType: m6i.2xlarge 362 | MaxCount: 200 363 | MinCount: 0 364 | - Name: m6i-8xlarge 365 | DisableSimultaneousMultithreading: true 366 | InstanceType: m6i.8xlarge 367 | MaxCount: 200 368 | MinCount: 0 369 | - Name: m6i-16xlarge 370 | DisableSimultaneousMultithreading: true 371 | InstanceType: m6i.16xlarge 372 | MaxCount: 200 373 | MinCount: 0 374 | - Name: m6i-xlarge 375 | DisableSimultaneousMultithreading: true 376 | InstanceType: m6i.xlarge 377 | MaxCount: 200 378 | MinCount: 0 379 | CustomActions: 380 | OnNodeConfigured: 381 | Args: 382 | - 04.configure.disable.anacron.compute.sh 383 | - 25.install.dcv-server.compute.sh 384 | - 26.configure.dcv.alb.compute.sh 385 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 386 | Iam: 387 | AdditionalIamPolicies: 388 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 389 | S3Access: 390 | - BucketName: '*' 391 | EnableWriteAccess: true 392 | Networking: 393 | AdditionalSecurityGroups: 394 | - ${ADDITIONAL_SG} 395 | PlacementGroup: 396 | Enabled: true 397 | SubnetIds: 398 | - ${PRIVATE_SUBNET_ID} 399 | AssignPublicIp: false 400 | SharedStorage: 401 | ${FSX} 402 | Tags: 403 | - Key: EnginFrame 404 | Value: 'true' 405 | - Key: 1Click-HPC 406 | Value: 'true' 407 | - Key: 1Click-HPC-version 408 | Value: '0.4' -------------------------------------------------------------------------------- /parallelcluster/config.ca-central-1.sample.yaml: -------------------------------------------------------------------------------- 1 | HeadNode: 2 | LocalStorage: 3 | RootVolume: 4 | Size: 100 5 | Encrypted: false 6 | VolumeType: gp3 7 | Iops: 3000 8 | Throughput: 250 9 | CustomActions: 10 | OnNodeConfigured: 11 | Args: 12 | - 04.configure.slurm.AllOrNothing.headnode.sh 13 | - 07.configure.slurm.tagging.headnode.sh 14 | - 10.install.enginframe.headnode.sh 15 | - 12.configure.enginframe.alb.headnode.sh 16 | - 20.install.dcv.slurm.headnode.sh 17 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 18 | Iam: 19 | AdditionalIamPolicies: 20 | - Policy: arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess 21 | - Policy: arn:aws:iam::aws:policy/SecretsManagerReadWrite 22 | S3Access: 23 | - BucketName: '*' 24 | EnableWriteAccess: true 25 | InstanceType: c5.4xlarge 26 | Networking: 27 | AdditionalSecurityGroups: 28 | - ${ADDITIONAL_SG} 29 | - ${DB_SG} 30 | ElasticIp: true 31 | SubnetId: ${PRIVATE_SUBNET_ID} 32 | Ssh: 33 | KeyName: ${KEY_PAIR} 34 | Imds: 35 | Secured: false 36 | Image: 37 | Os: alinux2 38 | DirectoryService: 39 | DomainName: dc=corp,dc=pcluster,dc=com 40 | DomainAddr: ${NLB_PUBLIC_DNS_NAME} 41 | PasswordSecretArn: ${SECRET_ARN} 42 | DomainReadOnlyUser: cn=ReadOnlyUser,ou=Users,ou=CORP,dc=corp,dc=pcluster,dc=com 43 | LdapTlsReqCert: never 44 | Region: ${AWS_REGION_NAME} 45 | Scheduling: 46 | Scheduler: slurm 47 | SlurmSettings: 48 | ScaledownIdletime: 10 49 | EnableMemoryBasedScheduling: true 50 | Database: 51 | Uri: ${SLURM_DB_ENDPOINT} 52 | UserName: 'admin' 53 | PasswordSecretArn: ${SECRET_ARN} 54 | SlurmQueues: 55 | - Name: compute-od-1 56 | CapacityType: ONDEMAND 57 | ComputeResources: 58 | - Name: c6i-32xlarge 59 | DisableSimultaneousMultithreading: true 60 | Efa: 61 | Enabled: true 62 | InstanceType: c6i.32xlarge 63 | MaxCount: 300 64 | MinCount: 0 65 | CustomActions: 66 | OnNodeConfigured: 67 | Args: 68 | - 04.configure.disable.anacron.compute.sh 69 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 70 | Iam: 71 | AdditionalIamPolicies: 72 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 73 | S3Access: 74 | - BucketName: '*' 75 | EnableWriteAccess: true 76 | Networking: 77 | AdditionalSecurityGroups: 78 | - ${ADDITIONAL_SG} 79 | PlacementGroup: 80 | Enabled: true 81 | SubnetIds: 82 | - ${PRIVATE_SUBNET_ID} 83 | AssignPublicIp: false 84 | - Name: compute-od-2 85 | CapacityType: ONDEMAND 86 | ComputeResources: 87 | - Name: c5n-18xlarge 88 | DisableSimultaneousMultithreading: true 89 | Efa: 90 | Enabled: true 91 | InstanceType: c5n.18xlarge 92 | MaxCount: 300 93 | MinCount: 0 94 | - Name: r5n-24xlarge 95 | DisableSimultaneousMultithreading: true 96 | Efa: 97 | Enabled: true 98 | InstanceType: r5n.24xlarge 99 | MaxCount: 300 100 | MinCount: 0 101 | CustomActions: 102 | OnNodeConfigured: 103 | Args: 104 | - 04.configure.disable.anacron.compute.sh 105 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 106 | Iam: 107 | AdditionalIamPolicies: 108 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 109 | S3Access: 110 | - BucketName: '*' 111 | EnableWriteAccess: true 112 | Networking: 113 | AdditionalSecurityGroups: 114 | - ${ADDITIONAL_SG} 115 | PlacementGroup: 116 | Enabled: true 117 | SubnetIds: 118 | - ${PRIVATE_SUBNET_ID} 119 | AssignPublicIp: false 120 | - Name: compute-od-3 121 | CapacityType: ONDEMAND 122 | ComputeResources: 123 | - Name: c5-24xlarge 124 | DisableSimultaneousMultithreading: true 125 | InstanceType: c5.24xlarge 126 | MaxCount: 300 127 | MinCount: 0 128 | - Name: m5-24xlarge 129 | DisableSimultaneousMultithreading: true 130 | InstanceType: m5.24xlarge 131 | MaxCount: 300 132 | MinCount: 0 133 | - Name: r5-24xlarge 134 | DisableSimultaneousMultithreading: true 135 | InstanceType: r5.24xlarge 136 | MaxCount: 300 137 | MinCount: 0 138 | CustomActions: 139 | OnNodeConfigured: 140 | Args: 141 | - 04.configure.disable.anacron.compute.sh 142 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 143 | Iam: 144 | AdditionalIamPolicies: 145 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 146 | S3Access: 147 | - BucketName: '*' 148 | EnableWriteAccess: true 149 | Networking: 150 | AdditionalSecurityGroups: 151 | - ${ADDITIONAL_SG} 152 | PlacementGroup: 153 | Enabled: true 154 | SubnetIds: 155 | - ${PRIVATE_SUBNET_ID} 156 | AssignPublicIp: false 157 | - Name: compute-od-4 158 | CapacityType: ONDEMAND 159 | ComputeResources: 160 | - Name: c5d-24xlarge 161 | DisableSimultaneousMultithreading: true 162 | InstanceType: c5d.24xlarge 163 | MaxCount: 300 164 | MinCount: 0 165 | - Name: m5d-24xlarge 166 | DisableSimultaneousMultithreading: true 167 | InstanceType: m5d.24xlarge 168 | MaxCount: 300 169 | MinCount: 0 170 | - Name: r5d-24xlarge 171 | DisableSimultaneousMultithreading: true 172 | InstanceType: r5d.24xlarge 173 | MaxCount: 300 174 | MinCount: 0 175 | CustomActions: 176 | OnNodeConfigured: 177 | Args: 178 | - 04.configure.disable.anacron.compute.sh 179 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 180 | Iam: 181 | AdditionalIamPolicies: 182 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 183 | S3Access: 184 | - BucketName: '*' 185 | EnableWriteAccess: true 186 | Networking: 187 | AdditionalSecurityGroups: 188 | - ${ADDITIONAL_SG} 189 | PlacementGroup: 190 | Enabled: true 191 | SubnetIds: 192 | - ${PRIVATE_SUBNET_ID} 193 | AssignPublicIp: false 194 | - Name: compute-spot-1 195 | CapacityType: SPOT 196 | ComputeResources: 197 | - Name: c6i-32xlarge 198 | DisableSimultaneousMultithreading: true 199 | Efa: 200 | Enabled: true 201 | InstanceType: c6i.32xlarge 202 | MaxCount: 300 203 | MinCount: 0 204 | CustomActions: 205 | OnNodeConfigured: 206 | Args: 207 | - 04.configure.disable.anacron.compute.sh 208 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 209 | Iam: 210 | AdditionalIamPolicies: 211 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 212 | S3Access: 213 | - BucketName: '*' 214 | EnableWriteAccess: true 215 | Networking: 216 | AdditionalSecurityGroups: 217 | - ${ADDITIONAL_SG} 218 | PlacementGroup: 219 | Enabled: true 220 | SubnetIds: 221 | - ${PRIVATE_SUBNET_ID} 222 | AssignPublicIp: false 223 | - Name: compute-spot-2 224 | CapacityType: SPOT 225 | ComputeResources: 226 | - Name: c5n-18xlarge 227 | DisableSimultaneousMultithreading: true 228 | Efa: 229 | Enabled: true 230 | InstanceType: c5n.18xlarge 231 | MaxCount: 300 232 | MinCount: 0 233 | - Name: c5-24xlarge 234 | DisableSimultaneousMultithreading: true 235 | InstanceType: c5.24xlarge 236 | MaxCount: 300 237 | MinCount: 0 238 | - Name: m5-24xlarge 239 | DisableSimultaneousMultithreading: true 240 | InstanceType: m5.24xlarge 241 | MaxCount: 300 242 | MinCount: 0 243 | - Name: r5-24xlarge 244 | DisableSimultaneousMultithreading: true 245 | InstanceType: r5.24xlarge 246 | MaxCount: 300 247 | MinCount: 0 248 | CustomActions: 249 | OnNodeConfigured: 250 | Args: 251 | - 04.configure.disable.anacron.compute.sh 252 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 253 | Iam: 254 | AdditionalIamPolicies: 255 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 256 | S3Access: 257 | - BucketName: '*' 258 | EnableWriteAccess: true 259 | Networking: 260 | AdditionalSecurityGroups: 261 | - ${ADDITIONAL_SG} 262 | PlacementGroup: 263 | Enabled: true 264 | SubnetIds: 265 | - ${PRIVATE_SUBNET_ID} 266 | AssignPublicIp: false 267 | - Name: dcv-gpu 268 | CapacityType: ONDEMAND 269 | ComputeResources: 270 | - Name: g4dn-xlarge 271 | DisableSimultaneousMultithreading: true 272 | InstanceType: g4dn.xlarge 273 | MaxCount: 100 274 | MinCount: 0 275 | - Name: g4dn-2xlarge 276 | DisableSimultaneousMultithreading: true 277 | InstanceType: g4dn.2xlarge 278 | MaxCount: 100 279 | MinCount: 0 280 | - Name: g4dn-4xlarge 281 | DisableSimultaneousMultithreading: true 282 | InstanceType: g4dn.4xlarge 283 | MaxCount: 100 284 | MinCount: 0 285 | - Name: g4dn-8xlarge 286 | DisableSimultaneousMultithreading: true 287 | InstanceType: g4dn.8xlarge 288 | MaxCount: 100 289 | MinCount: 0 290 | - Name: g4dn-16xlarge 291 | DisableSimultaneousMultithreading: true 292 | InstanceType: g4dn.16xlarge 293 | MaxCount: 100 294 | MinCount: 0 295 | CustomActions: 296 | OnNodeConfigured: 297 | Args: 298 | - 04.configure.disable.anacron.compute.sh 299 | - 25.install.dcv-server.gpu.sh 300 | - 26.configure.dcv.alb.compute.sh 301 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 302 | Iam: 303 | AdditionalIamPolicies: 304 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 305 | S3Access: 306 | - BucketName: '*' 307 | EnableWriteAccess: true 308 | Networking: 309 | AdditionalSecurityGroups: 310 | - ${ADDITIONAL_SG} 311 | PlacementGroup: 312 | Enabled: true 313 | SubnetIds: 314 | - ${PRIVATE_SUBNET_ID} 315 | AssignPublicIp: false 316 | - Name: dcv 317 | CapacityType: ONDEMAND 318 | ComputeResources: 319 | - Name: c6i-32xlarge 320 | DisableSimultaneousMultithreading: true 321 | InstanceType: c6i.32xlarge 322 | Efa: 323 | Enabled: true 324 | MaxCount: 200 325 | MinCount: 0 326 | - Name: c6i-2xlarge 327 | DisableSimultaneousMultithreading: true 328 | InstanceType: c6i.2xlarge 329 | MaxCount: 200 330 | MinCount: 0 331 | - Name: c6i-8xlarge 332 | DisableSimultaneousMultithreading: true 333 | InstanceType: c6i.8xlarge 334 | MaxCount: 200 335 | MinCount: 0 336 | - Name: c6i-16xlarge 337 | DisableSimultaneousMultithreading: true 338 | InstanceType: c6i.16xlarge 339 | MaxCount: 200 340 | MinCount: 0 341 | - Name: c6i-xlarge 342 | DisableSimultaneousMultithreading: true 343 | InstanceType: c6i.xlarge 344 | MaxCount: 200 345 | MinCount: 0 346 | CustomActions: 347 | OnNodeConfigured: 348 | Args: 349 | - 04.configure.disable.anacron.compute.sh 350 | - 25.install.dcv-server.compute.sh 351 | - 26.configure.dcv.alb.compute.sh 352 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 353 | Iam: 354 | AdditionalIamPolicies: 355 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 356 | S3Access: 357 | - BucketName: '*' 358 | EnableWriteAccess: true 359 | Networking: 360 | AdditionalSecurityGroups: 361 | - ${ADDITIONAL_SG} 362 | PlacementGroup: 363 | Enabled: true 364 | SubnetIds: 365 | - ${PRIVATE_SUBNET_ID} 366 | AssignPublicIp: false 367 | SharedStorage: 368 | ${FSX} 369 | Tags: 370 | - Key: EnginFrame 371 | Value: 'true' 372 | - Key: 1Click-HPC 373 | Value: 'true' 374 | - Key: 1Click-HPC-version 375 | Value: '0.4' -------------------------------------------------------------------------------- /parallelcluster/config.eu-north-1.sample.yaml: -------------------------------------------------------------------------------- 1 | HeadNode: 2 | LocalStorage: 3 | RootVolume: 4 | Size: 100 5 | Encrypted: false 6 | VolumeType: gp3 7 | Iops: 3000 8 | Throughput: 250 9 | CustomActions: 10 | OnNodeConfigured: 11 | Args: 12 | - 04.configure.slurm.AllOrNothing.headnode.sh 13 | - 07.configure.slurm.tagging.headnode.sh 14 | - 10.install.enginframe.headnode.sh 15 | - 12.configure.enginframe.alb.headnode.sh 16 | - 20.install.dcv.slurm.headnode.sh 17 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 18 | Iam: 19 | AdditionalIamPolicies: 20 | - Policy: arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess 21 | - Policy: arn:aws:iam::aws:policy/SecretsManagerReadWrite 22 | S3Access: 23 | - BucketName: '*' 24 | EnableWriteAccess: true 25 | InstanceType: c5.4xlarge 26 | Networking: 27 | AdditionalSecurityGroups: 28 | - ${ADDITIONAL_SG} 29 | - ${DB_SG} 30 | ElasticIp: true 31 | SubnetId: ${PRIVATE_SUBNET_ID} 32 | Ssh: 33 | KeyName: ${KEY_PAIR} 34 | Imds: 35 | Secured: false 36 | Image: 37 | Os: alinux2 38 | DirectoryService: 39 | DomainName: dc=corp,dc=pcluster,dc=com 40 | DomainAddr: ${NLB_PUBLIC_DNS_NAME} 41 | PasswordSecretArn: ${SECRET_ARN} 42 | DomainReadOnlyUser: cn=ReadOnlyUser,ou=Users,ou=CORP,dc=corp,dc=pcluster,dc=com 43 | LdapTlsReqCert: never 44 | Region: ${AWS_REGION_NAME} 45 | Scheduling: 46 | Scheduler: slurm 47 | SlurmSettings: 48 | ScaledownIdletime: 10 49 | EnableMemoryBasedScheduling: true 50 | Database: 51 | Uri: ${SLURM_DB_ENDPOINT} 52 | UserName: 'admin' 53 | PasswordSecretArn: ${SECRET_ARN} 54 | SlurmQueues: 55 | - Name: compute-od-1 56 | CapacityType: ONDEMAND 57 | ComputeResources: 58 | - Name: hpc6a-48xlarge 59 | DisableSimultaneousMultithreading: true 60 | Efa: 61 | Enabled: true 62 | InstanceType: hpc6a.48xlarge 63 | MaxCount: 400 64 | MinCount: 0 65 | - Name: hpc6id-32xlarge 66 | DisableSimultaneousMultithreading: true 67 | Efa: 68 | Enabled: true 69 | InstanceType: hpc6id.32xlarge 70 | MaxCount: 300 71 | MinCount: 0 72 | - Name: c5n-18xlarge 73 | DisableSimultaneousMultithreading: true 74 | Efa: 75 | Enabled: true 76 | InstanceType: c5n.18xlarge 77 | MaxCount: 300 78 | MinCount: 0 79 | - Name: r5n-24xlarge 80 | DisableSimultaneousMultithreading: true 81 | Efa: 82 | Enabled: true 83 | InstanceType: r5n.24xlarge 84 | MaxCount: 300 85 | MinCount: 0 86 | CustomActions: 87 | OnNodeConfigured: 88 | Args: 89 | - 04.configure.disable.anacron.compute.sh 90 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 91 | Iam: 92 | AdditionalIamPolicies: 93 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 94 | S3Access: 95 | - BucketName: '*' 96 | EnableWriteAccess: true 97 | Networking: 98 | AdditionalSecurityGroups: 99 | - ${ADDITIONAL_SG} 100 | PlacementGroup: 101 | Enabled: true 102 | SubnetIds: 103 | - ${PRIVATE_SUBNET_ID} 104 | AssignPublicIp: false 105 | - Name: compute-od-2 106 | CapacityType: ONDEMAND 107 | ComputeResources: 108 | - Name: c5-24xlarge 109 | DisableSimultaneousMultithreading: true 110 | InstanceType: c5.24xlarge 111 | MaxCount: 300 112 | MinCount: 0 113 | - Name: m5-24xlarge 114 | DisableSimultaneousMultithreading: true 115 | InstanceType: m5.24xlarge 116 | MaxCount: 300 117 | MinCount: 0 118 | - Name: r5-24xlarge 119 | DisableSimultaneousMultithreading: true 120 | InstanceType: r5.24xlarge 121 | MaxCount: 300 122 | MinCount: 0 123 | CustomActions: 124 | OnNodeConfigured: 125 | Args: 126 | - 04.configure.disable.anacron.compute.sh 127 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 128 | Iam: 129 | AdditionalIamPolicies: 130 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 131 | S3Access: 132 | - BucketName: '*' 133 | EnableWriteAccess: true 134 | Networking: 135 | AdditionalSecurityGroups: 136 | - ${ADDITIONAL_SG} 137 | PlacementGroup: 138 | Enabled: true 139 | SubnetIds: 140 | - ${PRIVATE_SUBNET_ID} 141 | AssignPublicIp: false 142 | - Name: compute-od-3 143 | CapacityType: ONDEMAND 144 | ComputeResources: 145 | - Name: c5d-24xlarge 146 | DisableSimultaneousMultithreading: true 147 | InstanceType: c5d.24xlarge 148 | MaxCount: 300 149 | MinCount: 0 150 | - Name: m5d-24xlarge 151 | DisableSimultaneousMultithreading: true 152 | InstanceType: m5d.24xlarge 153 | MaxCount: 300 154 | MinCount: 0 155 | - Name: r5d-24xlarge 156 | DisableSimultaneousMultithreading: true 157 | InstanceType: r5d.24xlarge 158 | MaxCount: 300 159 | MinCount: 0 160 | CustomActions: 161 | OnNodeConfigured: 162 | Args: 163 | - 04.configure.disable.anacron.compute.sh 164 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 165 | Iam: 166 | AdditionalIamPolicies: 167 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 168 | S3Access: 169 | - BucketName: '*' 170 | EnableWriteAccess: true 171 | Networking: 172 | AdditionalSecurityGroups: 173 | - ${ADDITIONAL_SG} 174 | PlacementGroup: 175 | Enabled: true 176 | SubnetIds: 177 | - ${PRIVATE_SUBNET_ID} 178 | AssignPublicIp: false 179 | - Name: compute-spot-1 180 | CapacityType: SPOT 181 | ComputeResources: 182 | - Name: c5n-18xlarge 183 | DisableSimultaneousMultithreading: true 184 | Efa: 185 | Enabled: true 186 | InstanceType: c5n.18xlarge 187 | MaxCount: 300 188 | MinCount: 0 189 | - Name: c5-24xlarge 190 | DisableSimultaneousMultithreading: true 191 | InstanceType: c5.24xlarge 192 | MaxCount: 300 193 | MinCount: 0 194 | - Name: m5-24xlarge 195 | DisableSimultaneousMultithreading: true 196 | InstanceType: m5.24xlarge 197 | MaxCount: 300 198 | MinCount: 0 199 | - Name: r5-24xlarge 200 | DisableSimultaneousMultithreading: true 201 | InstanceType: r5.24xlarge 202 | MaxCount: 300 203 | MinCount: 0 204 | CustomActions: 205 | OnNodeConfigured: 206 | Args: 207 | - 04.configure.disable.anacron.compute.sh 208 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 209 | Iam: 210 | AdditionalIamPolicies: 211 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 212 | S3Access: 213 | - BucketName: '*' 214 | EnableWriteAccess: true 215 | Networking: 216 | AdditionalSecurityGroups: 217 | - ${ADDITIONAL_SG} 218 | PlacementGroup: 219 | Enabled: true 220 | SubnetIds: 221 | - ${PRIVATE_SUBNET_ID} 222 | AssignPublicIp: false 223 | - Name: dcv-gpu 224 | CapacityType: ONDEMAND 225 | ComputeResources: 226 | - Name: g4dn-xlarge 227 | DisableSimultaneousMultithreading: true 228 | InstanceType: g4dn.xlarge 229 | MaxCount: 100 230 | MinCount: 0 231 | - Name: g4dn-2xlarge 232 | DisableSimultaneousMultithreading: true 233 | InstanceType: g4dn.2xlarge 234 | MaxCount: 100 235 | MinCount: 0 236 | - Name: g4dn-4xlarge 237 | DisableSimultaneousMultithreading: true 238 | InstanceType: g4dn.4xlarge 239 | MaxCount: 100 240 | MinCount: 0 241 | - Name: g4dn-8xlarge 242 | DisableSimultaneousMultithreading: true 243 | InstanceType: g4dn.8xlarge 244 | MaxCount: 100 245 | MinCount: 0 246 | - Name: g4dn-16xlarge 247 | DisableSimultaneousMultithreading: true 248 | InstanceType: g4dn.16xlarge 249 | MaxCount: 100 250 | MinCount: 0 251 | CustomActions: 252 | OnNodeConfigured: 253 | Args: 254 | - 04.configure.disable.anacron.compute.sh 255 | - 25.install.dcv-server.gpu.sh 256 | - 26.configure.dcv.alb.compute.sh 257 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 258 | Iam: 259 | AdditionalIamPolicies: 260 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 261 | S3Access: 262 | - BucketName: '*' 263 | EnableWriteAccess: true 264 | Networking: 265 | AdditionalSecurityGroups: 266 | - ${ADDITIONAL_SG} 267 | PlacementGroup: 268 | Enabled: true 269 | SubnetIds: 270 | - ${PRIVATE_SUBNET_ID} 271 | AssignPublicIp: false 272 | - Name: dcv 273 | CapacityType: ONDEMAND 274 | ComputeResources: 275 | - Name: m5-24xlarge 276 | DisableSimultaneousMultithreading: true 277 | InstanceType: m5.24xlarge 278 | MaxCount: 200 279 | MinCount: 0 280 | - Name: m5-2xlarge 281 | DisableSimultaneousMultithreading: true 282 | InstanceType: m5.2xlarge 283 | MaxCount: 200 284 | MinCount: 0 285 | - Name: m5-8xlarge 286 | DisableSimultaneousMultithreading: true 287 | InstanceType: m5.8xlarge 288 | MaxCount: 200 289 | MinCount: 0 290 | - Name: m5-16xlarge 291 | DisableSimultaneousMultithreading: true 292 | InstanceType: m5.16xlarge 293 | MaxCount: 200 294 | MinCount: 0 295 | - Name: m5-xlarge 296 | DisableSimultaneousMultithreading: true 297 | InstanceType: m5.xlarge 298 | MaxCount: 200 299 | MinCount: 0 300 | CustomActions: 301 | OnNodeConfigured: 302 | Args: 303 | - 04.configure.disable.anacron.compute.sh 304 | - 25.install.dcv-server.compute.sh 305 | - 26.configure.dcv.alb.compute.sh 306 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 307 | Iam: 308 | AdditionalIamPolicies: 309 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 310 | S3Access: 311 | - BucketName: '*' 312 | EnableWriteAccess: true 313 | Networking: 314 | AdditionalSecurityGroups: 315 | - ${ADDITIONAL_SG} 316 | PlacementGroup: 317 | Enabled: true 318 | SubnetIds: 319 | - ${PRIVATE_SUBNET_ID} 320 | AssignPublicIp: false 321 | SharedStorage: 322 | ${FSX} 323 | Tags: 324 | - Key: EnginFrame 325 | Value: 'true' 326 | - Key: 1Click-HPC 327 | Value: 'true' 328 | - Key: 1Click-HPC-version 329 | Value: '0.4' -------------------------------------------------------------------------------- /parallelcluster/config.eu-south-1.sample.yaml: -------------------------------------------------------------------------------- 1 | HeadNode: 2 | LocalStorage: 3 | RootVolume: 4 | Size: 100 5 | Encrypted: false 6 | VolumeType: gp3 7 | Iops: 3000 8 | Throughput: 250 9 | CustomActions: 10 | OnNodeConfigured: 11 | Args: 12 | - 04.configure.slurm.AllOrNothing.headnode.sh 13 | - 07.configure.slurm.tagging.headnode.sh 14 | - 10.install.enginframe.headnode.sh 15 | - 12.configure.enginframe.alb.headnode.sh 16 | - 20.install.dcv.slurm.headnode.sh 17 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 18 | Iam: 19 | AdditionalIamPolicies: 20 | - Policy: arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess 21 | - Policy: arn:aws:iam::aws:policy/SecretsManagerReadWrite 22 | S3Access: 23 | - BucketName: '*' 24 | EnableWriteAccess: true 25 | InstanceType: c5.4xlarge 26 | Networking: 27 | AdditionalSecurityGroups: 28 | - ${ADDITIONAL_SG} 29 | - ${DB_SG} 30 | ElasticIp: true 31 | SubnetId: ${PRIVATE_SUBNET_ID} 32 | Ssh: 33 | KeyName: ${KEY_PAIR} 34 | Imds: 35 | Secured: false 36 | Image: 37 | Os: alinux2 38 | DirectoryService: 39 | DomainName: dc=corp,dc=pcluster,dc=com 40 | DomainAddr: ${NLB_PUBLIC_DNS_NAME} 41 | PasswordSecretArn: ${SECRET_ARN} 42 | DomainReadOnlyUser: cn=ReadOnlyUser,ou=Users,ou=CORP,dc=corp,dc=pcluster,dc=com 43 | LdapTlsReqCert: never 44 | Region: ${AWS_REGION_NAME} 45 | Scheduling: 46 | Scheduler: slurm 47 | SlurmSettings: 48 | ScaledownIdletime: 10 49 | EnableMemoryBasedScheduling: true 50 | Database: 51 | Uri: ${SLURM_DB_ENDPOINT} 52 | UserName: 'admin' 53 | PasswordSecretArn: ${SECRET_ARN} 54 | SlurmQueues: 55 | - Name: compute-od-1 56 | CapacityType: ONDEMAND 57 | ComputeResources: 58 | - Name: c5n-18xlarge 59 | DisableSimultaneousMultithreading: true 60 | Efa: 61 | Enabled: true 62 | InstanceType: c5n.18xlarge 63 | MaxCount: 300 64 | MinCount: 0 65 | CustomActions: 66 | OnNodeConfigured: 67 | Args: 68 | - 04.configure.disable.anacron.compute.sh 69 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 70 | Iam: 71 | AdditionalIamPolicies: 72 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 73 | S3Access: 74 | - BucketName: '*' 75 | EnableWriteAccess: true 76 | Networking: 77 | AdditionalSecurityGroups: 78 | - ${ADDITIONAL_SG} 79 | PlacementGroup: 80 | Enabled: true 81 | SubnetIds: 82 | - ${PRIVATE_SUBNET_ID} 83 | AssignPublicIp: false 84 | - Name: compute-od-2 85 | CapacityType: ONDEMAND 86 | ComputeResources: 87 | - Name: c5-24xlarge 88 | DisableSimultaneousMultithreading: true 89 | InstanceType: c5.24xlarge 90 | MaxCount: 300 91 | MinCount: 0 92 | - Name: m5-24xlarge 93 | DisableSimultaneousMultithreading: true 94 | InstanceType: m5.24xlarge 95 | MaxCount: 300 96 | MinCount: 0 97 | - Name: r5-24xlarge 98 | DisableSimultaneousMultithreading: true 99 | InstanceType: r5.24xlarge 100 | MaxCount: 300 101 | MinCount: 0 102 | CustomActions: 103 | OnNodeConfigured: 104 | Args: 105 | - 04.configure.disable.anacron.compute.sh 106 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 107 | Iam: 108 | AdditionalIamPolicies: 109 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 110 | S3Access: 111 | - BucketName: '*' 112 | EnableWriteAccess: true 113 | Networking: 114 | AdditionalSecurityGroups: 115 | - ${ADDITIONAL_SG} 116 | PlacementGroup: 117 | Enabled: true 118 | SubnetIds: 119 | - ${PRIVATE_SUBNET_ID} 120 | AssignPublicIp: false 121 | - Name: compute-od-3 122 | CapacityType: ONDEMAND 123 | ComputeResources: 124 | - Name: c5d-24xlarge 125 | DisableSimultaneousMultithreading: true 126 | InstanceType: c5d.24xlarge 127 | MaxCount: 300 128 | MinCount: 0 129 | - Name: m5d-24xlarge 130 | DisableSimultaneousMultithreading: true 131 | InstanceType: m5d.24xlarge 132 | MaxCount: 300 133 | MinCount: 0 134 | - Name: r5d-24xlarge 135 | DisableSimultaneousMultithreading: true 136 | InstanceType: r5d.24xlarge 137 | MaxCount: 300 138 | MinCount: 0 139 | CustomActions: 140 | OnNodeConfigured: 141 | Args: 142 | - 04.configure.disable.anacron.compute.sh 143 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 144 | Iam: 145 | AdditionalIamPolicies: 146 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 147 | S3Access: 148 | - BucketName: '*' 149 | EnableWriteAccess: true 150 | Networking: 151 | AdditionalSecurityGroups: 152 | - ${ADDITIONAL_SG} 153 | PlacementGroup: 154 | Enabled: true 155 | SubnetIds: 156 | - ${PRIVATE_SUBNET_ID} 157 | AssignPublicIp: false 158 | - Name: compute-spot-1 159 | CapacityType: SPOT 160 | ComputeResources: 161 | - Name: c5n-18xlarge 162 | DisableSimultaneousMultithreading: true 163 | Efa: 164 | Enabled: true 165 | InstanceType: c5n.18xlarge 166 | MaxCount: 300 167 | MinCount: 0 168 | - Name: c5-24xlarge 169 | DisableSimultaneousMultithreading: true 170 | InstanceType: c5.24xlarge 171 | MaxCount: 300 172 | MinCount: 0 173 | - Name: m5-24xlarge 174 | DisableSimultaneousMultithreading: true 175 | InstanceType: m5.24xlarge 176 | MaxCount: 300 177 | MinCount: 0 178 | - Name: r5-24xlarge 179 | DisableSimultaneousMultithreading: true 180 | InstanceType: r5.24xlarge 181 | MaxCount: 300 182 | MinCount: 0 183 | CustomActions: 184 | OnNodeConfigured: 185 | Args: 186 | - 04.configure.disable.anacron.compute.sh 187 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 188 | Iam: 189 | AdditionalIamPolicies: 190 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 191 | S3Access: 192 | - BucketName: '*' 193 | EnableWriteAccess: true 194 | Networking: 195 | AdditionalSecurityGroups: 196 | - ${ADDITIONAL_SG} 197 | PlacementGroup: 198 | Enabled: true 199 | SubnetIds: 200 | - ${PRIVATE_SUBNET_ID} 201 | AssignPublicIp: false 202 | - Name: dcv-gpu 203 | CapacityType: ONDEMAND 204 | ComputeResources: 205 | - Name: g4dn-xlarge 206 | DisableSimultaneousMultithreading: true 207 | InstanceType: g4dn.xlarge 208 | MaxCount: 100 209 | MinCount: 0 210 | - Name: g4dn-2xlarge 211 | DisableSimultaneousMultithreading: true 212 | InstanceType: g4dn.2xlarge 213 | MaxCount: 100 214 | MinCount: 0 215 | - Name: g4dn-4xlarge 216 | DisableSimultaneousMultithreading: true 217 | InstanceType: g4dn.4xlarge 218 | MaxCount: 100 219 | MinCount: 0 220 | - Name: g4dn-8xlarge 221 | DisableSimultaneousMultithreading: true 222 | InstanceType: g4dn.8xlarge 223 | MaxCount: 100 224 | MinCount: 0 225 | - Name: g4dn-16xlarge 226 | DisableSimultaneousMultithreading: true 227 | InstanceType: g4dn.16xlarge 228 | MaxCount: 100 229 | MinCount: 0 230 | CustomActions: 231 | OnNodeConfigured: 232 | Args: 233 | - 04.configure.disable.anacron.compute.sh 234 | - 25.install.dcv-server.gpu.sh 235 | - 26.configure.dcv.alb.compute.sh 236 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 237 | Iam: 238 | AdditionalIamPolicies: 239 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 240 | S3Access: 241 | - BucketName: '*' 242 | EnableWriteAccess: true 243 | Networking: 244 | AdditionalSecurityGroups: 245 | - ${ADDITIONAL_SG} 246 | PlacementGroup: 247 | Enabled: true 248 | SubnetIds: 249 | - ${PRIVATE_SUBNET_ID} 250 | AssignPublicIp: false 251 | - Name: dcv 252 | CapacityType: ONDEMAND 253 | ComputeResources: 254 | - Name: m5-24xlarge 255 | DisableSimultaneousMultithreading: true 256 | InstanceType: m5.24xlarge 257 | MaxCount: 200 258 | MinCount: 0 259 | - Name: m5-2xlarge 260 | DisableSimultaneousMultithreading: true 261 | InstanceType: m5.2xlarge 262 | MaxCount: 200 263 | MinCount: 0 264 | - Name: m5-8xlarge 265 | DisableSimultaneousMultithreading: true 266 | InstanceType: m5.8xlarge 267 | MaxCount: 200 268 | MinCount: 0 269 | - Name: m5-16xlarge 270 | DisableSimultaneousMultithreading: true 271 | InstanceType: m5.16xlarge 272 | MaxCount: 200 273 | MinCount: 0 274 | - Name: m5-xlarge 275 | DisableSimultaneousMultithreading: true 276 | InstanceType: m5.xlarge 277 | MaxCount: 200 278 | MinCount: 0 279 | CustomActions: 280 | OnNodeConfigured: 281 | Args: 282 | - 04.configure.disable.anacron.compute.sh 283 | - 25.install.dcv-server.compute.sh 284 | - 26.configure.dcv.alb.compute.sh 285 | Script: s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh 286 | Iam: 287 | AdditionalIamPolicies: 288 | - Policy: arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole 289 | S3Access: 290 | - BucketName: '*' 291 | EnableWriteAccess: true 292 | Networking: 293 | AdditionalSecurityGroups: 294 | - ${ADDITIONAL_SG} 295 | PlacementGroup: 296 | Enabled: true 297 | SubnetIds: 298 | - ${PRIVATE_SUBNET_ID} 299 | AssignPublicIp: false 300 | SharedStorage: 301 | ${FSX} 302 | Tags: 303 | - Key: EnginFrame 304 | Value: 'true' 305 | - Key: 1Click-HPC 306 | Value: 'true' 307 | - Key: 1Click-HPC-version 308 | Value: '0.4' -------------------------------------------------------------------------------- /scripts/Cloud9-Bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ -f /home/ec2-user/environment/bootstrap.log ]]; then 4 | exit 1 5 | fi 6 | 7 | set -x 8 | exec >/home/ec2-user/environment/bootstrap.log; exec 2>&1 9 | 10 | sudo yum -y -q install jq 11 | sudo chown -R ec2-user:ec2-user /home/ec2-user/ 12 | #source cluster profile and move to the home dir 13 | cd /home/ec2-user/environment 14 | . cluster_env 15 | 16 | #install Lustre client 17 | sudo amazon-linux-extras install -y lustre2.10 > /dev/null 2>&1 18 | 19 | python3 -m pip install "aws-parallelcluster" --upgrade --user --quiet 20 | curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.38.0/install.sh | bash 21 | chmod ug+x ~/.nvm/nvm.sh 22 | source ~/.nvm/nvm.sh > /dev/null 2>&1 23 | nvm install --lts=Gallium > /dev/null 2>&1 24 | node --version 25 | 26 | if [[ $FSX_ID == "AUTO" ]];then 27 | FSX=$(cat < config.${AWS_REGION_NAME}.yaml 54 | /usr/bin/envsubst '${SLURM_DB_ENDPOINT}' < "1click-hpc/enginframe/mysql/efdb.config" > efdb.config 55 | /usr/bin/envsubst '${SLURM_DB_ENDPOINT}' < "1click-hpc/enginframe/efinstall.config" > efinstall.config 56 | /usr/bin/envsubst '${S3_BUCKET}' < "1click-hpc/enginframe/fm.browse.ui" > fm.browse.ui 57 | /usr/bin/envsubst '${POST_INSTALL}' < "1click-hpc/scripts/post.install.sh" > post.install.sh 58 | 59 | aws s3 cp --quiet efinstall.config "s3://${S3_BUCKET}/1click-hpc/enginframe/efinstall.config" --region "${AWS_REGION_NAME}" 60 | aws s3 cp --quiet fm.browse.ui "s3://${S3_BUCKET}/1click-hpc/enginframe/fm.browse.ui" --region "${AWS_REGION_NAME}" 61 | aws s3 cp --quiet efdb.config "s3://${S3_BUCKET}/1click-hpc/enginframe/mysql/efdb.config" --region "${AWS_REGION_NAME}" 62 | aws s3 cp --quiet post.install.sh "s3://${S3_BUCKET}/1click-hpc/scripts/post.install.sh" --region "${AWS_REGION_NAME}" 63 | aws s3 cp --quiet /usr/bin/mysql "s3://${S3_BUCKET}/1click-hpc/enginframe/mysql/mysql" --region "${AWS_REGION_NAME}" 64 | rm -f fm.browse.ui efinstall.config 65 | 66 | #Create the key pair (remove the existing one if it has the same name) 67 | # FIX this: create the key on the CF and store on SecretManager 68 | aws ec2 create-key-pair --key-name ${KEY_PAIR} --query KeyMaterial --output text > /home/ec2-user/.ssh/id_rsa 69 | if [ $? -ne 0 ]; then 70 | aws ec2 delete-key-pair --key-name ${KEY_PAIR} 71 | aws ec2 create-key-pair --key-name ${KEY_PAIR} --query KeyMaterial --output text > /home/ec2-user/.ssh/id_rsa 72 | fi 73 | sudo chmod 400 /home/ec2-user/.ssh/id_rsa 74 | 75 | #Create the cluster and wait 76 | CLUSTER_FULLNAME="hpc-1click-${CLUSTER_NAME}" 77 | /home/ec2-user/.local/bin/pcluster create-cluster --cluster-name "${CLUSTER_FULLNAME}" --cluster-configuration config.${AWS_REGION_NAME}.yaml --rollback-on-failure false 78 | cstatus='"CREATE_IN_PROGRESS"'; until [ ${cstatus} != '"CREATE_IN_PROGRESS"' ]; do sleep 10; cstatus=$(/home/ec2-user/.local/bin/pcluster describe-cluster -n "${CLUSTER_FULLNAME}" --query clusterStatus); done; 79 | 80 | 81 | HEADNODE_PRIVATE_IP=$(/home/ec2-user/.local/bin/pcluster describe-cluster --cluster-name "${CLUSTER_FULLNAME}" | jq -r '.headNode.privateIpAddress') 82 | echo "export HEADNODE_PRIVATE_IP='${HEADNODE_PRIVATE_IP}'" >> cluster_env 83 | 84 | # Modify the Message Of The Day 85 | sudo rm -f /etc/update-motd.d/* 86 | sudo aws s3 cp --quiet "s3://${S3_BUCKET}/1click-hpc/scripts/motd" /etc/update-motd.d/10-HPC --region "${AWS_REGION_NAME}" || exit 1 87 | sudo chmod +x /etc/update-motd.d/10-HPC 88 | echo 'run-parts /etc/update-motd.d' >> /home/ec2-user/.bash_profile 89 | 90 | #attach the ParallelCluster SG to the Cloud9 instance (for FSx or NFS) 91 | INSTANCE_ID=$(curl http://169.254.169.254/latest/meta-data/instance-id) 92 | SG_CLOUD9=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query Reservations[*].Instances[*].SecurityGroups[*].GroupId --output text) 93 | SG_HEADNODE=$(aws cloudformation describe-stack-resources --stack-name "${CLUSTER_FULLNAME}" --logical-resource-id ComputeSecurityGroup --query "StackResources[*].PhysicalResourceId" --output text) 94 | aws ec2 modify-instance-attribute --instance-id $INSTANCE_ID --groups $SG_CLOUD9 $SG_HEADNODE 95 | 96 | #increase the maximum number of files that can be handled by file watcher, 97 | sudo bash -c 'echo "fs.inotify.max_user_watches=524288" >> /etc/sysctl.conf' && sudo sysctl -p 98 | 99 | if [[ $FSX_ID == "AUTO" ]];then 100 | FSX_ID=$(aws cloudformation describe-stack-resources --stack-name "${CLUSTER_FULLNAME}" --logical-resource-id FSX0 --query "StackResources[*].PhysicalResourceId" --output text) 101 | fi 102 | 103 | FSX_DNS_NAME=$(aws fsx describe-file-systems --file-system-ids $FSX_ID --query "FileSystems[*].DNSName" --output text) 104 | FSX_MOUNT_NAME=$(aws fsx describe-file-systems --file-system-ids $FSX_ID --query "FileSystems[*].LustreConfiguration.MountName" --output text) 105 | 106 | #mount the same FSx created for the HPC Cluster 107 | mkdir fsx 108 | sudo mount -t lustre -o noatime,flock $FSX_DNS_NAME@tcp:/$FSX_MOUNT_NAME fsx 109 | sudo bash -c "echo \"$FSX_DNS_NAME@tcp:/$FSX_MOUNT_NAME /home/ec2-user/environment/fsx lustre defaults,noatime,flock,_netdev 0 0\" >> /etc/fstab" 110 | sudo chmod 755 fsx 111 | sudo chown ec2-user:ec2-user fsx 112 | 113 | # send SUCCESFUL to the wait handle 114 | curl -X PUT -H 'Content-Type:' \ 115 | --data-binary "{\"Status\" : \"SUCCESS\",\"Reason\" : \"Configuration Complete\",\"UniqueId\" : \"$HEADNODE_PRIVATE_IP\",\"Data\" : \"$HEADNODE_PRIVATE_IP\"}" \ 116 | "${WAIT_HANDLE}" -------------------------------------------------------------------------------- /scripts/motd: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . /home/ec2-user/environment/cluster_env 3 | cat << EOF 4 | 5 | ██╗ ██╗██████╗ ██████╗ ██████╗██╗ ██╗ ██╗███████╗████████╗███████╗██████╗ 6 | ██║ ██║██╔══██╗██╔════╝ ██╔════╝██║ ██║ ██║██╔════╝╚══██╔══╝██╔════╝██╔══██╗ 7 | ███████║██████╔╝██║ ██║ ██║ ██║ ██║███████╗ ██║ █████╗ ██████╔╝ 8 | ██╔══██║██╔═══╝ ██║ ██║ ██║ ██║ ██║╚════██║ ██║ ██╔══╝ ██╔══██╗ 9 | ██║ ██║██║ ╚██████╗ ╚██████╗███████╗╚██████╔╝███████║ ██║ ███████╗██║ ██║ 10 | ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚═════╝╚══════╝ ╚═════╝ ╚══════╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ 11 | 12 | ██████╗ ███╗ ██╗ █████╗ ██╗ ██╗███████╗ 13 | ██╔═══██╗████╗ ██║ ██╔══██╗██║ ██║██╔════╝ 14 | ██║ ██║██╔██╗ ██║ ███████║██║ █╗ ██║███████╗ 15 | ██║ ██║██║╚██╗██║ ██╔══██║██║███╗██║╚════██║ 16 | ╚██████╔╝██║ ╚████║ ██║ ██║╚███╔███╔╝███████║ 17 | ╚═════╝ ╚═╝ ╚═══╝ ╚═╝ ╚═╝ ╚══╝╚══╝ ╚══════╝ 18 | 19 | 20 | You can connect to your HPC cluster using the EnginFrame web portal: 21 | https://${ALB_PUBLIC_DNS_NAME}/enginframe 22 | 23 | Or, You can ssh into the Head-node: 24 | $ ssh ${HEADNODE_PRIVATE_IP} 25 | 26 | EOF -------------------------------------------------------------------------------- /scripts/post.install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | # Top level post install script 21 | set -a 22 | source '/etc/parallelcluster/cfnconfig' 23 | set +a 24 | 25 | # run scripts 26 | # ---------------------------------------------------------------------------- 27 | # runs secondary scripts according to the node type 28 | runScripts() { 29 | 30 | echo "Getting packages from ${post_install_url}" 31 | for script in "${@}"; do 32 | aws s3 cp --quiet ${post_install_base}/modules/${script} "${TMP_MODULES_DIR}" --region "${cfn_region}" || exit 1 33 | done 34 | 35 | chmod 755 -R "${TMP_MODULES_DIR}"* 36 | find "${TMP_MODULES_DIR}" -type f -name '[0-9][0-9]*.sh' -print0 | sort -z -n | xargs -0 -I '{}' /bin/bash -c '{}' 37 | } 38 | 39 | # main 40 | # ---------------------------------------------------------------------------- 41 | main() { 42 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] post.install.sh START" >&2 43 | runScripts "${@}" 44 | echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] post.install.sh: STOP" >&2 45 | } 46 | TMP_MODULES_DIR="/tmp/modules/" 47 | export host_name=$(hostname -s) 48 | export SLURM_CONF_FILE="/opt/slurm/etc/pcluster/slurm_parallelcluster_*_partition.conf" 49 | post_install_url=$(dirname ${POST_INSTALL}) 50 | export post_install_base=$(dirname "${post_install_url}") 51 | SLURM_ROOT="/opt/slurm" 52 | export SLURM_ETC="${SLURM_ROOT}/etc" 53 | export SHARED_FS_DIR="$(cat /etc/parallelcluster/shared_storages_data.yaml | grep mount_dir | awk '{print $2}')" 54 | export NICE_ROOT="${SHARED_FS_DIR}/nice" 55 | export EF_CONF_ROOT="${NICE_ROOT}/enginframe/conf" 56 | export EF_DATA_ROOT="${NICE_ROOT}/enginframe/data" 57 | 58 | if [[ ${cfn_node_type} == HeadNode ]]; then 59 | export head_node_hostname=${host_name} 60 | elif [[ ${cfn_node_type} == ComputeFleet ]]; then 61 | export head_node_hostname=${cfn_head_node} 62 | else 63 | exit 1 64 | fi 65 | 66 | main "${@}" -------------------------------------------------------------------------------- /scripts/prolog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # SPDX-License-Identifier: MIT-0 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | # software and associated documentation files (the "Software"), to deal in the Software 8 | # without restriction, including without limitation the rights to use, copy, modify, 9 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so. 11 | # 12 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | 19 | 20 | # The script assigns the required tags to the EC2 instances of the jobs. 21 | source /etc/parallelcluster/cfnconfig 22 | 23 | #slurm directory 24 | SLURM_ROOT="/opt/slurm" 25 | 26 | #function used to convert the hostname to ip 27 | function nametoip() 28 | { 29 | instance_ip=$(nslookup $1) 30 | echo "${instance_ip}" | awk '/^Address: / { print $2 }' 31 | } 32 | 33 | # Fetch the comments attached to the job from Slurm. 34 | # this is the supported format to transport the Slurm comments to EC2 tags: Key=aws-tag1,Value=tag-value1 Key=aws-tag2,Value=tag-value2 35 | tags=$($SLURM_ROOT/bin/scontrol show job ${SLURM_JOB_ID} | grep Comment | sed 's/Comment=//' | sed 's/^ *//g') 36 | 37 | # current compute node tags itself, and this prolog script is run on every compute node 38 | private_ip=$(nametoip $HOSTNAME) 39 | instance_id=$(aws ec2 --region $cfn_region describe-instances --filters "Name=network-interface.addresses.private-ip-address,Values=${private_ip}" --query Reservations[*].Instances[*].InstanceId --output text) 40 | 41 | aws ec2 create-tags --region $cfn_region --resources ${instance_id} --tags Key=aws-parallelcluster-username,Value=${SLURM_JOB_USER} Key=aws-parallelcluster-jobid,Value=${SLURM_JOB_ID} Key=aws-parallelcluster-partition,Value=${SLURM_JOB_PARTITION} ${tags} 42 | exit 0 --------------------------------------------------------------------------------