├── .editorconfig ├── .gitallowed ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .vscode └── settings.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Gemfile ├── LICENSE ├── Makefile ├── README.md ├── create-slurm-security-groups.sh ├── create-slurm-security-groups ├── .gitignore ├── README.md ├── app.py ├── cdk.json ├── create-slurm-security-groups.py ├── create_slurm_security_groups │ ├── __init__.py │ └── create_slurm_security_groups_stack.py ├── requirements-dev.txt ├── requirements.txt └── tests │ ├── __init__.py │ └── unit │ ├── __init__.py │ └── test_create_slurm_security_groups_stack.py ├── docs ├── CONTRIBUTING.md ├── config.md ├── containers.md ├── custom-amis.md ├── debug.md ├── delete-cluster.md ├── deploy-parallel-cluster.md ├── deployment-prerequisites.md ├── exostellar-infrastructure-optimizer.md ├── exostellar-workload-optimizer.md ├── federation.md ├── images │ ├── res-Admin-properties.png │ ├── res-ServiceAccount-properties.png │ ├── res-ad-users-and-computers.png │ ├── res-start-ad-users-and-computers.png │ ├── res-users.png │ └── res-windows-administrative-tools.png ├── implementation.md ├── index.md ├── job_preemption.md ├── onprem.md ├── res_integration.md ├── rest_api.md ├── run_jobs.md ├── security-groups.md └── soca_integration.md ├── index.md ├── install.sh ├── mkdocs.yml ├── res ├── .gitignore ├── Makefile ├── create-ldif.py ├── download-res-templates.sh ├── requirements.txt ├── res-demo-original │ ├── bi.yaml │ ├── keycloak.yaml │ ├── networking.yaml │ ├── res-demo-stack.yaml │ ├── res-sso-keycloak.yaml │ └── res.ldif ├── res-demo-with-cidr │ ├── bi.yaml │ ├── keycloak.yaml │ ├── res-bi-only.yaml │ ├── res-demo-stack.yaml │ ├── res-only.yaml │ ├── res-sso-keycloak.yaml │ └── res.ldif ├── upload-res-templates.py └── upload-res-templates.sh ├── security_scan ├── cfn_nag-deny-list.yml └── security_scan.sh ├── setup.sh ├── source ├── .gitignore ├── CDK-README.md ├── EC2InstanceTypeInfoPkg │ ├── EC2InstanceTypeInfo.py │ ├── __init__.py │ ├── get_ec2_instance_info.py │ ├── get_savings_plans.py │ └── retry_boto3_throttling.py ├── Makefile ├── SlurmPlugin.py ├── app.py ├── cdk.json ├── cdk │ ├── __init__.py │ ├── cdk_slurm_stack.py │ └── config_schema.py ├── create-ami-map.py ├── installer.py ├── requirements.txt ├── resources │ ├── config │ │ ├── default_config.yml │ │ ├── slurm_all_arm_instance_types.yml │ │ ├── slurm_all_x86_instance_types.yml │ │ ├── slurm_nodes_on_prem.conf │ │ ├── slurm_recommended_arm_instance_types.yml │ │ └── slurm_recommended_x86_instance_types.yml │ ├── lambdas │ │ ├── CallSlurmRestApi │ │ │ └── CallSlurmRestApi.py │ │ ├── ConfigureExternalLoginNodes │ │ │ └── ConfigureExternalLoginNodes.py │ │ ├── ConfigureUsersGroupsJson │ │ │ └── ConfigureUsersGroupsJson.py │ │ ├── CreateBuildFiles │ │ │ ├── CreateBuildFiles.py │ │ │ └── cfnresponse.py │ │ ├── CreateHeadNodeARecord │ │ │ └── CreateHeadNodeARecord.py │ │ ├── CreateParallelCluster │ │ │ ├── CreateParallelCluster.py │ │ │ └── cfnresponse.py │ │ ├── CreateParallelClusterConfig │ │ │ ├── CreateParallelClusterConfig.py │ │ │ └── cfnresponse.py │ │ ├── DeconfigureExternalLoginNodes │ │ │ ├── DeconfigureExternalLoginNodes.py │ │ │ └── cfnresponse.py │ │ ├── DeconfigureUsersGroupsJson │ │ │ ├── DeconfigureUsersGroupsJson.py │ │ │ └── cfnresponse.py │ │ ├── UpdateHeadNode │ │ │ ├── UpdateHeadNode.py │ │ │ └── cfnresponse.py │ │ └── cfnresponse.py │ ├── parallel-cluster │ │ └── config │ │ │ ├── bin │ │ │ ├── configure-eda.sh │ │ │ ├── configure-rootless-docker.sh │ │ │ ├── create_or_update_users_groups_json.sh │ │ │ ├── create_users_groups.py │ │ │ ├── create_users_groups_json.py │ │ │ ├── create_users_groups_json_configure.sh │ │ │ ├── create_users_groups_json_deconfigure.sh │ │ │ ├── exostellar-compute-node-ami-configure.sh │ │ │ ├── external_login_node_configure.sh │ │ │ ├── external_login_node_deconfigure.sh │ │ │ ├── install-ansible.sh │ │ │ ├── install-rootless-docker.sh │ │ │ ├── on_compute_node_configured.sh │ │ │ ├── on_compute_node_start.sh │ │ │ ├── on_head_node_configured.sh │ │ │ ├── on_head_node_start.sh │ │ │ └── on_head_node_updated.sh │ │ │ ├── build-files │ │ │ └── build-file-template.yml │ │ │ └── users_groups.json │ ├── playbooks │ │ ├── ExostellarComputeNodeAmi.yml │ │ ├── ParallelClusterComputeNode.yml │ │ ├── ParallelClusterCreateUsersGroupsJsonConfigure.yml │ │ ├── ParallelClusterCreateUsersGroupsJsonDeconfigure.yml │ │ ├── ParallelClusterExternalLoginNodeConfigure.yml │ │ ├── ParallelClusterExternalLoginNodeDeconfigure.yml │ │ ├── ParallelClusterExternalLoginNodeInstallSlurm.yml │ │ ├── ParallelClusterHeadNode.yml │ │ ├── README.md │ │ ├── ansible.cfg │ │ ├── bug_fixes.yml │ │ ├── configure-rootless-docker.yml │ │ ├── create_users_groups_json.yml │ │ ├── eda_tools.yml │ │ ├── install-rootless-docker.yml │ │ ├── install_slurm.yml │ │ ├── install_vscode.yml │ │ ├── inventories │ │ │ ├── group_vars │ │ │ │ └── all │ │ │ └── local.yml │ │ ├── roles │ │ │ ├── ExostellarComputeNodeAmi │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ └── etc │ │ │ │ │ ├── profile.d │ │ │ │ │ ├── slurm.csh │ │ │ │ │ └── slurm.sh │ │ │ │ │ ├── sysconfig │ │ │ │ │ └── slurmd │ │ │ │ │ └── systemd │ │ │ │ │ └── system │ │ │ │ │ └── slurmd.service │ │ │ ├── ParallelClusterComputeNode │ │ │ │ ├── README.md │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── ParallelClusterCreateUsersGroupsJsonConfigure │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ └── etc │ │ │ │ │ └── cron.d │ │ │ │ │ └── slurm_update_users_groups_json │ │ │ ├── ParallelClusterCreateUsersGroupsJsonDeconfigure │ │ │ │ ├── README.md │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── ParallelClusterExternalLoginNodeConfigure │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ └── etc │ │ │ │ │ └── profile.d │ │ │ │ │ └── slurm_modulefiles.sh │ │ │ ├── ParallelClusterExternalLoginNodeDeconfigure │ │ │ │ ├── README.md │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── ParallelClusterHeadNode │ │ │ │ ├── README.md │ │ │ │ ├── files │ │ │ │ │ ├── etc │ │ │ │ │ │ └── enroot │ │ │ │ │ │ │ └── enroot.conf │ │ │ │ │ └── opt │ │ │ │ │ │ └── slurm │ │ │ │ │ │ ├── config │ │ │ │ │ │ ├── accounts.yml.example │ │ │ │ │ │ └── bin │ │ │ │ │ │ │ └── create_slurm_accounts.py │ │ │ │ │ │ └── etc │ │ │ │ │ │ ├── oci.conf │ │ │ │ │ │ └── plugstack.conf.d │ │ │ │ │ │ └── pyxis.conf │ │ │ │ ├── tasks │ │ │ │ │ ├── config-external-login-node-access.yml │ │ │ │ │ ├── config-high-throughput.yml │ │ │ │ │ ├── config-licenses.yml │ │ │ │ │ ├── config-oci.yml │ │ │ │ │ ├── config-pyxis.yml │ │ │ │ │ ├── config-slurmdb-accounts.yml │ │ │ │ │ ├── config-slurmrestd.yml │ │ │ │ │ ├── config-sshd.yml │ │ │ │ │ ├── config-users-groups.yml │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ ├── etc │ │ │ │ │ ├── cron.d │ │ │ │ │ │ ├── slurm_accounts │ │ │ │ │ │ ├── slurm_users_groups │ │ │ │ │ │ ├── update_slurmrestd_jwt_for_root │ │ │ │ │ │ └── update_slurmrestd_jwt_for_slurmrestd │ │ │ │ │ ├── rc.d │ │ │ │ │ │ └── rc.local │ │ │ │ │ ├── sysconfig │ │ │ │ │ │ └── slurmrestd │ │ │ │ │ └── systemd │ │ │ │ │ │ └── system │ │ │ │ │ │ └── slurmrestd.service │ │ │ │ │ └── opt │ │ │ │ │ └── slurm │ │ │ │ │ ├── config │ │ │ │ │ └── bin │ │ │ │ │ │ └── update_slurmrestd_jwt_parameter.sh │ │ │ │ │ ├── etc │ │ │ │ │ └── plugstack.conf │ │ │ │ │ └── modules │ │ │ │ │ └── modulefiles │ │ │ │ │ └── slurm │ │ │ │ │ ├── .template │ │ │ │ │ └── .version │ │ │ ├── all │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ ├── templates │ │ │ │ │ ├── etc │ │ │ │ │ │ └── sudoers.d │ │ │ │ │ │ │ └── 10-admins │ │ │ │ │ ├── usr │ │ │ │ │ │ └── bin │ │ │ │ │ │ │ └── pip3 │ │ │ │ │ └── var │ │ │ │ │ │ └── lib │ │ │ │ │ │ └── cloud │ │ │ │ │ │ └── scripts │ │ │ │ │ │ └── per-boot │ │ │ │ │ │ └── 90_mount_ssds.bash │ │ │ │ └── tests │ │ │ │ │ ├── inventory │ │ │ │ │ └── test.yml │ │ │ ├── bug_fixes │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── tests │ │ │ │ │ ├── inventory │ │ │ │ │ └── test.yml │ │ │ ├── cloudwatch_agent │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ ├── slurm_node_cloudwatch_agent_config.sh │ │ │ │ │ ├── slurmctl_cloudwatch_agent_config.yml │ │ │ │ │ └── slurmdbd_cloudwatch_agent_config.yml │ │ │ ├── configure-rootless-docker │ │ │ │ ├── README.md │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── eda_tools │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ ├── templates │ │ │ │ │ └── etc │ │ │ │ │ │ └── profile.d │ │ │ │ │ │ ├── nodejs.csh │ │ │ │ │ │ └── nodejs.sh │ │ │ │ └── tests │ │ │ │ │ ├── inventory │ │ │ │ │ └── test.yml │ │ │ ├── exostellar_infrastructure_optimizer │ │ │ │ ├── README.md │ │ │ │ ├── files │ │ │ │ │ └── opt │ │ │ │ │ │ └── slurm │ │ │ │ │ │ └── etc │ │ │ │ │ │ └── exostellar │ │ │ │ │ │ └── configure_xio.py │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ └── opt │ │ │ │ │ └── slurm │ │ │ │ │ └── etc │ │ │ │ │ └── exostellar │ │ │ │ │ ├── parse_helper.sh │ │ │ │ │ ├── resume_xspot.sh │ │ │ │ │ ├── suspend_xspot.sh │ │ │ │ │ ├── test_createVm.sh │ │ │ │ │ ├── xspot-vm_custom_user_data.sh │ │ │ │ │ ├── xspot-vm_user_data.sh │ │ │ │ │ └── xspot.slurm.conf │ │ │ ├── exostellar_workload_optimizer │ │ │ │ ├── files │ │ │ │ │ └── opt │ │ │ │ │ │ └── slurm │ │ │ │ │ │ └── etc │ │ │ │ │ │ └── exostellar │ │ │ │ │ │ └── configure_xwo.py │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ └── opt │ │ │ │ │ └── slurm │ │ │ │ │ └── etc │ │ │ │ │ └── exostellar │ │ │ │ │ ├── parse_helper.sh │ │ │ │ │ ├── resume_xspot.sh │ │ │ │ │ ├── suspend_xspot.sh │ │ │ │ │ ├── test_createVm.sh │ │ │ │ │ ├── xspot-vm_custom_user_data.sh │ │ │ │ │ ├── xspot-vm_user_data.sh │ │ │ │ │ └── xspot.slurm.conf │ │ │ ├── install-rootless-docker │ │ │ │ ├── README.md │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── install_slurm │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── templates │ │ │ │ │ └── opt │ │ │ │ │ └── slurm │ │ │ │ │ └── config │ │ │ │ │ └── modules │ │ │ │ │ └── modulefiles │ │ │ │ │ └── slurm │ │ │ │ │ ├── .template │ │ │ │ │ └── .version │ │ │ ├── install_vscode │ │ │ │ ├── README.md │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── lustre-client │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── mount_extra_fs │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── mount_slurm_fs │ │ │ │ └── tasks │ │ │ │ │ └── main.yml │ │ │ ├── security_updates │ │ │ │ ├── README.md │ │ │ │ ├── tasks │ │ │ │ │ └── main.yml │ │ │ │ └── tests │ │ │ │ │ ├── inventory │ │ │ │ │ └── test.yml │ │ │ └── unmount_slurm_fs │ │ │ │ └── tasks │ │ │ │ └── main.yml │ │ └── security_updates.yml │ └── user_data │ │ ├── WaitForAmi.py │ │ ├── slurm_node_ami_config.sh │ │ ├── slurm_node_ami_user_data.sh │ │ ├── slurm_node_ami_user_data_on_exit.sh │ │ ├── slurm_node_ami_user_data_prolog.sh │ │ ├── slurmctl_config.sh │ │ ├── slurmctl_user_data.sh │ │ ├── slurmctl_user_data_on_exit.sh │ │ ├── slurmdbd_config.sh │ │ ├── slurmdbd_user_data.sh │ │ ├── slurmdbd_user_data_on_exit.sh │ │ └── user_data_bootstrap.sh ├── setup.py ├── slurm_installer │ ├── __init__.py │ ├── find_existing_resources.py │ ├── installer.py │ └── prompt.py ├── source.bat └── tests │ ├── __init__.py │ └── unit │ ├── __init__.py │ └── test_cdk_slurm_stack.py ├── tests ├── instance_type_info.old_format.json └── test_slurm_minimal.py └── xio ├── userdata.txt ├── xio-cloudformation-data-plane-iam.yaml └── xio-ems-2.3.2.yaml /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig for configuring multiple IDEs: https://EditorConfig.org 2 | 3 | root = True 4 | 5 | # Unix-style newlines with a newline ending every file 6 | [*] 7 | end_of_line = lf 8 | insert_final_newline = true 9 | 10 | # Matches multiple files with brace expansion notation 11 | # Set default charset 12 | [*.{js,py}] 13 | charset = utf-8 14 | 15 | # 4 space indentation 16 | [*.py] 17 | indent_style = space 18 | indent_size = 4 19 | 20 | # Matches the exact files either package.json or .travis.yml 21 | [*.yml] 22 | indent_style = space 23 | indent_size = 2 24 | 25 | # Tab indentation (no size specified) 26 | [Makefile,*.make] 27 | indent_style = tab 28 | indent_size = 4 29 | -------------------------------------------------------------------------------- /.gitallowed: -------------------------------------------------------------------------------- 1 | key = 'ParallelClusterEnableEnaExpressPolicyArn' 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] Concise and Descriptive Issue Title" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Repository Version** 27 | Version of the repository that you are using. 28 | 29 | **Additional context** 30 | Add any other context about the problem here. 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] Concise and Descriptive Issue Title" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | .mkdocs_venv/ 4 | _site 5 | site/ 6 | .vscode/ 7 | source/resources/parallel-cluster/config/build-files/*/*/parallelcluster-*.yml 8 | security_scan/bandit-env 9 | security_scan/bandit.log 10 | security_scan/cfn_nag.log 11 | security_scan/ScoutSuite 12 | 13 | __pycache__ 14 | 15 | .venv* 16 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.trimFinalNewlines": true, 3 | "files.trimTrailingWhitespace": true, 4 | "files.watcherExclude": { 5 | "**/cdk.out": true 6 | }, 7 | "makefile.configureOnOpen": false 8 | } 9 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem 'jekyll' 3 | gem 'jekyll-theme-slate' 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: help local-docs security_scan test clean 3 | 4 | help: 5 | @echo "Usage: make [ help | local-docs | github-docs | security_scan | test | clean ]" 6 | 7 | .mkdocs_venv/bin/activate: 8 | rm -rf .mkdocs_venv 9 | python3 -m venv .mkdocs_venv 10 | source .mkdocs_venv/bin/activate; pip install mkdocs 11 | 12 | local-docs: .mkdocs_venv/bin/activate 13 | source .mkdocs_venv/bin/activate; mkdocs serve& 14 | firefox http://127.0.0.1:8000/ 15 | 16 | github-docs: .mkdocs_venv/bin/activate 17 | source .mkdocs_venv/bin/activate; mkdocs gh-deploy --strict 18 | 19 | security_scan: 20 | security_scan/security_scan.sh 21 | 22 | test: 23 | pytest -x -v tests 24 | 25 | ansible-lint: 26 | source setup.sh; pip install ansible ansible-lint; ansible-lint --nocolor source/resources/playbooks 27 | 28 | clean: 29 | git clean -d -f -x 30 | # -d: Recurse into directories 31 | -------------------------------------------------------------------------------- /create-slurm-security-groups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | cd create-slurm-security-groups 4 | 5 | python3 -m venv .venv 6 | source .venv/bin/activate 7 | python3 -m pip install -r requirements.txt 8 | pwd 9 | ./create-slurm-security-groups.py "$@" 10 | -------------------------------------------------------------------------------- /create-slurm-security-groups/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | .pytest_cache 4 | *.egg-info 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # Environments 12 | .env 13 | .venv 14 | env/ 15 | venv/ 16 | ENV/ 17 | env.bak/ 18 | venv.bak/ 19 | 20 | # CDK Context & Staging files 21 | .cdk.staging/ 22 | cdk.out/ 23 | 24 | cdk.context.json 25 | -------------------------------------------------------------------------------- /create-slurm-security-groups/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Welcome to your CDK Python project! 3 | 4 | You should explore the contents of this project. It demonstrates a CDK app with an instance of a stack (`create_security_groups_stack`) 5 | which contains an Amazon SQS queue that is subscribed to an Amazon SNS topic. 6 | 7 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 8 | 9 | This project is set up like a standard Python project. The initialization process also creates 10 | a virtualenv within this project, stored under the .venv directory. To create the virtualenv 11 | it assumes that there is a `python3` executable in your path with access to the `venv` package. 12 | If for any reason the automatic creation of the virtualenv fails, you can create the virtualenv 13 | manually once the init process completes. 14 | 15 | To manually create a virtualenv on MacOS and Linux: 16 | 17 | ``` 18 | $ python3 -m venv .venv 19 | ``` 20 | 21 | After the init process completes and the virtualenv is created, you can use the following 22 | step to activate your virtualenv. 23 | 24 | ``` 25 | $ source .venv/bin/activate 26 | ``` 27 | 28 | If you are a Windows platform, you would activate the virtualenv like this: 29 | 30 | ``` 31 | % .venv\Scripts\activate.bat 32 | ``` 33 | 34 | Once the virtualenv is activated, you can install the required dependencies. 35 | 36 | ``` 37 | $ pip install -r requirements.txt 38 | ``` 39 | 40 | At this point you can now synthesize the CloudFormation template for this code. 41 | 42 | ``` 43 | $ cdk synth 44 | ``` 45 | 46 | You can now begin exploring the source code, contained in the hello directory. 47 | There is also a very trivial test included that can be run like this: 48 | 49 | ``` 50 | $ pytest 51 | ``` 52 | 53 | To add additional dependencies, for example other CDK libraries, just add to 54 | your requirements.txt file and rerun the `pip install -r requirements.txt` 55 | command. 56 | 57 | ## Useful commands 58 | 59 | * `cdk ls` list all stacks in the app 60 | * `cdk synth` emits the synthesized CloudFormation template 61 | * `cdk deploy` deploy this stack to your default AWS account/region 62 | * `cdk diff` compare deployed stack with current state 63 | * `cdk docs` open CDK documentation 64 | 65 | Enjoy! 66 | -------------------------------------------------------------------------------- /create-slurm-security-groups/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import aws_cdk as cdk 4 | from aws_cdk import App, Environment 5 | from create_slurm_security_groups.create_slurm_security_groups_stack import CreateSlurmSecurityGroupsStack 6 | 7 | app = cdk.App() 8 | 9 | cdk_env = Environment( 10 | account = app.node.try_get_context('account_id'), 11 | region = app.node.try_get_context('region') 12 | ) 13 | stack_name = app.node.try_get_context('stack_name') 14 | 15 | CreateSlurmSecurityGroupsStack(app, stack_name, env=cdk_env, termination_protection = True,) 16 | 17 | app.synth() 18 | -------------------------------------------------------------------------------- /create-slurm-security-groups/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 19 | "@aws-cdk/core:checkSecretUsage": true, 20 | "@aws-cdk/core:target-partitions": [ 21 | "aws", 22 | "aws-cn" 23 | ], 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 29 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 30 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 31 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 32 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 33 | "@aws-cdk/core:enablePartitionLiterals": true, 34 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 35 | "@aws-cdk/aws-iam:standardizedServicePrincipals": true, 36 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 37 | "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, 38 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, 39 | "@aws-cdk/aws-route53-patters:useCertificate": true, 40 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false, 41 | "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, 42 | "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, 43 | "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, 44 | "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, 45 | "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, 46 | "@aws-cdk/aws-redshift:columnId": true, 47 | "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, 48 | "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, 49 | "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, 50 | "@aws-cdk/aws-kms:aliasNameRef": true, 51 | "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, 52 | "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, 53 | "@aws-cdk/aws-efs:denyAnonymousAccess": true, 54 | "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, 55 | "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, 56 | "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, 57 | "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, 58 | "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, 59 | "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, 60 | "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /create-slurm-security-groups/create_slurm_security_groups/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/create-slurm-security-groups/create_slurm_security_groups/__init__.py -------------------------------------------------------------------------------- /create-slurm-security-groups/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.5 2 | -------------------------------------------------------------------------------- /create-slurm-security-groups/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.111.0 2 | boto3 3 | colored 4 | constructs>=10.0.0,<11.0.0 5 | packaging 6 | -------------------------------------------------------------------------------- /create-slurm-security-groups/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/create-slurm-security-groups/tests/__init__.py -------------------------------------------------------------------------------- /create-slurm-security-groups/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/create-slurm-security-groups/tests/unit/__init__.py -------------------------------------------------------------------------------- /create-slurm-security-groups/tests/unit/test_create_slurm_security_groups_stack.py: -------------------------------------------------------------------------------- 1 | import aws_cdk as core 2 | import aws_cdk.assertions as assertions 3 | from create_security_groups.create_security_groups_stack import CreateSecurityGroupsStack 4 | 5 | 6 | def test_sqs_queue_created(): 7 | app = core.App() 8 | stack = CreateSecurityGroupsStack(app, "create-security-groups") 9 | template = assertions.Template.from_stack(stack) 10 | 11 | template.has_resource_properties("AWS::SQS::Queue", { 12 | "VisibilityTimeout": 300 13 | }) 14 | 15 | 16 | def test_sns_topic_created(): 17 | app = core.App() 18 | stack = CreateSecurityGroupsStack(app, "create-security-groups") 19 | template = assertions.Template.from_stack(stack) 20 | 21 | template.resource_count_is("AWS::SNS::Topic", 1) 22 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ../CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/delete-cluster.md: -------------------------------------------------------------------------------- 1 | # Delete Cluster 2 | 3 | Before deleting the cluster, you should stop the cluster and make sure that no instances are 4 | connected to the clusters head node. 5 | 6 | For example, you should deconfigure external login nodes and instances that are creating and updating the users_groups.json file. 7 | 8 | If you specified RESEnvironmentName then it will also deconfigure the creation of `users_groups.json` and also deconfigure the VDI 9 | instances so they are no longer using the cluster. 10 | 11 | If you configured [DomanJoinedInstance](config.md/#domainjoinedinstance) then the creation of `users_groups.json` will be automatically deconfigured. 12 | 13 | If you configured [ExternalLoginNodes](config.md/#externalloginnodes) then they will automatically deconfigured. 14 | 15 | If you manually did this configuration, then you should manually deconfigure them also before deleting the cluster. 16 | Otherwise, the NFS mounts of the head node will hang and file system related commands on the instance may hang. 17 | The commands to manually deconfigure can be found in the outputs of the configuration stack. 18 | 19 | | Output | Description 20 | |--------|------------- 21 | | command10CreateUsersGroupsJsonDeconfigure | Deconfigure the creation of users_groups.json 22 | | command11ExternalLoginNodeDeconfigure | Deconfigure external login node 23 | 24 | To delete the cluster all you need to do is delete the configuration CloudFormation stack. 25 | This will delete the ParallelCluster cluster stack and all of the configuration resources. 26 | You should not manually delete the ParallelCluster stack. 27 | If you do, the deconfiguration of login nodes and such may fail. 28 | 29 | If you deployed the Slurm database stack then you can keep that and use it for other clusters. 30 | If you don't need it anymore, then you can delete the stack. 31 | You will also need to manually delete the RDS database. 32 | 33 | If you deployed the ParallelCluster UI then you can keep it and use it with other clusters. 34 | If you don't need it anymore then you can delete the stack. 35 | -------------------------------------------------------------------------------- /docs/federation.md: -------------------------------------------------------------------------------- 1 | # Federation (legacy) 2 | 3 | To maximize performance, EDA workloads should run in a single AZ. 4 | If you need to run jobs in more than one AZ then you can use the [federation feature of Slurm](https://slurm.schedmd.com/federation.html) so that you can run jobs on multiple clusters. 5 | 6 | The config directory has example configuration files that demonstrate how deploy federated cluster into 3 AZs. 7 | 8 | * [source/config/slurm_eda_az1.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/config/slurm_eda_az1.yml) 9 | * [source/config/slurm_eda_az2.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/config/slurm_eda_az2.yml) 10 | * [source/config/slurm_eda_az3.yml](https://github.com/aws-samples/aws-eda-slurm-cluster/blob/main/source/config/slurm_eda_az3.yml) 11 | 12 | These clusters should be deployed sequentially. 13 | The first cluster creates a cluster and a slurmdbd instance. 14 | The other 2 clusters are deployed into their own AZ by configuring the SubnetId of the cluster. 15 | They reuse the same slurmdbd instance so that they can reuse a common pool of licenses that 16 | is managed by the slurmdbd instance. 17 | 18 | The config files for the 2nd and 3rd clusters provide the stack names from the others 19 | so that the security groups can be updated to allow the required network traffic between the 20 | clusters. 21 | 22 | The following shows an example of the configuration. 23 | 24 | slurm_eda_az1: 25 | ``` 26 | Federation: 27 | Name: slurmeda 28 | FederatedClusterStackNames: [] 29 | ``` 30 | 31 | slurm_eda_az2: 32 | ``` 33 | Federation: 34 | Name: slurmeda 35 | FederatedClusterStackNames: 36 | - slurmedaaz1 37 | ``` 38 | 39 | slurm_eda_az3: 40 | ``` 41 | Federation: 42 | Name: slurmeda 43 | FederatedClusterStackNames: 44 | - slurmedaaz1 45 | - slurmedaaz2 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/images/res-Admin-properties.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/docs/images/res-Admin-properties.png -------------------------------------------------------------------------------- /docs/images/res-ServiceAccount-properties.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/docs/images/res-ServiceAccount-properties.png -------------------------------------------------------------------------------- /docs/images/res-ad-users-and-computers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/docs/images/res-ad-users-and-computers.png -------------------------------------------------------------------------------- /docs/images/res-start-ad-users-and-computers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/docs/images/res-start-ad-users-and-computers.png -------------------------------------------------------------------------------- /docs/images/res-users.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/docs/images/res-users.png -------------------------------------------------------------------------------- /docs/images/res-windows-administrative-tools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/docs/images/res-windows-administrative-tools.png -------------------------------------------------------------------------------- /docs/implementation.md: -------------------------------------------------------------------------------- 1 | # Implementation Details (legacy) 2 | 3 | ## Slurm Infrastructure 4 | 5 | All hosts in the cluster must share a uniform user and group namespace. 6 | 7 | The munged service must be running before starting any slurm daemons. 8 | 9 | ## Directory Structure 10 | 11 | All of the configuration files, scripts, and logs can be found under the following directory. 12 | 13 | ``` 14 | /opt/slurm/{{ClusterName}} 15 | ``` 16 | 17 | ## CloudWatch Metrics 18 | 19 | CloudWatch metrics are published by the following sources, but the code is all in `SlurmPlugin.py`. 20 | 21 | * Slurm power saving scripts 22 | * `/opt/slurm/{{ClusterName}}/bin/slurm_ec2_resume.py` 23 | * `/opt/slurm/{{ClusterName}}/bin/slurm_ec2_resume_fail.py` 24 | * `/opt/slurm/{{ClusterName}}/bin/slurm_ec2_stop.py` 25 | * `/opt/slurm/{{ClusterName}}/bin/slurm_ec2_terminate.py` 26 | * Spot monitor running on compute nodes 27 | * `/opt/slurm/{{ClusterName}}/bin/spot_monitor.py` 28 | * Cron jobs running on the Slurm controller 29 | * `/opt/slurm/{{ClusterName}}/bin/slurm_ec2_publish_cw.py` 30 | * `/opt/slurm/{{ClusterName}}/bin/terminate_old_instances.py` 31 | 32 | ## Down Node Handling 33 | 34 | If a node has a problem running jobs then Slurm can mark it DOWN. 35 | This includes if the resume script cannot start an instance for any reason include insufficient EC2 capacity. 36 | This can create 2 issues. First, if the compute node is running then it is wasting EC2 costs. 37 | Second, the node will be unavailable for scheduling which reduces the configured capacity of the cluster. 38 | 39 | The cluster is configured to periodically check for DOWN nodes so that they aren't left running and wasting compute costs. 40 | This is done by `/opt/slurm/{{ClusterName}}/bin/slurm_down_nodes_clean.sh`. 41 | 42 | The script is called every day by a systemd service: 43 | 44 | `/etc/systemd/system/slurm_down_nodes_clean.service` 45 | 46 | This service is run at boot and once a day as defined in 47 | 48 | `/etc/systemd/system/slurm_down_nodes_clean.timer` 49 | 50 | ## Insufficient Capacity Exception (ICE) Handling 51 | 52 | When Slurm schedules a powered down node it calls the ResumeScript defined in `slurm.conf`. 53 | This is in `/opt/slurm/{{ClusterName}}/bin/slurm_ec2_resume.py`. 54 | The script will attempt to start an EC2 instance and if it receives and InsufficientCapacityException (ICE) then the node will be marked down and Slurm will requeue the job. 55 | However, this is inadequate because if there are a large number of instances of that instance type configured then 56 | Slurm will schedule them and try to start them with the same result. 57 | Eventually all of the powered down nodes will be marked DOWN and depending on the job requirements the job will be allocated 58 | to a node with a different instance type or it will fail. 59 | This can take a substantial amount of time so `SlurmPlugin.py` does the following when it receives an ICE. 60 | 61 | * Mark the node as DRAIN so no new jobs are scheduled on it. 62 | * Find all other powered down nodes of the same type and mark them DOWN so that they won't be scheduled after this node is marked DOWN. Nodes that are running will be left alone. 63 | * Requeue jobs on the node that failed to resume because of ICE. 64 | * Mark the node DOWN. 65 | * Power down the node. This is so that Slurm knows that the node is powered down so that when it is marked IDLE it will be powered up when a job is scheduled on it. 66 | * The `slurm_down_nodes_clean.service` periodically finds all DOWN Slurm nodes, powers them down, and then marks them IDLE so that they can have jobs scheduled on them. This will allow Slurm to attempt to use more nodes of the instance type in the hopes that there is more capacity. If not, then the cycle repeats. 67 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /docs/job_preemption.md: -------------------------------------------------------------------------------- 1 | # Job Preemption 2 | 3 | The cluster is set up with an interactive partition that has a higher priority than all other partitions. 4 | All other partitions are configured to allow jobs to be preempted by the interactive queue. 5 | When an interactive job is pending because of compute resources then it can preempt another job and use the resources. 6 | The preempted job will be requeued so that it will rerun when resources become available. 7 | 8 | Jobs should rarely pend because of lack of compute resources if you've defined enough compute nodes in your configuration. 9 | The more likely reason for a job to pend is if it requires a license and all available licenses are already being used. 10 | However, it appears that Slurm doesn't support preemption based on licenses availability so if the reason a job is pending is 11 | because of licenses then it will not preempt jobs in a lower priority queue even if doing so would free up a license. 12 | 13 | ## Documentation 14 | 15 | [https://slurm.schedmd.com/preempt.html](https://slurm.schedmd.com/preempt.html) 16 | -------------------------------------------------------------------------------- /docs/rest_api.md: -------------------------------------------------------------------------------- 1 | # Slurm REST API 2 | 3 | The [Slurm REST API](https://slurm.schedmd.com/rest_api.html) give a programmatic way to access the features of Slurm. 4 | The REST API can be used, for example, to use a Lambda function to submit jobs to the Slurm cluster. 5 | 6 | ## How to use the REST API 7 | 8 | The following shows how to run a simple REST call. 9 | 10 | ``` 11 | source /opt/slurm/{{ClusterName}}/config/slurm_config.sh 12 | unset SLURM_JWT 13 | . <(scontrol token) 14 | wget --header "X-SLURM-USER-TOKEN: $SLURM_JWT" --header "X-SLURM-USER-NAME: $USER" -q $SLURMRESTD_URL/slurm/v0.0.38/diag/ -O - 15 | ``` 16 | 17 | The REST API is documented at [https://slurm.schedmd.com/rest_api.html](https://slurm.schedmd.com/rest_api.html). 18 | 19 | The token returned by `scontrol token` has a default lifetime of 3600 seconds (1 hour). 20 | For automation, a cron job on the Slurm controller creates a new token for the `root` and `slurmrestd` users every 30 minutes and stores them in SSM Parameter Store at `/{{ClusterName}}/slurmrestd/jwt/{{user_name}}`. 21 | These tokens can be used by automations such as a Lambda function to access the REST API. 22 | An example Lambda function called `{{ClusterName}}-CallSlurmRestApiLambda` shows how to call various API functions. 23 | You can use this as a template to write functions that use your Slurm cluster for automations. 24 | -------------------------------------------------------------------------------- /docs/soca_integration.md: -------------------------------------------------------------------------------- 1 | # SOCA Integration 2 | 3 | [Scale Out Computing on AWS (SOCA)](https://aws.amazon.com/solutions/implementations/scale-out-computing-on-aws/) is an AWS solution that 4 | was the basis for the [Research and Engineering Studion (RES)](https://docs.aws.amazon.com/res/latest/ug/overview.html) service. 5 | Unless you are already a SOCA user, it is highly recommended that you use RES, which is a fully supported AWS service. 6 | 7 | Integration with SOCA is straightforward. 8 | 9 | Set the following parameters in your config file. 10 | 11 | | Parameter | Description | Value 12 | |-----------|-------------|------ 13 | | VpcId | VPC id for the SOCA cluster | vpc-xxxxxx 14 | | slurm/SlurmCtl/AdditionalSecurityGroups | Security group ids that give desktop instances access to the head node and that give the head node access to VPC resources such as file systems. 15 | | slurm/InstanceConfig/AdditionalSecurityGroups | Security group ids that give desktop instances access to the compute nodes and that give compute nodes access to VPC resources such as file systems. 16 | | ExtraMounts | Add the mount parameters for the /apps and /data directories. This is required for access to the home directory. | 17 | 18 | Deploy your slurm cluster. 19 | 20 | Connect to the SOCA Scheduler instance and follow the instructions to [Create users_groups.json](deploy-parallel-cluster.md#create-users_groupsjson). 21 | 22 | Connect to a remote desktop instance and follow the instructions in [Configure submission hosts to use the cluster](deploy-parallel-cluster.md#configure-submission-hosts-to-use-the-cluster). 23 | If all users need to use the cluster then it is probably best to create a custom AMI that is configured with the configuration 24 | commands. 25 | 26 | You are now ready to run jobs from your SOCA desktop. 27 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | ## Contents 5 | 6 | * [README](docs/index.md) 7 | * [Deploy the Cluster](docs/deploy.md) 8 | * [Run Jobs](docs/run_jobs.md) 9 | * [On-Premises Integration](docs/onprem.md) 10 | * [SOCA Integration](docs/soca_integration.md) 11 | * [SLURM AMI Based On FPGA Developer AMI](docs/f1-ami.md) 12 | * [Federation](docs/federation.md) 13 | * [Implementation Details](docs/implementation.md) 14 | * [Debug](docs/debug.md) 15 | * [To Do List](docs/todo.md) 16 | * [mkdocs](mkdocs.md) 17 | 18 | {% include_relative docs/index.md %} 19 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | scriptdir=$(dirname $(readlink -f $0)) 6 | repodir=$scriptdir 7 | 8 | cd $repodir 9 | 10 | source setup.sh 11 | 12 | source/installer.py "$@" 13 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: EDA SLURM Cluster on AWS 2 | repo_url: https://github.com/aws-samples/aws-eda-slurm-cluster 3 | docs_dir: docs 4 | nav: 5 | - 'index.md' 6 | - 'deployment-prerequisites.md' 7 | - 'security-groups.md' 8 | - 'deploy-parallel-cluster.md' 9 | - 'config.md' 10 | - 'res_integration.md' 11 | - 'soca_integration.md' 12 | - 'exostellar-workload-optimizer.md' 13 | - 'exostellar-infrastructure-optimizer.md' 14 | - 'custom-amis.md' 15 | - 'run_jobs.md' 16 | - 'job_preemption.md' 17 | - 'rest_api.md' 18 | - 'onprem.md' 19 | - 'containers.md' 20 | # - 'federation.md' 21 | - 'delete-cluster.md' 22 | # - 'implementation.md' 23 | - 'debug.md' 24 | strict: true 25 | theme: 26 | name: mkdocs 27 | #name: readthedocs 28 | hljs_languages: 29 | - python 30 | - yaml 31 | navigation_depth: 4 32 | nav_style: dark 33 | features: 34 | - navigation.tabs 35 | -------------------------------------------------------------------------------- /res/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .venv 3 | rendered_templates/ 4 | -------------------------------------------------------------------------------- /res/Makefile: -------------------------------------------------------------------------------- 1 | 2 | diff-base: 3 | meld res-demo-original/bi.yaml res-demo-with-cidr/bi.yaml & 4 | meld res-demo-original/keycloak.yaml res-demo-with-cidr/keycloak.yaml & 5 | meld res-demo-original/res-sso-keycloak.yaml res-demo-with-cidr/res-sso-keycloak.yaml & 6 | meld res-demo-original/res.ldif res-demo-with-cidr/res.ldif & 7 | 8 | diff-top: 9 | meld res-demo-original/res-demo-stack.yaml res-demo-with-cidr/res-demo-stack.yaml & 10 | meld res-demo-with-cidr/res-demo-stack.yaml res-demo-with-cidr/res-bi-only.yaml & 11 | meld res-demo-with-cidr/res-demo-stack.yaml res-demo-with-cidr/res-only.yaml & 12 | 13 | diff: diff-base diff-top 14 | -------------------------------------------------------------------------------- /res/download-res-templates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | # 5 | # Download the original templates from s3 6 | # This is so that they can be used to create modified versions 7 | 8 | script_dir=$(dirname $(realpath $0)) 9 | cd $script_dir 10 | aws s3 cp s3://aws-hpc-recipes/main/recipes/res/res_demo_env/assets/bi.yaml res-demo-original/. 11 | aws s3 cp s3://aws-hpc-recipes/main/recipes/res/res_demo_env/assets/keycloak.yaml res-demo-original/. 12 | aws s3 cp s3://aws-hpc-recipes/main/recipes/res/res_demo_env/assets/res-demo-stack.yaml res-demo-original/. 13 | aws s3 cp s3://aws-hpc-recipes/main/recipes/res/res_demo_env/assets/res-sso-keycloak.yaml res-demo-original/. 14 | aws s3 cp s3://aws-hpc-recipes/main/recipes/net/hpc_large_scale/assets/main.yaml res-demo-original/networking.yaml 15 | 16 | aws s3 cp s3://aws-hpc-recipes/main/recipes/res/res_demo_env/assets/res.ldif res-demo-original/. 17 | -------------------------------------------------------------------------------- /res/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | jinja2 3 | -------------------------------------------------------------------------------- /res/upload-res-templates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | # 5 | # Create a virtual environment and call upload-res-templates.py. 6 | 7 | script_dir=$(dirname $(realpath $0)) 8 | cd $script_dir 9 | 10 | python3 -m venv .venv 11 | source .venv/bin/activate 12 | python3 -m pip install -r requirements.txt 13 | ./upload-res-templates.py "$@" 14 | -------------------------------------------------------------------------------- /security_scan/cfn_nag-deny-list.yml: -------------------------------------------------------------------------------- 1 | 2 | RulesToSuppress: 3 | - id: W12 # IAM policy should not allow * resource 4 | reason: LogRetentionaae0aa3c5b4d4f87b02d85b201efdd8aServiceRoleDefaultPolicyADDA7DEB created by CDK so can't fix. 5 | - id: W58 # Lambda functions require permission to write CloudWatch Logs 6 | reason: Logs permissions granted by AWSLambdaBasicExecutionRole 7 | - id: W76 # SPCM for IAM policy document is higher than 25 8 | reason: "SlurmCtlPolicyD0AD24C6, SlurmNodeAmiRoleDefaultPolicy24A6F225, SlurmNodeAmiPolicyD9697183" 9 | - id: W77 # Secrets Manager Secret should explicitly specify KmsKeyId. Besides control of the key this will allow the secret to be shared cross-account 10 | reason: Using AWS provided key 11 | - id: W89 # Lambda functions should be deployed inside a VPC 12 | reason: No VPC is required so not sure why this is a warning. Seems to violate principle of least privilege. 13 | - id: W92 # Lambda functions should define ReservedConcurrentExecutions to reserve simultaneous executions 14 | reason: Not required by these lambdas which are infrequently called 15 | -------------------------------------------------------------------------------- /security_scan/security_scan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | scriptdir=$(dirname $(readlink -f $0)) 6 | 7 | cd $scriptdir/.. 8 | ./install.sh --config-file ~/slurm/res-eda/res-eda-pc-3-9-1-rhel8-x86-config.yml --cdk-cmd synth 9 | 10 | cfn_nag_scan --input-path $scriptdir/../source/cdk.out/res-eda-pc-3-9-1-rhel8-x86-config.template.json --deny-list-path $scriptdir/cfn_nag-deny-list.yml --fail-on-warnings &> $scriptdir/cfn_nag.log 11 | 12 | cd $scriptdir 13 | if [ ! -e $scriptdir/bandit-env ]; then 14 | python3 -m venv bandit-env 15 | source bandit-env/bin/activate 16 | pip install bandit 17 | python3 -m pip install bandit 18 | fi 19 | source bandit-env/bin/activate 20 | 21 | cd $scriptdir/.. 22 | bandit -r source/resources/playbooks/roles/SlurmCtl/files/opt/slurm/cluster/bin/SlurmPlugin.py &> $scriptdir/bandit.log 23 | 24 | if [ ! -e $scriptdir/ScoutSuite ]; then 25 | cd $scriptdir 26 | git clone https://github.com/nccgroup/ScoutSuite 27 | fi 28 | if [ ! -e $scriptdir/ScoutSuite/scoutesuite-venv/bin/activate ]; then 29 | cd $scriptdir/ScoutSuite 30 | rm -f scoutesuite-venv 31 | python3 -m venv scoutesuite-venv 32 | source scoutesuite-venv/bin/activate 33 | python3 -m pip install -r requirements.txt 34 | fi 35 | cd $scriptdir/ScoutSuite 36 | source scoutesuite-venv/bin/activate 37 | python scout.py aws -r us-east-1 38 | -------------------------------------------------------------------------------- /source/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | .pytest_cache 4 | *.egg-info 5 | node_modules 6 | **/__pycache__/ 7 | 8 | # Environments 9 | .env 10 | .venv 11 | env/ 12 | venv/ 13 | ENV/ 14 | env.bak/ 15 | venv.bak/ 16 | 17 | # CDK Context & Staging files 18 | cdk.context.json 19 | .cdk.staging/ 20 | cdk.out/ 21 | 22 | installer_history.txt 23 | .requirements_installed 24 | -------------------------------------------------------------------------------- /source/CDK-README.md: -------------------------------------------------------------------------------- 1 | 2 | # Welcome to your CDK Python project! 3 | 4 | You should explore the contents of this project. It demonstrates a CDK app with an instance of a stack (`cdk_stack`) 5 | which contains an Amazon SQS queue that is subscribed to an Amazon SNS topic. 6 | 7 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 8 | 9 | This project is set up like a standard Python project. The initialization process also creates 10 | a virtualenv within this project, stored under the .venv directory. To create the virtualenv 11 | it assumes that there is a `python3` executable in your path with access to the `venv` package. 12 | If for any reason the automatic creation of the virtualenv fails, you can create the virtualenv 13 | manually once the init process completes. 14 | 15 | To manually create a virtualenv on MacOS and Linux: 16 | 17 | ``` 18 | $ python3 -m venv .venv 19 | ``` 20 | 21 | After the init process completes and the virtualenv is created, you can use the following 22 | step to activate your virtualenv. 23 | 24 | ``` 25 | $ source .venv/bin/activate 26 | ``` 27 | 28 | If you are a Windows platform, you would activate the virtualenv like this: 29 | 30 | ``` 31 | % .venv\Scripts\activate.bat 32 | ``` 33 | 34 | Once the virtualenv is activated, you can install the required dependencies. 35 | 36 | ``` 37 | $ pip install -r requirements.txt 38 | ``` 39 | 40 | At this point you can now synthesize the CloudFormation template for this code. 41 | 42 | ``` 43 | $ cdk synth 44 | ``` 45 | 46 | You can now begin exploring the source code, contained in the hello directory. 47 | There is also a very trivial test included that can be run like this: 48 | 49 | ``` 50 | $ pytest 51 | ``` 52 | 53 | To add additional dependencies, for example other CDK libraries, just add to 54 | your requirements.txt file and rerun the `pip install -r requirements.txt` 55 | command. 56 | 57 | ## Useful commands 58 | 59 | * `cdk ls` list all stacks in the app 60 | * `cdk synth` emits the synthesized CloudFormation template 61 | * `cdk deploy` deploy this stack to your default AWS account/region 62 | * `cdk diff` compare deployed stack with current state 63 | * `cdk docs` open CDK documentation 64 | 65 | Enjoy! 66 | -------------------------------------------------------------------------------- /source/EC2InstanceTypeInfoPkg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-eda-slurm-cluster/673567ba046ead31d4849534a43e4fb4de2a62cc/source/EC2InstanceTypeInfoPkg/__init__.py -------------------------------------------------------------------------------- /source/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from botocore.exceptions import NoCredentialsError 5 | from EC2InstanceTypeInfoPkg.EC2InstanceTypeInfo import EC2InstanceTypeInfo 6 | import logging 7 | from sys import exit 8 | from VersionCheck import logger as VersionCheck_logger, VersionCheck 9 | 10 | if __name__ == '__main__': 11 | try: 12 | parser = argparse.ArgumentParser(description="Get EC2 instance pricing info.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 13 | parser.add_argument("--region", "-r", type=str, default=[], action='append', help="AWS region(s) to get info for.") 14 | parser.add_argument("--input", '-i', type=str, default=None, help="JSON input file. Reads existing info from previous runs. Can speed up rerun if it failed to collect the data for a region.") 15 | parser.add_argument("--output-csv", '-o', type=str, default=None, help="CSV output file. Default: instance_type_info.csv") 16 | parser.add_argument("--disable-version-check", action='store_const', const=True, default=False, help="Disable git version check") 17 | parser.add_argument("--debug", "-d", action='store_const', const=True, default=False, help="Enable debug messages") 18 | args = parser.parse_args() 19 | 20 | if args.debug: 21 | VersionCheck_logger.setLevel(logging.DEBUG) 22 | 23 | if not args.disable_version_check and not VersionCheck().check_git_version(): 24 | exit(1) 25 | 26 | if args.input: 27 | print(f"Reading existing instance info from {args.input}") 28 | ec2InstanceTypeInfo = EC2InstanceTypeInfo(args.region, json_filename=args.input, debug=args.debug) 29 | if args.output_csv: 30 | print(f"\nWriting output to CSV: {args.output_csv}") 31 | ec2InstanceTypeInfo.print_csv(args.output_csv) 32 | except NoCredentialsError as e: 33 | print('No AWS credentials found') 34 | exit(1) 35 | -------------------------------------------------------------------------------- /source/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from botocore.exceptions import ClientError 4 | from functools import wraps 5 | import logging 6 | from logging import error, info, warning, handlers 7 | import random 8 | import time 9 | import traceback 10 | 11 | logger = logging.getLogger(__file__) 12 | 13 | logger_formatter = logging.Formatter('%(levelname)s:%(asctime)s: %(message)s') 14 | logger_streamHandler = logging.StreamHandler() 15 | logger_streamHandler.setFormatter(logger_formatter) 16 | logger.addHandler(logger_streamHandler) 17 | logger.setLevel(logging.INFO) 18 | #logger.setLevel(logging.DEBUG) 19 | 20 | def retry_boto3_throttling(min_delay = 1, max_delay = 10 * 60, max_cumulative_delay = 12 * 3600, base = 1, logger = logger): 21 | """ 22 | Retry calling the decorated function using a linear or exponential backoff. 23 | 24 | This is to handle EC2 API and resource throttling which uses a token bucket 25 | with a fixed refill rate. Once the bucket is emptied then throttling occurs 26 | until tokens are added. Tokens are added every second so the minimum retry 27 | interval is 1 second up to the specified maximum delay. 28 | 29 | I think I like this one better since it randomly spreads the backoff while 30 | still allowing some short backoffs. 31 | 32 | https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ 33 | 34 | http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/ 35 | original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry 36 | 37 | Decorators described here: 38 | https://docs.python.org/2/whatsnew/2.4.html?highlight=decorator#pep-318-decorators-for-functions-and-methods 39 | 40 | :param min_delay: Minimum delay before retry 41 | :type min_delay: int 42 | 43 | :param max_delay: Maximum delay before retry 44 | :type max_delay: int 45 | 46 | :param max_cumulative_delay: Maximum total time to wait in seconds 47 | :type max_cumulative_delay: int 48 | 49 | :param base: Base for exponential backoff 50 | :type base: int 51 | 52 | :param logger: logger to use. 53 | :type logger: logging.Logger instance 54 | """ 55 | def deco_retry(f): 56 | 57 | @wraps(f) 58 | def f_retry(*args, **kwargs): 59 | attempt = 0 60 | cumulative_delay = 0.0 61 | while (cumulative_delay < max_cumulative_delay): 62 | try: 63 | attempt += 1 64 | return f(*args, **kwargs) 65 | except ClientError as e: 66 | logging.debug("Caught exception") 67 | if e.response['Error']['Code'] in ['RequestLimitExceeded', 'InternalError', 'ThrottlingException']: 68 | pass 69 | else: 70 | logging.debug("Rethrew exception") 71 | raise e 72 | logger.debug("%s" % (traceback.format_exc())) 73 | logger.debug("attempt=%d" % attempt) 74 | current_max_delay = min(max_delay, base * 2 ** attempt) 75 | logger.debug("delay_range=(%f %f)" % (min_delay, current_max_delay)) 76 | delay = random.uniform(min_delay, current_max_delay) # nosec 77 | logger.debug("cumulative delay=%f max=%d" % (cumulative_delay, max_cumulative_delay)) 78 | logger.debug("Retrying in %f seconds..." % (delay)) 79 | time.sleep(delay) 80 | cumulative_delay += delay 81 | return f(*args, **kwargs) 82 | 83 | return f_retry # true decorator 84 | 85 | return deco_retry 86 | -------------------------------------------------------------------------------- /source/Makefile: -------------------------------------------------------------------------------- 1 | 2 | .requirements_installed: requirements.txt 3 | pip install -r requirements.txt 4 | touch $@ 5 | -------------------------------------------------------------------------------- /source/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | SPDX-License-Identifier: MIT-0 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, 9 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | """ 19 | 20 | from aws_cdk import App, Environment 21 | 22 | from cdk.cdk_slurm_stack import CdkSlurmStack 23 | 24 | app = App() 25 | 26 | cdk_env = Environment( 27 | account = app.node.try_get_context('account_id'), 28 | region = app.node.try_get_context('region') 29 | ) 30 | 31 | CdkSlurmStack(app, app.node.try_get_context('stack_name'), env=cdk_env, 32 | termination_protection = True, 33 | ) 34 | 35 | app.synth() 36 | -------------------------------------------------------------------------------- /source/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /source/cdk/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, 8 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 12 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 13 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 14 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 16 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | """ 18 | -------------------------------------------------------------------------------- /source/installer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | SPDX-License-Identifier: MIT-0 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, 9 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 13 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 14 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 15 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 18 | """ 19 | 20 | """ 21 | It's recommended to trigger this script via ../installer.sh as python's virtual env and all required 22 | libraries/dependencies will be automatically installed. 23 | 24 | If you trigger ./installer.py directly, make sure to have all the Python and CDK dependencies installed 25 | """ 26 | 27 | from slurm_installer.installer import SlurmInstaller 28 | 29 | if __name__ == "__main__": 30 | app = SlurmInstaller() 31 | app.main() 32 | -------------------------------------------------------------------------------- /source/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | aws-cdk-lib==2.179.0 3 | boto3 4 | colored 5 | constructs>=10.0.0 6 | hostlist 7 | isodate 8 | jinja2 9 | pytest 10 | python-hostlist 11 | pip 12 | requests 13 | PyYAML>5.4.1 14 | schema 15 | -------------------------------------------------------------------------------- /source/resources/config/default_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #==================================================================== 3 | # Sample configuraton that creates a minimal Slurm cluster 4 | # 5 | # NOTE: This is just an example. 6 | # Please create your own revision controlled config file. 7 | # 8 | # No SlurmDbd in this configuration. 9 | # Configure 5 each of t3 instance types. 10 | # 11 | # This config doesn't provide required parameters like VpcId so you must 12 | # use the --prompt option with it. 13 | # To use: 14 | # source setup.sh 15 | # ./install.sh --config-file source/config/default_config.yml --prompt 16 | # 17 | # Defaults and valid configuration options are in source/config_schema.py. 18 | # Command line values override values in the config file. 19 | #==================================================================== 20 | 21 | StackName: slurmminimal-config 22 | 23 | # @TODO: Add Region 24 | # Region: {{Region}} 25 | 26 | # @TODO: Add your SshKeyPair 27 | # SshKeyPair: {{SshKeyPair}} 28 | 29 | # @TODO: Update with your VPC 30 | # VpcId: vpc-xxxxxxxxxxxxxxxxx 31 | 32 | # @TODO: Update with your private subnet in your VPC 33 | # SubnetId: subnet-xxxxxxxxxxxxxxxxx 34 | 35 | # @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription 36 | # ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} 37 | 38 | # @TODO: Add your preferred timezone so times aren't in UTC 39 | # TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York 40 | 41 | # @TODO: If using Research and Engineering Studio, update with environment name 42 | # RESEnvironmentName: {{ResEnvironmentName}} 43 | 44 | slurm: 45 | ParallelClusterConfig: 46 | Version: 3.12.0 47 | # @TODO: Choose the CPU architecture: x86_64, arm64. Default: x86_64 48 | # Architecture: x86_64 49 | # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 50 | # Database: 51 | # DatabaseStackName: {{DatabaseStackName}} 52 | 53 | MungeKeySecret: SlurmMungeKey 54 | 55 | SlurmCtl: {} 56 | 57 | InstanceConfig: 58 | UseSpot: true 59 | Include: 60 | # @TODO: Update InstanceFamiles and InstanceTypes to use in your cluster 61 | InstanceFamilies: 62 | - t3 63 | InstanceTypes: [] 64 | NodeCounts: 65 | # @TODO: Update the max number of each instance type to configure 66 | DefaultMaxCount: 5 67 | # @TODO: You can update the max instance count for each compute resource 68 | # ComputeResourceCounts: 69 | # od-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge 70 | # MaxCount: 1 71 | # sp-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge 72 | # MaxCount: 2 73 | 74 | # @TODO: Configure storage mounts 75 | # storage: 76 | # ExtraMounts: 77 | # - dest: /home 78 | # StorageType: Efs 79 | # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' 80 | # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ 81 | # type: nfs4 82 | # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 83 | # ExtraMountSecurityGroups: 84 | # nfs: 85 | # DCV-Host: sg-xxxxxxxxxxxxxxxxx 86 | 87 | # @TODO: Configure license counts 88 | Licenses: 89 | vcs: 90 | Count: 10 91 | Server: synopsys_licenses 92 | Port: '24680' 93 | ServerType: flexlm 94 | -------------------------------------------------------------------------------- /source/resources/config/slurm_all_arm_instance_types.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #==================================================================== 3 | # Minimal cluster with all X86_64 instance types 4 | # 5 | # NOTE: This is just an example. 6 | # Please create your own revision controlled config file. 7 | # 8 | # No SlurmDbd in this configuration. 9 | # Configure 10 each of all x86_64 instance types. 10 | # 11 | # Defaults and valid configuration options are in source/config_schema.py. 12 | # Command line values override values in the config file. 13 | #==================================================================== 14 | 15 | StackName: slurm-all-arm-config 16 | 17 | # @TODO: Add Region 18 | # Region: {{Region}} 19 | 20 | # @TODO: Add your SshKeyPair 21 | # SshKeyPair: {{SshKeyPair}} 22 | 23 | # @TODO: Update with your VPC 24 | # VpcId: vpc-xxxxxxxxxxxxxxxxx 25 | 26 | # @TODO: Update with your private subnet in your VPC 27 | # SubnetId: subnet-xxxxxxxxxxxxxxxxx 28 | 29 | # @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription 30 | # ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} 31 | 32 | # @TODO: Add your preferred timezone so times aren't in UTC 33 | # TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York 34 | 35 | # @TODO: If using Research and Engineering Studio, update with environment name 36 | # RESEnvironmentName: {{ResEnvironmentName}} 37 | 38 | slurm: 39 | ParallelClusterConfig: 40 | Version: 3.9.1 41 | Architecture: arm64 42 | # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 43 | # Database: 44 | # DatabaseStackName: {{DatabaseStackName}} 45 | 46 | MungeKeySecret: SlurmMungeKey 47 | 48 | SlurmCtl: {} 49 | 50 | InstanceConfig: 51 | UseSpot: true 52 | Include: 53 | InstanceFamilies: ['.*'] 54 | InstanceTypes: [] 55 | NodeCounts: 56 | # @TODO: Update the max number of each instance type to configure 57 | DefaultMaxCount: 5 58 | # @TODO: You can update the max instance count for each compute resource 59 | ComputeResourceCounts: 60 | od-1024gb-64-cores: # x2gd.16xlarge 61 | MaxCount: 1 62 | sp-1024gb-64-cores: # x2gd.16xlarge 63 | MaxCount: 2 64 | 65 | # @TODO: Configure storage mounts 66 | # storage: 67 | # ExtraMounts: 68 | # - dest: /home 69 | # StorageType: Efs 70 | # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' 71 | # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ 72 | # type: nfs4 73 | # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 74 | # ExtraMountSecurityGroups: 75 | # nfs: 76 | # DCV-Host: sg-xxxxxxxxxxxxxxxxx 77 | 78 | # @TODO: Configure license counts 79 | Licenses: 80 | vcs: 81 | Count: 10 82 | Server: synopsys_licenses 83 | Port: '24680' 84 | ServerType: flexlm 85 | -------------------------------------------------------------------------------- /source/resources/config/slurm_all_x86_instance_types.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #==================================================================== 3 | # Minimal cluster with all X86_64 instance types 4 | # 5 | # NOTE: This is just an example. 6 | # Please create your own revision controlled config file. 7 | # 8 | # No SlurmDbd in this configuration. 9 | # Configure 10 each of all x86_64 instance types. 10 | # 11 | # Defaults and valid configuration options are in source/config_schema.py. 12 | # Command line values override values in the config file. 13 | #==================================================================== 14 | 15 | StackName: slurm-all-x86-config 16 | 17 | # @TODO: Add Region 18 | # Region: {{Region}} 19 | 20 | # @TODO: Add your SshKeyPair 21 | # SshKeyPair: {{SshKeyPair}} 22 | 23 | # @TODO: Update with your VPC 24 | # VpcId: vpc-xxxxxxxxxxxxxxxxx 25 | 26 | # @TODO: Update with your private subnet in your VPC 27 | # SubnetId: subnet-xxxxxxxxxxxxxxxxx 28 | 29 | # @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription 30 | # ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} 31 | 32 | # @TODO: Add your preferred timezone so times aren't in UTC 33 | # TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York 34 | 35 | # @TODO: If using Research and Engineering Studio, update with environment name 36 | # RESEnvironmentName: {{ResEnvironmentName}} 37 | 38 | slurm: 39 | ParallelClusterConfig: 40 | Version: 3.9.1 41 | Architecture: x86_64 42 | # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 43 | # Database: 44 | # DatabaseStackName: {{DatabaseStackName}} 45 | 46 | MungeKeySecret: SlurmMungeKey 47 | 48 | SlurmCtl: {} 49 | 50 | InstanceConfig: 51 | UseSpot: true 52 | Include: 53 | InstanceFamilies: ['.*'] 54 | InstanceTypes: [] 55 | NodeCounts: 56 | # @TODO: Update the max number of each instance type to configure 57 | DefaultMaxCount: 5 58 | # @TODO: You can update the max instance count for each compute resource 59 | ComputeResourceCounts: 60 | od-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge 61 | MaxCount: 1 62 | sp-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge 63 | MaxCount: 2 64 | 65 | # @TODO: Configure storage mounts 66 | # storage: 67 | # ExtraMounts: 68 | # - dest: /home 69 | # StorageType: Efs 70 | # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' 71 | # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ 72 | # type: nfs4 73 | # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 74 | # ExtraMountSecurityGroups: 75 | # nfs: 76 | # DCV-Host: sg-xxxxxxxxxxxxxxxxx 77 | 78 | # @TODO: Configure license counts 79 | Licenses: 80 | vcs: 81 | Count: 10 82 | Server: synopsys_licenses 83 | Port: '24680' 84 | ServerType: flexlm 85 | -------------------------------------------------------------------------------- /source/resources/config/slurm_nodes_on_prem.conf: -------------------------------------------------------------------------------- 1 | # 2 | # ON PREMISES COMPUTE NODES 3 | # 4 | # Config file with list of statically provisioned on-premises compute nodes that 5 | # are managed by this cluster. 6 | # 7 | # These nodes must be addressable on the network and firewalls must allow access on all ports 8 | # required by slurm. 9 | # 10 | # The compute nodes must have mounts that mirror the compute cluster including mounting the slurm file system 11 | # or a mirror of it. 12 | # 13 | # There are no constraints on the node names other than that they should be unique from all other nodes and are ideally descriptive. 14 | # 15 | # By giving these nodes a weight of 1 they should have highest priority for use by the scheduler. 16 | # 17 | # The example nodes are actually static EC2 nodes in an AWS VPC. 18 | # 19 | # You should also create a partition that uses these nodes and exclude the partitiion from power saving so that they alway remain powered up. 20 | # 21 | 22 | # Set the defaults for these nodes. 23 | NodeName=Default State=DOWN 24 | 25 | NodeName=onprem-c7-x86-t3-2xl-0 NodeAddr=onprem-c7-x86-t3-2xl-0.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 26 | NodeName=onprem-c7-x86-t3-2xl-1 NodeAddr=onprem-c7-x86-t3-2xl-1.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 27 | NodeName=onprem-c7-x86-t3-2xl-2 NodeAddr=onprem-c7-x86-t3-2xl-2.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 28 | NodeName=onprem-c7-x86-t3-2xl-3 NodeAddr=onprem-c7-x86-t3-2xl-3.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 29 | NodeName=onprem-c7-x86-t3-2xl-4 NodeAddr=onprem-c7-x86-t3-2xl-4.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 30 | NodeName=onprem-c7-x86-t3-2xl-5 NodeAddr=onprem-c7-x86-t3-2xl-5.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 31 | NodeName=onprem-c7-x86-t3-2xl-6 NodeAddr=onprem-c7-x86-t3-2xl-6.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 32 | NodeName=onprem-c7-x86-t3-2xl-7 NodeAddr=onprem-c7-x86-t3-2xl-7.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 33 | NodeName=onprem-c7-x86-t3-2xl-8 NodeAddr=onprem-c7-x86-t3-2xl-8.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 34 | NodeName=onprem-c7-x86-t3-2xl-9 NodeAddr=onprem-c7-x86-t3-2xl-9.onprem.com CPUs=4 RealMemory=30512 Feature=c7,CentOS_7_x86_64,x86_64,GHz:2.5,onprem Weight=1 35 | 36 | NodeSet=onprem_nodes Feature=onprem 37 | 38 | # 39 | # 40 | # OnPrem Partition 41 | # 42 | # The is the default partition and includes all nodes from the 1st OS. 43 | # 44 | PartitionName=onprem Default=YES PriorityTier=20000 Nodes=onprem_nodes 45 | 46 | # 47 | # Always on partitions 48 | # 49 | # Prevent the nodes from being powered down. 50 | # 51 | SuspendExcParts=onprem 52 | -------------------------------------------------------------------------------- /source/resources/config/slurm_recommended_arm_instance_types.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #==================================================================== 3 | # Minimal cluster with all X86_64 instance types 4 | # 5 | # NOTE: This is just an example. 6 | # Please create your own revision controlled config file. 7 | # 8 | # No SlurmDbd in this configuration. 9 | # Configure 10 each of all x86_64 instance types. 10 | # 11 | # Defaults and valid configuration options are in source/config_schema.py. 12 | # Command line values override values in the config file. 13 | #==================================================================== 14 | 15 | StackName: slurm-arm-config 16 | 17 | # @TODO: Add Region 18 | # Region: {{Region}} 19 | 20 | # @TODO: Add your SshKeyPair 21 | # SshKeyPair: {{SshKeyPair}} 22 | 23 | # @TODO: Update with your VPC 24 | # VpcId: vpc-xxxxxxxxxxxxxxxxx 25 | 26 | # @TODO: Update with your private subnet in your VPC 27 | # SubnetId: subnet-xxxxxxxxxxxxxxxxx 28 | 29 | # @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription 30 | # ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} 31 | 32 | # @TODO: Add your preferred timezone so times aren't in UTC 33 | # TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York 34 | 35 | # @TODO: If using Research and Engineering Studio, update with environment name 36 | # RESEnvironmentName: {{ResEnvironmentName}} 37 | 38 | slurm: 39 | ParallelClusterConfig: 40 | Version: 3.9.1 41 | Architecture: arm64 42 | # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 43 | # Database: 44 | # DatabaseStackName: {{DatabaseStackName}} 45 | 46 | MungeKeySecret: SlurmMungeKey 47 | 48 | SlurmCtl: {} 49 | 50 | InstanceConfig: 51 | UseSpot: true 52 | NodeCounts: 53 | # @TODO: Update the max number of each instance type to configure 54 | DefaultMaxCount: 5 55 | # @TODO: You can update the max instance count for each compute resource 56 | ComputeResourceCounts: 57 | od-1024gb-64-cores: # x2gd.16xlarge 58 | MaxCount: 1 59 | sp-1024gb-64-cores: # x2gd.16xlarge 60 | MaxCount: 2 61 | 62 | # @TODO: Configure storage mounts 63 | # storage: 64 | # ExtraMounts: 65 | # - dest: /home 66 | # StorageType: Efs 67 | # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' 68 | # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ 69 | # type: nfs4 70 | # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 71 | # ExtraMountSecurityGroups: 72 | # nfs: 73 | # DCV-Host: sg-xxxxxxxxxxxxxxxxx 74 | 75 | # @TODO: Configure license counts 76 | Licenses: 77 | vcs: 78 | Count: 10 79 | Server: synopsys_licenses 80 | Port: '24680' 81 | ServerType: flexlm 82 | -------------------------------------------------------------------------------- /source/resources/config/slurm_recommended_x86_instance_types.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #==================================================================== 3 | # Minimal cluster with all X86_64 instance types 4 | # 5 | # NOTE: This is just an example. 6 | # Please create your own revision controlled config file. 7 | # 8 | # No SlurmDbd in this configuration. 9 | # Configure 10 each of all x86_64 instance types. 10 | # 11 | # Defaults and valid configuration options are in source/config_schema.py. 12 | # Command line values override values in the config file. 13 | #==================================================================== 14 | 15 | StackName: slurm-x86-config 16 | 17 | # @TODO: Add Region 18 | # Region: {{Region}} 19 | 20 | # @TODO: Add your SshKeyPair 21 | # SshKeyPair: {{SshKeyPair}} 22 | 23 | # @TODO: Update with your VPC 24 | # VpcId: vpc-xxxxxxxxxxxxxxxxx 25 | 26 | # @TODO: Update with your private subnet in your VPC 27 | # SubnetId: subnet-xxxxxxxxxxxxxxxxx 28 | 29 | # @TODO: Update with your SNS Topic. Make sure to subscribe your email address to the topic and confirm the subscription 30 | # ErrorSnsTopicArn: arn:aws:sns:{{Region}}:{{AccountId}}:{{TopicName}} 31 | 32 | # @TODO: Add your preferred timezone so times aren't in UTC 33 | # TimeZone: America/Chicago # America/Los_Angeles or America/Denver or America/New_York 34 | 35 | # @TODO: If using Research and Engineering Studio, update with environment name 36 | # RESEnvironmentName: {{ResEnvironmentName}} 37 | 38 | slurm: 39 | ParallelClusterConfig: 40 | Version: 3.9.1 41 | Architecture: x86_64 42 | # @TODO: Update DatabaseStackName with stack name you deployed ParallelCluster database into. See: https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html#slurm-accounting-db-stack-v3 43 | # Database: 44 | # DatabaseStackName: {{DatabaseStackName}} 45 | 46 | MungeKeySecret: SlurmMungeKey 47 | 48 | SlurmCtl: {} 49 | 50 | InstanceConfig: 51 | UseSpot: true 52 | NodeCounts: 53 | # @TODO: Update the max number of each instance type to configure 54 | DefaultMaxCount: 5 55 | # @TODO: You can update the max instance count for each compute resource 56 | ComputeResourceCounts: 57 | od-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge 58 | MaxCount: 1 59 | sp-1024gb-16-cores: # x2iedn.8xlarge', x2iezn.8xlarge 60 | MaxCount: 2 61 | 62 | # @TODO: Configure storage mounts 63 | # storage: 64 | # ExtraMounts: 65 | # - dest: /home 66 | # StorageType: Efs 67 | # FileSystemId: 'fs-xxxxxxxxxxxxxxxxx' 68 | # src: fs-xxxxxxxxxxxxxxxxx.efs.{{Region}}.amazonaws.com:/ 69 | # type: nfs4 70 | # options: nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 71 | # ExtraMountSecurityGroups: 72 | # nfs: 73 | # DCV-Host: sg-xxxxxxxxxxxxxxxxx 74 | 75 | # @TODO: Configure license counts 76 | Licenses: 77 | vcs: 78 | Count: 10 79 | Server: synopsys_licenses 80 | Port: '24680' 81 | ServerType: flexlm 82 | -------------------------------------------------------------------------------- /source/resources/lambdas/CreateBuildFiles/cfnresponse.py: -------------------------------------------------------------------------------- 1 | ../cfnresponse.py -------------------------------------------------------------------------------- /source/resources/lambdas/CreateParallelCluster/cfnresponse.py: -------------------------------------------------------------------------------- 1 | ../cfnresponse.py -------------------------------------------------------------------------------- /source/resources/lambdas/CreateParallelClusterConfig/cfnresponse.py: -------------------------------------------------------------------------------- 1 | ../cfnresponse.py -------------------------------------------------------------------------------- /source/resources/lambdas/DeconfigureExternalLoginNodes/cfnresponse.py: -------------------------------------------------------------------------------- 1 | ../cfnresponse.py -------------------------------------------------------------------------------- /source/resources/lambdas/DeconfigureUsersGroupsJson/cfnresponse.py: -------------------------------------------------------------------------------- 1 | ../cfnresponse.py -------------------------------------------------------------------------------- /source/resources/lambdas/UpdateHeadNode/cfnresponse.py: -------------------------------------------------------------------------------- 1 | ../cfnresponse.py -------------------------------------------------------------------------------- /source/resources/lambdas/cfnresponse.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | from __future__ import print_function 5 | import urllib3 6 | import json 7 | 8 | SUCCESS = "SUCCESS" 9 | FAILED = "FAILED" 10 | 11 | http = urllib3.PoolManager() 12 | 13 | 14 | def send(event, context, responseStatus, responseData, physicalResourceId=None, noEcho=False, reason=None): 15 | responseUrl = event['ResponseURL'] 16 | 17 | print(responseUrl) 18 | 19 | responseBody = { 20 | 'Status' : responseStatus, 21 | 'Reason' : reason or "See the details in CloudWatch Log Stream: {}".format(context.log_stream_name), 22 | 'PhysicalResourceId' : physicalResourceId or context.log_stream_name, 23 | 'StackId' : event['StackId'], 24 | 'RequestId' : event['RequestId'], 25 | 'LogicalResourceId' : event['LogicalResourceId'], 26 | 'NoEcho' : noEcho, 27 | 'Data' : responseData 28 | } 29 | 30 | json_responseBody = json.dumps(responseBody) 31 | 32 | print("Response body:") 33 | print(json_responseBody) 34 | 35 | headers = { 36 | 'content-type' : '', 37 | 'content-length' : str(len(json_responseBody)) 38 | } 39 | 40 | try: 41 | response = http.request('PUT', responseUrl, headers=headers, body=json_responseBody) 42 | print("Status code:", response.status) 43 | 44 | 45 | except Exception as e: 46 | 47 | print("send(..) failed executing http.request(..):", e) 48 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/configure-rootless-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # Configure rootless docker for user. 6 | # The slurm config directory must exist 7 | 8 | script=$0 9 | script_name=$(basename $script) 10 | 11 | # Jinja2 template variables 12 | assets_bucket={{assets_bucket}} 13 | assets_base_key={{assets_base_key}} 14 | export AWS_DEFAULT_REGION={{Region}} 15 | ClusterName={{ClusterName}} 16 | ErrorSnsTopicArn={{ErrorSnsTopicArn}} 17 | playbooks_s3_url={{playbooks_s3_url}} 18 | 19 | # Notify user of errors 20 | function on_exit { 21 | rc=$? 22 | set +e 23 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 24 | tmpfile=$(mktemp) 25 | echo "See log files for more info: 26 | /var/lib/amazon/toe/TOE_* 27 | grep PCImageBuilderEDA /var/log/messages | less" > $tmpfile 28 | aws --region $AWS_DEFAULT_REGION sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} configure-rootless-docker.sh failed" --message file://$tmpfile 29 | rm $tmpfile 30 | fi 31 | } 32 | trap on_exit EXIT 33 | 34 | # Redirect all IO to /var/log/messages and then echo to stderr 35 | exec 1> >(logger -s -t configure-rootless-docker) 2>&1 36 | 37 | # Install ansible 38 | if ! yum list installed ansible &> /dev/null; then 39 | yum install -y ansible || amazon-linux-extras install -y ansible2 40 | fi 41 | 42 | external_login_node_config_dir=/opt/slurm/${ClusterName}/config 43 | if [ -e $external_login_node_config_dir ]; then 44 | config_dir=$external_login_node_config_dir 45 | else 46 | config_dir=/opt/slurm/config 47 | fi 48 | config_bin_dir=$config_dir/bin 49 | ANSIBLE_PATH=$config_dir/ansible 50 | PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks 51 | PLAYBOOKS_ZIP_PATH=$ANSIBLE_PATH/playbooks.zip 52 | 53 | if ! [ -e $external_login_node_config_dir ]; then 54 | mkdir -p $config_bin_dir 55 | 56 | ansible_head_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_head_node_vars.yml" 57 | ansible_compute_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_compute_node_vars.yml" 58 | ansible_external_login_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_external_login_node_vars.yml" 59 | 60 | # Download ansible playbooks 61 | aws s3 cp $playbooks_s3_url ${PLAYBOOKS_ZIP_PATH}.new 62 | if ! [ -e $PLAYBOOKS_ZIP_PATH ] || ! diff -q $PLAYBOOKS_ZIP_PATH ${PLAYBOOKS_ZIP_PATH}.new; then 63 | mv $PLAYBOOKS_ZIP_PATH.new $PLAYBOOKS_ZIP_PATH 64 | rm -rf $PLAYBOOKS_PATH 65 | mkdir -p $PLAYBOOKS_PATH 66 | pushd $PLAYBOOKS_PATH 67 | yum -y install unzip 68 | unzip $PLAYBOOKS_ZIP_PATH 69 | chmod -R 0700 $ANSIBLE_PATH 70 | popd 71 | fi 72 | 73 | aws s3 cp $ansible_head_node_vars_yml_s3_url /opt/slurm/config/ansible/ansible_head_node_vars.yml 74 | 75 | aws s3 cp $ansible_compute_node_vars_yml_s3_url /opt/slurm/config/ansible/ansible_compute_node_vars.yml 76 | 77 | aws s3 cp $ansible_external_login_node_vars_yml_s3_url /opt/slurm/config/ansible/ansible_external_login_node_vars.yml 78 | fi 79 | 80 | pushd $PLAYBOOKS_PATH 81 | 82 | ansible-playbook $PLAYBOOKS_PATH/configure-rootless-docker.yml \ 83 | -i inventories/local.yml \ 84 | -e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml 85 | 86 | popd 87 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/create_or_update_users_groups_json.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # This script creates the json file with user and group information. 6 | 7 | full_script=$(realpath $0) 8 | script_dir=$(dirname $full_script) 9 | base_script=$(basename $full_script) 10 | 11 | date 12 | echo "Started create_or_update_users_groups_json.sh: $full_script" 13 | 14 | config_dir={{ ExternalLoginNodeSlurmConfigDir }} 15 | config_bin_dir=$config_dir/bin 16 | 17 | $config_bin_dir/create_users_groups_json.py -o $config_dir/users_groups.json.new --subuid_filename $config_dir/subuid.new --subgid_filename $config_dir/subgid.new 18 | if ! diff $config_dir/users_groups.json.new $config_dir/users_groups.json; then 19 | mv $config_dir/users_groups.json.new $config_dir/users_groups.json 20 | mv $config_dir/subuid.new $config_dir/subuid 21 | mv $config_dir/subgid.new $config_dir/subgid 22 | fi 23 | 24 | date 25 | echo "Finished create_or_update_users_groups_json.sh: $full_script" 26 | 27 | exit 0 28 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/create_users_groups_json_configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # This script creates the json file with user and group information. 6 | # It also creates a crontab entry to update the json file every hour. 7 | # 8 | # The script and ansible playbooks needed to undo this will be installed at: 9 | # 10 | # /opt/aws-eda-slurm-cluster/{{ cluster_name }} 11 | # 12 | # To deconfigure the instance, run the following script: 13 | # 14 | # /opt/aws-eda-slurm-cluster/{{ cluster_name }}/create_users_groups_json_deconfigure.sh 15 | 16 | full_script=$(realpath $0) 17 | script_dir=$(dirname $full_script) 18 | base_script=$(basename $full_script) 19 | 20 | echo "$(date): Started create_users_groups_json_configure.sh: $full_script" 21 | 22 | config_dir={{ ExternalLoginNodeSlurmConfigDir }} 23 | config_bin_dir=$config_dir/bin 24 | 25 | ErrorSnsTopicArn={{ ErrorSnsTopicArn }} 26 | 27 | # Notify user of errors 28 | function on_exit { 29 | rc=$? 30 | set +e 31 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 32 | message_file=$(mktemp) 33 | echo "See log files for more info: 34 | grep ${script_name} /var/log/messages | less" > $message_file 35 | aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file 36 | rm $message_file 37 | fi 38 | } 39 | trap on_exit EXIT 40 | 41 | # Configure using ansible 42 | if ! yum list installed ansible &> /dev/null; then 43 | yum install -y ansible || amazon-linux-extras install -y ansible2 44 | fi 45 | 46 | ANSIBLE_PATH=$config_dir/ansible 47 | PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks 48 | 49 | pushd $PLAYBOOKS_PATH 50 | ansible-playbook $PLAYBOOKS_PATH/ParallelClusterCreateUsersGroupsJsonConfigure.yml \ 51 | -i inventories/local.yml \ 52 | -e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml 53 | popd 54 | 55 | echo "$(date): Finished create_users_groups_json_configure.sh: $full_script" 56 | 57 | exit 0 58 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/create_users_groups_json_deconfigure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # This script deconfigures this instance from creating the json file with user and group information. 6 | 7 | full_script=$(realpath $0) 8 | script_dir=$(dirname $full_script) 9 | base_script=$(basename $full_script) 10 | ANSIBLE_PATH=$(dirname $script_dir)/ansible 11 | PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks 12 | 13 | echo "$(date): Started create_users_groups_json_deconfigure.sh: $full_script" 14 | 15 | ErrorSnsTopicArn={{ ErrorSnsTopicArn }} 16 | 17 | # Notify user of errors 18 | function on_exit { 19 | rc=$? 20 | set +e 21 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 22 | message_file=$(mktemp) 23 | echo "See log files for more info: 24 | grep ${script_name} /var/log/messages | less" > $message_file 25 | aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file 26 | rm $message_file 27 | fi 28 | } 29 | trap on_exit EXIT 30 | 31 | # Install ansible 32 | if ! yum list installed ansible &> /dev/null; then 33 | yum install -y ansible || amazon-linux-extras install -y ansible2 34 | fi 35 | 36 | pushd $PLAYBOOKS_PATH 37 | ansible-playbook $PLAYBOOKS_PATH/ParallelClusterCreateUsersGroupsJsonDeconfigure.yml \ 38 | -i inventories/local.yml \ 39 | -e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml 40 | popd 41 | 42 | rm -rf $(dirname $script_dir) 43 | 44 | echo "$(date): Finished create_users_groups_json_deconfigure.sh: $full_script" 45 | 46 | exit 0 47 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/exostellar-compute-node-ami-configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # This script configures an instance so that it can be used to create an AMI to be used by 6 | # Exostellar Infrastructure Optimizer or Workload Optimizer. 7 | # The instance should be launched using a plain RHEL AMI. 8 | 9 | script=$0 10 | script_name=$(basename $script) 11 | 12 | # Jinja2 template variables 13 | assets_bucket={{assets_bucket}} 14 | assets_base_key={{assets_base_key}} 15 | export AWS_DEFAULT_REGION={{Region}} 16 | ClusterName={{ClusterName}} 17 | ErrorSnsTopicArn={{ErrorSnsTopicArn}} 18 | playbooks_s3_url={{playbooks_s3_url}} 19 | 20 | # Redirect all IO to /var/log/messages and then echo to stderr 21 | exec 1> >(logger -s -t exostellar-compute-node-ami-configure.sh) 2>&1 22 | 23 | # Install ansible 24 | if ! yum list installed ansible &> /dev/null; then 25 | yum install -y ansible || amazon-linux-extras install -y ansible2 26 | fi 27 | ansible-galaxy collection install ansible.posix 28 | 29 | config_dir=/opt/slurm/config 30 | config_bin_dir=$config_dir/bin 31 | ANSIBLE_PATH=$config_dir/ansible 32 | PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks 33 | PLAYBOOKS_ZIP_PATH=$ANSIBLE_PATH/playbooks.zip 34 | 35 | pushd $PLAYBOOKS_PATH 36 | 37 | ansible-playbook $PLAYBOOKS_PATH/ExostellarComputeNodeAmi.yml \ 38 | -i inventories/local.yml \ 39 | -e @$ANSIBLE_PATH/ansible_head_node_vars.yml 40 | 41 | popd 42 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/external_login_node_configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # This script configures an instance as an external login node for a ParallelCluster cluster. 6 | # 7 | # The script and ansible playbooks needed to undo this will be installed at: 8 | # 9 | # /opt/aws-eda-slurm-cluster/{{ cluster_name }} 10 | # 11 | # To deconfigure the instance as a login node run the following script: 12 | # 13 | # /opt/aws-eda-slurm-cluster/{{ cluster_name }}/external_login_node_deconfigure.sh 14 | 15 | full_script=$(realpath $0) 16 | script_dir=$(dirname $full_script) 17 | script_name=$(basename $full_script) 18 | 19 | echo "$(date): Started ${script_name}" 20 | 21 | # Jinja2 template variables 22 | assets_bucket={{ assets_bucket }} 23 | assets_base_key={{ assets_base_key }} 24 | export AWS_DEFAULT_REGION={{ Region }} 25 | ClusterName={{ ClusterName }} 26 | config_dir={{ ExternalLoginNodeSlurmConfigDir }} 27 | ErrorSnsTopicArn={{ ErrorSnsTopicArn }} 28 | 29 | # Notify user of errors 30 | function on_exit { 31 | rc=$? 32 | set +e 33 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 34 | message_file=$(mktemp) 35 | echo "See log files for more info: 36 | grep ${script_name} /var/log/messages | less" > $message_file 37 | aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file 38 | rm $message_file 39 | fi 40 | } 41 | trap on_exit EXIT 42 | 43 | config_bin_dir=$config_dir/bin 44 | 45 | # Configure using ansible 46 | if ! yum list installed ansible &> /dev/null; then 47 | yum install -y ansible || amazon-linux-extras install -y ansible2 48 | fi 49 | 50 | ANSIBLE_PATH=$config_dir/ansible 51 | PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks 52 | 53 | pushd $PLAYBOOKS_PATH 54 | ansible-playbook $PLAYBOOKS_PATH/ParallelClusterExternalLoginNodeConfigure.yml \ 55 | -i inventories/local.yml \ 56 | -e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml 57 | popd 58 | 59 | modulefile_profile=/etc/profile.d/slurm_${ClusterName}_modulefiles.sh 60 | if ! [ -e $modulefile_profile ]; then 61 | echo "error: $modulefile_profile doesn't exist" 62 | exit 1 63 | fi 64 | modulefile=$(cat $modulefile_profile | grep 'module use' | cut -d ' ' -f 3) 65 | if [ -z $modulefile ]; then 66 | echo "error: Couldn't get modulefile path from $modulefile_profile:" 67 | cat $modulefile_profile 68 | cat $modulefile_profile | grep 'module use' 69 | exit1 70 | fi 71 | 72 | pushd $PLAYBOOKS_PATH 73 | ansible-playbook $PLAYBOOKS_PATH/ParallelClusterExternalLoginNodeInstallSlurm.yml \ 74 | -i inventories/local.yml \ 75 | -e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml 76 | popd 77 | 78 | echo "$(date): Finished ${script_name}" 79 | 80 | exit 0 81 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/external_login_node_deconfigure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # This script deconfigures an instance that has been configured as a ParallelCluster Slurm login node. 6 | # 7 | # This script and it's ansible playbook are copied to /opt/aws-eda-slurm-cluster/{{ cluster_name }} so 8 | # that they can be executed whether the cluster still exists or not. 9 | 10 | full_script=$(realpath $0) 11 | script_dir=$(dirname $full_script) 12 | base_script=$(basename $full_script) 13 | ANSIBLE_PATH=$(dirname $script_dir)/ansible 14 | PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks 15 | 16 | echo "$(date): Started $base_script: $full_script" 17 | 18 | ErrorSnsTopicArn={{ ErrorSnsTopicArn }} 19 | 20 | # Notify user of errors 21 | function on_exit { 22 | rc=$? 23 | set +e 24 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 25 | message_file=$(mktemp) 26 | echo "See log files for more info: 27 | grep ${script_name} /var/log/messages | less" > $message_file 28 | aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file 29 | rm $message_file 30 | fi 31 | } 32 | trap on_exit EXIT 33 | 34 | # Install ansible 35 | if ! yum list installed ansible &> /dev/null; then 36 | yum install -y ansible || amazon-linux-extras install -y ansible2 37 | fi 38 | 39 | pushd $PLAYBOOKS_PATH 40 | ansible-playbook $PLAYBOOKS_PATH/ParallelClusterExternalLoginNodeDeconfigure.yml \ 41 | -i inventories/local.yml \ 42 | -e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml 43 | popd 44 | 45 | rm -rf $(dirname $script_dir) 46 | 47 | echo "$(date): Finished $base_script: $full_script" 48 | 49 | exit 0 50 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/install-ansible.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | script=$0 6 | 7 | yum -y install ansible 8 | 9 | exit 0 10 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/on_compute_node_start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | set -x 6 | set -e 7 | 8 | script_name=on_compute_node_start.sh 9 | 10 | exec 1> >(logger -s -t ${script_name}) 2>&1 11 | 12 | echo "$(date): Started ${script_name}" 13 | 14 | # Jinja2 template variables 15 | ErrorSnsTopicArn={{ErrorSnsTopicArn}} 16 | 17 | # Notify user of errors 18 | function on_exit { 19 | rc=$? 20 | set +e 21 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 22 | message_file=$(mktemp) 23 | echo "See log files for more info: 24 | grep ${script_name} /var/log/messages | less" > $message_file 25 | aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file 26 | rm $message_file 27 | fi 28 | } 29 | trap on_exit EXIT 30 | 31 | # /opt/slurm isn't mounted yet. 32 | 33 | # Configure pyxis and enroot 34 | 35 | # Configure Enroot 36 | ENROOT_PERSISTENT_DIR="/var/enroot" 37 | ENROOT_VOLATILE_DIR="/run/enroot" 38 | 39 | sudo mkdir -p $ENROOT_PERSISTENT_DIR 40 | sudo chmod 1777 $ENROOT_PERSISTENT_DIR 41 | sudo mkdir -p $ENROOT_VOLATILE_DIR 42 | sudo chmod 1777 $ENROOT_VOLATILE_DIR 43 | sudo cp /opt/parallelcluster/examples/enroot/enroot.conf /etc/enroot/enroot.conf 44 | sudo chmod 0644 /etc/enroot/enroot.conf 45 | 46 | # Configure Pyxis 47 | PYXIS_RUNTIME_DIR="/run/pyxis" 48 | 49 | sudo mkdir -p $PYXIS_RUNTIME_DIR 50 | sudo chmod 1777 $PYXIS_RUNTIME_DIR 51 | 52 | echo "$(date): Finished ${script_name}" 53 | 54 | exit 0 55 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/bin/on_head_node_updated.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | set -x 6 | set -e 7 | 8 | script_name=on_head_node_updated.sh 9 | 10 | exec 1> >(logger -s -t ${script_name}) 2>&1 11 | 12 | echo "$(date): Started ${script_name}" 13 | 14 | # Jinja2 template variables 15 | assets_bucket={{assets_bucket}} 16 | assets_base_key={{assets_base_key}} 17 | ErrorSnsTopicArn={{ErrorSnsTopicArn}} 18 | 19 | # Notify user of errors 20 | function on_exit { 21 | rc=$? 22 | set +e 23 | if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then 24 | message_file=$(mktemp) 25 | echo "See log files for more info: 26 | grep ${script_name} /var/log/messages | less" > $message_file 27 | aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file 28 | rm $message_file 29 | fi 30 | } 31 | trap on_exit EXIT 32 | 33 | config_dir=/opt/slurm/config 34 | config_bin_dir=$config_dir/bin 35 | 36 | # Make sure we're running the latest version 37 | dest_script="$config_bin_dir/${script_name}" 38 | mkdir -p $config_bin_dir 39 | aws s3 cp s3://$assets_bucket/$assets_base_key/config/bin/${script_name} $dest_script.new 40 | chmod 0700 $dest_script.new 41 | if ! [ -e $dest_script ] || ! diff -q $dest_script $dest_script.new; then 42 | mv -f $dest_script.new $dest_script 43 | exec $dest_script 44 | else 45 | rm $dest_script.new 46 | fi 47 | 48 | export PATH=/usr/sbin:$PATH 49 | 50 | $config_bin_dir/on_head_node_configured.sh 51 | 52 | echo "$(date): Finished ${script_name}" 53 | 54 | exit 0 55 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/build-files/build-file-template.yml: -------------------------------------------------------------------------------- 1 | # Template build file 2 | 3 | Image: 4 | Name: {{ImageName}} 5 | RootVolume: 6 | Size: {{RootVolumeSize}} 7 | 8 | Build: 9 | InstanceType: {{InstanceType}} 10 | SubnetId: {{SubnetId}} 11 | SecurityGroupIds: 12 | - {{ImageBuilderSecurityGroupId}} 13 | ParentImage: {{ParentImage}} 14 | UpdateOsPackages: 15 | Enabled: false 16 | {%- if ComponentS3Url %} 17 | Components: 18 | - Type: script 19 | Value: {{ComponentS3Url}} 20 | {%- endif %} 21 | Iam: 22 | AdditionalIamPolicies: 23 | - Policy: {{AssetReadPolicyArn}} 24 | 25 | DevSettings: 26 | TerminateInstanceOnFailure: false 27 | -------------------------------------------------------------------------------- /source/resources/parallel-cluster/config/users_groups.json: -------------------------------------------------------------------------------- 1 | { 2 | "gids": { 3 | }, 4 | "users": { 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /source/resources/playbooks/ExostellarComputeNodeAmi.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure instance for Exostellar Infrastructure Optimizer or Workload Optimizer AMI creation 3 | hosts: ExostellarComputeNodeAmi 4 | become_user: root 5 | become: yes 6 | roles: 7 | - eda_tools 8 | - ExostellarComputeNodeAmi 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterComputeNode.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure ParallelCluster Compute Node 3 | hosts: ParallelClusterComputeNode 4 | connection: local 5 | become_user: root 6 | become: yes 7 | roles: 8 | - all 9 | - eda_tools 10 | - security_updates 11 | - bug_fixes 12 | - ParallelClusterComputeNode 13 | - install-rootless-docker 14 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterCreateUsersGroupsJsonConfigure.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Create and periodically refresh users_groups.json 4 | hosts: ParallelClusterUsersGroupsSource 5 | become_user: root 6 | become: yes 7 | roles: 8 | - ParallelClusterCreateUsersGroupsJsonConfigure 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterCreateUsersGroupsJsonDeconfigure.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Remove contrab that refreshes refresh users_groups.json 4 | hosts: ParallelClusterUsersGroupsSource 5 | become_user: root 6 | become: yes 7 | roles: 8 | - ParallelClusterCreateUsersGroupsJsonDeconfigure 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterExternalLoginNodeConfigure.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure ParallelCluster ExternalLoginNode 3 | hosts: ParallelClusterExternalLoginNode 4 | become_user: root 5 | become: yes 6 | roles: 7 | - ParallelClusterExternalLoginNodeConfigure 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterExternalLoginNodeDeconfigure.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Deconfigure ParallelCluster ExternalLoginNode 3 | hosts: ParallelClusterExternalLoginNode 4 | become_user: root 5 | become: yes 6 | roles: 7 | - ParallelClusterExternalLoginNodeDeconfigure 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterExternalLoginNodeInstallSlurm.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Compile slurm for ParallelCluster ExternalLoginNode 3 | hosts: ParallelClusterExternalLoginNode 4 | become_user: root 5 | become: yes 6 | roles: 7 | - install_slurm 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/ParallelClusterHeadNode.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure ParallelCluster Head Node 3 | hosts: ParallelClusterHeadNode 4 | become_user: root 5 | become: yes 6 | roles: 7 | - all 8 | - ParallelClusterHeadNode 9 | - role: exostellar_infrastructure_optimizer 10 | when: xio_mgt_ip is defined 11 | - role: exostellar_workload_optimizer 12 | when: xwo_mgt_ip is defined 13 | - security_updates 14 | - bug_fixes 15 | -------------------------------------------------------------------------------- /source/resources/playbooks/README.md: -------------------------------------------------------------------------------- 1 | Ansible Playbooks For ParallelCluster 2 | ================= 3 | 4 | ## TOC 5 | 6 | * Quick Start 7 | * Roles 8 | 9 | ## Quick Start 10 | 11 | 12 | ## Roles 13 | -------------------------------------------------------------------------------- /source/resources/playbooks/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | log_path=/var/log/ansible.log 3 | stdout_callback=yaml 4 | -------------------------------------------------------------------------------- /source/resources/playbooks/bug_fixes.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install yum bug fixes 3 | hosts: 4 | - all 5 | become_user: root 6 | become: yes 7 | roles: 8 | - bug_fixes 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/configure-rootless-docker.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure rootless docker for user 3 | hosts: 4 | - ExternalLoginNode 5 | become_user: root 6 | become: yes 7 | roles: 8 | - configure-rootless-docker 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/create_users_groups_json.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure host to save users/groups to json 3 | hosts: localgroup 4 | become_user: root 5 | become: yes 6 | roles: 7 | - create_users_groups_json 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/eda_tools.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Configure eda_tools packages 3 | hosts: 4 | - ami_GridEngineHost 5 | - ami_jenkins_slave 6 | - ami_SlurmNode 7 | - dcv 8 | - grid_hosts 9 | - GridEngineHost 10 | - SlurmNode 11 | - workspaces 12 | become_user: root 13 | become: yes 14 | roles: 15 | - eda_tools 16 | -------------------------------------------------------------------------------- /source/resources/playbooks/install-rootless-docker.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install rootless docker for OCI containers 3 | hosts: 4 | - ParallelClusterComputeNode 5 | become_user: root 6 | become: yes 7 | roles: 8 | - install-rootless-docker 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/install_slurm.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Build slurm 3 | hosts: local 4 | connection: local 5 | become_user: root 6 | become: yes 7 | roles: 8 | - all 9 | - install_slurm 10 | -------------------------------------------------------------------------------- /source/resources/playbooks/install_vscode.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install VSCode 3 | hosts: local 4 | connection: local 5 | become_user: root 6 | become: yes 7 | roles: 8 | - install_vscode 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/inventories/group_vars/all: -------------------------------------------------------------------------------- 1 | --- 2 | ansible_ssh_user: ec2-user 3 | 4 | #ansible_ssh_user: 5 | #ansible_ssh_pass: 6 | 7 | ansible_ssh_common_args: "-o StrictHostKeyChecking=no -o LogLevel=ERROR -o UserKnownHostsFile=/dev/null" 8 | 9 | ansible_architecture: "{{ ansible_facts['architecture'] }}" 10 | distribution: "{{ ansible_facts['distribution'] }}" 11 | distribution_major_version: "{{ ansible_facts['distribution_major_version'] }}" 12 | distribution_version: "{{ ansible_facts['distribution_version'] }}" 13 | kernel: "{{ ansible_facts['kernel'] }}" 14 | memtotal_mb: "{{ ansible_facts['memtotal_mb'] }}" 15 | 16 | # Derived facts 17 | architecture: "{%if ansible_architecture == 'aarch64'%}arm64{%else%}{{ ansible_architecture }}{%endif%}" 18 | amazonlinux2: "{{ distribution == 'Amazon' and distribution_major_version == '2' }}" 19 | alma: "{{ distribution == 'AlmaLinux' }}" 20 | alma8: "{{ alma and distribution_major_version == '8' }}" 21 | centos: "{{ distribution == 'CentOS' }}" 22 | centos7: "{{ centos and distribution_major_version == '7' }}" 23 | rhel: "{{ distribution == 'RedHat' }}" 24 | rhel7: "{{ rhel and distribution_major_version == '7' }}" 25 | rhel8: "{{ rhel and distribution_major_version == '8' }}" 26 | rhel9: "{{ rhel and distribution_major_version == '9' }}" 27 | rocky: "{{ distribution == 'Rocky' }}" 28 | rocky8: "{{ rocky and distribution_major_version == '8' }}" 29 | rocky9: "{{ rocky and distribution_major_version == '9' }}" 30 | rhelclone: "{{ alma or centos or rocky }}" 31 | rhel8clone: "{{ rhelclone and distribution_major_version == '8' }}" 32 | rhel9clone: "{{ rhelclone and distribution_major_version == '9' }}" 33 | centos7_5_to_6: "{{ distribution in ['CentOS', 'RedHat'] and distribution_version is match('7\\.[5-6]') }}" 34 | centos7_5_to_9: "{{ distribution in ['CentOS', 'RedHat'] and distribution_version is match('7\\.[5-9]') }}" 35 | centos7_7_to_9: "{{ distribution in ['CentOS', 'RedHat'] and distribution_version is match('7\\.[7-9]') }}" 36 | 37 | # Create separate build and release dirs because binaries built on AmazonLinux2 don't run on CentOS 7 38 | slurm_base_dir: "{{ file_system_mount_path }}" 39 | slurm_sbin_dir: "{{ slurm_base_dir }}/sbin" 40 | slurm_bin_dir: "{{ slurm_base_dir }}/bin" 41 | slurm_scripts_dir: "{{ slurm_base_dir }}/bin" 42 | slurm_root: "{{ slurm_base_dir }}" 43 | 44 | # Cluster specific directories 45 | slurm_config_dir: "{{ slurm_base_dir }}/config" 46 | slurm_etc_dir: "{{ slurm_base_dir }}/etc" 47 | slurm_logs_dir: "{{ slurm_base_dir }}/logs" 48 | slurmrestd_socket_dir: "{{ slurm_base_dir }}/com" 49 | slurmrestd_socket: "{{ slurmrestd_socket_dir }}/slurmrestd.socket" 50 | slurm_spool_dir: "{{ slurm_base_dir }}/var/spool" 51 | slurm_conf: "{{ slurm_etc_dir }}/slurm.conf" 52 | 53 | modulefiles_base_dir: "{{ slurm_base_dir }}/modules/modulefiles" 54 | 55 | pc_modulefiles_base_dir: "{{ slurm_config_dir }}/modules/modulefiles" 56 | external_login_node_slurm_base_dir: "{{ slurm_base_dir }}/{{ cluster_name }}" 57 | external_login_node_slurm_config_dir: "{{ external_login_node_slurm_base_dir }}/config" 58 | external_login_node_modulefiles_base_dir: "{{ external_login_node_slurm_config_dir} }/modules/modulefiles" 59 | 60 | supported_distributions: 61 | - AlmaLinux/8/arm64 62 | - AlmaLinux/8/x86_64 63 | - Amazon/2/arm64 64 | - Amazon/2/x86_64 65 | - CentOS/7/x86_64 66 | - RedHat/8/arm64 67 | - Rocky/8/arm64 68 | - Rocky/8/x86_64 69 | -------------------------------------------------------------------------------- /source/resources/playbooks/inventories/local.yml: -------------------------------------------------------------------------------- 1 | --- 2 | localgroup: 3 | hosts: 4 | local: 5 | ansible_host: 127.0.0.1 6 | ansible_connection: local 7 | ExternalLoginNode: 8 | hosts: 9 | local: 10 | ansible_host: 127.0.0.1 11 | ansible_connection: local 12 | ParallelClusterComputeNode: 13 | hosts: 14 | local: 15 | ansible_host: 127.0.0.1 16 | ansible_connection: local 17 | ParallelClusterHeadNode: 18 | hosts: 19 | local: 20 | ansible_host: 127.0.0.1 21 | ansible_connection: local 22 | ParallelClusterExternalLoginNode: 23 | hosts: 24 | local: 25 | ansible_host: 127.0.0.1 26 | ansible_connection: local 27 | ParallelClusterUsersGroupsSource: 28 | hosts: 29 | local: 30 | ansible_host: 127.0.0.1 31 | ansible_connection: local 32 | SlurmCtl: 33 | hosts: 34 | local: 35 | ansible_host: 127.0.0.1 36 | ansible_connection: local 37 | SlurmDbd: 38 | hosts: 39 | local: 40 | ansible_host: 127.0.0.1 41 | ansible_connection: local 42 | SlurmNode: 43 | hosts: 44 | local: 45 | ansible_host: 127.0.0.1 46 | ansible_connection: local 47 | SlurmNodeAmi: 48 | hosts: 49 | local: 50 | ansible_host: 127.0.0.1 51 | ansible_connection: local 52 | SlurmExternalLoginNode: 53 | hosts: 54 | local: 55 | ansible_host: 127.0.0.1 56 | ansible_connection: local 57 | ExostellarComputeNodeAmi: 58 | hosts: 59 | local: 60 | ansible_host: 127.0.0.1 61 | ansible_connection: local 62 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ExostellarComputeNodeAmi/README.md: -------------------------------------------------------------------------------- 1 | ExostellarComputeNodeAmi 2 | ========= 3 | 4 | Configure an instance to create an AMI to be used by Exostellar Infrastructure Optimizer (XIO). 5 | The instance should be launched from a base RHEL AMI, not a ParallelCluster AMI. 6 | 7 | * Mount /opt/slurm in /etc/fstab 8 | * Install required packages 9 | * Configure munge 10 | * Configure slurmd. 11 | 12 | Requirements 13 | ------------ 14 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ExostellarComputeNodeAmi/templates/etc/profile.d/slurm.csh: -------------------------------------------------------------------------------- 1 | # slurm.csh: 2 | # Sets the C shell user environment for slurm commands 3 | # 4 | set path = ($path /opt/slurm/bin) 5 | if ( ${?MANPATH} ) then 6 | setenv MANPATH ${MANPATH}:/opt/slurm/share/man 7 | else 8 | setenv MANPATH :/opt/slurm/share/man 9 | endif 10 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ExostellarComputeNodeAmi/templates/etc/profile.d/slurm.sh: -------------------------------------------------------------------------------- 1 | # slurm.sh: 2 | # Setup slurm environment variables 3 | # 4 | 5 | PATH=$PATH:/opt/slurm/bin 6 | MANPATH=$MANPATH:/opt/slurm/share/man 7 | 8 | export PATH MANPATH 9 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ExostellarComputeNodeAmi/templates/etc/sysconfig/slurmd: -------------------------------------------------------------------------------- 1 | SLURMD_OPTIONS='{{ node_name }}' 2 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ExostellarComputeNodeAmi/templates/etc/systemd/system/slurmd.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slurm node daemon 3 | After=munge.service network-online.target remote-fs.target 4 | Wants=network-online.target 5 | ConditionPathExists=/opt/slurm/etc/slurm.conf 6 | 7 | [Service] 8 | Type=simple 9 | EnvironmentFile=-/etc/sysconfig/slurmd 10 | ExecStart=/opt/slurm/sbin/slurmd -D -s $SLURMD_OPTIONS 11 | ExecReload=/bin/kill -HUP $MAINPID 12 | KillMode=process 13 | LimitNOFILE=131072 14 | LimitMEMLOCK=infinity 15 | LimitSTACK=infinity 16 | Delegate=yes 17 | 18 | [Install] 19 | WantedBy=multi-user.target 20 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterComputeNode/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterComputeNode/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for ParallelCluster Compute nodes 3 | 4 | - name: Install slurm_node yum packages 5 | yum: 6 | state: present 7 | name: 8 | - emacs 9 | - hwloc-libs 10 | - mailx 11 | 12 | - name: Create/Update Users 13 | template: 14 | src: ../../ParallelClusterHeadNode/templates/etc/cron.d/slurm_users_groups 15 | dest: /etc/cron.d/slurm_users_groups 16 | owner: root 17 | group: root 18 | mode: 0600 19 | force: yes 20 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/README.md: -------------------------------------------------------------------------------- 1 | ParallelClusterCreateUsersGroupsJsonConfigure 2 | ========= 3 | 4 | Configure the server that is periodically updating the users_groups.json file. 5 | Creates the file and a cron job that refreshes it hourly. 6 | 7 | * Mounts the cluster's /opt/slurm export at /opt/slurm/{{ cluster_name }} 8 | * Updates the /etc/fstab so that the mount works after a reboot. 9 | * Creates a crontab to refresh /opt/slurm/{{ cluster_name }}/config/users_groups.json is refreshed hourly. 10 | 11 | Requirements 12 | ------------ 13 | 14 | This is meant to be run on a server that is joined to your domain so that it 15 | has access to info about all of the users and groups. 16 | For SOCA, this is the scheduler instance. 17 | For RES, this is the {{ EnvironmentName }}-cluster-manager instance. 18 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Tasks for ParallelClusterCreateUsersGroupsJsonConfigure 3 | 4 | - name: Show vars used in this playbook 5 | debug: 6 | msg: | 7 | cluster_name: {{ cluster_name }} 8 | region: {{ region }} 9 | slurm_config_dir: {{ slurm_config_dir }} 10 | 11 | - name: Add /opt/slurm/{{ cluster_name }} to /etc/fstab 12 | mount: 13 | path: /opt/slurm/{{ cluster_name }} 14 | src: "head_node.{{ cluster_name }}.pcluster:/opt/slurm" 15 | fstype: nfs 16 | backup: true 17 | state: present # Should already be mounted 18 | 19 | - name: Create {{ slurm_config_dir }}/users_groups.json 20 | shell: | 21 | set -ex 22 | 23 | {{ slurm_config_dir }}/bin/create_or_update_users_groups_json.sh 24 | args: 25 | creates: '{{ slurm_config_dir }}/users_groups.json' 26 | 27 | - name: Create cron to refresh {{ slurm_config_dir }}/users_groups.json every hour 28 | template: 29 | dest: /etc/cron.d/slurm_{{ cluster_name }}_update_users_groups_json 30 | src: etc/cron.d/slurm_update_users_groups_json 31 | owner: root 32 | group: root 33 | mode: 0600 34 | force: yes 35 | 36 | - name: Create /opt/aws-eda-slurm-cluster/{{ cluster_name }} 37 | file: 38 | path: /opt/aws-eda-slurm-cluster/{{ cluster_name }} 39 | owner: root 40 | group: root 41 | mode: 0700 42 | state: directory 43 | 44 | - name: Create /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin 45 | file: 46 | path: /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin 47 | owner: root 48 | group: root 49 | mode: 0700 50 | state: directory 51 | 52 | - name: Copy {{ slurm_config_dir }}/bin/create_users_groups_json_deconfigure.sh to /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin/ 53 | copy: 54 | src: "{{ slurm_config_dir }}/bin/create_users_groups_json_deconfigure.sh" 55 | dest: /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin/create_users_groups_json_deconfigure.sh 56 | remote_src: true 57 | force: true # Has to be true or won't be copied when they are different. 58 | owner: root 59 | group: root 60 | mode: 0700 61 | 62 | - name: Copy {{ slurm_config_dir }}/ansible/ to /opt/aws-eda-slurm-cluster/{{ cluster_name }}/ansible/ 63 | copy: 64 | src: "{{ slurm_config_dir }}/ansible" 65 | dest: /opt/aws-eda-slurm-cluster/{{ cluster_name }}/ 66 | remote_src: true 67 | force: true # Has to be true or won't be copied when they are different. 68 | owner: root 69 | group: root 70 | directory_mode: 0700 71 | mode: 0600 72 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/templates/etc/cron.d/slurm_update_users_groups_json: -------------------------------------------------------------------------------- 1 | MAILTO='' 2 | PATH="{{ slurm_config_dir }}/bin:/sbin:/bin:/usr/sbin:/usr/bin" 3 | 50 * * * * root {{ slurm_config_dir }}/bin/create_or_update_users_groups_json.sh 4 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonDeconfigure/README.md: -------------------------------------------------------------------------------- 1 | ParallelClusterCreateUsersGroupsJsonDeconfigure 2 | ========= 3 | 4 | Deconfigure the server that is periodically updating the users_groups.json file. 5 | Just removes the crontab entry on the server. 6 | 7 | * Copies ansible playbooks to /tmp because the cluster's mount is removed by the playbook. 8 | * Remove crontab that refreshes /opt/slurm/{{ cluster_name }}/config/users_groups.json. 9 | * Remove /opt/slurm/{{ cluster_name }} from /etc/fstab and unmount it. 10 | 11 | Requirements 12 | ------------ 13 | 14 | This is meant to be run on a server that is joined to your domain so that it 15 | has access to info about all of the users and groups. 16 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonDeconfigure/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Tasks for ParallelClusterCreateUsersGroupsJsonDeconfigure 3 | 4 | - name: Show vars used in this playbook 5 | debug: 6 | msg: | 7 | cluster_name: {{ cluster_name }} 8 | 9 | - name: Delete cron to refresh {{ slurm_config_dir }}/users_groups.json every hour 10 | file: 11 | dest: /etc/cron.d/slurm_{{ cluster_name }}_update_users_groups_json 12 | state: absent 13 | 14 | - name: Unmount /opt/slurm/{{ cluster_name }} 15 | shell: | 16 | set -ex 17 | 18 | # Handle case where cluster was already deleted so the mountpoint is hung 19 | if ! timeout 1s /opt/slurm/{{ cluster_name }}; then 20 | echo "Mount point is hung. Source has already been deleted." 21 | umount -lf /opt/slurm/{{ cluster_name }} 22 | fi 23 | if ! mountpoint /opt/slurm/{{ cluster_name }}; then 24 | echo "/opt/slurm/{{ cluster_name }} already unmounted." 25 | exit 0 26 | fi 27 | umount -lf /opt/slurm/{{ cluster_name }} || lsof /opt/slurm/{{ cluster_name }} 28 | register: umount_results 29 | 30 | - name: Show umount results 31 | debug: 32 | msg: | 33 | umount_results: {{ umount_results }} 34 | 35 | - name: Remove /opt/slurm/{{ cluster_name }} from /etc/fstab and ignore errors 36 | mount: 37 | path: /opt/slurm/{{ cluster_name }} 38 | backup: true 39 | fstype: nfs 40 | state: absent 41 | # For some reason umount is failing with device busy even though running out of /tmp. 42 | # Retry it again without ignoring errors. 43 | ignore_errors: true 44 | 45 | - name: Remove /opt/slurm/{{ cluster_name }} from /etc/fstab 46 | mount: 47 | path: /opt/slurm/{{ cluster_name }} 48 | backup: true 49 | fstype: nfs 50 | state: absent 51 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterExternalLoginNodeConfigure/README.md: -------------------------------------------------------------------------------- 1 | ParallelClusterExternalLoginNodeConfigure 2 | ========= 3 | 4 | Configure an ExternalLoginNode instance as a Slurm login node that can submit commands to a ParallelCluster cluster. 5 | 6 | Requirements 7 | ------------ 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterExternalLoginNodeConfigure/templates/etc/profile.d/slurm_modulefiles.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | module use {{ slurm_base_dir }}/config/modules/modulefiles/{{ distribution }}/{{ distribution_major_version }}/{{ architecture }} 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterExternalLoginNodeDeconfigure/README.md: -------------------------------------------------------------------------------- 1 | ParallelClusterExternalLoginNodeDeconfigure 2 | ========= 3 | 4 | Deconfigure an ExternalLoginNode to a ParallelCluster cluster. 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterExternalLoginNodeDeconfigure/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Show vars used in this playbook 4 | debug: 5 | msg: | 6 | cluster_name: {{ cluster_name }} 7 | 8 | - name: Remove modulefile configuration 9 | file: 10 | dest: /etc/profile.d/slurm_{{ cluster_name }}_modulefiles.sh 11 | state: absent 12 | 13 | - name: Remove /opt/slurm/{{ cluster_name }} from /etc/fstab 14 | mount: 15 | path: /opt/slurm/{{ cluster_name }} 16 | backup: true 17 | fstype: nfs 18 | state: absent 19 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/README.md: -------------------------------------------------------------------------------- 1 | ParallelClusterHeadNode 2 | ========= 3 | 4 | Configure the ParallelCluster head node for EDA workloads 5 | 6 | Requirements 7 | ------------ 8 | 9 | This is meant to be run on a ParallelCluster head node that has been configured using aws-eda-slurm-cluster. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | Example Playbook 20 | ---------------- 21 | 22 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 23 | 24 | - hosts: servers 25 | roles: 26 | - { role: username.rolename, x: 42 } 27 | 28 | License 29 | ------- 30 | 31 | BSD 32 | 33 | Author Information 34 | ------------------ 35 | 36 | Allan Carter 37 | cartalla@amazon.com 38 | AWS Specialist Solutions Architect 39 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/files/etc/enroot/enroot.conf: -------------------------------------------------------------------------------- 1 | #ENROOT_LIBRARY_PATH /usr/lib/enroot 2 | #ENROOT_SYSCONF_PATH /etc/enroot 3 | ENROOT_RUNTIME_PATH /run/enroot/runtime/user-$(id -u) 4 | ENROOT_DATA_PATH /run/enroot/data/user-$(id -u) 5 | ENROOT_CONFIG_PATH /var/enroot/config/user-$(id -u) 6 | ENROOT_CACHE_PATH /var/enroot/cache/group-$(id -g) 7 | #ENROOT_TEMP_PATH ${TMPDIR:-/tmp} 8 | 9 | # Gzip program used to uncompress digest layers. 10 | #ENROOT_GZIP_PROGRAM gzip 11 | 12 | # Options passed to zstd to compress digest layers. 13 | #ENROOT_ZSTD_OPTIONS -1 14 | 15 | # Options passed to mksquashfs to produce container images. 16 | ENROOT_SQUASH_OPTIONS -noI -noD -noF -noX -no-duplicates 17 | 18 | # Make the container root filesystem writable by default. 19 | ENROOT_ROOTFS_WRITABLE yes 20 | 21 | # Remap the current user to root inside containers by default. 22 | #ENROOT_REMAP_ROOT no 23 | 24 | # Maximum number of processors to use for parallel tasks (0 means unlimited). 25 | #ENROOT_MAX_PROCESSORS $(nproc) 26 | 27 | # Maximum number of concurrent connections (0 means unlimited). 28 | #ENROOT_MAX_CONNECTIONS 10 29 | 30 | # Maximum time in seconds to wait for connections establishment (0 means unlimited). 31 | #ENROOT_CONNECT_TIMEOUT 30 32 | 33 | # Maximum time in seconds to wait for network operations to complete (0 means unlimited). 34 | #ENROOT_TRANSFER_TIMEOUT 0 35 | 36 | # Number of times network operations should be retried. 37 | #ENROOT_TRANSFER_RETRIES 0 38 | 39 | # Use a login shell to run the container initialization. 40 | #ENROOT_LOGIN_SHELL yes 41 | 42 | # Allow root to retain his superuser privileges inside containers. 43 | #ENROOT_ALLOW_SUPERUSER no 44 | 45 | # Use HTTP for outgoing requests instead of HTTPS (UNSECURE!). 46 | #ENROOT_ALLOW_HTTP no 47 | 48 | # Include user-specific configuration inside bundles by default. 49 | #ENROOT_BUNDLE_ALL no 50 | 51 | # Generate an embedded checksum inside bundles by default. 52 | #ENROOT_BUNDLE_CHECKSUM no 53 | 54 | # Mount the current user's home directory by default. 55 | ENROOT_MOUNT_HOME no 56 | 57 | # Restrict /dev inside the container to a minimal set of devices. 58 | ENROOT_RESTRICT_DEV no 59 | 60 | # Always use --force on command invocations. 61 | #ENROOT_FORCE_OVERRIDE no 62 | 63 | # SSL certificates settings: 64 | #SSL_CERT_DIR 65 | #SSL_CERT_FILE 66 | 67 | # Proxy settings: 68 | #all_proxy 69 | #no_proxy 70 | #http_proxy 71 | #https_proxy 72 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/files/opt/slurm/config/accounts.yml.example: -------------------------------------------------------------------------------- 1 | --- 2 | jenkins: 3 | fairshare: 10 4 | users: 5 | - jenkins 6 | 7 | project1: 8 | fairshare: 90 9 | 10 | project1-dv: 11 | parent: project1 12 | fairshare: 80 13 | users: 14 | - dvuser1 15 | project1-pd: 16 | parent: project1 17 | fairshare: 10 18 | users: 19 | - pduser1 20 | project1-rtl: 21 | parent: project1 22 | fairshare: 10 23 | users: 24 | - rtluser1 25 | project1-fv: 26 | parent: project1 27 | fairshare: 10 28 | users: 29 | - fvuser1 30 | 31 | project2: 32 | fairshare: 90 33 | 34 | project2-dv: 35 | parent: project2 36 | fairshare: 80 37 | users: 38 | - dvuser1 39 | project2-pd: 40 | parent: project2 41 | fairshare: 10 42 | users: 43 | - pduser1 44 | project2-rtl: 45 | parent: project2 46 | fairshare: 10 47 | users: 48 | - rtluser1 49 | project2-fv: 50 | parent: project2 51 | fairshare: 10 52 | users: 53 | - fvuser1 54 | 55 | infrastructure: 56 | fairshare: 10 57 | users: 58 | - user1 59 | - user2 60 | 61 | # Account for unassigned users so that they belong to an account. 62 | unassigned: 63 | fairshare: 1 64 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/files/opt/slurm/etc/oci.conf: -------------------------------------------------------------------------------- 1 | EnvExclude="^(SLURM_CONF|SLURM_CONF_SERVER)=" 2 | RunTimeEnvExclude="^(SLURM_CONF|SLURM_CONF_SERVER)=" 3 | RunTimeQuery="runc --rootless=true --root=/run/user/%U/ state %n.%u.%j.%s.%t" 4 | RunTimeKill="runc --rootless=true --root=/run/user/%U/ kill -a %n.%u.%j.%s.%t" 5 | RunTimeDelete="runc --rootless=true --root=/run/user/%U/ delete --force %n.%u.%j.%s.%t" 6 | RunTimeRun="runc --rootless=true --root=/run/user/%U/ run %n.%u.%j.%s.%t -b %b" 7 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/files/opt/slurm/etc/plugstack.conf.d/pyxis.conf: -------------------------------------------------------------------------------- 1 | # Configure pyxis 2 | required /usr/local/lib/slurm/spank_pyxis.so runtime_path=/run/pyxis 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-high-throughput.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Update for a large, high throughput cluster 3 | # See https://slurm.schedmd.com/big_sys.html 4 | # See https://slurm.schedmd.com/high_throughput.html 5 | 6 | - name: Set eth0 txqueuelen to 4096 7 | shell: ifconfig eth0 txqueuelen 4096 8 | 9 | # ParallelCluster sets this to 360,272 10 | # This is set to 392,837 by default on Amazon Linux 2 11 | # - name: Configure kernel parameter fs.file-max 12 | # sysctl: 13 | # name: fs.file-max 14 | # value: "392837" 15 | # sysctl_file: /etc/sysctl.d/slurmctl.conf 16 | 17 | # ParallelCluster sets this to 65535 18 | # - name: Configure kernel parameter net.ipv4.tcp_max_syn_backlog 19 | # sysctl: 20 | # name: net.ipv4.tcp_max_syn_backlog 21 | # value: "4096" 22 | # sysctl_file: /etc/sysctl.d/slurmctl.conf 23 | 24 | # ParallelCluster sets this to 1 25 | # - name: Configure kernel parameter net.ipv4.tcp_syncookies 26 | # sysctl: 27 | # name: net.ipv4.tcp_syncookies 28 | # value: "1" 29 | # sysctl_file: /etc/sysctl.d/slurmctl.conf 30 | 31 | # ParallelCluster sets this to 65535 32 | # - name: Configure kernel parameter net.core.somaxconn 33 | # sysctl: 34 | # name: net.core.somaxconn 35 | # value: "4096" 36 | # sysctl_file: /etc/sysctl.d/slurmctl.conf 37 | 38 | 39 | - name: Create /etc/rc.d/rc.local to set ifconfig eth0 txqueuelen 4096 40 | template: 41 | dest: /etc/rc.d/rc.local 42 | src: etc/rc.d/rc.local 43 | owner: root 44 | group: root 45 | mode: 0700 46 | force: yes 47 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-licenses.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Show vars used in this playbook 4 | debug: 5 | msg: | 6 | accounting_storage_host: {{ accounting_storage_host }} 7 | cluster_name: {{ cluster_name }} 8 | licenses: {{ licenses }} 9 | primary_controller: {{ primary_controller }} 10 | slurm_bin_dir: {{ slurm_bin_dir }} 11 | 12 | - name: Configure remote licenses 13 | # This uses sacctmcr so must do this after slurmctld and slurmd are working. 14 | when: primary_controller|bool and accounting_storage_host and licenses 15 | shell: 16 | cmd: | 17 | set -ex 18 | # Add or update configured licenses 19 | declare -A licenses 20 | {% for lic in licenses -%} 21 | license='{{ lic }}' 22 | # Using '@' for the port separator instead of ':' because sbatch doesn't work if ':' is in the server name. 23 | server='{% if 'Server' in licenses[lic] %}{{ licenses[lic].Server }}{% if 'Port' in licenses[lic] %}@{{ licenses[lic].Port }}{% endif %}{% else %}slurmdb{% endif %}' 24 | count='{{ licenses[lic].Count }}' 25 | licenses["$license@$server"]="$count" 26 | # Check to see if license has already been created 27 | slurm_license=$({{ slurm_bin_dir }}/sacctmgr -i show resource $license --parsable2 --noheader) 28 | if [ -z $slurm_license ]; then 29 | echo "$license license not in slurmdbd so add it" 30 | {{ slurm_bin_dir }}/sacctmgr -i add resource type=License name=$license server=$server{% if 'ServerType' in licenses[lic] %} servertype={{ licenses[lic].ServerType }}{% endif %} count={{ licenses[lic].Count }} cluster={{ cluster_name }} percentallowed=100 31 | else 32 | echo "$license already in slurmdbd so check count and percent allowed." 33 | slurmdb_count=$({{ slurm_bin_dir }}/sacctmgr -i show resource $license --parsable2 --noheader | cut -d '|' -f 4) 34 | echo "slurmdb count=$slurmdb_count" 35 | if [[ $count != $slurmdb_count ]]; then 36 | echo "Update $license count from $slurmdb_count to $count" 37 | {{ slurm_bin_dir }}/sacctmgr -i modify resource name=$license server=$server set count=$count 38 | fi 39 | 40 | slurmdb_percent_allowed=$({{ slurm_bin_dir }}/sacctmgr -i show resource $license --parsable2 --noheader | cut -d '|' -f 6) 41 | if [[ "100" != $slurmdb_percent_allowed ]]; then 42 | {{ slurm_bin_dir }}/sacctmgr -i modify resource name=$license server=$server cluster={{ cluster_name }} set percentallowed=100 43 | fi 44 | fi 45 | {% endfor -%} 46 | 47 | # Remove deleted licenses 48 | configured_licenses_and_servers=( $({{ slurm_bin_dir }}/sacctmgr --noheader --parsable2 show resource Clusters={{ cluster_name }} format=name,server) ) 49 | echo ${configured_licenses_and_servers[@]} 50 | for configured_license_and_server in ${configured_licenses_and_servers[@]}; do 51 | configured_license=$(echo $configured_license_and_server | cut -d '|' -f 1) 52 | configured_server=$(echo $configured_license_and_server | cut -d '|' -f 2) 53 | if [ -z ${licenses["$configured_license@$configured_server"]} ]; then 54 | {{ slurm_bin_dir }}/sacctmgr -i delete resource name=$configured_license server=$configured_server 55 | fi 56 | done 57 | 58 | register: remote_slurm_licenses_conf_result 59 | # ignore_errors: true 60 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-oci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Create oci.conf 4 | when: primary_controller|bool 5 | copy: 6 | dest: "/opt/slurm/etc/oci.conf" 7 | src: opt/slurm/etc/oci.conf 8 | owner: root 9 | group: root 10 | mode: 0644 11 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-pyxis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # https://docs.aws.amazon.com/en_us/parallelcluster/latest/ug/tutorials_11_running-containerized-jobs-with-pyxis.html 4 | 5 | - name: Set enroot and pyxis facts 6 | set_fact: 7 | enroot_persistent_dir: '/var/enroot' 8 | enroot_volatile_dir: '/run/enroot' 9 | pyxis_runtime_dir: '/run/pyxis' 10 | 11 | - name: Show variables used by this role 12 | debug: 13 | msg: | 14 | primary_controller: {{ primary_controller }} 15 | 16 | enable_pyxis: {{ enable_pyxis }} 17 | enroot_persistent_dir: {{ enroot_persistent_dir }} 18 | enroot_volatile_dir: {{ enroot_volatile_dir }} 19 | pyxis_runtime_dir: {{ pyxis_runtime_dir }} 20 | 21 | - name: Configure OCI 22 | when: primary_controller|bool 23 | copy: 24 | dest: "/opt/slurm/etc/oci.conf" 25 | src: opt/slurm/etc/oci.conf 26 | owner: root 27 | group: root 28 | mode: 0644 29 | 30 | - name: Create {{ enroot_persistent_dir }} 31 | when: primary_controller|bool 32 | file: 33 | path: "{{ enroot_persistent_dir }}" 34 | state: directory 35 | owner: root 36 | group: root 37 | mode: 01777 38 | 39 | - name: Create {{ enroot_volatile_dir }} 40 | when: primary_controller|bool 41 | file: 42 | path: "{{ enroot_volatile_dir }}" 43 | state: directory 44 | owner: root 45 | group: root 46 | mode: 01777 47 | 48 | - name: Create {{ pyxis_runtime_dir }} 49 | when: primary_controller|bool 50 | file: 51 | path: "{{ pyxis_runtime_dir }}" 52 | state: directory 53 | owner: root 54 | group: root 55 | mode: 01777 56 | 57 | - name: Create /opt/slurm/etc/plugstack.conf.d 58 | when: primary_controller|bool 59 | file: 60 | path: "/opt/slurm/etc/plugstack.conf.d" 61 | state: directory 62 | owner: root 63 | group: root 64 | mode: 0755 65 | 66 | - name: Create /etc/enroot/enroot.conf 67 | when: primary_controller|bool 68 | copy: 69 | dest: "/etc/enroot/enroot.conf" 70 | src: etc/enroot/enroot.conf 71 | owner: root 72 | group: root 73 | mode: 0644 74 | 75 | - name: Delete /opt/slurm/etc/plugstack.conf 76 | when: primary_controller|bool and not enable_pyxis|bool 77 | file: 78 | path: "/opt/slurm/etc/plugstack.conf" 79 | state: absent 80 | 81 | - name: Create /opt/slurm/etc/plugstack.conf 82 | when: primary_controller|bool and enable_pyxis|bool 83 | template: 84 | dest: "/opt/slurm/etc/plugstack.conf" 85 | src: opt/slurm/etc/plugstack.conf 86 | owner: root 87 | group: root 88 | mode: 0644 89 | backup: false 90 | 91 | - name: Create /opt/slurm/etc/plugstack.conf.d/pyxis.conf 92 | when: primary_controller|bool 93 | copy: 94 | dest: "/opt/slurm/etc/plugstack.conf.d/pyxis.conf" 95 | src: opt/slurm/etc/plugstack.conf.d/pyxis.conf 96 | owner: root 97 | group: root 98 | mode: 0644 99 | backup: false 100 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-slurmdb-accounts.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Install python3 4 | yum: 5 | state: present 6 | name: 7 | - python3 8 | - python3-pip 9 | 10 | - name: Install pyyaml 11 | pip: 12 | executable: /usr/bin/pip3 13 | state: present 14 | name: 15 | - pyyaml 16 | 17 | - name: Create {{ slurm_config_dir }}/bin/create_slurm_accounts.py 18 | when: primary_controller|bool 19 | copy: 20 | dest: "{{ slurm_config_dir }}/bin/create_slurm_accounts.py" 21 | src: opt/slurm/config/bin/create_slurm_accounts.py 22 | owner: root 23 | group: root 24 | mode: 0755 25 | 26 | - name: Create {{ slurm_config_dir }}/accounts.yml 27 | when: primary_controller|bool 28 | copy: 29 | dest: "{{ slurm_config_dir }}/accounts.yml" 30 | src: opt/slurm/config/accounts.yml.example 31 | owner: root 32 | group: root 33 | mode: 0664 34 | backup: yes 35 | force: false # Don't overwrite changes 36 | 37 | # create_slurm_accounts.py writes a logfile to /var/logs/slurm 38 | - name: Create /var/log/slurm 39 | when: primary_controller|bool 40 | file: 41 | path: "/var/log/slurm" 42 | state: directory 43 | owner: root 44 | group: root 45 | mode: 0755 46 | 47 | - name: Run {{ slurm_config_dir }}/bin/create_slurm_accounts.py to make sure it works 48 | # This uses sacctmcr so must do this after slurmctld and slurmdbd are working. 49 | when: primary_controller|bool and accounting_storage_host and licenses 50 | shell: 51 | cmd: | 52 | set -ex 53 | 54 | export SLURM_ROOT={{ slurm_root }} 55 | {{ slurm_config_dir }}/bin/create_slurm_accounts.py --accounts {{ slurm_config_dir }}/accounts.yml --users {{ slurm_config_dir }}/users_groups.json --default-account unassigned -d 56 | 57 | - name: Create /etc/cron.d/slurm_accounts 58 | when: primary_controller|bool 59 | template: 60 | src: etc/cron.d/slurm_accounts 61 | dest: /etc/cron.d/slurm_accounts 62 | owner: root 63 | group: root 64 | mode: 0600 65 | force: yes 66 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-sshd.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Allow X11 forwarding to the head node 4 | shell: 5 | cmd: | 6 | set -ex 7 | 8 | grep -v X11Forwarding /etc/ssh/sshd_config > /etc/ssh/sshd_config.new 9 | echo "X11Forwarding yes" >> /etc/ssh/sshd_config.new 10 | if diff /etc/ssh/sshd_config /etc/ssh/sshd_config.new; then 11 | rm -f /etc/ssh/sshd_config.new 12 | else 13 | mv -f /etc/ssh/sshd_config.new /etc/ssh/sshd_config 14 | systemctl restart sshd 15 | fi 16 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-users-groups.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Create/Update Users 4 | when: primary_controller|bool 5 | template: 6 | src: etc/cron.d/slurm_users_groups 7 | dest: /etc/cron.d/slurm_users_groups 8 | owner: root 9 | group: root 10 | mode: 0600 11 | force: yes 12 | 13 | - name: Test create_users_groups.py 14 | shell: | 15 | set -ex 16 | 17 | {{ slurm_config_dir }}/bin//create_users_groups.py -i {{ slurm_config_dir }}/users_groups.json 18 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - { include_tasks: config-users-groups.yml, tags: users-groups } 4 | - { include_tasks: config-high-throughput.yml, tags: high-throughput } 5 | - { include_tasks: config-external-login-node-access.yml, tags: external-login-node } 6 | - { include_tasks: config-sshd.yml, tags: sshd } 7 | - { include_tasks: config-slurmrestd.yml, tags: slurmrestd } 8 | - { include_tasks: config-licenses.yml, tags: licenses } 9 | - { include_tasks: config-slurmdb-accounts.yml, tags: accounts } 10 | - { include_tasks: config-oci.yml } 11 | - { include_tasks: config-pyxis.yml } 12 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/cron.d/slurm_accounts: -------------------------------------------------------------------------------- 1 | MAILTO='' 2 | SLURM_ROOT={{ slurm_root }} 3 | PATH="{{ slurm_config_dir }}/bin:{{ slurm_bin_dir }}:/sbin:/bin:/usr/sbin:/usr/bin" 4 | */30 * * * * root {{ slurm_config_dir }}/bin/create_slurm_accounts.py --accounts {{ slurm_config_dir }}/accounts.yml --users {{ slurm_config_dir }}/users_groups.json --default-account unassigned 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/cron.d/slurm_users_groups: -------------------------------------------------------------------------------- 1 | MAILTO='' 2 | SLURM_ROOT={{ slurm_root }} 3 | PATH="{{ slurm_config_dir }}/bin:{{ slurm_bin_dir }}:/sbin:/bin:/usr/sbin:/usr/bin" 4 | */5 * * * * root {{ slurm_config_dir }}/bin/create_users_groups.py -i {{ slurm_config_dir }}/users_groups.json 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/cron.d/update_slurmrestd_jwt_for_root: -------------------------------------------------------------------------------- 1 | MAILTO='' 2 | SLURM_ROOT={{ slurm_root }} 3 | PATH="{{ slurm_scripts_dir }}:{{ slurm_bin_dir }}:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin" 4 | */30 * * * * root {{ slurm_config_dir }}/bin/update_slurmrestd_jwt_parameter.sh root {{ slurmrestd_jwt_for_root_parameter }} 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/cron.d/update_slurmrestd_jwt_for_slurmrestd: -------------------------------------------------------------------------------- 1 | MAILTO='' 2 | SLURM_ROOT={{ slurm_root }} 3 | PATH="{{ slurm_config_dir }}/bin:{{ slurm_scripts_dir }}:{{ slurm_bin_dir }}:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin" 4 | */30 * * * * root {{ slurm_config_dir }}/bin/update_slurmrestd_jwt_parameter.sh slurmrestd {{ slurmrestd_jwt_for_slurmrestd_parameter }} 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/rc.d/rc.local: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ifconfig eth0 txqueuelen 4096 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/sysconfig/slurmrestd: -------------------------------------------------------------------------------- 1 | # Set environment variables for slurmrestd 2 | # Included in /etc/systemd/system/slurmrestd.service 3 | 4 | LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64 5 | 6 | SLURMRESTD_OPTIONS=-u slurmrestd -g slurmrestd -a rest_auth/jwt 7 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/etc/systemd/system/slurmrestd.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Slurm REST daemon 3 | After=network-online.target slurmctld.service remote-fs.target 4 | ConditionPathExists={{ slurm_conf }} 5 | 6 | [Service] 7 | Type=simple 8 | EnvironmentFile=-/etc/sysconfig/slurmrestd 9 | EnvironmentFile=-/etc/default/slurmrestd 10 | Environment="SLURM_JWT=daemon" 11 | # slurmrestd should not run as root or the slurm user. 12 | # Please either use the -u and -g options in /etc/sysconfig/slurmrestd or 13 | # /etc/default/slurmrestd, or explicitly set the User and Group in this file 14 | # an unpriviledged user to run as. 15 | # User= 16 | # Group= 17 | # Default to listen on both socket and slurmrestd port 18 | #ExecStart={{ slurm_sbin_dir }}/slurmrestd $SLURMRESTD_OPTIONS 0.0.0.0:{{ slurmrestd_port }} unix:{{ slurmrestd_socket }} 19 | ExecStart={{ slurm_sbin_dir }}/slurmrestd $SLURMRESTD_OPTIONS 0.0.0.0:{{ slurmrestd_port }} 20 | # Enable auth/jwt be default, comment out the line to disable it for slurmrestd 21 | ExecReload=/bin/kill -HUP $MAINPID 22 | Restart=on-failure 23 | 24 | [Install] 25 | WantedBy=multi-user.target 26 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/opt/slurm/config/bin/update_slurmrestd_jwt_parameter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, 8 | # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 9 | # permit persons to whom the Software is furnished to do so. 10 | 11 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 12 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 13 | # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 14 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 16 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | exec 1> >(logger -s -t update_slurmrestd_jwt_parameter) 2>&1 19 | 20 | userid=$1 21 | parameter_name=$2 22 | 23 | . <({{ slurm_bin_dir }}/scontrol token) 24 | # SLURM_JWT set by scontrol token command 25 | aws ssm put-parameter --region {{ region }} --name $parameter_name --type String --value "$SLURM_JWT" --overwrite 26 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/opt/slurm/etc/plugstack.conf: -------------------------------------------------------------------------------- 1 | 2 | include /opt/slurm/{{ cluster_name }}/etc/plugstack.conf.d/* 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/opt/slurm/modules/modulefiles/slurm/.template: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | ######################################## 3 | ## 4 | ## SLURM modulefile 5 | ## 6 | ## Docs: https://modules.readthedocs.io/en/latest/modulefile.html 7 | ######################################## 8 | 9 | set nicename "SLURM" 10 | 11 | set thisname [ module-info name ] 12 | set namelen [llength [split $thisname "/"]] 13 | set toolname [lindex [split $thisname "/" ] $namelen-2 ] 14 | set version [lindex [split $thisname "/" ] end ] 15 | 16 | proc ModulesHelp { } { 17 | global thisname toolname nicename 18 | puts stderr "\t$thisname - loads the env for $nicename\n" 19 | } 20 | 21 | module-whatis "loads the env for $toolname version $version" 22 | 23 | prepend-path LD_LIBRARY_PATH {{ slurm_os_dir }}/lib/slurm 24 | 25 | # This overrides the "search" line in /etc/resolv.conf so that the pcluster route53 zone is used to resolve compute node host names. 26 | setenv LOCALDOMAIN "ec2.internal {{ cluster_name }}.pcluster" 27 | 28 | prepend-path PATH {{ slurm_os_dir }}/bin 29 | 30 | setenv SLURM_CONF {{ external_login_node_slurm_base_dir }}/etc/slurm.conf 31 | 32 | setenv SLURM_CLUSTER_NAME {{ cluster_name }} 33 | 34 | # squeue defaults 35 | # 36 | # Output format 37 | # If SQUEUE_FORMAT or SQUEUE_FORMAT2 already set then doen't change them. 38 | # This is to avoid overwriting a user's settings. 39 | # 40 | # SQUEUE_FORMAT doesn't allow all fields so prefer SQUEUE_FORMAT2 41 | # default: "%.18i %9P %8j %8u %12T %.10M %.6D %R" 42 | # JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 43 | # --long: "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R" 44 | # JOBID PARTITION NAME USER STATE TIME TIME_LIMI NODES NODELIST(REASON) 45 | #setenv SQUEUE_FORMAT '%.9i %.19P %.9j %.8u %.10T %R' 46 | # 47 | # SQUEUE_FORMAT2 has all available fields 48 | # BatchHost: Similar to NodeList 49 | # ClusterFeature 50 | # 51 | # Priority 52 | # Reason 53 | # 54 | # TimeUsed 55 | # tres-alloc 56 | # UserId 57 | if { [ module-info mode load ] || [ module-info mode display ] } { 58 | if { ! [ info exists ::env(SQUEUE_SORT) ] } { 59 | # Sort by state, priority 60 | setenv SQUEUE_SORT "t,-p" 61 | setenv SQUEUE_SORT_SET "" 62 | } 63 | if { ! ( [ info exists ::env(SQUEUE_FORMAT) ] || [ info exists ::env(SQUEUE_FORMAT2) ] ) } { 64 | setenv SQUEUE_FORMAT2 "Cluster:16 ,Partition:15 ,JobArrayId:16 ,Priority:12 ,State:11 ,UserName:8 ,Name:16 ,NumNodes:.5 ,NumCPUs:.4 ,MinMemory:.10 ,Feature:15 ,Dependency:10 ,Licenses:8 ,ReasonList:35" 65 | # 66 | # Time and priority information 67 | #setenv SQUEUE_FORMAT2 "JobId:.6 ,Partition:9 ,State:7 ,UserName:8 ,Name:16 ,SubmitTime:16 ,PendingTime:12 ,TimeLimit:18 ,EndTime:18 ,ReasonList" 68 | setenv SQUEUE_FORMAT2_SET "" 69 | } 70 | } elseif { [ module-info mode remove ] } { 71 | if { [ info exists ::env(SQUEUE_SORT_SET) ] } { 72 | unsetenv SQUEUE_SORT 73 | unsetenv SQUEUE_SORT_SET 74 | } 75 | if { [ info exists ::env(SQUEUE_FORMAT2_SET) ] } { 76 | unsetenv SQUEUE_FORMAT2 77 | unsetenv SQUEUE_FORMAT2_SET 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/ParallelClusterHeadNode/templates/opt/slurm/modules/modulefiles/slurm/.version: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | ## 3 | ## Default version of this module 4 | ## 5 | set ModulesVersion "{{ parallel_cluster_version }}" 6 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/all/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/all/templates/etc/sudoers.d/10-admins: -------------------------------------------------------------------------------- 1 | %Admins@{{domain}} ALL=(ALL:ALL) NOPASSWD:ALL 2 | %sudoers@{{domain}} ALL=(ALL:ALL) NOPASSWD:AL 3 | %admins ALL=(ALL:ALL) NOPASSWD: ALL 4 | %sudoers ALL=(ALL:ALL) NOPASSWD: ALL 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/all/templates/usr/bin/pip3: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python3 -m pip $@ 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/all/templates/var/lib/cloud/scripts/per-boot/90_mount_ssds.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | if ! yum list installed nvme-cli; then 4 | yum -y install nvme-cli 5 | fi 6 | 7 | ssds=( $(nvme list | grep 'Amazon EC2 NVMe Instance Storage' | awk '{print $1}') ) 8 | if [ -z "$ssds" ]; then 9 | echo "No nvme SSDs found" 10 | # Search for ssd block devices 11 | ssds=() 12 | ephemerals=( ephemeral0 ephemeral1 ephemeral2 ephemeral3 ) 13 | for ephemeral in ${ephemerals[@]}; do 14 | device=$(cat /run/cloud-init/instance-data.json | jq -r ".ds.\"meta-data\".\"block-device-mapping\".\"$ephemeral\"") 15 | if [ ":$device" = ":" ] || [ ":$device" = ":null" ]; then 16 | continue 17 | fi 18 | if [ -e /dev/$device ]; then 19 | ssds+=(/dev/${device/sd/xvd}) 20 | continue 21 | fi 22 | device=${device/sd/xvd} 23 | if [ -e /dev/$device ]; then 24 | ssds+=(/dev/$device) 25 | fi 26 | done 27 | fi 28 | {% raw %} 29 | if [[ ${#ssds} == 0 ]]; then 30 | {% endraw %} 31 | echo "No SSDs found" 32 | exit 0 33 | fi 34 | {% raw %} 35 | echo "Found ${#ssds[@]} SSDs: ${ssds[@]}" 36 | {% endraw %} 37 | if ! yum list installed lvm2; then 38 | yum -y install lvm2 39 | fi 40 | for ssd in ${ssds[@]}; do 41 | if pvs $ssd; then 42 | echo "Physical volumes already exist: pv$ssd" 43 | else 44 | pvcreate $ssd 45 | fi 46 | done 47 | if vgs vgssd; then 48 | echo "vgssd volume group already exists" 49 | else 50 | vgcreate vgssd ${ssds[@]} 51 | fi 52 | if lvs vgssd/tmp; then 53 | echo "vgssd/tmp logical volume exists" 54 | else 55 | lvcreate -n tmp -l 100%VG vgssd 56 | mkfs.ext4 /dev/vgssd/tmp 57 | fi 58 | if [ ! -d /ssd ]; then 59 | mkdir /ssd 60 | else 61 | echo "/ssd already exists" 62 | fi 63 | if [ ! -d /mnt/ssd/tmp ]; then 64 | mkdir -p /mnt/ssd/tmp 65 | else 66 | echo "/mnt/ssd/tmp already exists" 67 | fi 68 | if ! findmnt --source /dev/vgssd/tmp --target /mnt/ssd/tmp; then 69 | mount /dev/vgssd/tmp /mnt/ssd/tmp 70 | chmod a+rwx /mnt/ssd/tmp 71 | fi 72 | if ! findmnt --source /dev/vgssd/tmp --target /ssd; then 73 | mount /dev/vgssd/tmp /ssd 74 | chmod a+rwx /ssd 75 | fi 76 | if ! findmnt --source /dev/vgssd/tmp --target /tmp; then 77 | mount /dev/vgssd/tmp /tmp 78 | chmod a+rwx /tmp 79 | fi 80 | 81 | ssd_size=$(lvs -o lv_size --units b --noheadings --nosuffix vgssd/tmp) 82 | 83 | 84 | if [ ! -e /tmp/swapfile ]; then 85 | swap_size=$( expr $ssd_size / 2 ) 86 | fallocate -l $swap_size /tmp/swapfile 87 | chmod 0600 /tmp/swapfile 88 | mkswap /tmp/swapfile 89 | swapon /tmp/swapfile 90 | free 91 | fi 92 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/all/tests/inventory: -------------------------------------------------------------------------------- 1 | localhost 2 | 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/all/tests/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | remote_user: root 4 | roles: 5 | - all -------------------------------------------------------------------------------- /source/resources/playbooks/roles/bug_fixes/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/bug_fixes/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for bug_fixes 3 | 4 | - name: Install bug fixes 5 | yum: 6 | bugfix: yes 7 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/bug_fixes/tests/inventory: -------------------------------------------------------------------------------- 1 | localhost 2 | 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/bug_fixes/tests/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | remote_user: root 4 | roles: 5 | - bug_fixes -------------------------------------------------------------------------------- /source/resources/playbooks/roles/cloudwatch_agent/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/cloudwatch_agent/templates/slurm_node_cloudwatch_agent_config.sh: -------------------------------------------------------------------------------- 1 | --- 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | logs: 5 | logs_collected: 6 | files: 7 | collect_list: 8 | - file_path: /var/log/cfn-init.log 9 | log_group_name: cfn-init.log 10 | log_stream_name: "{instance_id}" 11 | - file_path: /opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log 12 | log_group_name: amazon-cloudwatch-agent.log 13 | log_stream_name: "{instance_id}" 14 | - file_path: /var/log/messages 15 | log_group_name: messages 16 | log_stream_name: "{instance_id}" 17 | - file_path: /var/log/secure 18 | log_group_name: secure 19 | log_stream_name: "{instance_id}" 20 | - file_path: /var/log/slurm/slurmd.log 21 | log_group_name: slurmd.log 22 | log_stream_name: "{instance_id}" 23 | metrics: 24 | append_dimensions: 25 | InstanceId: "${aws:InstanceId}" 26 | metrics_collected: 27 | collectd: 28 | metrics_aggregation_interval: 60 29 | cpu: 30 | measurement: 31 | - cpu_usage_idle 32 | - cpu_usage_iowait 33 | - cpu_usage_user 34 | - cpu_usage_system 35 | metrics_collection_interval: 60 36 | resources: ["*"] 37 | totalcpu: true 38 | disk: 39 | measurement: 40 | - used_percent 41 | - inodes_free 42 | metrics_collection_interval: 60 43 | resources: ["/"] 44 | diskio: 45 | measurement: 46 | - io_time 47 | - write_bytes 48 | - read_bytes 49 | - writes 50 | - reads 51 | metrics_collection_interval: 60 52 | resources: ["*"] 53 | netstat: 54 | measurement: 55 | - tcp_established 56 | - tcp_time_wait 57 | metrics_collection_interval: 60 58 | swap: 59 | measurement: 60 | - swap_used_percent 61 | metrics_collection_interval: 60 62 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/cloudwatch_agent/templates/slurmctl_cloudwatch_agent_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # CloudWatch Agent config for slurmctl 3 | logs: 4 | logs_collected: 5 | files: 6 | collect_list: 7 | - file_path: /var/log/cfn-init.log 8 | log_group_name: cfn-init.log 9 | log_stream_name: "{instance_id}" 10 | - file_path: /opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log 11 | log_group_name: amazon-cloudwatch-agent.log 12 | log_stream_name: "{instance_id}" 13 | - file_path: /var/log/messages 14 | log_group_name: messages 15 | log_stream_name: "{instance_id}" 16 | - file_path: /var/log/secure 17 | log_group_name: secure 18 | log_stream_name: "{instance_id}" 19 | - file_path: /var/log/slurm/slurmctld.log 20 | log_group_name: slurmctld.log 21 | log_stream_name: "{instance_id}" 22 | - file_path: /var/log/slurm/power_save.log 23 | log_group_name: power_save.log 24 | log_stream_name: "{instance_id}" 25 | metrics: 26 | append_dimensions: 27 | InstanceId: "${aws:InstanceId}" 28 | metrics_collected: 29 | collectd: 30 | metrics_aggregation_interval: 60 31 | cpu: 32 | measurement: 33 | - cpu_usage_idle 34 | - cpu_usage_iowait 35 | - cpu_usage_user 36 | - cpu_usage_system 37 | metrics_collection_interval: 60 38 | resources: ["*"] 39 | totalcpu: true 40 | disk: 41 | measurement: 42 | - used_percent 43 | - inodes_free 44 | metrics_collection_interval: 60 45 | resources: ["/"] 46 | diskio: 47 | measurement: 48 | - io_time 49 | - write_bytes 50 | - read_bytes 51 | - writes 52 | - reads 53 | metrics_collection_interval: 60 54 | resources: ["*"] 55 | netstat: 56 | measurement: 57 | - tcp_established 58 | - tcp_time_wait 59 | metrics_collection_interval: 60 60 | swap: 61 | measurement: 62 | - swap_used_percent 63 | metrics_collection_interval: 60 64 | procstat: 65 | - pid_file: "/var/run/slurmctld.pid" 66 | measurement: 67 | - cpu_time 68 | - cpu_usage 69 | - memory_data 70 | - memory_locked 71 | - memory_rss 72 | - memory_stack 73 | - memory_swap 74 | - memory_vms 75 | - pid_count 76 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/cloudwatch_agent/templates/slurmdbd_cloudwatch_agent_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # CloudWatch Agent config for slurmdbd 3 | logs: 4 | logs_collected: 5 | files: 6 | collect_list: 7 | - file_path: /var/log/cfn-init.log 8 | log_group_name: cfn-init.log 9 | log_stream_name: "{instance_id}" 10 | - file_path: /opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log 11 | log_group_name: amazon-cloudwatch-agent.log 12 | log_stream_name: "{instance_id}" 13 | - file_path: /var/log/messages 14 | log_group_name: messages 15 | log_stream_name: "{instance_id}" 16 | - file_path: /var/log/secure 17 | log_group_name: secure 18 | log_stream_name: "{instance_id}" 19 | - file_path: /var/log/slurm/slurmd.log 20 | log_group_name: slurm 21 | log_stream_name: "{instance_id}" 22 | metrics: 23 | append_dimensions: 24 | InstanceId: "${aws:InstanceId}" 25 | metrics_collected: 26 | collectd: 27 | metrics_aggregation_interval: 60 28 | cpu: 29 | measurement: 30 | - cpu_usage_idle 31 | - cpu_usage_iowait 32 | - cpu_usage_user 33 | - cpu_usage_system 34 | metrics_collection_interval: 60 35 | resources: ["*"] 36 | totalcpu: true 37 | disk: 38 | measurement: 39 | - used_percent 40 | - inodes_free 41 | metrics_collection_interval: 60 42 | resources: ["/"] 43 | diskio: 44 | measurement: 45 | - io_time 46 | - write_bytes 47 | - read_bytes 48 | - writes 49 | - reads 50 | metrics_collection_interval: 60 51 | resources: ["*"] 52 | netstat: 53 | measurement: 54 | - tcp_established 55 | - tcp_time_wait 56 | metrics_collection_interval: 60 57 | swap: 58 | measurement: 59 | - swap_used_percent 60 | metrics_collection_interval: 60 61 | procstat: 62 | - pid_file: "/var/run/slurmdbd.pid" 63 | measurement: 64 | - cpu_time 65 | - cpu_usage 66 | - memory_data 67 | - memory_locked 68 | - memory_rss 69 | - memory_stack 70 | - memory_swap 71 | - memory_vms 72 | - pid_count 73 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/configure-rootless-docker/README.md: -------------------------------------------------------------------------------- 1 | configure-rootless-docker 2 | ========= 3 | 4 | Configure user to run rootless docker. 5 | 6 | License 7 | ------- 8 | 9 | mit0 10 | 11 | Author Information 12 | ------------------ 13 | 14 | Allan Carter (cartalla@amazon.com) 15 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/configure-rootless-docker/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Configure rootless docker 4 | shell: 5 | cmd: | 6 | dockerd-rootless-setuptool.sh install 7 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/eda_tools/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/eda_tools/templates/etc/profile.d/nodejs.csh: -------------------------------------------------------------------------------- 1 | 2 | setenv PATH {{nodejs_dir}}/bin:$PATH 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/eda_tools/templates/etc/profile.d/nodejs.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | export PATH={{nodejs_dir}}/bin:$PATH 5 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/eda_tools/tests/inventory: -------------------------------------------------------------------------------- 1 | localhost 2 | 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/eda_tools/tests/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | remote_user: root 4 | roles: 5 | - eda_tools -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_infrastructure_optimizer/README.md: -------------------------------------------------------------------------------- 1 | exostellar_infrastructure_optimizer 2 | ========= 3 | 4 | Configure Exostellar Infrastructure Optimizer (XIO). 5 | 6 | This is run on the Slurm head node and uploads configuration files to the XIO managment server using curl commands. 7 | 8 | Requirements 9 | ------------ 10 | 11 | Requires root permissions so that it can install the packages required by slurm. 12 | 13 | Role Variables 14 | -------------- 15 | cluster_name 16 | xio_mgt_ip 17 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/suspend_xspot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################################### 3 | ## Copyright (c) 2024 Exostellar Inc. All rights reserved. ## 4 | ## Email: support@exostellar.io ## 5 | ############################################################################### 6 | 7 | SLURM_AWS_LOG=/var/log/slurm/power_save.log 8 | 9 | exec 1> >(logger -s -t suspend_xspot.sh) 2>&1 10 | 11 | set -x 12 | set -e 13 | 14 | SLURM_BIN_PATH="/opt/slurm/{{ cluster_name }}/bin" 15 | SLURM_CONF_PATH="/opt/slurm/{{ cluster_name }}/etc" 16 | 17 | XCOMPUTE_HEAD_IP={{ xio_mgt_ip }} 18 | 19 | function suspend_xspot() 20 | { 21 | hostname=$1 22 | 23 | echo "xspot suspending $hostname" 24 | curl -v -X DELETE http://$XCOMPUTE_HEAD_IP:5000/v1/xcompute/vm/$hostname 25 | } 26 | 27 | echo "`date` Suspend invoked $0 $*" >> $SLURM_AWS_LOG 28 | 29 | {% raw -%} 30 | hostnames=$(${SLURM_BIN_PATH}/scontrol show hostnames $1) 31 | xspot_hostnames=( ) 32 | pc_hostnames=( ) 33 | for hostname in $hostnames 34 | do 35 | if [[ "$hostname" == "xspot-vm"* ]]; then 36 | xspot_hostnames+=( $hostname ) 37 | else 38 | pc_hostnames+=( $hostname ) 39 | fi 40 | done 41 | 42 | if [[ ${#pc_hostnames[@]} -gt 0 ]]; then 43 | pc_hostlist=$(${SLURM_BIN_PATH}/scontrol show hostlistsorted $(IFS=,; echo "${pc_hostnames[*]}")) 44 | echo "ParallelCluster suspending $pc_hostlist" 45 | /opt/parallelcluster/scripts/slurm/slurm_suspend $pc_hostlist 46 | fi 47 | 48 | if [[ ${#xspot_hostnames[@]} -gt 0 ]]; then 49 | for hostname in ${xspot_hostnames[@]}; do 50 | suspend_xspot $hostname 51 | done 52 | fi 53 | {% endraw %} 54 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/test_createVm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################################### 3 | ## Copyright (c) 2024 Exostellar Inc. All rights reserved. ## 4 | ## Email: support@exostellar.io ## 5 | ############################################################################### 6 | 7 | XCOMPUTE_HEAD_IP={{ xio_mgt_ip }} 8 | 9 | pool="" 10 | profile="" 11 | image_name="" 12 | host="" 13 | u_file="user_data.sh" 14 | 15 | while [ "$1" != "" ]; do 16 | case $1 in 17 | 18 | --pool ) 19 | shift 20 | pool=$1 21 | ;; 22 | --profile ) 23 | shift 24 | profile=$1 25 | ;; 26 | -i | --image_name ) 27 | shift 28 | image_name="$1" 29 | ;; 30 | -h | --host ) 31 | shift 32 | host="$1" 33 | ;; 34 | -u | --u_file ) 35 | shift 36 | u_file="$1" 37 | ;; 38 | --cpus ) 39 | shift 40 | cpus="$1" 41 | ;; 42 | --mem ) 43 | shift 44 | mem="$1" 45 | ;; 46 | * ) echo "USAGE: test_createVm.sh --pool --profile -i -h -u --cpus --mem " 47 | exit 1 48 | ;; 49 | esac 50 | shift 51 | done 52 | 53 | if [ -z "$pool" ] || [ -z "$profile" ] || [ -z "$image_name" ] || [ -z "$host" ] || [ -z "$cpus" ] || [ -z $mem ]; then 54 | echo "please provide --pool --profile -i -h --cpus --mem " 55 | exit 2 56 | fi 57 | 58 | if [ -f $u_file ]; then 59 | user_data=$(cat $u_file | sed "s/XSPOT_NODENAME/$host/g" | base64 -w 0) 60 | fi 61 | 62 | TMP_FILE=$(mktemp).json 63 | OUT_FILE=$(mktemp).json 64 | 65 | cat << END > $TMP_FILE 66 | { 67 | "NodeName": "$host", 68 | "PoolName": "$pool", 69 | "ProfileName": "$profile", 70 | "VM": { 71 | "CPUs": "$cpus", 72 | "ImageName": "$image_name", 73 | "MaxMemory": "$mem", 74 | "UserData": "$user_data", 75 | "VolumeSize": 4 76 | } 77 | } 78 | END 79 | 80 | echo "########## user_data ##########" 81 | echo $user_data | base64 -d 82 | echo 83 | echo "########## json ##########" 84 | cat $TMP_FILE 85 | echo 86 | 87 | echo "########## createVm ##########" 88 | http_code=$(curl -s -w "%{http_code}" -d "@$TMP_FILE" -H 'Content-Type: application/json' -X POST http://$XCOMPUTE_HEAD_IP:5000/v1/xcompute/vm -o $OUT_FILE) 89 | if [ $http_code -ne 200 ]; then 90 | echo "parse FAILED; curl = $http_code" 91 | fi 92 | id=`jq -r '.JobId' $OUT_FILE` 93 | echo -e "** OUT: JobId = $id\n" 94 | 95 | for i in {0..59}; do 96 | echo -ne "Waiting for $host... $((i * 10))s\033[0K\r" 97 | http_code=$(curl -s -w "%{http_code}" -X GET http://$XCOMPUTE_HEAD_IP:5000/v1/xcompute/vm/$host?detailedInfo=true -o $OUT_FILE) 98 | echo 99 | jq -r '' $OUT_FILE 100 | if [ $http_code -eq 200 ]; then 101 | echo "NodeName: `jq -r '.NodeName' $OUT_FILE`" 102 | echo "Controller: `jq -r '.Controller.NodeName' $OUT_FILE`" 103 | echo "Controller IP: `jq -r '.Controller.PrivateIpAddress' $OUT_FILE`" 104 | echo "Vm IP: `jq -r '.Vm.PrivateIpAddress' $OUT_FILE`" 105 | if [ "$(jq '.Vm | has("PrivateIpAddress")' $OUT_FILE)" == "true" ]; then 106 | echo "########## done ##########" 107 | break 108 | fi 109 | echo 110 | fi 111 | sleep 10 112 | done 113 | 114 | rm -f $TMP_FILE $OUT_FILE 115 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/xspot-vm_custom_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | ############################################################################### 3 | ## Copyright (c) 2024 Exostellar Inc. All rights reserved. ## 4 | ## Email: support@exostellar.io ## 5 | ############################################################################### 6 | 7 | # Custom user data script 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/xspot-vm_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | ############################################################################### 3 | ## Copyright (c) 2024 Exostellar Inc. All rights reserved. ## 4 | ## Email: support@exostellar.io ## 5 | ############################################################################### 6 | 7 | # Do not edit this file 8 | # If you need to customize the VM image add your customizations to xspot-vm_custom_user_data.sh. 9 | 10 | # Patch eth0 configuration to remove workers IP address 11 | if ip addr show dev eth0 | grep -q 'global secondary noprefixroute eth0'; then 12 | worker_ip_address=$(ip addr show dev eth0 | grep 'global noprefixroute eth0' | awk '{print $2}') 13 | vm_ip_address=$(ip addr show dev eth0 | grep 'global secondary noprefixroute eth0' | awk '{print $2}') 14 | echo "Found 2 ip addresses for eth0" 15 | ip addr show dev eth0 16 | echo "Worker ip address: $worker_ip_address" 17 | echo "VM ip address: $vm_ip_address" 18 | echo "Deleting worker IP address from etho0 configuration" 19 | ip addr del $worker_ip_address dev eth0 20 | else 21 | echo "No secondary IP address found" 22 | fi 23 | 24 | systemctl stop amazon-ssm-agent || true 25 | systemctl disable amazon-ssm-agent || true 26 | 27 | if ! [[ -d /opt/slurm ]]; then 28 | mkdir /opt/slurm 29 | fi 30 | if ! mountpoint /opt/slurm; then 31 | mount -t nfs -o defaults head_node.{{ cluster_name }}.pcluster:/opt/slurm /opt/slurm 32 | fi 33 | 34 | if ! [[ -d /opt/parallelcluster/shared ]]; then 35 | mkdir -p /opt/parallelcluster/shared 36 | fi 37 | if ! mountpoint /opt/parallelcluster/shared; then 38 | mount -t nfs -o defaults head_node.{{ cluster_name }}.pcluster:/opt/parallelcluster/shared /opt/parallelcluster/shared 39 | fi 40 | 41 | if ! [[ -d /opt/intel ]]; then 42 | mkdir -p /opt/intel 43 | fi 44 | if ! mountpoint /opt/intel; then 45 | mount -t nfs -o defaults head_node.{{ cluster_name }}.pcluster:/opt/intel /opt/intel 46 | fi 47 | 48 | {% for mount_config in xio_config.ExtraMounts %} 49 | if ! [[ -d "{{ mount_config.dest }}" ]]; then 50 | mkdir -p "{{ mount_config.dest }}" 51 | fi 52 | mount -t "{{ mount_config.type }}" -o "{{ mount_config.options }}" "{{ mount_config.src }}" "{{ mount_config.dest }}" 53 | 54 | {% endfor %} 55 | if [[ -e /opt/slurm/config/users_groups.json ]]; then 56 | /opt/slurm/config/bin/create_users_groups.py -i /opt/slurm/config/users_groups.json 57 | fi 58 | 59 | if ! [[ -e /var/log/parallelcluster ]]; then 60 | mkdir -p /var/log/parallelcluster 61 | chmod 0777 /var/log/parallelcluster 62 | fi 63 | 64 | if [[ -e /opt/slurm/etc/exostellar/custom_xio_user_data.sh ]]; then 65 | /opt/slurm/etc/exostellar/custom_xio_user_data.sh 66 | fi 67 | 68 | if ! [[ -e /etc/profile.d/slurm.sh ]]; then 69 | cat < /etc/profile.d/slurm.sh 70 | PATH=$PATH:/opt/slurm/bin 71 | MANPATH=$MANPATH:/opt/slurm/share/man 72 | 73 | export PATH MANPATH 74 | EOF 75 | fi 76 | source /etc/profile.d/slurm.sh 77 | 78 | if ! [[ -e /etc/profile.d/slurm.csh ]]; then 79 | cat < /etc/profile.d/slurm.csh 80 | set path = ($path /opt/slurm/bin) 81 | if ( ${?MANPATH} ) then 82 | setenv MANPATH ${MANPATH}:/opt/slurm/share/man 83 | else 84 | setenv MANPATH :/opt/slurm/share/man 85 | endif 86 | EOF 87 | fi 88 | 89 | cat < /etc/sysconfig/slurmd 90 | SLURMD_OPTIONS='-N XSPOT_NODENAME' 91 | EOF 92 | 93 | hostnamectl set-hostname XSPOT_NODENAME 94 | 95 | echo XSPOT_NODENAME > /var/run/nodename 96 | 97 | scontrol update nodename=XSPOT_NODENAME nodeaddr=$(hostname -I) 98 | 99 | systemctl start slurmd 100 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_infrastructure_optimizer/templates/opt/slurm/etc/exostellar/xspot.slurm.conf: -------------------------------------------------------------------------------- 1 | 2 | ResumeProgram=/opt/slurm/{{ cluster_name }}/etc/exostellar/resume_xspot.sh 3 | #ResumeRate=100 4 | #ResumeTimeout=600 5 | SuspendProgram=/opt/slurm/{{ cluster_name }}/etc/exostellar/suspend_xspot.sh 6 | #SuspendRate=100 7 | #SuspendTime=350 8 | #TreeWidth=60000 9 | 10 | {% set ns = namespace(node_names = []) %} 11 | {% for pool_config in xio_config.Pools %} 12 | {% set node_name = 'xspot-vm-' + pool_config.PoolName + '-[1-' + pool_config.PoolSize|string + ']' %} 13 | {% set ns.node_names = ns.node_names + [node_name] %} 14 | NodeName={{ node_name }} CPUs={{ pool_config.CPUs }} RealMemory={{ pool_config.MaxMemory }} State=CLOUD Feature=xio,{{ pool_config.PoolName }},{{ pool_config.ProfileName }} Weight={{ pool_config.Weight }} 15 | 16 | PartitionName=xio-{{ pool_config.PoolName }} MaxTime=INFINITE State=Up Nodes={{ node_name }} 17 | 18 | {% endfor %} 19 | 20 | PartitionName=xio MaxTime=INFINITE State=UP \ 21 | Nodes=\ 22 | {{ ns.node_names|join(',\\\n') }} 23 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_workload_optimizer/templates/opt/slurm/etc/exostellar/suspend_xspot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################################### 3 | ## Copyright (c) 2024 Exostellar Inc. All rights reserved. ## 4 | ## Email: support@exostellar.io ## 5 | ############################################################################### 6 | 7 | SLURM_AWS_LOG=/var/log/slurm/power_save.log 8 | 9 | exec 1> >(logger -s -t suspend_xspot.sh) 2>&1 10 | 11 | set -x 12 | set -e 13 | 14 | # NOTE: ExostellarRootCA.crt and ExostellarClient.pem are now required. 15 | CERT_PATH=/etc/ssl/certs/ 16 | 17 | SLURM_BIN_PATH="/opt/slurm/{{ cluster_name }}/bin" 18 | SLURM_CONF_PATH="/opt/slurm/{{ cluster_name }}/etc" 19 | 20 | XCOMPUTE_HEAD_IP={{ xwo_mgt_ip }} 21 | 22 | function suspend_xspot() 23 | { 24 | hostname=$1 25 | 26 | echo "xspot suspending $hostname" 27 | 28 | mapfile -t curl_out < <(curl -s -w "%{http_code}" -H 'Content-Type: application/json' -X DELETE --cacert $CERT_PATH/ExostellarRootCA.crt --cert $CERT_PATH/ExostellarClient.pem https://ExostellarHeadNode/v1/xcompute/vm/$hostname --resolve ExostellarHeadNode:443:$XCOMPUTE_HEAD_IP -o /dev/null) 29 | echo "$curl_out" 30 | http_code=${curl_out[-1]} 31 | echo "http_code=$http_code" 32 | if [ $http_code -ne 200 ]; then 33 | echo "`date` Suspend $hostname FAILED; curl = $http_code" >> $SLURM_AWS_LOG 34 | fi 35 | } 36 | 37 | echo "`date` Suspend invoked $0 $*" >> $SLURM_AWS_LOG 38 | 39 | {% raw -%} 40 | hostnames=$(${SLURM_BIN_PATH}/scontrol show hostnames $1) 41 | xspot_hostnames=( ) 42 | pc_hostnames=( ) 43 | for hostname in $hostnames 44 | do 45 | if [[ "$hostname" == "xspot-vm"* ]] || [[ "$hostname" == "xio"* ]] || [[ "$hostname" == "xwo"* ]]; then 46 | xspot_hostnames+=( $hostname ) 47 | else 48 | pc_hostnames+=( $hostname ) 49 | fi 50 | done 51 | 52 | if [[ ${#pc_hostnames[@]} -gt 0 ]]; then 53 | pc_hostlist=$(${SLURM_BIN_PATH}/scontrol show hostlistsorted $(IFS=,; echo "${pc_hostnames[*]}")) 54 | echo "ParallelCluster suspending $pc_hostlist" 55 | /opt/parallelcluster/scripts/slurm/slurm_suspend $pc_hostlist 56 | fi 57 | 58 | if [[ ${#xspot_hostnames[@]} -gt 0 ]]; then 59 | for hostname in ${xspot_hostnames[@]}; do 60 | suspend_xspot $hostname 61 | done 62 | fi 63 | {% endraw %} 64 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_workload_optimizer/templates/opt/slurm/etc/exostellar/xspot-vm_custom_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | ############################################################################### 3 | ## Copyright (c) 2024 Exostellar Inc. All rights reserved. ## 4 | ## Email: support@exostellar.io ## 5 | ############################################################################### 6 | 7 | # Custom user data script 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/exostellar_workload_optimizer/templates/opt/slurm/etc/exostellar/xspot.slurm.conf: -------------------------------------------------------------------------------- 1 | 2 | ResumeProgram=/opt/slurm/{{ cluster_name }}/etc/exostellar/resume_xspot.sh 3 | SuspendProgram=/opt/slurm/{{ cluster_name }}/etc/exostellar/suspend_xspot.sh 4 | 5 | {% set ns = namespace(node_names = []) %} 6 | {% for pool_name, pool_config in xwo_config.Pools.items() %} 7 | {% set node_name = 'xwo-' + pool_name + '-[1-' + pool_config.PoolSize|string + ']' %} 8 | {% set ns.node_names = ns.node_names + [node_name] %} 9 | NodeName={{ node_name }} CPUs={{ pool_config.CPUs }} RealMemory={{ pool_config.MaxMemory }} State=CLOUD Feature=xwo,{{ pool_name }},{{ pool_config.ProfileName }} Weight={{ pool_config.Weight }} 10 | 11 | PartitionName=xwo-{{ pool_name }} MaxTime=INFINITE State=Up Nodes={{ node_name }} 12 | 13 | {% endfor %} 14 | 15 | PartitionName=xwo MaxTime=INFINITE State=UP \ 16 | Nodes=\ 17 | {{ ns.node_names|join(',\\\n') }} 18 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install-rootless-docker/README.md: -------------------------------------------------------------------------------- 1 | install-rootless-docker 2 | ========= 3 | 4 | Install rootless docker for use by OCI containers. 5 | 6 | License 7 | ------- 8 | 9 | mit0 10 | 11 | Author Information 12 | ------------------ 13 | 14 | Allan Carter (cartalla@amazon.com) 15 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install-rootless-docker/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for install-rootless-docker 3 | 4 | - name: Remove Docker packages 5 | yum: 6 | state: removed 7 | name: 8 | - docker 9 | - docker-client 10 | - docker-client-latest 11 | - docker-common 12 | - docker-latest 13 | - docker-latest-logrotate 14 | - docker-logrotate 15 | - docker-engine 16 | - podman 17 | - runc 18 | 19 | - name: Install dnf-plugins-core 20 | when: (rhel8 or rhel8clone or rhel9 or rhel9clone) 21 | yum: 22 | state: present 23 | name: 24 | - dnf-plugins-core 25 | 26 | - name: Set up Docker repository 27 | shell: 28 | cmd: | 29 | dnf config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo 30 | creates: /etc/yum.repos.d/docker-ce.repo 31 | 32 | - name: Install docker packages 33 | yum: 34 | state: present 35 | name: 36 | - containerd.io 37 | - docker-buildx-plugin 38 | - docker-ce 39 | - docker-ce-cli 40 | - docker-ce-rootless-extras 41 | - docker-compose-plugin 42 | - fuse-overlayfs 43 | - iptables 44 | - runc 45 | - shadow-utils 46 | 47 | - name: Disable docker.service 48 | systemd_service: 49 | name: docker.service 50 | enabled: false 51 | state: stopped 52 | 53 | - name: Disable docker.socket 54 | systemd_service: 55 | name: docker.socket 56 | enabled: false 57 | state: stopped 58 | 59 | - name: Remove /var/run/docker.sock 60 | file: 61 | path: /var/run/docker.sock 62 | state: absent 63 | 64 | - name: Load ip_tables kernel module 65 | community.general.modprobe: 66 | name: ip_tables 67 | state: present 68 | persistent: present 69 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install_slurm/README.md: -------------------------------------------------------------------------------- 1 | install_slurm 2 | ========= 3 | 4 | Compile and install slurm on ParallelCluster NFS share using the external login node host if the external login nodes's OS is different than the cluster. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Requires root permissions so that it can install the packages required by slurm. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install_slurm/templates/opt/slurm/config/modules/modulefiles/slurm/.template: -------------------------------------------------------------------------------- 1 | ../../../../../../../../ParallelClusterHeadNode/templates/opt/slurm/modules/modulefiles/slurm/.template -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install_slurm/templates/opt/slurm/config/modules/modulefiles/slurm/.version: -------------------------------------------------------------------------------- 1 | ../../../../../../../../ParallelClusterHeadNode/templates/opt/slurm/modules/modulefiles/slurm/.version -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install_vscode/README.md: -------------------------------------------------------------------------------- 1 | install_vscode 2 | ========= 3 | 4 | Install vscode 5 | 6 | Requirements 7 | ------------ 8 | 9 | Requires root permissions so that it can install the repo and package 10 | 11 | Role Variables 12 | -------------- 13 | 14 | None 15 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/install_vscode/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for install_vscode 3 | 4 | - name: Show variables used by this role 5 | debug: 6 | msg: | 7 | ansible_architecture: {{ ansible_architecture }} 8 | architecture: {{ architecture }} 9 | distribution: {{ distribution }} 10 | distribution_major_version: {{ distribution_major_version }} 11 | distribution_version: {{ distribution_version }} 12 | 13 | amazonlinux2: {{ amazonlinux2 }} 14 | alma: {{ alma }} 15 | alma8: {{ alma8 }} 16 | centos: {{ centos }} 17 | centos7: {{ centos7 }} 18 | rhel: {{ rhel }} 19 | rhel7: {{ rhel7 }} 20 | rhel8: {{ rhel8 }} 21 | rhel9: {{ rhel9 }} 22 | rocky: {{ rocky }} 23 | rocky8: {{ rocky8 }} 24 | rocky9: {{ rocky9 }} 25 | rhelclone: {{ rhelclone }} 26 | rhel8clone: {{ rhel8clone }} 27 | rhel9clone: {{ rhel9clone }} 28 | 29 | - name: Install vscode key and yum repository 30 | shell: 31 | cmd: | 32 | rpm --import https://packages.microsoft.com/keys/microsoft.asc 33 | echo -e "[code]\nname=Visual Studio Code\nbaseurl=https://packages.microsoft.com/yumrepos/vscode\nenabled=1\ngpgcheck=1\ngpgkey=https://packages.microsoft.com/keys/microsoft.asc" | sudo tee /etc/yum.repos.d/vscode.repo > /dev/null 34 | args: 35 | creates: "/etc/yum.repos.d/vscode.repo" 36 | 37 | - name: Update package cache and install vscode 38 | shell: 39 | cmd: | 40 | dnf check-update || yum check-update 41 | dnf -y install code || yum -y install code 42 | args: 43 | creates: "/bin/code" 44 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/lustre-client/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Tasks to install lustre-client 3 | 4 | - name: Check if lustre-client installed 5 | register: lustre_client 6 | shell: 7 | cmd: | 8 | set -ex 9 | if yum list installed kmod-lustre-client lustre-client &> /dev/null; then 10 | echo "installed" 11 | else 12 | echo "needs_install" 13 | fi 14 | - name: Set lustre_client_installed 15 | set_fact: 16 | lustre_client_installed: "{{ lustre_client['stdout_lines'][0] == 'installed' }}" 17 | 18 | - name: Show variables 19 | debug: 20 | msg: | 21 | lustre_client_installed: {{ lustre_client_installed }} 22 | amazonlinux2: {{ amazonlinux2 }} 23 | centos7_5_to_6: {{ centos7_5_to_6 }} 24 | centos7_5_to_9: {{ centos7_5_to_9 }} 25 | centos7_7_to_9: {{ centos7_7_to_9 }} 26 | kernel: {{ kernel }} 27 | 28 | - name: Install lustre client on Amazon Linux 2 29 | when: not lustre_client_installed and amazonlinux2 30 | shell: 31 | cmd: | 32 | set -ex 33 | amazon-linux-extras install -y lustre2.10 34 | 35 | - name: Install lustre rpm public key 36 | when: centos7_5_to_9 37 | shell: 38 | creates: /tmp/fsx-rpm-public-key.asc 39 | cmd: | 40 | set -ex 41 | curl https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -o /tmp/fsx-rpm-public-key.asc 42 | sudo rpm --import /tmp/fsx-rpm-public-key.asc 43 | 44 | - name: Install lustre client on CentOS or RedHat 7.5 or 7.6 kernel 3.10.0-862.* 45 | when: not lustre_client_installed and centos7_5_to_6 and kernel is match('3\.10\.0\-862\.*') 46 | shell: 47 | cmd: | 48 | set -ex 49 | yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.5/el7/client/RPMS/x86_64/kmod-lustre-client-2.10.5-1.el7.x86_64.rpm 50 | yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.5/el7/client/RPMS/x86_64/lustre-client-2.10.5-1.el7.x86_64.rpm 51 | 52 | - name: Install lustre client on CentOS or RedHat 7.5 or 7.6 kernel 3.10.0-957.* 53 | when: not lustre_client_installed and centos7_5_to_6 and kernel is match('3\.10\.0\-957\.*') 54 | shell: 55 | cmd: | 56 | set -ex 57 | yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.8/el7/client/RPMS/x86_64/kmod-lustre-client-2.10.8-1.el7.x86_64.rpm 58 | yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.8/el7/client/RPMS/x86_64/lustre-client-2.10.8-1.el7.x86_64.rpm 59 | 60 | - name: Install lustre repo on CentOS or RedHat 7.7-9 61 | when: centos7_7_to_9 62 | shell: 63 | creates: /etc/yum.repos.d/aws-fsx.repo 64 | cmd: | 65 | set -ex 66 | curl https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -o /etc/yum.repos.d/aws-fsx.repo 67 | 68 | - name: Install kernel-3.10.0-1160.* required by lustre 69 | when: centos7_7_to_9 and not kernel is match('3\.10\.0\-1160\.*') 70 | register: kernel_update 71 | yum: 72 | state: present 73 | name: 74 | - kernel-3.10.0-1160.* 75 | 76 | - name: Reboot after kernel update 77 | when: kernel_update['changed'] 78 | reboot: 79 | 80 | - name: Install lustre client on CentOS or RedHat 7.7-9 81 | when: not lustre_client_installed and centos7_7_to_9 and kernel is match('3\.10\.0\-1160\.*') 82 | yum: 83 | state: present 84 | name: 85 | - kmod-lustre-client 86 | - lustre-client 87 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/mount_extra_fs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Mount extra file systems 3 | 4 | - name: Print vars 5 | debug: 6 | msg: | 7 | extra_mounts: {{ extra_mounts }} 8 | 9 | - name: Install nfs-utils 10 | yum: 11 | state: present 12 | name: 13 | - nfs-utils 14 | 15 | - name: Mount extra_mounts 16 | when: extra_mounts|length > 0 17 | loop: "{{ extra_mounts }}" 18 | mount: 19 | backup: yes 20 | fstype: "{{ item.type }}" 21 | opts: "{{ item.options }}" 22 | path: "{{ item.dest }}" 23 | src: "{{ item.src }}" 24 | state: mounted 25 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/mount_slurm_fs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Mount SLURM file system 3 | 4 | - name: Print vars 5 | debug: 6 | msg: | 7 | file_system_dns: {{ file_system_dns }} 8 | file_system_mount_path: {{ file_system_mount_path }} 9 | file_system_mount_src: {{ file_system_mount_src }} 10 | file_system_options: {{ file_system_options }} 11 | file_system_port: {{ file_system_port }} 12 | file_system_type: {{ file_system_type }} 13 | extra_mounts: {{ extra_mounts }} 14 | 15 | - name: Install nfs-utils 16 | yum: 17 | state: present 18 | name: 19 | - nfs-utils 20 | 21 | - name: Create {{ file_system_mount_path }} 22 | file: 23 | path: "{{ file_system_mount_path }}" 24 | state: directory 25 | owner: root 26 | group: root 27 | mode: 0755 28 | 29 | - name: Wait for file system dns to exist 30 | wait_for: 31 | host: "{{ file_system_dns }}" 32 | port: "{{ file_system_port }}" 33 | timeout: 1800 # 30 minutes 34 | 35 | - name: Mount SLURM file system 36 | mount: 37 | backup: yes 38 | fstype: "{{ file_system_type }}" 39 | opts: "{{ file_system_options }}" 40 | path: "{{ file_system_mount_path }}" 41 | src: "{{ file_system_mount_src }}" 42 | state: mounted 43 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/security_updates/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/security_updates/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for security_updates 3 | 4 | - name: Install Security updates 5 | yum: 6 | disablerepo: "{{ yum_disablerepo|default(omit) }}" 7 | security: yes 8 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/security_updates/tests/inventory: -------------------------------------------------------------------------------- 1 | localhost 2 | 3 | -------------------------------------------------------------------------------- /source/resources/playbooks/roles/security_updates/tests/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: localhost 3 | remote_user: root 4 | roles: 5 | - security_updates -------------------------------------------------------------------------------- /source/resources/playbooks/roles/unmount_slurm_fs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Unmount SLURM file system 3 | 4 | - name: Print vars 5 | debug: 6 | msg: | 7 | file_system_dns: {{ file_system_dns }} 8 | file_system_mount_path: {{ file_system_mount_path }} 9 | file_system_mount_src: {{ file_system_mount_src }} 10 | file_system_options: {{ file_system_options }} 11 | file_system_port: {{ file_system_port }} 12 | file_system_type: {{ file_system_type }} 13 | 14 | - name: Unmount SLURM file system 15 | mount: 16 | fstype: "{{ file_system_type }}" 17 | path: "{{ file_system_mount_path }}" 18 | state: absent 19 | 20 | # For some reason the ansible module is leaving the file system mounted 21 | - name: Unmount SLURM file system 22 | shell: | 23 | umount -f {{ file_system_mount_path }} 24 | 25 | - name: Remove {{ file_system_mount_path }} 26 | file: 27 | state: absent 28 | path: "{{ file_system_mount_path }}" 29 | -------------------------------------------------------------------------------- /source/resources/playbooks/security_updates.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install yum security updates 3 | hosts: 4 | - security_updates 5 | become_user: root 6 | become: yes 7 | roles: 8 | - security_updates 9 | -------------------------------------------------------------------------------- /source/resources/user_data/slurm_node_ami_user_data.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | # The environment variables are set before this part of the user_data runs. 5 | 6 | script=$(readlink -f $0) 7 | script_name=$(basename $script) 8 | 9 | # Tag EBS disks manually 10 | AWS_AVAIL_ZONE=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) 11 | AWS_INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) 12 | EBS_IDS=$(aws ec2 describe-volumes --filters Name=attachment.instance-id,Values="$AWS_INSTANCE_ID" --region $AWS_DEFAULT_REGION --query "Volumes[*].[VolumeId]" --out text | tr "\n" " ") 13 | aws ec2 create-tags --resources $EBS_IDS --region $AWS_DEFAULT_REGION --tags Key=Name,Value="${STACK_NAME} SlurmNodeAMI Root Disk" 14 | 15 | # Tag Network Adapter 16 | ENI_IDS=$(aws ec2 describe-network-interfaces --filters Name=attachment.instance-id,Values="$AWS_INSTANCE_ID" --region $AWS_DEFAULT_REGION --query "NetworkInterfaces[*].[NetworkInterfaceId]" --out text | tr "\n" " ") 17 | aws ec2 create-tags --resources $ENI_IDS --region $AWS_DEFAULT_REGION --tags Key=Name,Value="${STACK_NAME} SlurmNodeAMI Network Adapter" 18 | 19 | chmod +x $CONFIG_SCRIPT_PATH 20 | $CONFIG_SCRIPT_PATH 21 | -------------------------------------------------------------------------------- /source/resources/user_data/slurm_node_ami_user_data_on_exit.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | if [[ $exitCode -ne 0 ]] && [[ ":{{ERROR_SNS_TOPIC_ARN}}" != ":" ]]; then 5 | instance_id=$(curl --silent http://169.254.169.254/latest/meta-data/instance-id) 6 | msg_file=$(mktemp) 7 | echo -e "\nINSTANCE NAME: $INSTANCE_NAME" > $msg_file 8 | echo -e "\nINSTANCE ID: $instance_id" >> $msg_file 9 | echo -e "\ngrep cloud-init /var/log/messages | tail -n 200:\n\n" >> $msg_file 10 | grep cloud-init /var/log/messages |tail -n 200 >> $msg_file 11 | if [ -e /var/log/cloud-init.log ]; then 12 | echo -e "\n\n\ntail -n 200 /var/log/cloud-init.log:\n\n" >> $msg_file 13 | tail -n 200 /var/log/cloud-init.log >> $msg_file 14 | fi 15 | # --subject is limited to 100 characters 16 | aws sns publish --region {{AWS_DEFAULT_REGION}} --topic-arn {{ERROR_SNS_TOPIC_ARN}} --subject "$instance_id UserData failed" --message "file://$msg_file" 17 | rm $msg_file 18 | fi 19 | 20 | if ! needs-restarting -r; then 21 | reboot 22 | fi 23 | -------------------------------------------------------------------------------- /source/resources/user_data/slurm_node_ami_user_data_prolog.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | # This is the first thing to run in the user_data 5 | 6 | script=$(readlink -f $0) 7 | script_name=$(basename $script) 8 | 9 | # Rerun after reboot 10 | # We want the user_data to be run every time the instance boots so that all of the latest S3 assets and other configuration is downloaded. 11 | # But this is only for the AMI instances. 12 | # Check the "role" tag and delete the rerun script if not "slurm_node_ami" 13 | old_rerun_script=/var/lib/cloud/scripts/per-boot/10_user_data 14 | rm -f $old_rerun_script 15 | rerun_script=/var/lib/cloud/scripts/per-boot/10_slurm_node_ami_user_data 16 | AWS_INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) 17 | node_type=$(aws ec2 describe-tags --filters '[{"Name":"resource-id","Values":["'$AWS_INSTANCE_ID'"]},{"Name":"tag:NodeType","Values":["*"]}]' --query 'Tags[0].Value' --output text) 18 | if [[ $node_type == 'slurm_node_ami' ]]; then 19 | ln -sf $script $rerun_script 20 | else 21 | trap - EXIT 22 | rm -f $rerun_script 23 | exit 0 24 | fi 25 | -------------------------------------------------------------------------------- /source/resources/user_data/slurmctl_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | source /etc/profile.d/instance_vars.sh 6 | 7 | function on_exit { 8 | rc=$? 9 | set +e 10 | 11 | if [[ $rc -ne 0 ]] && [[ ":$ERROR_SNS_TOPIC_ARN" != ":" ]]; then 12 | aws sns publish --region $AWS_DEFAULT_REGION --topic-arn $ERROR_SNS_TOPIC_ARN --subject "$INSTANCE_NAME slurmctl_config.sh failed" --message "See /var/log/cloud-init.log or grep cloud-init /var/log/messages | less for more info." 13 | fi 14 | 15 | # Make sure that security patches that require a reboot are applied 16 | if ! needs-restarting -r; then 17 | reboot 18 | fi 19 | } 20 | trap on_exit EXIT 21 | 22 | # Install security updates first. 23 | # Since this is Amazon Linux 2 don't need to configure proxy because yum repos are in S3. 24 | # Disable epel because it isn't in S3 and requires configuration. 25 | yum -y update --security --bugfix --skip-broken 26 | 27 | # Update to latest ssm agent 28 | if yum install -y https://s3.$AWS_DEFAULT_REGION.amazonaws.com/amazon-ssm-$AWS_DEFAULT_REGION/latest/linux_amd64/amazon-ssm-agent.rpm; then 29 | systemctl restart amazon-ssm-agent 30 | fi 31 | 32 | # Configure using ansible 33 | if ! yum list installed ansible &> /dev/null; then 34 | amazon-linux-extras install -y ansible2 35 | fi 36 | 37 | hostnamectl set-hostname --static $SlurmCtlHostname.$Domain 38 | 39 | PLAYBOOKS_PATH=/root/playbooks 40 | if [ -e $PLAYBOOKS_ZIP_PATH ]; then 41 | rm -rf $PLAYBOOKS_PATH 42 | mkdir -p $PLAYBOOKS_PATH 43 | pushd $PLAYBOOKS_PATH 44 | unzip -q $PLAYBOOKS_ZIP_PATH 45 | rm $PLAYBOOKS_ZIP_PATH 46 | popd 47 | fi 48 | 49 | pushd $PLAYBOOKS_PATH 50 | ansible-playbook $PLAYBOOKS_PATH/SlurmCtl.yml \ 51 | -i inventories/local.yml \ 52 | -e @/root/ansible_extra_vars.yml 53 | popd 54 | 55 | # Disable automatic motd update 56 | /usr/sbin/update-motd --disable 57 | rm -f /etc/cron.d/update-motd 58 | rm -f /etc/update-motd.d/* 59 | 60 | # Set up motd 61 | if ! yum list installed figlet &> /dev/null; then 62 | yum install -y figlet 63 | fi 64 | figlet -f slant "SlurmCtl" > /etc/motd 65 | echo -e "Stack Name: ${STACK_NAME} 66 | " >> /etc/motd 67 | -------------------------------------------------------------------------------- /source/resources/user_data/slurmctl_user_data.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | # Rerun after reboot 5 | script=$(readlink -f $0) 6 | script_name=$(basename $script) 7 | rerun_script=/var/lib/cloud/scripts/per-boot/10_$script_name 8 | ln -sf $script $rerun_script 9 | 10 | # Tag EBS disks manually as CFN ASG does not support it 11 | AWS_AVAIL_ZONE=$(curl http://169.254.169.254/latest/meta-data/placement/availability-zone) 12 | AWS_INSTANCE_ID=$(curl http://169.254.169.254/latest/meta-data/instance-id) 13 | EBS_IDS=$(aws ec2 describe-volumes --filters Name=attachment.instance-id,Values="$AWS_INSTANCE_ID" --region $AWS_DEFAULT_REGION --query "Volumes[*].[VolumeId]" --out text | tr "\n" " ") 14 | aws ec2 create-tags --resources $EBS_IDS --region $AWS_DEFAULT_REGION --tags Key=Name,Value="${STACK_NAME} SlurmCtl Root Disk" 15 | 16 | # Tag Network Adapter for the Proxy 17 | ENI_IDS=$(aws ec2 describe-network-interfaces --filters Name=attachment.instance-id,Values="$AWS_INSTANCE_ID" --region $AWS_DEFAULT_REGION --query "NetworkInterfaces[*].[NetworkInterfaceId]" --out text | tr "\n" " ") 18 | aws ec2 create-tags --resources $ENI_IDS --region $AWS_DEFAULT_REGION --tags Key=Name,Value="${STACK_NAME} SlurmCtl Network Adapter" 19 | 20 | chmod +x $CONFIG_SCRIPT_PATH 21 | $CONFIG_SCRIPT_PATH 22 | -------------------------------------------------------------------------------- /source/resources/user_data/slurmctl_user_data_on_exit.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | if [[ $exitCode -ne 0 ]] && [[ ":{{ERROR_SNS_TOPIC_ARN}}" != ":" ]]; then 5 | instance_id=$(curl --silent http://169.254.169.254/latest/meta-data/instance-id) 6 | msg_file=$(mktemp) 7 | echo -e "grep cloud-init /var/log/messages | tail -n 200:\n\n" > $msg_file 8 | grep cloud-init /var/log/messages |tail -n 200 >> $msg_file 9 | if [ -e /var/log/cloud-init.log ]; then 10 | echo -e "\n\n\ntail -n 200 /var/log/cloud-init.log:\n\n" >> $msg_file 11 | tail -n 200 /var/log/cloud-init.log >> $msg_file 12 | fi 13 | aws sns publish --region {{AWS_DEFAULT_REGION}} --topic-arn {{ERROR_SNS_TOPIC_ARN}} --subject "$INSTANCE_NAME($instance_id) UserData failed" --message "file://$msg_file" 14 | rm $msg_file 15 | fi 16 | 17 | if ! needs-restarting -r; then 18 | reboot 19 | fi 20 | -------------------------------------------------------------------------------- /source/resources/user_data/slurmdbd_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: MIT-0 4 | 5 | source /etc/profile.d/instance_vars.sh 6 | 7 | function on_exit { 8 | rc=$? 9 | set +e 10 | 11 | if [[ $rc -ne 0 ]] && [[ ":$SOCA_ERROR_SNS_TOPIC_ARN" != ":" ]]; then 12 | aws sns publish --region $AWS_DEFAULT_REGION --topic-arn $SOCA_ERROR_SNS_TOPIC_ARN --subject "$INSTANCE_NAME slurmdb_config.sh failed" --message "See /var/log/cloud-init.log or grep cloud-init /var/log/messages | less for more info." 13 | fi 14 | 15 | # Make sure that security patches that require a reboot are applied 16 | if ! needs-restarting -r; then 17 | reboot 18 | fi 19 | } 20 | trap on_exit EXIT 21 | 22 | # Install security updates first. 23 | # Since this is Amazon Linux 2 don't need to configure proxy because yum repos are in S3. 24 | # Disable epel because it isn't in S3 and requires configuration. 25 | yum -y update --security --bugfix --skip-broken 26 | 27 | # Update to latest ssm agent 28 | if yum install -y https://s3.$AWS_DEFAULT_REGION.amazonaws.com/amazon-ssm-$AWS_DEFAULT_REGION/latest/linux_amd64/amazon-ssm-agent.rpm; then 29 | systemctl restart amazon-ssm-agent 30 | fi 31 | 32 | # Configure using ansible 33 | if ! yum list installed ansible &> /dev/null; then 34 | amazon-linux-extras install -y ansible2 35 | fi 36 | 37 | hostnamectl set-hostname --static $SlurmDbdHostname.$Domain 38 | 39 | PLAYBOOKS_PATH=/root/playbooks 40 | if [ -e $PLAYBOOKS_ZIP_PATH ]; then 41 | rm -rf $PLAYBOOKS_PATH 42 | mkdir -p $PLAYBOOKS_PATH 43 | pushd $PLAYBOOKS_PATH 44 | unzip -q $PLAYBOOKS_ZIP_PATH 45 | rm $PLAYBOOKS_ZIP_PATH 46 | popd 47 | fi 48 | 49 | pushd $PLAYBOOKS_PATH 50 | ansible-playbook $PLAYBOOKS_PATH/SlurmDbd.yml \ 51 | -i inventories/local.yml \ 52 | -e @/root/ansible_extra_vars.yml 53 | popd 54 | 55 | # Disable automatic motd update 56 | /usr/sbin/update-motd --disable 57 | rm -f /etc/cron.d/update-motd 58 | rm -f /etc/update-motd.d/* 59 | 60 | # Set up motd 61 | figlet -f slant "SLURM DB" > /etc/motd 62 | echo -e "Stack Name: ${STACK_NAME} 63 | " >> /etc/motd 64 | -------------------------------------------------------------------------------- /source/resources/user_data/slurmdbd_user_data.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | # Rerun after reboot 5 | script=$(readlink -f $0) 6 | script_name=$(basename $script) 7 | rerun_script=/var/lib/cloud/scripts/per-boot/10_$script_name 8 | ln -sf $script $rerun_script 9 | 10 | # Tag EBS disks manually as CFN ASG does not support it 11 | AWS_AVAIL_ZONE=$(curl http://169.254.169.254/latest/meta-data/placement/availability-zone) 12 | AWS_REGION="`echo \"$AWS_AVAIL_ZONE\" | sed "s/[a-z]$//"`" 13 | AWS_INSTANCE_ID=$(curl http://169.254.169.254/latest/meta-data/instance-id) 14 | EBS_IDS=$(aws ec2 describe-volumes --filters Name=attachment.instance-id,Values="$AWS_INSTANCE_ID" --region $AWS_DEFAULT_REGION --query "Volumes[*].[VolumeId]" --out text | tr "\n" " ") 15 | aws ec2 create-tags --resources $EBS_IDS --region $AWS_DEFAULT_REGION --tags Key=Name,Value="${STACK_NAME} SLURM DB Root Disk" 16 | 17 | # Tag Network Adapter for the Proxy 18 | ENI_IDS=$(aws ec2 describe-network-interfaces --filters Name=attachment.instance-id,Values="$AWS_INSTANCE_ID" --region $AWS_DEFAULT_REGION --query "NetworkInterfaces[*].[NetworkInterfaceId]" --out text | tr "\n" " ") 19 | aws ec2 create-tags --resources $ENI_IDS --region $AWS_DEFAULT_REGION --tags Key=Name,Value="${STACK_NAME} SLURM DB Network Adapter" 20 | 21 | chmod +x $CONFIG_SCRIPT_PATH 22 | $CONFIG_SCRIPT_PATH 23 | -------------------------------------------------------------------------------- /source/resources/user_data/slurmdbd_user_data_on_exit.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | if [[ $exitCode -ne 0 ]] && [[ ":{{ERROR_SNS_TOPIC_ARN}}" != ":" ]]; then 5 | instance_id=$(curl --silent http://169.254.169.254/latest/meta-data/instance-id) 6 | msg_file=$(mktemp) 7 | echo -e "grep cloud-init /var/log/messages | tail -n 200:\n\n" > $msg_file 8 | grep cloud-init /var/log/messages |tail -n 200 >> $msg_file 9 | if [ -e /var/log/cloud-init.log ]; then 10 | echo -e "\n\n\ntail -n 200 /var/log/cloud-init.log:\n\n" >> $msg_file 11 | tail -n 200 /var/log/cloud-init.log >> $msg_file 12 | fi 13 | aws sns publish --region {{AWS_DEFAULT_REGION}} --topic-arn {{ERROR_SNS_TOPIC_ARN}} --subject "$INSTANCE_NAME($instance_id) UserData failed" "file://$msg_file" 14 | rm $msg_file 15 | fi 16 | 17 | if ! needs-restarting -r; then 18 | reboot 19 | fi 20 | -------------------------------------------------------------------------------- /source/resources/user_data/user_data_bootstrap.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | # Update to latest ssm agent 5 | if [[ $ARCHITECTURE == 'x86_64' ]]; then 6 | amazon_ssm_agent_url=https://s3.$AWS_DEFAULT_REGION.amazonaws.com/amazon-ssm-$AWS_DEFAULT_REGION/latest/linux_amd64/amazon-ssm-agent.rpm 7 | elif [[ $ARCHITECTURE == 'arm64' ]]; then 8 | amazon_ssm_agent_url=https://s3.$AWS_DEFAULT_REGION.amazonaws.com/amazon-ssm-$AWS_DEFAULT_REGION/latest/linux_arm64/amazon-ssm-agent.rpm 9 | fi 10 | if yum install -y $amazon_ssm_agent_url; then 11 | systemctl restart amazon-ssm-agent 12 | else 13 | if ! yum list installed amazon-ssm-agent &> /dev/null; then 14 | echo "error: Could not install amazon-ssm-agent" 15 | exit 1 16 | fi 17 | fi 18 | systemctl enable amazon-ssm-agent || true 19 | 20 | # Install epel-release. Contains ansible 21 | if ! yum list installed epel-release &> /dev/null; then 22 | if [[ $DISTRIBUTION == 'Amazon' ]]; then 23 | amazon-linux-extras install -y epel 24 | else 25 | if [[ $DISTRIBUTION_MAJOR_VERSION == '7' ]]; then 26 | yum -y install epel-release || yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm 27 | elif [[ $DISTRIBUTION_MAJOR_VERSION == '8' ]]; then 28 | yum -y install epel-release || yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm 29 | fi 30 | fi 31 | fi 32 | 33 | # Install ansible 34 | if ! yum list installed ansible &> /dev/null; then 35 | if [[ $DISTRIBUTION == 'Amazon' ]]; then 36 | amazon-linux-extras install -y epel 37 | else 38 | yum -y install ansible 39 | fi 40 | fi 41 | 42 | # Install unzip. Used to install awscli 43 | if ! yum list installed unzip &> /dev/null; then 44 | yum -y install unzip 45 | fi 46 | 47 | # Add path to aws cli 48 | export PATH=/usr/local/bin:$PATH 49 | 50 | # Install/update awscli to make sure running version 2 51 | if ! aws --version | grep aws-cli/2; then 52 | pushd /tmp 53 | if yum list installed awscli &> /dev/null; then 54 | yum -y remove awscli 55 | fi 56 | rm -rf /usr/local/aws-cli 57 | rm -f awscliv2.zip 58 | rm -rf aws 59 | if [[ $ARCHITECTURE == 'x86_64' ]]; then 60 | awscli_url=https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip 61 | elif [[ $ARCHITECTURE == 'arm64' ]]; then 62 | awscli_url=https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip 63 | else 64 | echo "error: Unsupported $ARCHITECTURE architecture" 65 | exit 1 66 | fi 67 | curl "$awscli_url" -o "awscliv2.zip" 68 | unzip -q awscliv2.zip 69 | ./aws/install 70 | rm -f awscliv2.zip 71 | rm -rf aws 72 | popd 73 | if ! aws --version | grep aws-cli/2; then 74 | echo "error: Could not update awscli to v2" 75 | exit 1 76 | fi 77 | fi 78 | -------------------------------------------------------------------------------- /source/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | import setuptools 5 | 6 | 7 | with open("CDK-README.md") as fp: 8 | long_description = fp.read() 9 | 10 | 11 | setuptools.setup( 12 | name="cdk", 13 | version="0.0.1", 14 | 15 | description="A sample CDK Python app", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | 19 | author="author", 20 | 21 | package_dir={"": "cdk"}, 22 | packages=setuptools.find_packages(where="cdk"), 23 | 24 | install_requires=[ 25 | "aws-cdk-lib>=2.12.0", 26 | "constructs>=10.0.0", 27 | ], 28 | 29 | python_requires=">=3.6", 30 | 31 | classifiers=[ 32 | "Development Status :: 4 - Beta", 33 | 34 | "Intended Audience :: Developers", 35 | 36 | "Programming Language :: JavaScript", 37 | "Programming Language :: Python :: 3 :: Only", 38 | "Programming Language :: Python :: 3.6", 39 | "Programming Language :: Python :: 3.7", 40 | "Programming Language :: Python :: 3.8", 41 | 42 | "Topic :: Software Development :: Code Generators", 43 | "Topic :: Utilities", 44 | 45 | "Typing :: Typed", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /source/slurm_installer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, 8 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 12 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 13 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 14 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 16 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | """ 18 | -------------------------------------------------------------------------------- /source/slurm_installer/prompt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, 8 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 12 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 13 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 14 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 16 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | """ 18 | 19 | import sys 20 | from colored import fg, bg, attr 21 | import getpass 22 | 23 | def get_input(prompt, specified_value=None, expected_answers=None, expected_type=int, hide=False): 24 | if expected_answers is None: 25 | expected_answers = [] 26 | response = None 27 | if specified_value: 28 | # Value specified, validating user provided input 29 | if expected_answers: 30 | if specified_value not in expected_answers: 31 | print(f"{fg('red')}{specified_value} is an invalid choice. Choose something from {expected_answers}{attr('reset')}") 32 | sys.exit(1) 33 | return specified_value 34 | 35 | else: 36 | # Value not specified, prompt user 37 | while isinstance(response, expected_type) is False: 38 | if sys.version_info[0] >= 3: 39 | if expected_answers: 40 | question = input(f"{fg('misty_rose_3')} >> {prompt} {expected_answers}{attr('reset')}: ") 41 | else: 42 | if hide is True: 43 | question = getpass.getpass(prompt=f"{fg('misty_rose_3')} >> {prompt}{attr('reset')}: ") 44 | else: 45 | question = input(f"{fg('misty_rose_3')} >> {prompt}{attr('reset')}: ") 46 | else: 47 | # Python 2 48 | if expected_answers: 49 | question = raw_input(f"{fg('misty_rose_3')} >> {prompt} {expected_answers}{attr('reset')}: ") 50 | else: 51 | question = raw_input(f"{fg('misty_rose_3')} >> {prompt}{attr('reset')}: ") 52 | 53 | try: 54 | response = expected_type(question.rstrip().lstrip()) 55 | except ValueError: 56 | print(f"Sorry, expected answer is something from {expected_answers}") 57 | 58 | 59 | if expected_answers: 60 | if response not in expected_answers: 61 | print(f"{fg('red')}{response} is an invalid choice. Choose something from {expected_answers}{attr('reset')}") 62 | response = None 63 | 64 | return response 65 | -------------------------------------------------------------------------------- /source/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /source/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | -------------------------------------------------------------------------------- /source/tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | -------------------------------------------------------------------------------- /source/tests/unit/test_cdk_slurm_stack.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | import json 5 | import pytest 6 | 7 | from aws_cdk import core 8 | from cdk.cdk_slurm_stack import CdkSlurmStack 9 | 10 | 11 | def get_template(): 12 | app = core.App() 13 | CdkSlurmStack(app, "cdk") 14 | return json.dumps(app.synth().get_stack("cdk").template) 15 | 16 | 17 | def test_sqs_queue_created(): 18 | assert("AWS::SQS::Queue" in get_template()) 19 | 20 | 21 | def test_sns_topic_created(): 22 | assert("AWS::SNS::Topic" in get_template()) 23 | -------------------------------------------------------------------------------- /tests/test_slurm_minimal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #import filecmp 4 | from os import path, system 5 | from os.path import abspath, dirname 6 | import pytest 7 | import subprocess 8 | from subprocess import CalledProcessError, check_output 9 | 10 | 11 | REPO_DIR = abspath(f"{dirname(__file__)}/..") 12 | 13 | def test_slurm_minimal(): 14 | try: 15 | output = check_output([f"{REPO_DIR}/install.sh", '--cdk-cmd', 'create', '--region', 'us-east-1'], stderr=subprocess.STDOUT, encoding='utf8') 16 | except CalledProcessError as e: 17 | print(f"returncode: {e.returncode}") 18 | print(f"output:\n{e.stdout}") 19 | raise 20 | -------------------------------------------------------------------------------- /xio/userdata.txt: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | runcmd: 3 | - [sh, -c, "mkdir -p /xcompute"] 4 | - [sh, -c, "mount 172.31.24.5:/xcompute /xcompute"] 5 | - [sh, -c, "mkdir -p /home/slurm"] 6 | - [sh, -c, "mount 172.31.24.5:/home/slurm /home/slurm"] 7 | - [sh, -c, "rm -rf /etc/slurm"] 8 | - [sh, -c, "ln -s /xcompute/slurm/ /etc/slurm"] 9 | - [sh, -c, "cp /xcompute/slurm/munge.key /etc/munge/munge.key"] 10 | - [sh, -c, "systemctl restart munge"] 11 | # ALWAYS LAST! 12 | - [ 13 | sh, 14 | -c, 15 | "echo XSPOT_NODENAME > /var/run/nodename; scontrol update nodename=XSPOT_NODENAME nodeaddr=`hostname -I`", 16 | ] 17 | 18 | --------------------------------------------------------------------------------