├── .bandit.ini ├── .flake8 ├── .github ├── CODEOWNERS ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── bump_version.yml │ ├── changelog_enforcer.yml │ ├── ci.yml │ ├── codeql-analysis.yml │ ├── security_exclusions_checker.yml │ └── unsafe_patterns_checker.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .pylintrc ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── THIRD-PARTY-LICENSES.txt ├── requirements.txt ├── setup.py ├── src ├── __init__.py ├── aws │ ├── __init__.py │ ├── common.py │ └── ec2.py ├── common │ ├── __init__.py │ ├── ec2_utils.py │ ├── schedulers │ │ ├── __init__.py │ │ ├── slurm_commands.py │ │ └── slurm_reservation_commands.py │ ├── time_utils.py │ └── utils.py └── slurm_plugin │ ├── __init__.py │ ├── capacity_block_manager.py │ ├── cluster_event_publisher.py │ ├── clustermgtd.py │ ├── common.py │ ├── computemgtd.py │ ├── console_logger.py │ ├── fleet_manager.py │ ├── fleet_status_manager.py │ ├── instance_manager.py │ ├── logging │ ├── parallelcluster_clustermgtd_logging.conf │ ├── parallelcluster_computemgtd_logging.conf │ ├── parallelcluster_fleet_status_manager_logging.conf │ ├── parallelcluster_resume_logging.conf │ └── parallelcluster_suspend_logging.conf │ ├── resume.py │ ├── slurm_resources.py │ ├── suspend.py │ └── task_executor.py ├── tests ├── __init__.py ├── aws │ └── test_ec2.py ├── common.py ├── common │ ├── schedulers │ │ ├── __init__.py │ │ ├── test_slurm_commands.py │ │ ├── test_slurm_commands │ │ │ └── TestPartitionNodelistMapping │ │ │ │ ├── test_get_partition_nodelist_mapping │ │ │ │ └── slurm_dir │ │ │ │ │ └── etc │ │ │ │ │ └── pcluster │ │ │ │ │ └── parallelcluster_partition_nodelist_mapping.json │ │ │ │ └── test_get_partitions │ │ │ │ └── slurm_dir │ │ │ │ └── etc │ │ │ │ └── pcluster │ │ │ │ └── parallelcluster_partition_nodelist_mapping.json │ │ └── test_slurm_reservation_commands.py │ ├── test_ec2_utils.py │ ├── test_time_utils.py │ └── test_utils.py ├── conftest.py ├── requirements.txt └── slurm_plugin │ ├── __init__.py │ ├── slurm_resources │ ├── __init__.py │ └── test_slurm_resources.py │ ├── test_capacity_block_manager.py │ ├── test_cluster_event_publisher.py │ ├── test_clustermgtd.py │ ├── test_clustermgtd │ ├── TestClustermgtdConfig │ │ ├── test_config_comparison │ │ │ ├── config.conf │ │ │ └── config_modified.conf │ │ └── test_config_parsing │ │ │ ├── all_options.conf │ │ │ ├── default.conf │ │ │ └── health_check.conf │ └── test_manage_cluster_boto3 │ │ └── default.conf │ ├── test_common.py │ ├── test_common │ └── test_read_json │ │ ├── faulty.json │ │ └── standard.json │ ├── test_computemgtd.py │ ├── test_computemgtd │ └── test_computemgtd_config │ │ ├── all_options.conf │ │ └── default.conf │ ├── test_console_logger.py │ ├── test_fleet_manager.py │ ├── test_fleet_manager │ └── TestEc2CreateFleetManager │ │ └── test_evaluate_launch_params │ │ ├── all_or_nothing │ │ └── expected_launch_params.json │ │ ├── fleet-multi-az-multi-it-all_or_nothing │ │ └── expected_launch_params.json │ │ ├── fleet-multi-az-multi-it │ │ └── expected_launch_params.json │ │ ├── fleet-multi-az-single-it-all_or_nothing │ │ └── expected_launch_params.json │ │ ├── fleet-single-az-multi-it-all_or_nothing │ │ └── expected_launch_params.json │ │ ├── fleet_capacity_block │ │ └── expected_launch_params.json │ │ ├── fleet_ondemand │ │ └── expected_launch_params.json │ │ ├── fleet_spot │ │ └── expected_launch_params.json │ │ └── launch_overrides │ │ └── expected_launch_params.json │ ├── test_fleet_status_manager.py │ ├── test_fleet_status_manager │ ├── test_fleet_status_manager_config │ │ ├── all_options.conf │ │ └── default.conf │ └── test_get_computefleet_status │ │ ├── correct_status.json │ │ ├── malformed_status │ │ ├── no_status.json │ │ └── wrong_status.json │ ├── test_instance_manager.py │ ├── test_resume.py │ ├── test_resume │ ├── test_get_slurm_resume │ │ ├── malformed.json │ │ └── resume.json │ └── test_resume_config │ │ ├── all_options.conf │ │ └── default.conf │ ├── test_suspend.py │ ├── test_suspend │ └── test_suspend_config │ │ ├── all_options.conf │ │ └── default.conf │ └── test_task_executor.py ├── tox.ini └── util ├── bump-version.sh ├── create-attribution-doc.sh └── upload-node.sh /.bandit.ini: -------------------------------------------------------------------------------- 1 | skips: [] 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = 3 | # D105: Missing docstring in magic method 4 | D105, 5 | # D100: Missing docstring in public module 6 | D100, 7 | # D101: Missing docstring in public class 8 | D101, 9 | # D102: Missing docstring in public method 10 | D102, 11 | # D103: Missing docstring in public function 12 | D103, 13 | # D104: Missing docstring in public package 14 | D104, 15 | # D107: Missing docstring in __init__ 16 | D107, 17 | # W503: line break before binary operator => Conflicts with black style. 18 | W503, 19 | # N818: exception name should be named with an Error suffix 20 | N818 21 | exclude = 22 | .tox, 23 | .git, 24 | .pytest_cache, 25 | docs/source/conf.py, 26 | build, 27 | dist, 28 | tests/fixtures/*, 29 | *.pyc, 30 | *.egg-info, 31 | .cache, 32 | .eggs 33 | max-complexity = 10 34 | max-line-length = 120 35 | import-order-style = google 36 | application-import-names = flake8 37 | format = %(cyan)s%(path)s%(reset)s:%(bold)s%(yellow)s%(row)d%(reset)s:%(bold)s%(green)s%(col)d%(reset)s: %(bold)s%(red)s%(code)s%(reset)s %(text)s 38 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @aws/aws-parallelcluster-admins @aws/aws-parallelcluster-maintainers 2 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | ### Description of changes 3 | * Describe *what* you're changing and *why* you're doing these changes. 4 | 5 | ### Tests 6 | * Describe the automated and/or manual tests executed to validate the patch. 7 | * Describe the added/modified tests. 8 | 9 | ### References 10 | * Link to impacted open issues. 11 | * Link to related PRs in other packages (i.e. cookbook, node). 12 | * Link to documentation useful to understand the changes. 13 | 14 | ### Checklist 15 | - Make sure you are pointing to **the right branch**. 16 | - If you're creating a patch for a branch other than `develop` add the branch name as prefix in the PR title (e\.g\. `[release-3.6]`). 17 | - Check all commits' messages are clear, describing what and why vs how. 18 | - Make sure **to have added unit tests or integration tests** to cover the new/modified code. 19 | - Check if documentation is impacted by this change. 20 | 21 | Please review the [guidelines for contributing](../CONTRIBUTING.md) and [Pull Request Instructions](https://github.com/aws/aws-parallelcluster/wiki/Git-Pull-Request-Instructions). 22 | 23 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 24 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Documentation for all configuration options: 2 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | 4 | version: 2 5 | updates: 6 | - package-ecosystem: "pip" 7 | directory: "/" 8 | schedule: 9 | interval: "daily" 10 | -------------------------------------------------------------------------------- /.github/workflows/bump_version.yml: -------------------------------------------------------------------------------- 1 | # Bump Version workflow that is triggered manually 2 | name: Bump Version 3 | 4 | on: 5 | workflow_dispatch: 6 | # Inputs the workflow accepts. 7 | inputs: 8 | pcluster-version: 9 | description: 'The target version of ParallelCluster CLI' 10 | required: true 11 | type: string 12 | branch: 13 | description: 'The Github branch name' 14 | required: true 15 | type: string 16 | 17 | jobs: 18 | create-pull-requests: 19 | permissions: 20 | contents: write 21 | pull-requests: write 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v2 25 | with: 26 | fetch-depth: 0 27 | ref: ${{ inputs.branch }} 28 | - name: Modifiy Code to Change version 29 | run: ./util/bump-version.sh ${{ inputs.pcluster-version }} 30 | 31 | - name: Create a Pull Request 32 | uses: peter-evans/create-pull-request@v6 33 | with: 34 | commit-message: 'Bump version to ${{ inputs.pcluster-version }}' 35 | title: 'Bump version to ${{ inputs.pcluster-version }}' 36 | body: | 37 | This PR contains version bump. 38 | Auto-generated by Github Action 39 | branch: versionbump${{ inputs.branch }}${{ inputs.pcluster-version }} 40 | delete-branch: true 41 | labels: skip-changelog-update 42 | -------------------------------------------------------------------------------- /.github/workflows/changelog_enforcer.yml: -------------------------------------------------------------------------------- 1 | name: Enforce Changelog Updates 2 | on: 3 | pull_request: 4 | types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled] 5 | 6 | jobs: 7 | # Enforces the update of a changelog file on every pull request 8 | changelog: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: dangoslen/changelog-enforcer@v1.4.0 13 | with: 14 | changeLogPath: CHANGELOG.md 15 | skipLabel: skip-changelog-update 16 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ParallelCluster CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | depcheck: 7 | runs-on: ubuntu-latest 8 | name: DepCheck 9 | steps: 10 | - uses: actions/checkout@v1 11 | - name: Dependency Check 12 | uses: dependency-check/Dependency-Check_Action@main 13 | with: 14 | path: '.' 15 | format: 'HTML' 16 | project: 'aws-parallelcluster-node' 17 | - name: Upload Test results 18 | uses: actions/upload-artifact@master 19 | with: 20 | name: Depcheck report 21 | path: ${{github.workspace}}/reports 22 | build: 23 | runs-on: ${{ matrix.os }} 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | os: [ubuntu-latest] 28 | name: 29 | - Python 3.9 Tests 30 | - Python 3.10 Tests 31 | - Python 3.9 Tests Coverage 32 | - Code Checks 33 | include: 34 | - name: Python 3.9 Tests 35 | python: 3.9 36 | toxdir: cli 37 | toxenv: py39-nocov 38 | - name: Python 3.10 Tests 39 | python: '3.10' 40 | toxdir: cli 41 | toxenv: py310-nocov 42 | - name: Python 3.9 Tests Coverage 43 | python: 3.9 44 | toxdir: cli 45 | toxenv: py39-cov 46 | - name: Code Checks 47 | python: 3.9 48 | toxdir: cli 49 | toxenv: code-linters 50 | 51 | steps: 52 | - uses: actions/checkout@v2 53 | - name: Setup Python 54 | uses: actions/setup-python@v2 55 | with: 56 | python-version: ${{ matrix.python }} 57 | - name: Install Tox and any other packages 58 | run: pip install tox 59 | - name: Run Tox 60 | run: tox -e ${{ matrix.toxenv }} 61 | - name: Upload code coverage report to Codecov 62 | uses: codecov/codecov-action@v3 63 | if: ${{ endsWith(matrix.toxenv, '-cov') }} 64 | with: 65 | files: coverage.xml 66 | flags: unittests 67 | verbose: true 68 | shellcheck: 69 | name: Shellcheck 70 | runs-on: ubuntu-latest 71 | steps: 72 | - uses: actions/checkout@v2 73 | - name: Run ShellCheck 74 | uses: ludeeus/action-shellcheck@master 75 | with: 76 | severity: warning 77 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: '0 10 * * 2' 8 | 9 | jobs: 10 | analyze: 11 | name: Analyze 12 | runs-on: ubuntu-latest 13 | permissions: 14 | actions: read 15 | contents: read 16 | security-events: write 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | language: [ 'python' ] 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@v2 24 | - name: Initialize CodeQL 25 | uses: github/codeql-action/init@v2 26 | with: 27 | languages: ${{ matrix.language }} 28 | queries: +security-and-quality 29 | - name: Perform CodeQL Analysis 30 | uses: github/codeql-action/analyze@v2 31 | -------------------------------------------------------------------------------- /.github/workflows/security_exclusions_checker.yml: -------------------------------------------------------------------------------- 1 | name: Security Exclusions Checker 2 | on: 3 | pull_request: 4 | types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled] 5 | 6 | jobs: 7 | # Prevent security exclusions 8 | security-exclusions-check: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check PR 12 | uses: francesco-giordano/gh-pr-content-checker@v1.0.0 13 | with: 14 | diffDoesNotContainRegex: "\\bnosec\\b|\\bnosemgrep\\b" 15 | skipLabels: skip-security-exclusions-check 16 | -------------------------------------------------------------------------------- /.github/workflows/unsafe_patterns_checker.yml: -------------------------------------------------------------------------------- 1 | name: Unsafe Patterns Checker 2 | on: 3 | pull_request: 4 | types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled] 5 | 6 | jobs: 7 | # Prevent bad URL suffix 8 | bad-url-suffix-check: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check PR for Disallowed URL Suffixes 12 | uses: francesco-giordano/gh-pr-content-checker@v1.0.0 13 | with: 14 | diffDoesNotContainRegex: "amazonaws\\.com|amazonaws\\.com\\.cn|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov" 15 | skipLabels: skip-bad-url-suffix-check 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | *.class 4 | *~ 5 | *# 6 | dist/ 7 | build/ 8 | *.egg-info/ 9 | .idea/ 10 | *.iml 11 | .DS_Store 12 | .tox/ 13 | .coverage 14 | coverage.xml 15 | assets/ 16 | report.html 17 | 18 | aws-parallelcluster-node-*.tgz 19 | aws-parallelcluster-node-*.md5 20 | aws-parallelcluster-node-*.date 21 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | line_length=120 3 | known_third_party=assertpy,boto3,botocore,common,pytest,retrying,setuptools,slurm_plugin 4 | # 3 - Vertical Hanging Indent 5 | # from third_party import ( 6 | # lib1, 7 | # lib2, 8 | # lib3, 9 | # lib4, 10 | # ) 11 | multi_line_output=3 12 | include_trailing_comma=true 13 | profile=black -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: check-added-large-files 7 | - id: check-ast 8 | - id: check-executables-have-shebangs 9 | - id: check-json 10 | - id: check-merge-conflict 11 | - id: check-xml 12 | - id: check-yaml 13 | - id: debug-statements 14 | - id: detect-private-key 15 | - id: check-symlinks 16 | - id: end-of-file-fixer 17 | - id: pretty-format-json 18 | args: ['--autofix', '--indent=4'] 19 | - id: requirements-txt-fixer 20 | - id: mixed-line-ending 21 | args: ['--fix=no'] 22 | 23 | - repo: https://github.com/PyCQA/flake8 24 | rev: 6.0.0 25 | hooks: 26 | - id: flake8 27 | additional_dependencies: [flake8-docstrings, flake8-bugbear, flake8-colors, pep8-naming] 28 | 29 | - repo: https://github.com/timothycrosley/isort 30 | rev: 5.12.0 31 | hooks: 32 | - id: isort 33 | args: ['-rc', '-w 120'] 34 | 35 | - repo: https://github.com/ambv/black 36 | rev: 23.1.0 37 | hooks: 38 | - id: black 39 | args: ['-l 120'] 40 | 41 | - repo: https://github.com/PyCQA/bandit 42 | rev: 1.7.4 43 | hooks: 44 | - id: bandit 45 | args: ['-r', '-c', '.bandit.ini', '--exclude', 'tests'] 46 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/aws/aws-parallelcluster-node/issues), or [recently closed](https://github.com/aws/aws-parallelcluster-node/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *develop* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-parallelcluster-node/labels/help%20wanted) issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](https://github.com/aws/aws-parallelcluster-node/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | aws-parallelcluster-node 2 | Copyright 2014-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AWS ParallelCluster Node 2 | ======================== 3 | 4 | [![PyPI Version](https://img.shields.io/pypi/v/aws-parallelcluster-node)](https://pypi.org/project/aws-parallelcluster-node/) 5 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 6 | [![codecov](https://codecov.io/gh/aws/aws-parallelcluster-node/branch/develop/graph/badge.svg)](https://codecov.io/gh/aws/aws-parallelcluster-node) 7 | [![ParallelCluster CI](https://github.com/aws/aws-parallelcluster-node/workflows/ParallelCluster%20CI/badge.svg)](https://github.com/aws/aws-parallelcluster-node/actions) 8 | 9 | This repo contains the aws-parallelcluster-node package installed on the Amazon EC2 instances launched 10 | as part of AWS ParallelCluster. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.7.55 2 | retrying~=1.3 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | import os 13 | 14 | from setuptools import find_packages, setup 15 | 16 | # Utility function to read the README file. 17 | # Used for the long_description. It's nice, because now 1) we have a top level 18 | # README file and 2) it's easier to type in the README file than to put a raw 19 | # string in below ... 20 | 21 | 22 | def read(fname): 23 | path = os.path.join(os.path.dirname(__file__), fname) 24 | with open(path, "r") as file: 25 | return file.read() 26 | 27 | 28 | console_scripts = [ 29 | "slurm_resume = slurm_plugin.resume:main", 30 | "slurm_suspend = slurm_plugin.suspend:main", 31 | "slurm_fleet_status_manager = slurm_plugin.fleet_status_manager:main", 32 | "clustermgtd = slurm_plugin.clustermgtd:main", 33 | "computemgtd = slurm_plugin.computemgtd:main", 34 | ] 35 | version = "3.14.0" 36 | requires = ["boto3>=1.7.55", "retrying>=1.3.3"] 37 | 38 | setup( 39 | name="aws-parallelcluster-node", 40 | version=version, 41 | author="Amazon Web Services", 42 | description="aws-parallelcluster-node provides the scripts for an AWS ParallelCluster node.", 43 | url="https://github.com/aws/aws-parallelcluster-node", 44 | license="Apache License 2.0", 45 | packages=find_packages("src", exclude=["tests"]), 46 | package_dir={"": "src"}, 47 | python_requires=">=3.9", 48 | install_requires=requires, 49 | entry_points=dict(console_scripts=console_scripts), 50 | zip_safe=False, 51 | package_data={"slurm_plugin": ["logging/*.conf"]}, 52 | long_description=( 53 | "aws-parallelcluster-node is the python package installed on the Amazon EC2 instances launched " 54 | "as part of AWS ParallelCluster." 55 | ), 56 | classifiers=[ 57 | "Development Status :: 5 - Production/Stable", 58 | "Environment :: Console", 59 | "Programming Language :: Python", 60 | "Topic :: Scientific/Engineering", 61 | "License :: OSI Approved :: Apache Software License", 62 | ], 63 | ) 64 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /src/aws/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /src/aws/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | import functools 13 | import logging 14 | import time 15 | from enum import Enum 16 | 17 | import boto3 18 | from botocore.config import Config 19 | from botocore.exceptions import BotoCoreError, ClientError, ParamValidationError 20 | 21 | LOGGER = logging.getLogger(__name__) 22 | 23 | 24 | class AWSClientError(Exception): 25 | """Error during execution of some AWS calls.""" 26 | 27 | class ErrorCode(Enum): 28 | """Error codes for AWS ClientError.""" 29 | 30 | VALIDATION_ERROR = "ValidationError" 31 | REQUEST_LIMIT_EXCEEDED = "RequestLimitExceeded" 32 | THROTTLING_EXCEPTION = "ThrottlingException" 33 | CONDITIONAL_CHECK_FAILED_EXCEPTION = "ConditionalCheckFailedException" 34 | 35 | @classmethod 36 | def throttling_error_codes(cls): 37 | """Return a set of error codes returned when service rate limits are exceeded.""" 38 | return {cls.REQUEST_LIMIT_EXCEEDED.value, cls.THROTTLING_EXCEPTION.value} 39 | 40 | def __init__(self, function_name: str, message: str, error_code: str = None): 41 | super().__init__(message) 42 | self.message = message 43 | self.error_code = error_code 44 | self.function_name = function_name 45 | 46 | 47 | class LimitExceededError(AWSClientError): 48 | """Error caused by exceeding the limits of a downstream AWS service.""" 49 | 50 | def __init__(self, function_name: str, message: str, error_code: str = None): 51 | super().__init__(function_name=function_name, message=message, error_code=error_code) 52 | 53 | 54 | class BadRequestError(AWSClientError): 55 | """Error caused by a problem in the request.""" 56 | 57 | def __init__(self, function_name: str, message: str, error_code: str = None): 58 | super().__init__(function_name=function_name, message=message, error_code=error_code) 59 | 60 | 61 | class AWSExceptionHandler: 62 | """AWS Exception handler.""" 63 | 64 | @staticmethod 65 | def handle_client_exception(func): 66 | """Handle Boto3 errors, can be used as a decorator.""" 67 | 68 | @functools.wraps(func) 69 | def wrapper(*args, **kwargs): 70 | try: 71 | return func(*args, **kwargs) 72 | except ParamValidationError as validation_error: 73 | error = BadRequestError( 74 | func.__name__, 75 | "Error validating parameter. Failed with exception: {0}".format(str(validation_error)), 76 | ) 77 | except BotoCoreError as e: 78 | error = AWSClientError(func.__name__, str(e)) 79 | except ClientError as e: 80 | # add request id 81 | message = e.response["Error"]["Message"] 82 | error_code = e.response["Error"]["Code"] 83 | 84 | if error_code in AWSClientError.ErrorCode.throttling_error_codes(): 85 | error = LimitExceededError(func.__name__, message, error_code) 86 | elif error_code == AWSClientError.ErrorCode.VALIDATION_ERROR: 87 | error = BadRequestError(func.__name__, message, error_code) 88 | else: 89 | error = AWSClientError(func.__name__, message, error_code) 90 | LOGGER.error("Encountered error when performing boto3 call in %s: %s", error.function_name, error.message) 91 | raise error 92 | 93 | return wrapper 94 | 95 | @staticmethod 96 | def retry_on_boto3_throttling(func): 97 | """Retry boto3 calls on throttling, can be used as a decorator.""" 98 | 99 | @functools.wraps(func) 100 | def wrapper(*args, **kwargs): 101 | while True: 102 | try: 103 | return func(*args, **kwargs) 104 | except ClientError as e: 105 | if e.response["Error"]["Code"] != "Throttling": 106 | raise 107 | LOGGER.debug("Throttling when calling %s function. Will retry in %d seconds.", func.__name__, 5) 108 | time.sleep(5) 109 | 110 | return wrapper 111 | 112 | 113 | def _log_boto3_calls(params, **kwargs): 114 | service = kwargs["event_name"].split(".")[-2] 115 | operation = kwargs["event_name"].split(".")[-1] 116 | region = kwargs["context"].get("client_region", boto3.session.Session().region_name) 117 | LOGGER.info( 118 | "Executing boto3 call: region=%s, service=%s, operation=%s, params=%s", region, service, operation, params 119 | ) 120 | 121 | 122 | class Boto3Client: 123 | """Boto3 client Class.""" 124 | 125 | def __init__(self, client_name: str, config: Config, region: str = None): 126 | region = region if region else get_region() 127 | self._client = boto3.client(client_name, region_name=region, config=config if config else None) 128 | self._client.meta.events.register("provide-client-params.*.*", _log_boto3_calls) 129 | 130 | def _paginate_results(self, method, **kwargs): 131 | """ 132 | Return a generator for a boto3 call, this allows pagination over an arbitrary number of responses. 133 | 134 | :param method: boto3 method 135 | :param kwargs: arguments to method 136 | :return: generator with boto3 results 137 | """ 138 | paginator = self._client.get_paginator(method.__name__) 139 | for page in paginator.paginate(**kwargs).result_key_iters(): 140 | for result in page: 141 | yield result 142 | 143 | 144 | class Boto3Resource: 145 | """Boto3 resource Class.""" 146 | 147 | def __init__(self, resource_name: str): 148 | self._resource = boto3.resource(resource_name) 149 | self._resource.meta.client.meta.events.register("provide-client-params.*.*", _log_boto3_calls) 150 | 151 | 152 | def get_region(): 153 | """Get region used internally for all the AWS calls.""" 154 | region = boto3.session.Session().region_name 155 | if region is None: 156 | raise AWSClientError("get_region", "AWS region not configured") 157 | return region 158 | -------------------------------------------------------------------------------- /src/aws/ec2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | from typing import List 12 | 13 | from common.utils import ApiMocker 14 | 15 | from aws.common import AWSExceptionHandler, Boto3Client 16 | 17 | 18 | class CapacityReservationInfo: 19 | """ 20 | Data object wrapping the result of a describe-capacity-reservations call. 21 | 22 | { 23 | "CapacityReservationId": "cr-123456", 24 | "OwnerId": "123", 25 | "CapacityReservationArn": "arn:aws:ec2:us-east-2:123:capacity-reservation/cr-123456", 26 | "AvailabilityZoneId": "use2-az1", 27 | "InstanceType": "t3.large", 28 | "InstancePlatform": "Linux/UNIX", 29 | "AvailabilityZone": "eu-west-1a", 30 | "Tenancy": "default", 31 | "TotalInstanceCount": 1, 32 | "AvailableInstanceCount": 1, 33 | "EbsOptimized": false, 34 | "EphemeralStorage": false, 35 | "State": "active", 36 | "StartDate": "2023-11-15T11:30:00+00:00", 37 | "EndDate": "2023-11-16T11:30:00+00:00", # capacity-block only 38 | "EndDateType": "limited", 39 | "InstanceMatchCriteria": "targeted", 40 | "CreateDate": "2023-10-25T20:40:13+00:00", 41 | "Tags": [ 42 | { 43 | "Key": "aws:ec2capacityreservation:incrementalRequestedQuantity", 44 | "Value": "1" 45 | }, 46 | { 47 | "Key": "aws:ec2capacityreservation:capacityReservationType", 48 | "Value": "capacity-block" 49 | } 50 | ], 51 | "CapacityAllocations": [], 52 | "ReservationType": "capacity-block" # capacity-block only 53 | } 54 | """ 55 | 56 | def __init__(self, capacity_reservation_data): 57 | self.capacity_reservation_data = capacity_reservation_data 58 | 59 | def capacity_reservation_id(self): 60 | """Return the id of the Capacity Reservation.""" 61 | return self.capacity_reservation_data.get("CapacityReservationId") 62 | 63 | def state(self): 64 | """Return the state of the Capacity Reservation.""" 65 | return self.capacity_reservation_data.get("State") 66 | 67 | def __eq__(self, other): 68 | return self.__dict__ == other.__dict__ 69 | 70 | 71 | class Ec2Client(Boto3Client): 72 | """Implement EC2 Boto3 client.""" 73 | 74 | def __init__(self, config=None, region=None): 75 | super().__init__("ec2", region=region, config=config) 76 | 77 | @AWSExceptionHandler.handle_client_exception 78 | @ApiMocker.mockable 79 | def describe_capacity_reservations(self, capacity_reservation_ids: List[str]) -> List[CapacityReservationInfo]: 80 | """Accept a space separated list of reservation ids. Return a list of CapacityReservationInfo.""" 81 | result = [] 82 | response = list( 83 | self._paginate_results( 84 | self._client.describe_capacity_reservations, 85 | CapacityReservationIds=capacity_reservation_ids, 86 | # ReservationType=reservation_type, # not yet available 87 | ) 88 | ) 89 | for capacity_reservation in response: 90 | result.append(CapacityReservationInfo(capacity_reservation)) 91 | 92 | return result 93 | -------------------------------------------------------------------------------- /src/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /src/common/ec2_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "LICENSE.txt" file accompanying this file. 10 | # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. 11 | # See the License for the specific language governing permissions and limitations under the License. 12 | 13 | 14 | def get_private_ip_address_and_dns_name(instance_info): 15 | """ 16 | Return the PrivateIpAddress and PrivateDnsName of the EC2 instance. 17 | 18 | The PrivateIpAddress and PrivateDnsName are considered to be the ones for the 19 | network interface with DeviceIndex = NetworkCardIndex = 0. 20 | :param instance_info: the dictionary returned by a EC2:DescribeInstances call. 21 | :return: the PrivateIpAddress and PrivateDnsName of the instance. 22 | """ 23 | private_ip = instance_info["PrivateIpAddress"] 24 | private_dns_name = instance_info["PrivateDnsName"] 25 | all_private_ips = [private_ip] 26 | for network_interface in instance_info["NetworkInterfaces"]: 27 | all_private_ips.append(network_interface.get("PrivateIpAddress", private_ip)) 28 | attachment = network_interface["Attachment"] 29 | if attachment.get("DeviceIndex", -1) == 0 and attachment.get("NetworkCardIndex", -1) == 0: 30 | private_ip = network_interface.get("PrivateIpAddress", private_ip) 31 | private_dns_name = network_interface.get("PrivateDnsName", private_dns_name) 32 | return private_ip, private_dns_name, set(all_private_ips) 33 | -------------------------------------------------------------------------------- /src/common/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /src/common/schedulers/slurm_reservation_commands.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | import logging 12 | 13 | # A nosec comment is appended to the following line in order to disable the B404 check. 14 | # In this file the input of the module subprocess is trusted. 15 | import subprocess # nosec B404 16 | from datetime import datetime 17 | from typing import List, Union 18 | 19 | from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL 20 | from common.utils import ( 21 | SlurmCommandError, 22 | SlurmCommandErrorHandler, 23 | check_command_output, 24 | run_command, 25 | validate_subprocess_argument, 26 | ) 27 | from retrying import retry 28 | from slurm_plugin.slurm_resources import SlurmReservation 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | 33 | SCONTROL_SHOW_RESERVATION_OUTPUT_AWK_PARSER = ( 34 | 'awk \'BEGIN{{RS="\\n\\n" ; ORS="######\\n";}} {{print}}\' | ' 35 | + "grep -oP '^(ReservationName=\\S+)|(? List[SlurmReservation]: 246 | """ 247 | List existing slurm reservations with scontrol call. 248 | 249 | The output of the command is something like the following: 250 | $ scontrol show reservations 251 | ReservationName=root_7 StartTime=2023-10-25T09:46:49 EndTime=2024-10-24T09:46:49 Duration=365-00:00:00 252 | Nodes=queuep4d-dy-crp4d-[1-5] NodeCnt=5 CoreCnt=480 Features=(null) PartitionName=(null) Flags=MAINT,SPEC_NODES 253 | TRES=cpu=480 254 | Users=root Groups=(null) Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) Watts=n/a 255 | MaxStartDelay=(null) 256 | 257 | Official documentation is https://slurm.schedmd.com/reservations.html 258 | """ 259 | # awk is used to replace the \n\n record separator with '######\n' 260 | show_reservations_command = f"{SCONTROL} show reservations | {SCONTROL_SHOW_RESERVATION_OUTPUT_AWK_PARSER}" 261 | slurm_reservations_info = check_command_output( 262 | show_reservations_command, raise_on_error=raise_on_error, timeout=command_timeout, shell=True 263 | ) # nosec B604 264 | 265 | return _parse_reservations_info(slurm_reservations_info) 266 | 267 | 268 | def _parse_reservations_info(slurm_reservations_info: str) -> List[SlurmReservation]: 269 | """Parse slurm reservations info into SlurmReservation objects.""" 270 | # $ /opt/slurm/bin/scontrol show reservations awk 'BEGIN{{RS="\n\n" ; ORS="######\n";}} {{print}}' | 271 | # grep -oP '^(ReservationName=\S+)|(?= grace_time 300 | 301 | 302 | def read_json(file_path, default=None): 303 | """Read json file into a dict.""" 304 | try: 305 | with open(file_path) as mapping_file: 306 | return json.load(mapping_file) 307 | except Exception as e: 308 | if default is None: 309 | log.error("Unable to read file from '%s'. Failed with exception: %s", file_path, e) 310 | raise 311 | else: 312 | if not isinstance(e, FileNotFoundError): 313 | log.info("Unable to read file '%s' due to an exception: %s. Using default: %s", file_path, e, default) 314 | return default 315 | 316 | 317 | def validate_subprocess_argument(argument): 318 | """ 319 | Validate an argument used to build a subprocess command. 320 | 321 | The validation is done forcing the encoding to be the standard 322 | Python Unicode / UTF-8 and searching for forbidden patterns. 323 | 324 | :param argument: an argument string to validate 325 | :raise: Exception if the argument contains a forbidden pattern 326 | :return: True if the argument does not contain forbidden patterns 327 | """ 328 | forbidden_patterns = ["&", "|", ";", "$", ">", "<", "`", "\\", "!", "#", "\n"] 329 | 330 | # Forcing the encoding to be the standard Python Unicode / UTF-8 331 | # https://docs.python.org/3/howto/unicode.html 332 | # https://docs.python.org/3/library/codecs.html#standard-encodings 333 | _argument = (str(argument).encode("utf-8", "ignore")).decode() 334 | 335 | if any(pattern in _argument for pattern in forbidden_patterns): 336 | raise ValueError("Value of provided argument contains at least a forbidden pattern") 337 | return True 338 | 339 | 340 | def validate_absolute_path(path): 341 | """ 342 | Validate if a path string represents is a valid absolute path. 343 | 344 | :param path: path to validate 345 | :raise: Exception if the path is not a valid absolute path 346 | :return: True if the path is a valid absolute path 347 | """ 348 | if not os.path.isabs(path): 349 | raise ValueError(f"The path {path} is not a valid absolute path") 350 | return True 351 | 352 | 353 | @contextlib.contextmanager 354 | def setup_logging_filter(logger: logging.Logger, custom_field: str): 355 | """Set up a custom logging filter and remove it at the end of the context.""" 356 | 357 | class CustomFilter(logging.Filter): 358 | def __init__(self, custom_field: str): 359 | super().__init__() 360 | self.field = custom_field 361 | self.value = None 362 | 363 | def set_custom_value(self, custom_value: str): 364 | self.value = custom_value 365 | 366 | def filter(self, record: logging.LogRecord) -> bool: 367 | if self.value: 368 | record.msg = f"{self.field} {self.value} - {record.msg}" 369 | return True 370 | 371 | custom_filter = CustomFilter(custom_field) 372 | logger.addFilter(custom_filter) 373 | try: 374 | yield custom_filter 375 | finally: 376 | # Remove the custom log filter 377 | logger.removeFilter(custom_filter) 378 | 379 | 380 | class ApiMocker: 381 | """API mocker.""" 382 | 383 | @staticmethod 384 | def mockable(func): 385 | """ 386 | Try to mock passed function by searching for an overrides.py file in the same path of the given func. 387 | 388 | This function can be used a decorator and applied any method. 389 | 390 | The function will check if a function called with the name of the given function exists 391 | in the /overrides.py, and if it does, the function will execute it. 392 | 393 | E.g. if the method with ApiMocker.mockable decorator is defined in Ec2Client class 394 | of the ${node_virtualenv_path}/aws/ec2.py module, the mocked function should be defined 395 | in the ${node_virtualenv_path}/aws/overrides.py file. 396 | """ 397 | 398 | def wrapper(*args, **kwargs): 399 | try: 400 | function_name = func.__name__ 401 | # retrieve parent module of the given function that has the ApiMocker.mockable decorator 402 | func_module = func.__module__ 403 | func_parent_module = func_module[: func_module.rindex(".")] 404 | # try to import overrides.py module in the same folder of the module to mock 405 | overrides_module = __import__(f"{func_parent_module}.overrides", fromlist=function_name) 406 | overrided_func = getattr(overrides_module, function_name) 407 | log.info("Calling %s override with args: %s and kwargs: %s", function_name, args, kwargs) 408 | result = overrided_func(*args, **kwargs) 409 | except (ImportError, AttributeError): 410 | result = func(*args, **kwargs) 411 | return result 412 | 413 | return wrapper 414 | -------------------------------------------------------------------------------- /src/slurm_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /src/slurm_plugin/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | 13 | import functools 14 | import logging 15 | from concurrent.futures import Future 16 | from datetime import datetime 17 | from enum import Enum 18 | from typing import Callable, Optional, Protocol, TypedDict 19 | 20 | from common.utils import check_command_output, time_is_up, validate_absolute_path 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | # timestamp used by clustermgtd and computemgtd should be in default ISO format 25 | # YYYY-MM-DDTHH:MM:SS.ffffff+HH:MM[:SS[.ffffff]] 26 | TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f%z" 27 | DEFAULT_COMMAND_TIMEOUT = 30 28 | 29 | ComputeInstanceDescriptor = TypedDict( 30 | "ComputeInstanceDescriptor", 31 | { 32 | "Name": str, 33 | "InstanceId": str, 34 | }, 35 | ) 36 | 37 | 38 | class ScalingStrategy(Enum): 39 | ALL_OR_NOTHING = "all-or-nothing" 40 | BEST_EFFORT = "best-effort" 41 | GREEDY_ALL_OR_NOTHING = "greedy-all-or-nothing" 42 | 43 | @classmethod 44 | def _missing_(cls, strategy): 45 | # Ref: https://docs.python.org/3/library/enum.html#enum.Enum._missing_ 46 | _strategy = str(strategy).lower() 47 | for member in cls: 48 | if member.value == _strategy: 49 | return member 50 | return cls.ALL_OR_NOTHING # Default to all-or-nothing 51 | 52 | def __str__(self): 53 | return str(self.value) 54 | 55 | 56 | class TaskController(Protocol): 57 | class TaskShutdownError(RuntimeError): 58 | """Exception raised if shutdown has been requested.""" 59 | 60 | pass 61 | 62 | def queue_task(self, task: Callable[[], None]) -> Optional[Future]: 63 | """Queue a task and returns a Future for the task or None if the task could not be queued.""" 64 | 65 | def is_shutdown(self) -> bool: 66 | """Is shutdown has been requested.""" 67 | 68 | def raise_if_shutdown(self) -> None: 69 | """Raise an error if a shutdown has been requested.""" 70 | 71 | def wait_unless_shutdown(self, seconds_to_wait: float) -> None: 72 | """Wait for seconds_to_wait or will raise an error if a shutdown has been requested.""" 73 | 74 | def shutdown(self, wait: bool, cancel_futures: bool) -> None: 75 | """Request that all tasks be shutdown.""" 76 | 77 | 78 | def log_exception( 79 | logger, 80 | action_desc, 81 | log_level=logging.ERROR, 82 | catch_exception=Exception, 83 | raise_on_error=True, 84 | exception_to_raise=None, 85 | ): 86 | def _log_exception(function): 87 | @functools.wraps(function) 88 | def wrapper(*args, **kwargs): 89 | wrapped = None 90 | try: 91 | wrapped = function(*args, **kwargs) 92 | except catch_exception as e: 93 | logger.log(log_level, "Failed when %s with exception %s, message: %s", action_desc, type(e).__name__, e) 94 | if raise_on_error: 95 | if exception_to_raise: 96 | # preserve exception message if exception to raise is same of actual exception 97 | raise e if isinstance(e, exception_to_raise) else exception_to_raise 98 | else: 99 | raise 100 | 101 | return wrapped 102 | 103 | return wrapper 104 | 105 | return _log_exception 106 | 107 | 108 | def print_with_count(resource_list): 109 | """Print resource list with the len of the list.""" 110 | if isinstance(resource_list, str): 111 | return resource_list 112 | resource_list = [str(elem) for elem in resource_list] 113 | return f"(x{len(resource_list)}) {str(resource_list)}" 114 | 115 | 116 | def get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path): 117 | """Get clustermgtd's last heartbeat.""" 118 | # Use subprocess based method to read shared file to prevent hanging when NFS is down 119 | # Do not copy to local. Different users need to access the file, but file should be writable by root only 120 | # Only use last line of output to avoid taking unexpected output in stdout 121 | 122 | # Validation to sanitize the input argument and make it safe to use the function affected by B604 123 | validate_absolute_path(clustermgtd_heartbeat_file_path) 124 | 125 | heartbeat = ( 126 | check_command_output( 127 | f"cat {clustermgtd_heartbeat_file_path}", 128 | timeout=DEFAULT_COMMAND_TIMEOUT, 129 | shell=True, # nosec B604 130 | ) 131 | .splitlines()[-1] 132 | .strip() 133 | ) 134 | # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str 135 | # datetime.strptime will not work with str(datetime) 136 | # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00 137 | return datetime.strptime(heartbeat, TIMESTAMP_FORMAT) 138 | 139 | 140 | def expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout): 141 | """Test if clustermgtd heartbeat is expired.""" 142 | if time_is_up(last_heartbeat, current_time, clustermgtd_timeout): 143 | logger.error( 144 | "Clustermgtd has been offline since %s. Current time is %s. Timeout of %s seconds has expired!", 145 | last_heartbeat, 146 | current_time, 147 | clustermgtd_timeout, 148 | ) 149 | return True 150 | return False 151 | 152 | 153 | def is_clustermgtd_heartbeat_valid(current_time, clustermgtd_timeout, clustermgtd_heartbeat_file_path): 154 | try: 155 | last_heartbeat = get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path) 156 | logger.info("Latest heartbeat from clustermgtd: %s", last_heartbeat) 157 | return not expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout) 158 | except Exception as e: 159 | logger.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e) 160 | return False 161 | -------------------------------------------------------------------------------- /src/slurm_plugin/computemgtd.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | import logging 13 | import os 14 | import time 15 | from configparser import ConfigParser 16 | from datetime import datetime, timezone 17 | from io import StringIO 18 | from logging.config import fileConfig 19 | 20 | # A nosec comment is appended to the following line in order to disable the B404 check. 21 | # In this file the input of the module subprocess is trusted. 22 | from subprocess import CalledProcessError # nosec B404 23 | 24 | from botocore.config import Config 25 | from common.schedulers.slurm_commands import get_nodes_info 26 | from common.time_utils import seconds 27 | from common.utils import check_command_output, run_command, sleep_remaining_loop_time, validate_absolute_path 28 | from retrying import retry 29 | from slurm_plugin.common import ( 30 | DEFAULT_COMMAND_TIMEOUT, 31 | expired_clustermgtd_heartbeat, 32 | get_clustermgtd_heartbeat, 33 | log_exception, 34 | ) 35 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR 36 | 37 | LOOP_TIME = 60 38 | RELOAD_CONFIG_ITERATIONS = 10 39 | # Computemgtd config is under /opt/slurm/etc/pcluster/.slurm_plugin/; all compute nodes share a config 40 | SLURM_PLUGIN_DIR = "/opt/slurm/etc/pcluster/.slurm_plugin" 41 | COMPUTEMGTD_CONFIG_PATH = f"{SLURM_PLUGIN_DIR}/parallelcluster_computemgtd.conf" 42 | log = logging.getLogger(__name__) 43 | 44 | 45 | class ComputemgtdConfig: 46 | DEFAULTS = { 47 | # Basic configs 48 | "max_retry": 1, 49 | "loop_time": LOOP_TIME, 50 | "proxy": "NONE", 51 | "disable_computemgtd_actions": False, 52 | "clustermgtd_timeout": 600, 53 | "slurm_nodename_file": os.path.join(CONFIG_FILE_DIR, "slurm_nodename"), 54 | "logging_config": os.path.join( 55 | os.path.dirname(__file__), "logging", "parallelcluster_computemgtd_logging.conf" 56 | ), 57 | } 58 | 59 | def __init__(self, config_file_path): 60 | self._get_config(config_file_path) 61 | 62 | def __repr__(self): 63 | attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) 64 | return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs) 65 | 66 | @log_exception(log, "reading computemgtd config", catch_exception=Exception, raise_on_error=True) 67 | def _get_config(self, config_file_path): 68 | """Get computemgtd configuration.""" 69 | log.info("Reading %s", config_file_path) 70 | config = ConfigParser() 71 | try: 72 | # Validation to sanitize the input argument and make it safe to use the function affected by B604 73 | validate_absolute_path(config_file_path) 74 | # Use subprocess based method to copy shared file to local to prevent hanging when NFS is down 75 | config_str = check_command_output( 76 | f"cat {config_file_path}", 77 | timeout=DEFAULT_COMMAND_TIMEOUT, 78 | shell=True, # nosec B604 79 | ) 80 | config.read_file(StringIO(config_str)) 81 | except Exception: 82 | log.error("Cannot read computemgtd configuration file: %s", config_file_path) 83 | raise 84 | 85 | # Get config settings 86 | self.region = config.get("computemgtd", "region") 87 | self.cluster_name = config.get("computemgtd", "cluster_name") 88 | # Configure boto3 to retry 1 times by default 89 | self._boto3_retry = config.getint("clustermgtd", "boto3_retry", fallback=self.DEFAULTS.get("max_retry")) 90 | self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}} 91 | self.loop_time = config.getint("computemgtd", "loop_time", fallback=self.DEFAULTS.get("loop_time")) 92 | self.clustermgtd_timeout = config.getint( 93 | "computemgtd", 94 | "clustermgtd_timeout", 95 | fallback=self.DEFAULTS.get("clustermgtd_timeout"), 96 | ) 97 | self.disable_computemgtd_actions = config.getboolean( 98 | "computemgtd", 99 | "disable_computemgtd_actions", 100 | fallback=self.DEFAULTS.get("disable_computemgtd_actions"), 101 | ) 102 | self.clustermgtd_heartbeat_file_path = config.get("computemgtd", "clustermgtd_heartbeat_file_path") 103 | self._slurm_nodename_file = config.get( 104 | "computemgtd", "slurm_nodename_file", fallback=self.DEFAULTS.get("slurm_nodename_file") 105 | ) 106 | self.nodename = ComputemgtdConfig._read_nodename_from_file(self._slurm_nodename_file) 107 | 108 | proxy = config.get("computemgtd", "proxy", fallback=self.DEFAULTS.get("proxy")) 109 | if proxy != "NONE": 110 | self._boto3_config["proxies"] = {"https": proxy} 111 | self.boto3_config = Config(**self._boto3_config) 112 | self.logging_config = config.get("computemgtd", "logging_config", fallback=self.DEFAULTS.get("logging_config")) 113 | # Log configuration 114 | log.info(self.__repr__()) 115 | 116 | @staticmethod 117 | def _read_nodename_from_file(nodename_file_path): 118 | """Read self nodename from a file.""" 119 | try: 120 | with open(nodename_file_path, "r") as nodename_file: 121 | nodename = nodename_file.read() 122 | return nodename 123 | except Exception as e: 124 | log.error("Unable to read self nodename from %s with exception: %s\n", nodename_file_path, e) 125 | raise 126 | 127 | 128 | @log_exception(log, "self terminating compute instance", catch_exception=CalledProcessError, raise_on_error=False) 129 | def _self_terminate(): 130 | """Self terminate the instance.""" 131 | # Sleep for 10 seconds so termination log entries are uploaded to CW logs 132 | log.info("Preparing to self terminate the instance in 10 seconds!") 133 | time.sleep(10) 134 | log.info("Self terminating instance now!") 135 | run_command("sudo shutdown -h now") 136 | 137 | 138 | @retry(stop_max_attempt_number=3, wait_fixed=1500) 139 | def _get_nodes_info_with_retry(nodes): 140 | return get_nodes_info(nodes) 141 | 142 | 143 | def _is_self_node_down(self_nodename): 144 | """ 145 | Check if self node is healthy according to the scheduler. 146 | 147 | Node is considered healthy if: 148 | 1. Node is not in DOWN 149 | 2. Node is not in POWER_SAVE 150 | Note: node that is incorrectly attached to the scheduler will be in DOWN* after SlurmdTimeout. 151 | """ 152 | try: 153 | self_node = _get_nodes_info_with_retry(self_nodename)[0] 154 | log.info("Current self node state %s", self_node.__repr__()) 155 | if self_node.is_down() or self_node.is_power(): 156 | log.warning("Node is incorrectly attached to scheduler, preparing for self termination...") 157 | return True 158 | log.info("Node is correctly attached to scheduler, not terminating...") 159 | return False 160 | except Exception as e: 161 | # This could happen is slurmctld is down completely 162 | log.error("Unable to retrieve current node state from slurm with exception: %s\nConsidering node as down!", e) 163 | 164 | return True 165 | 166 | 167 | def _load_daemon_config(config_file): 168 | # Get program config 169 | computemgtd_config = ComputemgtdConfig(config_file) 170 | # Configure root logger 171 | try: 172 | fileConfig(computemgtd_config.logging_config, disable_existing_loggers=False) 173 | except Exception as e: 174 | log.warning( 175 | "Unable to configure logging from %s, using default logging settings.\nException: %s", 176 | computemgtd_config.logging_config, 177 | e, 178 | ) 179 | return computemgtd_config 180 | 181 | 182 | def _run_computemgtd(config_file): 183 | """Run computemgtd actions.""" 184 | # Initial default heartbeat time as computemgtd startup time 185 | last_heartbeat = datetime.now(tz=timezone.utc) 186 | log.info("Initializing clustermgtd heartbeat to be computemgtd startup time: %s", last_heartbeat) 187 | computemgtd_config = _load_daemon_config(config_file) 188 | reload_config_counter = RELOAD_CONFIG_ITERATIONS 189 | while True: 190 | # Get current time 191 | current_time = datetime.now(tz=timezone.utc) 192 | 193 | if reload_config_counter <= 0: 194 | try: 195 | computemgtd_config = _load_daemon_config(config_file) 196 | reload_config_counter = RELOAD_CONFIG_ITERATIONS 197 | except Exception as e: 198 | log.warning("Unable to reload daemon config, using previous one.\nException: %s", e) 199 | else: 200 | reload_config_counter -= 1 201 | 202 | # Check heartbeat 203 | try: 204 | last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path) 205 | log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat) 206 | except Exception as e: 207 | log.warning( 208 | "Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s", 209 | last_heartbeat, 210 | e, 211 | ) 212 | if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout): 213 | if computemgtd_config.disable_computemgtd_actions: 214 | log.info("All computemgtd actions currently disabled") 215 | elif _is_self_node_down(computemgtd_config.nodename): 216 | _self_terminate() 217 | 218 | sleep_remaining_loop_time(computemgtd_config.loop_time, current_time) 219 | 220 | 221 | @retry(wait_fixed=seconds(LOOP_TIME)) 222 | def main(): 223 | logging.basicConfig( 224 | level=logging.INFO, format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s" 225 | ) 226 | log.info("Computemgtd Startup") 227 | try: 228 | clustermgtd_config_file = os.environ.get("CONFIG_FILE", COMPUTEMGTD_CONFIG_PATH) 229 | _run_computemgtd(clustermgtd_config_file) 230 | except Exception as e: 231 | log.exception("An unexpected error occurred: %s", e) 232 | raise 233 | 234 | 235 | if __name__ == "__main__": 236 | main() 237 | -------------------------------------------------------------------------------- /src/slurm_plugin/console_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the 5 | # License. A copy of the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging 14 | import re 15 | from typing import Any, Callable, Iterable 16 | 17 | import boto3 18 | from slurm_plugin.common import ComputeInstanceDescriptor, TaskController 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class ConsoleLogger: 24 | """Class for retrieving and logging instance console output.""" 25 | 26 | def __init__(self, enabled: bool, region: str, console_output_consumer: Callable[[str, str, str], None]): 27 | self._region = region 28 | self._console_logging_enabled = enabled 29 | self._console_output_consumer = console_output_consumer 30 | self._boto3_client_factory = lambda service_name: boto3.session.Session().client( 31 | service_name, region_name=region 32 | ) 33 | 34 | def report_console_output_from_nodes( 35 | self, 36 | compute_instances: Iterable[ComputeInstanceDescriptor], 37 | task_controller: TaskController, 38 | task_wait_function: Callable[[], None], 39 | ): 40 | """Queue a task that will retrieve the console output for failed compute nodes.""" 41 | if not self._console_logging_enabled: 42 | return None 43 | 44 | # Only schedule a task if we have any compute_instances to query. We also need to realize any lazy instance ID 45 | # lookups before we schedule the task since the instance ID mapping may change after we return from this 46 | # call but before the task is executed. 47 | compute_instances = tuple(compute_instances) 48 | if len(compute_instances) < 1: 49 | return None 50 | 51 | task = self._get_console_output_task( 52 | raise_if_shutdown=task_controller.raise_if_shutdown, 53 | task_wait_function=task_wait_function, 54 | client_factory=self._boto3_client_factory, 55 | compute_instances=compute_instances, 56 | ) 57 | 58 | return task_controller.queue_task(task) 59 | 60 | def _get_console_output_task( 61 | self, 62 | task_wait_function: Callable[[], None], 63 | raise_if_shutdown: Callable[[], None], 64 | client_factory: Callable[[str], Any], 65 | compute_instances: Iterable[ComputeInstanceDescriptor], 66 | ): 67 | def console_collector(): 68 | try: 69 | # Sleep to allow EC2 time to publish the console output after the node terminates. 70 | task_wait_function() 71 | ec2client = client_factory("ec2") 72 | 73 | for output in ConsoleLogger._get_console_output_from_nodes(ec2client, compute_instances): 74 | # If shutdown, raise an exception so that any interested threads will know 75 | # this task was not completed. 76 | raise_if_shutdown() 77 | self._console_output_consumer( 78 | output.get("Name"), 79 | output.get("InstanceId"), 80 | output.get("ConsoleOutput"), 81 | ) 82 | except Exception as e: 83 | logger.error("Encountered exception while retrieving compute console output: %s", e) 84 | raise 85 | 86 | return console_collector 87 | 88 | @staticmethod 89 | def _get_console_output_from_nodes(ec2client, compute_instances): 90 | pattern = re.compile(r"\r\n|\n") 91 | for instance in compute_instances: 92 | instance_name = instance.get("Name") 93 | instance_id = instance.get("InstanceId") 94 | logger.info("Retrieving Console Output for node %s (%s)", instance_id, instance_name) 95 | response = ec2client.get_console_output(InstanceId=instance_id) 96 | output = response.get("Output") 97 | yield { 98 | "Name": instance_name, 99 | "InstanceId": instance_id, 100 | "ConsoleOutput": pattern.sub("\r", output) if output else None, 101 | } 102 | -------------------------------------------------------------------------------- /src/slurm_plugin/fleet_status_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | 13 | import argparse 14 | import json 15 | import logging 16 | import os 17 | import sys 18 | from configparser import ConfigParser 19 | from logging.config import fileConfig 20 | 21 | from botocore.config import Config 22 | from common.schedulers.slurm_commands import resume_powering_down_nodes, update_all_partitions 23 | from slurm_plugin.clustermgtd import ComputeFleetStatus, ComputeFleetStatusManager 24 | from slurm_plugin.common import log_exception 25 | from slurm_plugin.instance_manager import InstanceManager 26 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR, PartitionStatus 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | class SlurmFleetManagerConfig: 32 | DEFAULTS = { 33 | "max_retry": 5, 34 | "terminate_max_batch_size": 1000, 35 | "proxy": "NONE", 36 | "logging_config": os.path.join( 37 | os.path.dirname(__file__), "logging", "parallelcluster_fleet_status_manager_logging.conf" 38 | ), 39 | } 40 | 41 | def __init__(self, config_file_path): 42 | self._get_config(config_file_path) 43 | 44 | def __repr__(self): 45 | attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) 46 | return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs) 47 | 48 | @log_exception(log, "reading fleet status manager configuration file", catch_exception=IOError, raise_on_error=True) 49 | def _get_config(self, config_file_path): 50 | """Get fleetmanager configuration.""" 51 | log.info("Reading %s", config_file_path) 52 | 53 | config = ConfigParser() 54 | try: 55 | config.read_file(open(config_file_path, "r")) 56 | except IOError: 57 | log.error("Cannot read slurm fleet manager configuration file: %s", config_file_path) 58 | raise 59 | 60 | self.region = config.get("slurm_fleet_status_manager", "region") 61 | self.cluster_name = config.get("slurm_fleet_status_manager", "cluster_name") 62 | self.terminate_max_batch_size = config.getint( 63 | "slurm_fleet_status_manager", 64 | "terminate_max_batch_size", 65 | fallback=self.DEFAULTS.get("terminate_max_batch_size"), 66 | ) 67 | self._boto3_retry = config.getint( 68 | "slurm_fleet_status_manager", "boto3_retry", fallback=self.DEFAULTS.get("max_retry") 69 | ) 70 | self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}} 71 | proxy = config.get("slurm_fleet_status_manager", "proxy", fallback=self.DEFAULTS.get("proxy")) 72 | if proxy != "NONE": 73 | self._boto3_config["proxies"] = {"https": proxy} 74 | self.boto3_config = Config(**self._boto3_config) 75 | 76 | self.logging_config = config.get( 77 | "slurm_fleet_status_manager", "logging_config", fallback=self.DEFAULTS.get("logging_config") 78 | ) 79 | 80 | log.debug(self.__repr__()) 81 | 82 | 83 | def _manage_fleet_status_transition(config, computefleet_status_data_path): 84 | computefleet_status = _get_computefleet_status(computefleet_status_data_path) 85 | 86 | if ComputeFleetStatus.is_stop_requested(computefleet_status): 87 | _stop_partitions(config) 88 | elif ComputeFleetStatus.is_start_requested(computefleet_status): 89 | _start_partitions() 90 | 91 | 92 | def _start_partitions(): 93 | log.info("Setting slurm partitions to UP and resuming nodes...") 94 | update_all_partitions(PartitionStatus.UP, reset_node_addrs_hostname=False) 95 | # TODO: This function was added due to Slurm ticket 12915. The bug is not reproducible and the ticket was then 96 | # closed. This operation may now be useless: we need to check this. 97 | resume_powering_down_nodes() 98 | 99 | 100 | def _stop_partitions(config): 101 | log.info("Setting slurm partitions to INACTIVE and terminating all compute nodes...") 102 | update_all_partitions(PartitionStatus.INACTIVE, reset_node_addrs_hostname=True) 103 | instance_manager = InstanceManager( 104 | config.region, 105 | config.cluster_name, 106 | config.boto3_config, 107 | ) 108 | instance_manager.terminate_all_compute_nodes(config.terminate_max_batch_size) 109 | 110 | 111 | def _get_computefleet_status(computefleet_status_data_path): 112 | try: 113 | with open(computefleet_status_data_path, "r", encoding="utf-8") as computefleet_status_data_file: 114 | computefleet_status = ComputeFleetStatus( 115 | json.load(computefleet_status_data_file).get(ComputeFleetStatusManager.COMPUTE_FLEET_STATUS_ATTRIBUTE) 116 | ) 117 | log.info("ComputeFleet status is: %s", computefleet_status) 118 | except Exception as e: 119 | log.error("Cannot read compute fleet status data file: %s.\nException: %s", computefleet_status_data_path, e) 120 | raise 121 | 122 | return computefleet_status 123 | 124 | 125 | def main(): 126 | default_log_file = "/var/log/parallelcluster/slurm_fleet_status_manager.log" 127 | logging.basicConfig( 128 | filename=default_log_file, 129 | level=logging.INFO, 130 | format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s", 131 | ) 132 | log.info("FleetManager startup.") 133 | args = _parse_arguments() 134 | try: 135 | config_file = os.environ.get( 136 | "CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_fleet_status_manager.conf") 137 | ) 138 | fleet_status_manager_config = SlurmFleetManagerConfig(config_file) 139 | try: 140 | # Configure root logger 141 | fileConfig(fleet_status_manager_config.logging_config, disable_existing_loggers=False) 142 | except Exception as e: 143 | log.warning( 144 | "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s", 145 | fleet_status_manager_config.logging_config, 146 | default_log_file, 147 | e, 148 | ) 149 | log.info("FleetManager config: %s", fleet_status_manager_config) 150 | _manage_fleet_status_transition(fleet_status_manager_config, args.computefleet_status_data) 151 | log.info("FleetManager finished.") 152 | except Exception as e: 153 | log.exception("Encountered exception when running fleet manager: %s", e) 154 | sys.exit(1) 155 | 156 | 157 | def _parse_arguments(): 158 | parser = argparse.ArgumentParser() 159 | parser.add_argument("-cf", "--computefleet-status-data", help="Path to compute fleet status data", required=True) 160 | args = parser.parse_args() 161 | return args 162 | 163 | 164 | if __name__ == "__main__": 165 | main() 166 | -------------------------------------------------------------------------------- /src/slurm_plugin/logging/parallelcluster_clustermgtd_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,computeConsole,events 3 | 4 | [handlers] 5 | keys=streamHandler,computeConsoleHandler,eventsHandler 6 | 7 | [formatters] 8 | keys=defaultFormatter,computeConsoleFormatter,eventsFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=streamHandler 13 | 14 | [formatter_defaultFormatter] 15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s 16 | 17 | [handler_streamHandler] 18 | class=StreamHandler 19 | level=INFO 20 | formatter=defaultFormatter 21 | args=(sys.stdout,) 22 | 23 | [logger_computeConsole] 24 | level=INFO 25 | handlers=computeConsoleHandler 26 | propagate=0 27 | qualname=slurm_plugin.clustermgtd.console_output 28 | 29 | [formatter_computeConsoleFormatter] 30 | format=%(asctime)s - %(message)s 31 | 32 | [handler_computeConsoleHandler] 33 | class=FileHandler 34 | level=INFO 35 | formatter=computeConsoleFormatter 36 | args=('/var/log/parallelcluster/compute_console_output.log', 'a', None, False) 37 | 38 | [logger_events] 39 | level=INFO 40 | handlers=eventsHandler 41 | propagate=0 42 | qualname=slurm_plugin.clustermgtd.events 43 | 44 | [formatter_eventsFormatter] 45 | format=%(message)s 46 | 47 | [handler_eventsHandler] 48 | class=FileHandler 49 | level=INFO 50 | formatter=eventsFormatter 51 | args=('/var/log/parallelcluster/clustermgtd.events', 'a', None, False) 52 | -------------------------------------------------------------------------------- /src/slurm_plugin/logging/parallelcluster_computemgtd_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=streamHandler 6 | 7 | [formatters] 8 | keys=defaultFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=streamHandler 13 | 14 | [formatter_defaultFormatter] 15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s 16 | 17 | [handler_streamHandler] 18 | class=StreamHandler 19 | level=INFO 20 | formatter=defaultFormatter 21 | args=(sys.stdout,) 22 | -------------------------------------------------------------------------------- /src/slurm_plugin/logging/parallelcluster_fleet_status_manager_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=fileHandler 6 | 7 | [formatters] 8 | keys=defaultFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=fileHandler 13 | 14 | [formatter_defaultFormatter] 15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s 16 | 17 | [handler_fileHandler] 18 | class=FileHandler 19 | level=INFO 20 | formatter=defaultFormatter 21 | args=("/var/log/parallelcluster/slurm_fleet_status_manager.log",) 22 | -------------------------------------------------------------------------------- /src/slurm_plugin/logging/parallelcluster_resume_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,events 3 | 4 | [handlers] 5 | keys=fileHandler,eventsHandler 6 | 7 | [formatters] 8 | keys=defaultFormatter,eventsFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=fileHandler 13 | 14 | [formatter_defaultFormatter] 15 | format=%(asctime)s - %(process)d - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s 16 | 17 | [handler_fileHandler] 18 | class=FileHandler 19 | level=INFO 20 | formatter=defaultFormatter 21 | args=("/var/log/parallelcluster/slurm_resume.log",) 22 | 23 | [logger_events] 24 | level=WARNING 25 | handlers=eventsHandler 26 | propagate=0 27 | qualname=slurm_plugin.resume.events 28 | 29 | [formatter_eventsFormatter] 30 | format=%(message)s 31 | 32 | [handler_eventsHandler] 33 | class=FileHandler 34 | level=WARNING 35 | formatter=eventsFormatter 36 | args=('/var/log/parallelcluster/slurm_resume.events', 'a', None, False) 37 | -------------------------------------------------------------------------------- /src/slurm_plugin/logging/parallelcluster_suspend_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=fileHandler 6 | 7 | [formatters] 8 | keys=defaultFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=fileHandler 13 | 14 | [formatter_defaultFormatter] 15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s 16 | 17 | [handler_fileHandler] 18 | class=FileHandler 19 | level=INFO 20 | formatter=defaultFormatter 21 | args=("/var/log/parallelcluster/slurm_suspend.log",) 22 | -------------------------------------------------------------------------------- /src/slurm_plugin/resume.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | 13 | import argparse 14 | import logging 15 | import os 16 | from configparser import ConfigParser 17 | from datetime import datetime, timezone 18 | from logging.config import fileConfig 19 | 20 | from botocore.config import Config 21 | from common.schedulers.slurm_commands import get_nodes_info, set_nodes_down 22 | from common.utils import read_json 23 | from slurm_plugin.cluster_event_publisher import ClusterEventPublisher 24 | from slurm_plugin.common import ScalingStrategy, is_clustermgtd_heartbeat_valid, print_with_count 25 | from slurm_plugin.instance_manager import InstanceManager 26 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR 27 | 28 | log = logging.getLogger(__name__) 29 | event_logger = log.getChild("events") 30 | 31 | 32 | class SlurmResumeConfig: 33 | DEFAULTS = { 34 | "max_retry": 1, 35 | "launch_max_batch_size": 500, 36 | "assign_node_max_batch_size": 500, 37 | "terminate_max_batch_size": 1000, 38 | "update_node_address": True, 39 | "clustermgtd_timeout": 300, 40 | "proxy": "NONE", 41 | "logging_config": os.path.join(os.path.dirname(__file__), "logging", "parallelcluster_resume_logging.conf"), 42 | "hosted_zone": None, 43 | "dns_domain": None, 44 | "use_private_hostname": False, 45 | "run_instances_overrides": "/opt/slurm/etc/pcluster/run_instances_overrides.json", 46 | "create_fleet_overrides": "/opt/slurm/etc/pcluster/create_fleet_overrides.json", 47 | "fleet_config_file": "/etc/parallelcluster/slurm_plugin/fleet-config.json", 48 | "job_level_scaling": True, 49 | "scaling_strategy": "all-or-nothing", 50 | } 51 | 52 | def __init__(self, config_file_path): 53 | self._get_config(config_file_path) 54 | 55 | def __repr__(self): 56 | attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) 57 | return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs) 58 | 59 | def _get_config(self, config_file_path): 60 | """Get resume program configuration.""" 61 | log.info("Reading %s", config_file_path) 62 | 63 | config = ConfigParser() 64 | try: 65 | with open(config_file_path, "r") as config_file: 66 | config.read_file(config_file) 67 | except IOError: 68 | log.error("Cannot read slurm cloud bursting scripts configuration file: %s", config_file_path) 69 | raise 70 | 71 | self.region = config.get("slurm_resume", "region") 72 | self.cluster_name = config.get("slurm_resume", "cluster_name") 73 | self.dynamodb_table = config.get("slurm_resume", "dynamodb_table") 74 | self.hosted_zone = config.get("slurm_resume", "hosted_zone", fallback=self.DEFAULTS.get("hosted_zone")) 75 | self.dns_domain = config.get("slurm_resume", "dns_domain", fallback=self.DEFAULTS.get("dns_domain")) 76 | self.use_private_hostname = config.getboolean( 77 | "slurm_resume", "use_private_hostname", fallback=self.DEFAULTS.get("use_private_hostname") 78 | ) 79 | self.head_node_private_ip = config.get("slurm_resume", "head_node_private_ip") 80 | self.head_node_hostname = config.get("slurm_resume", "head_node_hostname") 81 | self.launch_max_batch_size = config.getint( 82 | "slurm_resume", "launch_max_batch_size", fallback=self.DEFAULTS.get("launch_max_batch_size") 83 | ) 84 | self.assign_node_max_batch_size = config.getint( 85 | "slurm_resume", "assign_node_max_batch_size", fallback=self.DEFAULTS.get("assign_node_max_batch_size") 86 | ) 87 | self.terminate_max_batch_size = config.getint( 88 | "slurm_resume", "terminate_max_batch_size", fallback=self.DEFAULTS.get("terminate_max_batch_size") 89 | ) 90 | self.update_node_address = config.getboolean( 91 | "slurm_resume", "update_node_address", fallback=self.DEFAULTS.get("update_node_address") 92 | ) 93 | self.scaling_strategy = config.get( 94 | "slurm_resume", "scaling_strategy", fallback=self.DEFAULTS.get("scaling_strategy") 95 | ) # TODO: Check if it's a valid scaling strategy before calling expensive downstream APIs 96 | self.job_level_scaling = config.getboolean( 97 | "slurm_resume", "job_level_scaling", fallback=self.DEFAULTS.get("job_level_scaling") 98 | ) 99 | fleet_config_file = config.get( 100 | "slurm_resume", "fleet_config_file", fallback=self.DEFAULTS.get("fleet_config_file") 101 | ) 102 | self.fleet_config = read_json(fleet_config_file) 103 | 104 | # run_instances_overrides_file and create_fleet_overrides_file contain a json with the following format: 105 | # { 106 | # "queue_name": { 107 | # "compute_resource_name": { 108 | # 109 | # }, 110 | # ... 111 | # }, 112 | # ... 113 | # } 114 | run_instances_overrides_file = config.get( 115 | "slurm_resume", "run_instances_overrides", fallback=self.DEFAULTS.get("run_instances_overrides") 116 | ) 117 | self.run_instances_overrides = read_json(run_instances_overrides_file, default={}) 118 | create_fleet_overrides_file = config.get( 119 | "slurm_resume", "create_fleet_overrides", fallback=self.DEFAULTS.get("create_fleet_overrides") 120 | ) 121 | self.create_fleet_overrides = read_json(create_fleet_overrides_file, default={}) 122 | 123 | self.clustermgtd_timeout = config.getint( 124 | "slurm_resume", 125 | "clustermgtd_timeout", 126 | fallback=self.DEFAULTS.get("clustermgtd_timeout"), 127 | ) 128 | self.clustermgtd_heartbeat_file_path = config.get("slurm_resume", "clustermgtd_heartbeat_file_path") 129 | 130 | # Configure boto3 to retry 1 times by default 131 | self._boto3_retry = config.getint("slurm_resume", "boto3_retry", fallback=self.DEFAULTS.get("max_retry")) 132 | self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}} 133 | proxy = config.get("slurm_resume", "proxy", fallback=self.DEFAULTS.get("proxy")) 134 | if proxy != "NONE": 135 | self._boto3_config["proxies"] = {"https": proxy} 136 | self.boto3_config = Config(**self._boto3_config) 137 | self.logging_config = config.get("slurm_resume", "logging_config", fallback=self.DEFAULTS.get("logging_config")) 138 | self.head_node_instance_id = config.get("slurm_resume", "instance_id", fallback="unknown") 139 | 140 | log.debug(self.__repr__()) 141 | 142 | 143 | def _handle_failed_nodes(node_list, reason="Failure when resuming nodes"): 144 | """ 145 | Fall back mechanism to handle failure when launching instances. 146 | 147 | When encountering a failure, want slurm to deallocate current nodes, 148 | and re-queue job to be run automatically by new nodes. 149 | To do this, set node to DOWN, so slurm will automatically re-queue job. 150 | Then set node to POWER_DOWN so suspend program will be run and node can be reset back to power saving. 151 | 152 | If this process is not done explicitly, slurm will wait until ResumeTimeout, 153 | then execute this process of setting nodes to DOWN then POWER_DOWN. 154 | To save time, should explicitly set nodes to DOWN in ResumeProgram so clustermgtd can maintain failed nodes. 155 | Clustermgtd will be responsible for running full DOWN -> POWER_DOWN process. 156 | """ 157 | if node_list: 158 | try: 159 | log.info( 160 | "Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason 161 | ) 162 | set_nodes_down(node_list, reason=reason) 163 | except Exception as e: 164 | log.error( 165 | "Failed to place nodes %s into DOWN for reason %s with exception: %s", 166 | print_with_count(node_list), 167 | reason, 168 | e, 169 | ) 170 | 171 | 172 | def _resume(arg_nodes, resume_config, slurm_resume): 173 | """Launch new EC2 nodes according to nodes requested by slurm.""" 174 | # Check heartbeat 175 | current_time = datetime.now(tz=timezone.utc) 176 | if not is_clustermgtd_heartbeat_valid( 177 | current_time, resume_config.clustermgtd_timeout, resume_config.clustermgtd_heartbeat_file_path 178 | ): 179 | log.error( 180 | "No valid clustermgtd heartbeat detected, clustermgtd is down!\n" 181 | "Please check clustermgtd log for error.\n" 182 | "Not launching nodes %s", 183 | arg_nodes, 184 | ) 185 | _handle_failed_nodes(arg_nodes) 186 | return 187 | log.info("Launching EC2 instances for the following Slurm nodes: %s", arg_nodes) 188 | node_list = [] 189 | node_list_with_status = [] 190 | for node in get_nodes_info(arg_nodes): 191 | node_list.append(node.name) 192 | node_list_with_status.append((node.name, node.state_string)) 193 | log.info("Current state of Slurm nodes to resume: %s", node_list_with_status) 194 | 195 | instance_manager = InstanceManager( 196 | region=resume_config.region, 197 | cluster_name=resume_config.cluster_name, 198 | boto3_config=resume_config.boto3_config, 199 | table_name=resume_config.dynamodb_table, 200 | hosted_zone=resume_config.hosted_zone, 201 | dns_domain=resume_config.dns_domain, 202 | use_private_hostname=resume_config.use_private_hostname, 203 | head_node_private_ip=resume_config.head_node_private_ip, 204 | head_node_hostname=resume_config.head_node_hostname, 205 | fleet_config=resume_config.fleet_config, 206 | run_instances_overrides=resume_config.run_instances_overrides, 207 | create_fleet_overrides=resume_config.create_fleet_overrides, 208 | job_level_scaling=resume_config.job_level_scaling, 209 | ) 210 | instance_manager.add_instances( 211 | slurm_resume=slurm_resume, 212 | node_list=node_list, 213 | launch_batch_size=resume_config.launch_max_batch_size, 214 | assign_node_batch_size=resume_config.assign_node_max_batch_size, 215 | terminate_batch_size=resume_config.terminate_max_batch_size, 216 | update_node_address=resume_config.update_node_address, 217 | scaling_strategy=ScalingStrategy(resume_config.scaling_strategy), 218 | ) 219 | failed_nodes = set().union(*instance_manager.failed_nodes.values()) 220 | success_nodes = [node for node in node_list if node not in failed_nodes] 221 | if success_nodes: 222 | log.info("Successfully launched nodes %s", print_with_count(success_nodes)) 223 | 224 | if failed_nodes: 225 | log.error( 226 | "Failed to launch following nodes, setting nodes to DOWN: %s", 227 | print_with_count(failed_nodes), 228 | ) 229 | for error_code, node_list in instance_manager.failed_nodes.items(): 230 | _handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes") 231 | 232 | event_publisher = ClusterEventPublisher.create_with_default_publisher( 233 | event_logger, 234 | resume_config.cluster_name, 235 | "HeadNode", 236 | "slurm-resume", 237 | resume_config.head_node_instance_id, 238 | ) 239 | event_publisher.publish_node_launch_events(instance_manager.failed_nodes) 240 | 241 | 242 | def main(): 243 | default_log_file = "/var/log/parallelcluster/slurm_resume.log" 244 | logging.basicConfig( 245 | filename=default_log_file, 246 | level=logging.INFO, 247 | format="%(asctime)s - %(process)d - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s", 248 | ) 249 | log.info("ResumeProgram startup.") 250 | parser = argparse.ArgumentParser() 251 | parser.add_argument("nodes", help="Nodes to burst") 252 | args = parser.parse_args() 253 | try: 254 | config_file = os.environ.get("CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_resume.conf")) 255 | resume_config = SlurmResumeConfig(config_file) 256 | try: 257 | # Configure root logger 258 | fileConfig(resume_config.logging_config, disable_existing_loggers=False) 259 | except Exception as e: 260 | log.warning( 261 | "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s", 262 | resume_config.logging_config, 263 | default_log_file, 264 | e, 265 | ) 266 | log.info("ResumeProgram config: %s", resume_config) 267 | 268 | _resume(args.nodes, resume_config, _get_slurm_resume()) 269 | log.info("ResumeProgram finished.") 270 | except Exception as e: 271 | log.exception("Encountered exception when requesting instances for %s: %s", args.nodes, e) 272 | _handle_failed_nodes(args.nodes) 273 | 274 | 275 | def _get_slurm_resume(): 276 | slurm_resume = read_json(os.environ.get("SLURM_RESUME_FILE"), default={}) 277 | log_level = logging.INFO if slurm_resume else logging.ERROR 278 | log.log(log_level, "Slurm Resume File content: %s", slurm_resume) 279 | return slurm_resume 280 | 281 | 282 | if __name__ == "__main__": 283 | main() 284 | -------------------------------------------------------------------------------- /src/slurm_plugin/suspend.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | 13 | import argparse 14 | import logging 15 | import os 16 | from configparser import ConfigParser 17 | from datetime import datetime, timezone 18 | from logging.config import fileConfig 19 | 20 | from slurm_plugin.common import is_clustermgtd_heartbeat_valid 21 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR 22 | 23 | log = logging.getLogger(__name__) 24 | 25 | 26 | class SlurmSuspendConfig: 27 | DEFAULTS = { 28 | "clustermgtd_timeout": 300, 29 | "logging_config": os.path.join(os.path.dirname(__file__), "logging", "parallelcluster_suspend_logging.conf"), 30 | } 31 | 32 | def __init__(self, config_file_path): 33 | config = ConfigParser() 34 | try: 35 | with open(config_file_path, "r") as config_file: 36 | config.read_file(config_file) 37 | except IOError: 38 | log.error("Cannot read slurm cloud bursting scripts configuration file: %s", config_file_path) 39 | raise 40 | 41 | self.clustermgtd_timeout = config.getint( 42 | "slurm_suspend", 43 | "clustermgtd_timeout", 44 | fallback=self.DEFAULTS.get("clustermgtd_timeout"), 45 | ) 46 | self.clustermgtd_heartbeat_file_path = config.get("slurm_suspend", "clustermgtd_heartbeat_file_path") 47 | self.logging_config = config.get( 48 | "slurm_suspend", "logging_config", fallback=self.DEFAULTS.get("logging_config") 49 | ) 50 | log.info(self.__repr__()) 51 | 52 | 53 | def main(): 54 | default_log_file = "/var/log/parallelcluster/slurm_suspend.log" 55 | logging.basicConfig( 56 | filename=default_log_file, 57 | level=logging.INFO, 58 | format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s", 59 | ) 60 | log.info("SuspendProgram startup.") 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("nodes", help="Nodes to release") 63 | args = parser.parse_args() 64 | config_file = os.environ.get("CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_suspend.conf")) 65 | suspend_config = SlurmSuspendConfig(config_file) 66 | try: 67 | # Configure root logger 68 | fileConfig(suspend_config.logging_config, disable_existing_loggers=False) 69 | except Exception as e: 70 | log.warning( 71 | "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s", 72 | suspend_config.logging_config, 73 | default_log_file, 74 | e, 75 | ) 76 | 77 | log.info("Suspending following nodes. Clustermgtd will cleanup orphaned instances: %s", args.nodes) 78 | current_time = datetime.now(tz=timezone.utc) 79 | if not is_clustermgtd_heartbeat_valid( 80 | current_time, suspend_config.clustermgtd_timeout, suspend_config.clustermgtd_heartbeat_file_path 81 | ): 82 | log.error( 83 | "No valid clustermgtd heartbeat detected, clustermgtd is down! " 84 | "Please check clustermgtd log for error.\n" 85 | "Nodes will be reset to POWER_SAVE state after SuspendTimeout. " 86 | "The backing EC2 instances may not be correctly terminated.\n" 87 | "Please check and terminate any orphaned instances in EC2!" 88 | ) 89 | else: 90 | log.info("SuspendProgram finished. Nodes will be available after SuspendTimeout") 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /src/slurm_plugin/task_executor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the 5 | # License. A copy of the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import inspect 14 | import logging 15 | from concurrent.futures import Future, ThreadPoolExecutor 16 | from functools import partial 17 | from threading import Event, Semaphore 18 | from typing import Callable, Optional 19 | 20 | from slurm_plugin.common import TaskController 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class TaskExecutor: 26 | """Class for managing execution of asynchronous tasks.""" 27 | 28 | class MaximumBacklogExceededError(RuntimeError): 29 | """Exception raised when a task can't be queued due to backlog.""" 30 | 31 | def __init__(self, task, maximum_backlog): 32 | self.failed_task = task 33 | self.maximum_backlog = maximum_backlog 34 | 35 | def __init__(self, worker_pool_size, max_backlog): 36 | self._max_backlog = max_backlog 37 | self._executor_limit = Semaphore(max_backlog) 38 | self._shutdown_event = Event() 39 | self._executor_pool = ThreadPoolExecutor(max_workers=worker_pool_size) 40 | 41 | def is_shutdown(self) -> bool: 42 | return self._shutdown_event.is_set() 43 | 44 | def raise_if_shutdown(self) -> None: 45 | if self.is_shutdown(): 46 | raise TaskController.TaskShutdownError() 47 | 48 | def wait_unless_shutdown(self, seconds_to_wait: float) -> None: 49 | shutdown = self._shutdown_event.wait(seconds_to_wait) 50 | if shutdown: 51 | raise TaskController.TaskShutdownError() 52 | 53 | def queue_task(self, task: Callable[[], None]) -> Optional[Future]: 54 | def queue_executor_task_callback(semaphore, *args): 55 | semaphore.release() 56 | 57 | if task: 58 | self.raise_if_shutdown() 59 | 60 | if self._executor_limit.acquire(blocking=False): 61 | future = self._executor_pool.submit(task) 62 | future.add_done_callback(partial(queue_executor_task_callback, self._executor_limit)) 63 | 64 | return future 65 | else: 66 | logging.error( 67 | "Unable to queue task due to exceeding backlog limit of %d", 68 | self._max_backlog, 69 | ) 70 | raise TaskExecutor.MaximumBacklogExceededError(task=task, maximum_backlog=self._max_backlog) 71 | 72 | return None 73 | 74 | def shutdown(self, wait: bool = False, cancel_futures: bool = True) -> None: 75 | if self._executor_pool: 76 | # Notify any waiters that we are shutting down 77 | self._shutdown_event.set() 78 | 79 | # `cancel_futures` parameter does not exist in python pre-3.9 80 | can_cancel = "cancel_futures" in inspect.getfullargspec(self._executor_pool.shutdown).kwonlyargs 81 | ( 82 | self._executor_pool.shutdown(wait=wait, cancel_futures=cancel_futures) 83 | if can_cancel 84 | else self._executor_pool.shutdown(wait=False) 85 | ) 86 | self._executor_pool = None 87 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /tests/aws/test_ec2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | import os 12 | from collections import namedtuple 13 | 14 | import pytest 15 | from assertpy import assert_that 16 | 17 | from aws.common import AWSClientError 18 | from aws.ec2 import CapacityReservationInfo, Ec2Client 19 | 20 | MockedBoto3Request = namedtuple( 21 | "MockedBoto3Request", ["method", "response", "expected_params", "generate_error", "error_code"] 22 | ) 23 | # Set defaults for attributes of the namedtuple. Since fields with a default value must come after any fields without 24 | # a default, the defaults are applied to the rightmost parameters. In this case generate_error = False and 25 | # error_code = None 26 | MockedBoto3Request.__new__.__defaults__ = (False, None) 27 | 28 | 29 | @pytest.fixture() 30 | def boto3_stubber_path(): 31 | # we need to set the region in the environment because the Boto3ClientFactory requires it. 32 | os.environ["AWS_DEFAULT_REGION"] = "us-east-1" 33 | return "aws.common.boto3" 34 | 35 | 36 | FAKE_CAPACITY_BLOCK_ID = "cr-a1234567" 37 | FAKE_CAPACITY_BLOCK_INFO = { 38 | "CapacityReservationId": FAKE_CAPACITY_BLOCK_ID, 39 | "EndDateType": "limited", 40 | # "ReservationType": "capacity-block", 41 | "AvailabilityZone": "eu-east-2a", 42 | "InstanceMatchCriteria": "targeted", 43 | "EphemeralStorage": False, 44 | "CreateDate": "2023-07-29T14:22:45Z ", 45 | "StartDate": "2023-08-15T12:00:00Z", 46 | "EndDate": "2023-08-19T12:00:00Z", 47 | "AvailableInstanceCount": 0, 48 | "InstancePlatform": "Linux/UNIX", 49 | "TotalInstanceCount": 16, 50 | "State": "payment-pending", 51 | "Tenancy": "default", 52 | "EbsOptimized": True, 53 | "InstanceType": "p5.48xlarge", 54 | } 55 | 56 | 57 | @pytest.mark.parametrize("generate_error", [True, False]) 58 | def test_describe_capacity_reservations(boto3_stubber, generate_error): 59 | """Verify that describe_capacity_reservations behaves as expected.""" 60 | dummy_message = "dummy error message" 61 | mocked_requests = [ 62 | MockedBoto3Request( 63 | method="describe_capacity_reservations", 64 | expected_params={"CapacityReservationIds": [FAKE_CAPACITY_BLOCK_ID]}, 65 | response=dummy_message if generate_error else {"CapacityReservations": [FAKE_CAPACITY_BLOCK_INFO]}, 66 | generate_error=generate_error, 67 | error_code=None, 68 | ) 69 | ] 70 | boto3_stubber("ec2", mocked_requests) 71 | if generate_error: 72 | with pytest.raises(AWSClientError, match=dummy_message): 73 | Ec2Client().describe_capacity_reservations(capacity_reservation_ids=[FAKE_CAPACITY_BLOCK_ID]) 74 | else: 75 | return_value = Ec2Client().describe_capacity_reservations(capacity_reservation_ids=[FAKE_CAPACITY_BLOCK_ID]) 76 | assert_that(return_value).is_equal_to([CapacityReservationInfo(FAKE_CAPACITY_BLOCK_INFO)]) 77 | -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | from collections import namedtuple 12 | 13 | from botocore.exceptions import ClientError 14 | 15 | MockedBoto3Request = namedtuple( 16 | "MockedBoto3Request", ["method", "response", "expected_params", "generate_error", "error_code"] 17 | ) 18 | # Set defaults for attributes of the namedtuple. Since fields with a default value must come after any fields without 19 | # a default, the defaults are applied to the rightmost parameters. In this case generate_error = False and 20 | # error_code = None 21 | MockedBoto3Request.__new__.__defaults__ = (False, None) 22 | 23 | 24 | def read_text(path): 25 | """Read the content of a file.""" 26 | with path.open() as f: 27 | return f.read() 28 | 29 | 30 | def client_error(error_code): 31 | return ClientError({"Error": {"Code": error_code}}, "failed_operation") 32 | 33 | 34 | SINGLE_SUBNET = {"SubnetIds": ["1234567"]} 35 | MULTIPLE_SUBNETS = {"SubnetIds": ["1234567", "7654321"]} 36 | 37 | FLEET_CONFIG = { 38 | "queue": {"c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}}, 39 | "queue1": { 40 | "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}, 41 | "c52xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.2xlarge"}]}, 42 | "p4d24xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "p4d.24xlarge"}]}, 43 | "fleet-spot": { 44 | "Api": "create-fleet", 45 | "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}], 46 | "MaxPrice": 10, 47 | "AllocationStrategy": "capacity-optimized", 48 | "CapacityType": "spot", 49 | "Networking": SINGLE_SUBNET, 50 | }, 51 | }, 52 | "queue2": { 53 | "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}, 54 | "fleet-ondemand": { 55 | "Api": "create-fleet", 56 | "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}], 57 | "AllocationStrategy": "lowest-price", 58 | "CapacityType": "on-demand", 59 | "Networking": SINGLE_SUBNET, 60 | }, 61 | }, 62 | "queue3": { 63 | "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}, 64 | "c52xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.2xlarge"}]}, 65 | "p4d24xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "p4d.24xlarge"}]}, 66 | }, 67 | "queue4": { 68 | "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}, 69 | "fleet1": { 70 | "Api": "create-fleet", 71 | "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}], 72 | "AllocationStrategy": "lowest-price", 73 | "CapacityType": "on-demand", 74 | "Networking": SINGLE_SUBNET, 75 | }, 76 | }, 77 | "queue5": { 78 | "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}, 79 | "fleet1": { 80 | "Api": "create-fleet", 81 | "Instances": [{"InstanceType": "t2.medium"}], 82 | "AllocationStrategy": "lowest-price", 83 | "CapacityType": "on-demand", 84 | "Networking": MULTIPLE_SUBNETS, 85 | }, 86 | }, 87 | "queue6": { 88 | "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}, 89 | "fleet1": { 90 | "Api": "create-fleet", 91 | "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}], 92 | "AllocationStrategy": "lowest-price", 93 | "CapacityType": "on-demand", 94 | "Networking": MULTIPLE_SUBNETS, 95 | }, 96 | }, 97 | "queue-cb": { 98 | "run-instances-capacity-block": { 99 | "Api": "run-instances", 100 | "Instances": [{"InstanceType": "c5.xlarge"}], 101 | "CapacityType": "capacity-block", 102 | "Networking": SINGLE_SUBNET, 103 | "CapacityReservationId": "cr-123456", 104 | }, 105 | "fleet-capacity-block": { 106 | "Api": "create-fleet", 107 | "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}], 108 | "CapacityType": "capacity-block", 109 | "Networking": SINGLE_SUBNET, 110 | "CapacityReservationId": "cr-234567", 111 | }, 112 | }, 113 | } 114 | 115 | LAUNCH_OVERRIDES = {} 116 | -------------------------------------------------------------------------------- /tests/common/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /tests/common/schedulers/test_slurm_commands/TestPartitionNodelistMapping/test_get_partition_nodelist_mapping/slurm_dir/etc/pcluster/parallelcluster_partition_nodelist_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "test": "test-st-cr1-[1-10],test-dy-cr2-[1-2]", 3 | "test2": "test2-st-cr1-[1-10],test2-dy-cr2-[1-2]" 4 | } 5 | -------------------------------------------------------------------------------- /tests/common/schedulers/test_slurm_commands/TestPartitionNodelistMapping/test_get_partitions/slurm_dir/etc/pcluster/parallelcluster_partition_nodelist_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "test": "test-st-cr1-[1-10],test-dy-cr2-[1-2]", 3 | "test2": "test2-st-cr1-[1-10],test2-dy-cr2-[1-2]" 4 | } 5 | -------------------------------------------------------------------------------- /tests/common/test_ec2_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "LICENSE.txt" file accompanying this file. 10 | # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. 11 | # See the License for the specific language governing permissions and limitations under the License. 12 | import pytest 13 | from assertpy import assert_that 14 | from common.ec2_utils import get_private_ip_address_and_dns_name 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "instance_info, expected_private_ip, expected_private_dns_name, expected_all_private_ips", 19 | [ 20 | ( 21 | { 22 | "InstanceId": "i-12345", 23 | "InstanceType": "c5.xlarge", 24 | "PrivateIpAddress": "ip.1.0.0.1", 25 | "PrivateDnsName": "ip-1-0-0-1", 26 | "NetworkInterfaces": [ 27 | { 28 | "Attachment": { 29 | "DeviceIndex": 0, 30 | "NetworkCardIndex": 0, 31 | }, 32 | "PrivateIpAddress": "ip.1.0.0.1", 33 | "PrivateDnsName": "ip-1-0-0-1", 34 | }, 35 | ], 36 | }, 37 | "ip.1.0.0.1", 38 | "ip-1-0-0-1", 39 | {"ip.1.0.0.1"}, 40 | ), 41 | ( 42 | { 43 | "InstanceId": "i-12345", 44 | "InstanceType": "c5.xlarge", 45 | "PrivateIpAddress": "ip.1.0.0.1", 46 | "PrivateDnsName": "ip-1-0-0-1", 47 | "NetworkInterfaces": [ 48 | { 49 | "Attachment": { 50 | "DeviceIndex": 0, 51 | "NetworkCardIndex": 0, 52 | }, 53 | }, 54 | ], 55 | }, 56 | "ip.1.0.0.1", 57 | "ip-1-0-0-1", 58 | {"ip.1.0.0.1"}, 59 | ), 60 | ( 61 | { 62 | "InstanceId": "i-12345", 63 | "InstanceType": "c5.xlarge", 64 | "PrivateIpAddress": "ip.1.0.0.1", 65 | "PrivateDnsName": "ip-1-0-0-1", 66 | "NetworkInterfaces": [ 67 | { 68 | "Attachment": {}, 69 | }, 70 | ], 71 | }, 72 | "ip.1.0.0.1", 73 | "ip-1-0-0-1", 74 | {"ip.1.0.0.1"}, 75 | ), 76 | ( 77 | { 78 | "InstanceId": "i-12345", 79 | "InstanceType": "c5.xlarge", 80 | "PrivateIpAddress": "ip.1.0.0.1", 81 | "PrivateDnsName": "ip-1-0-0-1", 82 | "NetworkInterfaces": [ 83 | { 84 | "Attachment": { 85 | "DeviceIndex": 0, 86 | "NetworkCardIndex": 1, 87 | }, 88 | "PrivateIpAddress": "ip.1.0.0.1", 89 | "PrivateDnsName": "ip-1-0-0-1", 90 | }, 91 | { 92 | "Attachment": { 93 | "DeviceIndex": 0, 94 | "NetworkCardIndex": 0, 95 | }, 96 | "PrivateIpAddress": "ip.1.0.0.2", 97 | "PrivateDnsName": "ip-1-0-0-2", 98 | }, 99 | ], 100 | }, 101 | "ip.1.0.0.2", 102 | "ip-1-0-0-2", 103 | {"ip.1.0.0.1", "ip.1.0.0.2"}, 104 | ), 105 | ( 106 | { 107 | "InstanceId": "i-12345", 108 | "InstanceType": "c5.xlarge", 109 | "PrivateIpAddress": "ip.1.0.0.1", 110 | "PrivateDnsName": "ip-1-0-0-1", 111 | "NetworkInterfaces": [ 112 | { 113 | "Attachment": { 114 | "DeviceIndex": 0, 115 | "NetworkCardIndex": 0, 116 | }, 117 | "PrivateIpAddress": "ip.1.0.0.1", 118 | "PrivateDnsName": "ip-1-0-0-1", 119 | }, 120 | { 121 | "Attachment": { 122 | "DeviceIndex": 0, 123 | "NetworkCardIndex": 1, 124 | }, 125 | "PrivateIpAddress": "ip.1.0.0.2", 126 | "PrivateDnsName": "ip-1-0-0-2", 127 | }, 128 | ], 129 | }, 130 | "ip.1.0.0.1", 131 | "ip-1-0-0-1", 132 | {"ip.1.0.0.1", "ip.1.0.0.2"}, 133 | ), 134 | ], 135 | ) 136 | def test_get_private_ip_address_and_dns_name( 137 | mocker, instance_info, expected_private_ip, expected_private_dns_name, expected_all_private_ips 138 | ): 139 | actual_private_ip, actual_private_dns_name, actual_all_private_ips = get_private_ip_address_and_dns_name( 140 | instance_info 141 | ) 142 | assert_that(actual_private_ip).is_equal_to(expected_private_ip) 143 | assert_that(actual_private_dns_name).is_equal_to(expected_private_dns_name) 144 | assert_that(actual_all_private_ips).is_equal_to(expected_all_private_ips) 145 | -------------------------------------------------------------------------------- /tests/common/test_time_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | import pytest 12 | from assertpy import assert_that 13 | from common.time_utils import seconds_to_minutes 14 | 15 | 16 | @pytest.mark.parametrize("value_in_seconds, expected_output", [(0, 0), (12, 0), (60, 1), (66, 1), (1202, 20)]) 17 | def test_seconds_to_minutes(value_in_seconds, expected_output): 18 | assert_that(seconds_to_minutes(value_in_seconds)).is_equal_to(expected_output) 19 | -------------------------------------------------------------------------------- /tests/common/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | import logging 12 | import os 13 | from datetime import datetime, timedelta, timezone 14 | 15 | import common.utils as utils 16 | import pytest 17 | from assertpy import assert_that 18 | from common.utils import read_json 19 | 20 | 21 | @pytest.fixture() 22 | def boto3_stubber_path(): 23 | # we need to set the region in the environment because the Boto3ClientFactory requires it. 24 | os.environ["AWS_DEFAULT_REGION"] = "us-east-2" 25 | return "common.utils.boto3" 26 | 27 | 28 | @pytest.mark.parametrize( 29 | "source_object, chunk_size, expected_grouped_output", 30 | [ 31 | ([1, 2, 3, 4, 5], 2, [(1, 2), (3, 4), (5,)]), 32 | ([1, 2, 3, 4, 5, 6], 3, [(1, 2, 3), (4, 5, 6)]), 33 | ({"A": 1, "B": 2, "C": 3}, 2, [("A", "B"), ("C",)]), 34 | ((1, 2, 3, 4, 5), 2, [(1, 2), (3, 4), (5,)]), 35 | ((1, 2, 3), 1, [(1,), (2,), (3,)]), 36 | ], 37 | ) 38 | def test_grouper(source_object, chunk_size, expected_grouped_output): 39 | assert_that(list(utils.grouper(source_object, chunk_size))).is_equal_to(expected_grouped_output) 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "loop_start_time, loop_end_time, loop_total_time, expected_sleep_time", 44 | [ 45 | ( 46 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 47 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 48 | 60, 49 | 60, 50 | ), 51 | ( 52 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 53 | datetime(2020, 1, 1, 0, 1, 00, tzinfo=timezone.utc), 54 | 60, 55 | 30, 56 | ), 57 | ( 58 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 59 | datetime(2020, 1, 1, 0, 1, 30, tzinfo=timezone.utc), 60 | 60, 61 | 0, 62 | ), 63 | ( 64 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 65 | datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), 66 | 60, 67 | 0, 68 | ), 69 | ( 70 | datetime(2020, 1, 1, 1, 0, 0, tzinfo=timezone(timedelta(hours=1))), 71 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 72 | 60, 73 | 30, 74 | ), 75 | ( 76 | datetime(2020, 1, 1, 1, 0, 0), 77 | datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc), 78 | 60, 79 | None, # can't assert this with naive timezone since the value depends on the system timezone 80 | ), 81 | ], 82 | ) 83 | def test_sleep_remaining_loop_time(mocker, loop_start_time, loop_end_time, loop_total_time, expected_sleep_time): 84 | sleep_mock = mocker.patch("time.sleep") 85 | datetime_now_mock = mocker.MagicMock() 86 | datetime_now_mock.now = mocker.MagicMock(return_value=loop_end_time, spec=datetime.now) 87 | mocker.patch("common.utils.datetime", datetime_now_mock) 88 | 89 | utils.sleep_remaining_loop_time(loop_total_time, loop_start_time) 90 | 91 | if expected_sleep_time: 92 | sleep_mock.assert_called_with(expected_sleep_time) 93 | elif expected_sleep_time == 0: 94 | sleep_mock.assert_not_called() 95 | datetime_now_mock.now.assert_called_with(tz=timezone.utc) 96 | 97 | 98 | @pytest.mark.parametrize( 99 | "argument,raises_exception", 100 | [ 101 | ("standard parameter name", False), 102 | ("my/parameter", False), 103 | ("execute this & then this", True), 104 | ("redirect | my output", True), 105 | ("execute\nmultiline", True), 106 | ], 107 | ) 108 | def test_validate_subprocess_argument(argument, raises_exception): 109 | if raises_exception: 110 | with pytest.raises(ValueError): 111 | utils.validate_subprocess_argument(argument) 112 | else: 113 | assert_that(utils.validate_subprocess_argument(argument)).is_true() 114 | 115 | 116 | @pytest.mark.parametrize( 117 | "argument,raises_exception", 118 | [ 119 | ("/usr/my_path", False), 120 | ("./my_path", True), 121 | ("my_path", True), 122 | (".my_path", True), 123 | ], 124 | ) 125 | def test_validate_absolute_path(argument, raises_exception): 126 | if raises_exception: 127 | with pytest.raises(ValueError): 128 | utils.validate_absolute_path(argument) 129 | else: 130 | assert_that(utils.validate_absolute_path(argument)).is_true() 131 | 132 | 133 | @pytest.mark.parametrize( 134 | "raw_input, default, expected_output, expected_exception", 135 | [ 136 | ("", None, None, True), 137 | ("", {}, {}, True), 138 | ("{}", {}, {}, True), 139 | ("malformed", {}, {}, True), 140 | ("{malformed}", {}, {}, True), 141 | ( 142 | '{"jobs":[{"extra":null,"job_id":91,"features":null,"nodes_alloc":"q1-dy-c1-3","nodes_resume":"q1-dy-c1-3",' 143 | '"oversubscribe":"NO","partition":"q1","reservation":null}],"all_nodes_resume":"q1-dy-c1-3"}', 144 | {}, 145 | { 146 | "all_nodes_resume": "q1-dy-c1-3", 147 | "jobs": [ 148 | { 149 | "extra": None, 150 | "features": None, 151 | "job_id": 91, 152 | "nodes_alloc": "q1-dy-c1-3", 153 | "nodes_resume": "q1-dy-c1-3", 154 | "oversubscribe": "NO", 155 | "partition": "q1", 156 | "reservation": None, 157 | } 158 | ], 159 | }, 160 | False, 161 | ), 162 | ], 163 | ) 164 | def test_read_json(mocker, raw_input, default, expected_output, expected_exception, caplog): 165 | if default is not None: 166 | mocker.patch("builtins.open", mocker.mock_open(read_data=raw_input)) 167 | if expected_exception: 168 | assert_that(read_json(None, default=default)).is_equal_to(default) 169 | else: 170 | assert_that(read_json(None, default=default)).is_equal_to(expected_output) 171 | else: 172 | with pytest.raises(TypeError): 173 | read_json(None) 174 | assert_that(caplog.text).contains("Unable to read file") 175 | 176 | 177 | def test_custom_filter(caplog): 178 | logger = logging.getLogger(__name__) 179 | caplog.set_level(logging.INFO) 180 | 181 | logger.info("This is a log") 182 | assert_that(caplog.text).matches("This is a log") 183 | 184 | with utils.setup_logging_filter(logger, "CustomField") as custom_log_filter: 185 | custom_log_filter.set_custom_value("CustomValue") 186 | logger.info("This is a another log") 187 | assert_that(caplog.text).matches("CustomField CustomValue - This is a another log") 188 | 189 | caplog.clear() 190 | logger.info("This is another log with no filter") 191 | assert_that(caplog.text).matches("This is another log with no filter") 192 | assert_that(caplog.text).does_not_match("CustomField CustomValue") 193 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | import boto3 12 | import pytest 13 | from botocore.stub import Stubber 14 | 15 | 16 | @pytest.fixture() 17 | def test_datadir(request, datadir): 18 | """ 19 | Inject the datadir with resources for the specific test function. 20 | 21 | If the test function is declared in a class then datadir is ClassName/FunctionName 22 | otherwise it is only FunctionName. 23 | """ 24 | function_name = request.function.__name__ 25 | if not request.cls: 26 | return datadir / function_name 27 | 28 | class_name = request.cls.__name__ 29 | return datadir / "{0}/{1}".format(class_name, function_name) 30 | 31 | 32 | @pytest.fixture() 33 | def boto3_stubber(mocker, boto3_stubber_path): 34 | """ 35 | Create a function to easily mock boto3 clients. 36 | 37 | To mock a boto3 service simply pass the name of the service to mock and 38 | the mocked requests, where mocked_requests is an object containing the method to mock, 39 | the response to return and the expected params for the boto3 method that gets called. 40 | 41 | The function makes use of botocore.Stubber to mock the boto3 API calls. 42 | Multiple boto3 services can be mocked as part of the same test. 43 | 44 | :param boto3_stubber_path is the path of the boto3 import to mock. (e.g. pcluster.config.validators.boto3) 45 | """ 46 | __tracebackhide__ = True 47 | created_stubbers = [] 48 | mocked_clients = {} 49 | 50 | mocked_client_factory = mocker.patch(boto3_stubber_path, autospec=True) 51 | # use **kwargs to skip parameters passed to the boto3.client other than the "service" 52 | # e.g. boto3.client("ec2", region_name=region, ...) --> x = ec2 53 | mocked_client_factory.client.side_effect = lambda x, **kwargs: mocked_clients[x] 54 | 55 | def _boto3_stubber(service, mocked_requests): 56 | client = boto3.client(service) 57 | stubber = Stubber(client) 58 | # Save a ref to the stubber so that we can deactivate it at the end of the test. 59 | created_stubbers.append(stubber) 60 | 61 | # Attach mocked requests to the Stubber and activate it. 62 | if not isinstance(mocked_requests, list): 63 | mocked_requests = [mocked_requests] 64 | for mocked_request in mocked_requests: 65 | if mocked_request.generate_error: 66 | stubber.add_client_error( 67 | mocked_request.method, 68 | service_message=mocked_request.response, 69 | expected_params=mocked_request.expected_params, 70 | service_error_code=mocked_request.error_code, 71 | ) 72 | else: 73 | stubber.add_response( 74 | mocked_request.method, mocked_request.response, expected_params=mocked_request.expected_params 75 | ) 76 | stubber.activate() 77 | 78 | # Add stubber to the collection of mocked clients. This allows to mock multiple clients. 79 | # Mocking twice the same client will replace the previous one. 80 | mocked_clients[service] = client 81 | return client 82 | 83 | # yield allows to return the value and then continue the execution when the test is over. 84 | # Used for resources cleanup. 85 | yield _boto3_stubber 86 | 87 | # Assert that all mocked requests were consumed and deactivate all stubbers. 88 | for stubber in created_stubbers: 89 | stubber.assert_no_pending_responses() 90 | stubber.deactivate() 91 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | assertpy 2 | pytest 3 | pytest-cov 4 | pytest-datadir 5 | pytest-html 6 | pytest-mock 7 | pytest-xdist 8 | retrying -------------------------------------------------------------------------------- /tests/slurm_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | -------------------------------------------------------------------------------- /tests/slurm_plugin/slurm_resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-parallelcluster-node/3438c8fcc22bb818660c7d47875e8c3a8a52fa31/tests/slurm_plugin/slurm_resources/__init__.py -------------------------------------------------------------------------------- /tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_comparison/config.conf: -------------------------------------------------------------------------------- 1 | [clustermgtd] 2 | cluster_name = hit 3 | region = us-east-2 4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 5 | dynamodb_table = table-name 6 | head_node_private_ip = head.node.ip 7 | head_node_hostname = head-node-hostname 8 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_comparison/config_modified.conf: -------------------------------------------------------------------------------- 1 | [clustermgtd] 2 | cluster_name = hit 3 | region = us-east-2 4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 5 | dynamodb_table = table-name-2 6 | head_node_private_ip = head.node.ip 7 | head_node_hostname = head-node-hostname 8 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_parsing/all_options.conf: -------------------------------------------------------------------------------- 1 | [clustermgtd] 2 | cluster_name = hit 3 | region = us-east-1 4 | heartbeat_file_path = /home/ubuntu/clustermgtd_heartbeat 5 | loop_time = 30 6 | boto3_retry = 10 7 | disable_all_cluster_management = true 8 | proxy = https://fake.proxy 9 | logging_config = /my/logging/config 10 | update_node_address = false 11 | launch_max_batch_size = 1 12 | terminate_max_batch_size = 500 13 | node_replacement_timeout = 10 14 | terminate_drain_nodes = false 15 | terminate_down_nodes = false 16 | orphaned_instance_timeout = 60 17 | disable_ec2_health_check = True 18 | disable_scheduled_event_health_check = True 19 | disable_all_health_checks = False 20 | health_check_timeout = 10 21 | dynamodb_table = table-name 22 | head_node_private_ip = head.node.ip 23 | head_node_hostname = head-node-hostname 24 | hosted_zone = hosted-zone 25 | dns_domain = dns.domain 26 | use_private_hostname = false 27 | protected_failure_count = 5 28 | insufficient_capacity_timeout = 50.5 29 | compute_console_logging_enabled = False 30 | compute_console_logging_max_sample_size = 50 31 | compute_console_wait_time: 10 32 | worker_pool_size: 2 33 | worker_pool_max_backlog: 5 34 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_parsing/default.conf: -------------------------------------------------------------------------------- 1 | [clustermgtd] 2 | cluster_name = hit 3 | region = us-east-2 4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 5 | dynamodb_table = table-name 6 | head_node_private_ip = head.node.ip 7 | head_node_hostname = head-node-hostname -------------------------------------------------------------------------------- /tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_parsing/health_check.conf: -------------------------------------------------------------------------------- 1 | [clustermgtd] 2 | cluster_name = hit 3 | region = us-east-1 4 | heartbeat_file_path = /home/ubuntu/clustermgtd_heartbeat 5 | loop_time = 30 6 | disable_all_cluster_management = true 7 | proxy = https://fake.proxy 8 | logging_config = /my/logging/config 9 | update_node_address = false 10 | launch_max_batch_size = 1 11 | terminate_max_batch_size = 500 12 | node_replacement_timeout = 10 13 | terminate_drain_nodes = false 14 | terminate_down_nodes = false 15 | orphaned_instance_timeout = 60 16 | disable_ec2_health_check = True 17 | disable_scheduled_event_health_check = True 18 | health_check_timeout = 10 19 | dynamodb_table = table-name 20 | head_node_private_ip = head.node.ip 21 | head_node_hostname = head-node-hostname 22 | hosted_zone = hosted-zone 23 | dns_domain = dns.domain 24 | use_private_hostname = false 25 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_clustermgtd/test_manage_cluster_boto3/default.conf: -------------------------------------------------------------------------------- 1 | [clustermgtd] 2 | cluster_name = hit 3 | region = us-east-2 4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 5 | dynamodb_table = table-name 6 | head_node_private_ip = head.node.ip 7 | head_node_hostname = head-node-hostname 8 | hosted_zone = hosted-zone 9 | dns_domain = dns.domain 10 | use_private_hostname = no 11 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | import logging 12 | from datetime import datetime, timedelta, timezone 13 | 14 | import pytest 15 | from assertpy import assert_that 16 | from common.utils import read_json, time_is_up 17 | from slurm_plugin.common import TIMESTAMP_FORMAT, ScalingStrategy, get_clustermgtd_heartbeat 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "initial_time, current_time, grace_time, expected_result", 22 | [ 23 | (datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 1, 1, 0, 0, 29), 30, False), 24 | (datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 1, 1, 0, 0, 30), 30, True), 25 | ( 26 | datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), 27 | # local timezone is 1 hours ahead of UTC, so this time stamp is actually 30 mins before initial_time 28 | datetime(2020, 1, 1, 0, 30, 0, tzinfo=timezone(timedelta(hours=1))), 29 | 30 * 60, 30 | False, 31 | ), 32 | ( 33 | datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), 34 | # local timezone is 1 hours ahead of UTC, so this time stamp is actually 30 mins after initial_time 35 | datetime(2020, 1, 1, 1, 30, 0, tzinfo=timezone(timedelta(hours=1))), 36 | 30 * 60, 37 | True, 38 | ), 39 | ( 40 | datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), 41 | # local timezone is 1 hours behind of UTC, so this time stamp is actually 1.5 hrs after initial_time 42 | datetime(2020, 1, 1, 0, 30, 0, tzinfo=timezone(-timedelta(hours=1))), 43 | 90 * 60, 44 | True, 45 | ), 46 | ( 47 | datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc), 48 | # local timezone is 1 hours behind of UTC, so this time stamp is actually 1 hrs after initial_time 49 | datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone(-timedelta(hours=1))), 50 | 90 * 60, 51 | False, 52 | ), 53 | ( 54 | None, 55 | datetime(2020, 1, 24, 23, 42, 12), 56 | 180, 57 | True, 58 | ), 59 | ], 60 | ) 61 | def test_time_is_up(initial_time, current_time, grace_time, expected_result): 62 | assert_that(time_is_up(initial_time, current_time, grace_time)).is_equal_to(expected_result) 63 | 64 | 65 | @pytest.mark.parametrize( 66 | "time, expected_parsed_time", 67 | [ 68 | ( 69 | datetime(2020, 7, 30, 19, 34, 2, 613338, tzinfo=timezone.utc), 70 | datetime(2020, 7, 30, 19, 34, 2, 613338, tzinfo=timezone.utc), 71 | ), 72 | ( 73 | datetime(2020, 7, 30, 10, 1, 1, tzinfo=timezone(timedelta(hours=1))), 74 | datetime(2020, 7, 30, 10, 1, 1, tzinfo=timezone(timedelta(hours=1))), 75 | ), 76 | ], 77 | ) 78 | def test_get_clustermgtd_heartbeat(time, expected_parsed_time, mocker): 79 | mocker.patch( 80 | "slurm_plugin.common.check_command_output", 81 | return_value=f"some_random_stdout\n{time.strftime(TIMESTAMP_FORMAT)}", 82 | ) 83 | assert_that(get_clustermgtd_heartbeat("/some/file/path")).is_equal_to(expected_parsed_time) 84 | 85 | 86 | @pytest.mark.parametrize( 87 | "json_file, default_value, raises_exception, message_in_log", 88 | [ 89 | ("faulty.json", None, True, "Failed with exception"), 90 | ("faulty.json", {}, False, "due to an exception"), # info message 91 | ("standard.json", None, False, None), 92 | ("non_existing.json", None, True, "Failed with exception"), 93 | ("non_existing.json", {}, False, None), # info message not displayed 94 | ], 95 | ) 96 | def test_read_json(test_datadir, caplog, json_file, default_value, raises_exception, message_in_log): 97 | caplog.set_level(logging.INFO) 98 | json_file_path = str(test_datadir.joinpath(json_file)) 99 | if raises_exception: 100 | with pytest.raises((ValueError, FileNotFoundError)): 101 | read_json(json_file_path, default_value) 102 | else: 103 | read_json(json_file_path, default_value) 104 | 105 | if message_in_log: 106 | assert_that(caplog.text).matches(message_in_log) 107 | else: 108 | assert_that(caplog.text).does_not_match("exception") 109 | 110 | 111 | @pytest.mark.parametrize( 112 | "strategy_as_value, expected_strategy_enum", 113 | [ 114 | ("best-effort", ScalingStrategy.BEST_EFFORT), 115 | ("all-or-nothing", ScalingStrategy.ALL_OR_NOTHING), 116 | ("", ScalingStrategy.ALL_OR_NOTHING), 117 | ("invalid-strategy", ScalingStrategy.ALL_OR_NOTHING), 118 | ], 119 | ) 120 | def test_scaling_strategies_enum_from_value(strategy_as_value, expected_strategy_enum): 121 | strategy_enum = ScalingStrategy(strategy_as_value) 122 | assert_that(strategy_enum).is_equal_to(expected_strategy_enum) 123 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_common/test_read_json/faulty.json: -------------------------------------------------------------------------------- 1 | { 2 |     "test_property_1": { 3 |                     "test_property_2": "test_value" 4 |                 } 5 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_common/test_read_json/standard.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_property_1": { 3 | "test_property_2": "test_value" 4 | } 5 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_computemgtd.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | 13 | import logging 14 | import os 15 | 16 | import pytest 17 | import slurm_plugin 18 | from assertpy import assert_that 19 | from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate 20 | from slurm_plugin.slurm_resources import DynamicNode 21 | 22 | 23 | @pytest.mark.parametrize( 24 | ("config_file", "expected_attributes"), 25 | [ 26 | ( 27 | "default.conf", 28 | { 29 | "cluster_name": "hit", 30 | "region": "us-east-2", 31 | "_boto3_config": {"retries": {"max_attempts": 1, "mode": "standard"}}, 32 | "clustermgtd_timeout": 600, 33 | "clustermgtd_heartbeat_file_path": "/home/ec2-user/clustermgtd_heartbeat", 34 | "disable_computemgtd_actions": False, 35 | "_slurm_nodename_file": "/etc/parallelcluster/slurm_plugin/slurm_nodename", 36 | "nodename": "some_nodename", 37 | "loop_time": 60, 38 | "logging_config": os.path.join( 39 | os.path.dirname(slurm_plugin.__file__), "logging", "parallelcluster_computemgtd_logging.conf" 40 | ), 41 | }, 42 | ), 43 | ( 44 | "all_options.conf", 45 | { 46 | "cluster_name": "hit", 47 | "region": "us-east-2", 48 | "loop_time": 300, 49 | "clustermgtd_timeout": 30, 50 | "clustermgtd_heartbeat_file_path": "/home/ubuntu/clustermgtd_heartbeat", 51 | "_slurm_nodename_file": "/my/nodename/path", 52 | "nodename": "some_nodename", 53 | "disable_computemgtd_actions": True, 54 | "_boto3_config": { 55 | "retries": {"max_attempts": 1, "mode": "standard"}, 56 | "proxies": {"https": "my.resume.proxy"}, 57 | }, 58 | "logging_config": "/path/to/logging/config", 59 | }, 60 | ), 61 | ], 62 | ) 63 | def test_computemgtd_config(config_file, expected_attributes, test_datadir, mocker): 64 | mocker.patch("slurm_plugin.computemgtd.ComputemgtdConfig._read_nodename_from_file", return_value="some_nodename") 65 | mocker.patch("slurm_plugin.computemgtd.check_command_output", return_value=(test_datadir / config_file).read_text()) 66 | compute_config = ComputemgtdConfig("/mocked/config/path") 67 | for key in expected_attributes: 68 | assert_that(compute_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key)) 69 | 70 | 71 | @pytest.mark.parametrize( 72 | "mock_node_info, expected_result", 73 | [ 74 | ( 75 | [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "DOWN+CLOUD+NOT_RESPONDING", "queue1")], 76 | True, 77 | ), 78 | ( 79 | [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "IDLE+CLOUD+DRAIN", "queue1")], 80 | False, 81 | ), 82 | ( 83 | [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "DOWN+CLOUD+DRAIN", "queue1")], 84 | True, 85 | ), 86 | ( 87 | [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "IDLE+CLOUD+POWERED_DOWN", "queue1")], 88 | True, 89 | ), 90 | ( 91 | Exception, 92 | True, 93 | ), 94 | ], 95 | ids=["node_down", "node_drained_idle", "node_drained_down", "node_power_save", "cant_get_node_info"], 96 | ) 97 | def test_is_self_node_down(mock_node_info, expected_result, mocker): 98 | if mock_node_info is Exception: 99 | mocker.patch("slurm_plugin.computemgtd._get_nodes_info_with_retry", side_effect=Exception()) 100 | else: 101 | mocker.patch("slurm_plugin.computemgtd._get_nodes_info_with_retry", return_value=mock_node_info) 102 | 103 | assert_that(_is_self_node_down("queue1-st-c5xlarge-1")).is_equal_to(expected_result) 104 | 105 | 106 | def test_self_terminate(mocker, caplog): 107 | """Verify self-termination is implemented via a shutdown command rather than calling TerminateInstances.""" 108 | run_command_patch = mocker.patch("slurm_plugin.computemgtd.run_command") 109 | sleep_patch = mocker.patch("slurm_plugin.computemgtd.time.sleep") 110 | with caplog.at_level(logging.INFO): 111 | _self_terminate() 112 | assert_that(caplog.text).contains("Preparing to self terminate the instance in 10 seconds!") 113 | assert_that(caplog.text).contains("Self terminating instance now!") 114 | run_command_patch.assert_called_with("sudo shutdown -h now") 115 | sleep_patch.assert_called_with(10) 116 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_computemgtd/test_computemgtd_config/all_options.conf: -------------------------------------------------------------------------------- 1 | [computemgtd] 2 | cluster_name = hit 3 | region = us-east-2 4 | clustermgtd_heartbeat_file_path = /home/ubuntu/clustermgtd_heartbeat 5 | disable_computemgtd_actions = True 6 | slurm_nodename_file = /my/nodename/path 7 | loop_time = 300 8 | clustermgtd_timeout = 30 9 | logging_config = /path/to/logging/config 10 | proxy = my.resume.proxy 11 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_computemgtd/test_computemgtd_config/default.conf: -------------------------------------------------------------------------------- 1 | [computemgtd] 2 | cluster_name = hit 3 | region = us-east-2 4 | clustermgtd_heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 5 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_console_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the 5 | # License. A copy of the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import base64 14 | import os 15 | import re 16 | from concurrent.futures import Future 17 | from typing import Callable, Optional 18 | 19 | import boto3 20 | import pytest 21 | from assertpy import assert_that 22 | from botocore.stub import Stubber 23 | from slurm_plugin.common import TaskController 24 | from slurm_plugin.console_logger import ConsoleLogger 25 | 26 | from tests.common import MockedBoto3Request 27 | 28 | 29 | class _TestController(TaskController): 30 | def __init__(self): 31 | self.tasks_queued: int = 0 32 | self._shutdown = False 33 | 34 | def queue_task(self, task: Callable[[], None]) -> Optional[Future]: 35 | self.tasks_queued += 1 36 | task() 37 | return None 38 | 39 | def is_shutdown(self) -> bool: 40 | return self._shutdown 41 | 42 | def raise_if_shutdown(self) -> None: 43 | if self._shutdown: 44 | raise TaskController.TaskShutdownError() 45 | 46 | def wait_unless_shutdown(self, seconds_to_wait: float) -> None: 47 | self.raise_if_shutdown() 48 | 49 | def shutdown(self, wait: bool = False, cancel_futures: bool = True) -> None: 50 | self._shutdown = True 51 | 52 | 53 | @pytest.fixture() 54 | def boto3_stubber_path(): 55 | # we need to set the region in the environment because the Boto3ClientFactory requires it. 56 | os.environ["AWS_DEFAULT_REGION"] = "us-east-2" 57 | return "slurm_plugin.instance_manager.boto3" # FIXME 58 | 59 | 60 | @pytest.mark.parametrize( 61 | "compute_instances", 62 | [ 63 | [ 64 | { 65 | "Name": "node-0", 66 | "InstanceId": "i-005457f0c2beb9ad2", 67 | }, 68 | { 69 | "Name": "node-1", 70 | "InstanceId": "i-105457f0c2beb9ad2", 71 | }, 72 | ], 73 | [], 74 | ], 75 | ) 76 | def test_get_console_output_from_nodes(compute_instances): 77 | def console_callback(name, instance_id, output): 78 | actual_results.update({instance_id: output}) 79 | 80 | expected_instances = tuple(node.get("InstanceId") for node in compute_instances if node.get("InstanceId")) 81 | expected_results = {instance: f"{instance}:\rConsole output for you too." for instance in expected_instances} 82 | 83 | mocked_ec2_requests = [ 84 | MockedBoto3Request( 85 | method="get_console_output", 86 | response={ 87 | "InstanceId": instance, 88 | "Output": str(base64.b64encode(re.sub(r"\r", "\r\n", output).encode("utf-8")), "latin-1"), 89 | "Timestamp": "2022-11-18T23:37:25.000Z", 90 | }, 91 | expected_params={ 92 | "InstanceId": instance, 93 | }, 94 | generate_error=False, 95 | ) 96 | for instance, output in expected_results.items() 97 | ] 98 | 99 | actual_results = {} 100 | 101 | console_logger = ConsoleLogger( 102 | enabled=True, 103 | region="us-east-2", 104 | console_output_consumer=console_callback, 105 | ) 106 | 107 | ec2client = boto3.session.Session().client("ec2", "us-east-2") 108 | 109 | task_controller = _TestController() 110 | 111 | with Stubber(ec2client) as ec2_stub: 112 | for request in mocked_ec2_requests: 113 | ec2_stub.add_response(request.method, request.response, expected_params=request.expected_params) 114 | console_logger._boto3_client_factory = lambda service_name: ec2client 115 | console_logger.report_console_output_from_nodes( 116 | compute_instances=compute_instances, 117 | task_controller=task_controller, 118 | task_wait_function=lambda: None, 119 | ) 120 | ec2_stub.assert_no_pending_responses() 121 | 122 | ( 123 | assert_that(task_controller.tasks_queued).is_equal_to(1) 124 | if len(compute_instances) > 0 125 | else assert_that(task_controller.tasks_queued).is_zero() 126 | ) 127 | assert_that(actual_results).is_length(len(mocked_ec2_requests)) 128 | 129 | for instance, actual_output in actual_results.items(): 130 | assert_that(actual_output).is_equal_to(expected_results.get(instance)) 131 | 132 | 133 | def test_exception_handling(): 134 | class EC2Client: 135 | def get_console_output(*args, **kwargs): 136 | test_controller.shutdown() 137 | 138 | instance_id = kwargs.get("InstanceId") 139 | return {"Output": f"Output for {instance_id}"} 140 | 141 | def boto3_factory(*args, **kwargs): 142 | return EC2Client() 143 | 144 | def callback(*args): 145 | nonlocal call_count 146 | call_count += 1 147 | 148 | call_count = 0 149 | 150 | console_logger = ConsoleLogger(enabled=True, region="us-east-2", console_output_consumer=callback) 151 | 152 | console_logger._boto3_client_factory = boto3_factory 153 | 154 | test_controller = _TestController() 155 | 156 | assert_that(console_logger.report_console_output_from_nodes).raises( 157 | TaskController.TaskShutdownError 158 | ).when_called_with( 159 | compute_instances=[{"Name": "hello", "InstanceId": "instance-id"}], 160 | task_controller=test_controller, 161 | task_wait_function=lambda: None, 162 | ) 163 | 164 | assert_that(test_controller.tasks_queued).is_equal_to(1) 165 | assert_that(call_count).is_zero() 166 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs":[ 3 | { 4 | "LaunchTemplateSpecification":{ 5 | "LaunchTemplateName":"hit-queue1-fleet-spot", 6 | "Version":"$Latest" 7 | }, 8 | "Overrides":[ 9 | { 10 | "MaxPrice":"10", 11 | "InstanceType":"t2.medium", 12 | "SubnetId":"1234567" 13 | }, 14 | { 15 | "MaxPrice":"10", 16 | "InstanceType":"t2.large", 17 | "SubnetId":"1234567" 18 | } 19 | ] 20 | } 21 | ], 22 | "TargetCapacitySpecification":{ 23 | "TotalTargetCapacity":5, 24 | "DefaultTargetCapacityType":"spot" 25 | }, 26 | "Type":"instant", 27 | "SpotOptions":{ 28 | "AllocationStrategy":"capacity-optimized", 29 | "SingleInstanceType":false, 30 | "SingleAvailabilityZone":true, 31 | "MinTargetCapacity":5 32 | } 33 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs": [ 3 | { 4 | "LaunchTemplateSpecification": { 5 | "LaunchTemplateName": "hit-queue6-fleet1", 6 | "Version": "$Latest" 7 | }, 8 | "Overrides": [ 9 | { 10 | "InstanceType": "t2.medium", 11 | "SubnetId": "1234567" 12 | }, 13 | { 14 | "InstanceType": "t2.medium", 15 | "SubnetId": "7654321" 16 | }, 17 | { 18 | "InstanceType": "t2.large", 19 | "SubnetId": "1234567" 20 | }, 21 | { 22 | "InstanceType": "t2.large", 23 | "SubnetId": "7654321" 24 | } 25 | ] 26 | } 27 | ], 28 | "OnDemandOptions": { 29 | "AllocationStrategy": "lowest-price", 30 | "SingleInstanceType": false, 31 | "SingleAvailabilityZone": false, 32 | "CapacityReservationOptions": { 33 | "UsageStrategy": "use-capacity-reservations-first" 34 | } 35 | }, 36 | "TargetCapacitySpecification": { 37 | "TotalTargetCapacity": 5, 38 | "DefaultTargetCapacityType": "on-demand" 39 | }, 40 | "Type": "instant" 41 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs": [ 3 | { 4 | "LaunchTemplateSpecification": { 5 | "LaunchTemplateName": "hit-queue6-fleet1", 6 | "Version": "$Latest" 7 | }, 8 | "Overrides": [ 9 | { 10 | "InstanceType": "t2.medium", 11 | "SubnetId": "1234567" 12 | }, 13 | { 14 | "InstanceType": "t2.medium", 15 | "SubnetId": "7654321" 16 | }, 17 | { 18 | "InstanceType": "t2.large", 19 | "SubnetId": "1234567" 20 | }, 21 | { 22 | "InstanceType": "t2.large", 23 | "SubnetId": "7654321" 24 | } 25 | ] 26 | } 27 | ], 28 | "OnDemandOptions": { 29 | "AllocationStrategy": "lowest-price", 30 | "SingleInstanceType": false, 31 | "SingleAvailabilityZone": false, 32 | "CapacityReservationOptions": { 33 | "UsageStrategy": "use-capacity-reservations-first" 34 | } 35 | }, 36 | "TargetCapacitySpecification": { 37 | "TotalTargetCapacity": 5, 38 | "DefaultTargetCapacityType": "on-demand" 39 | }, 40 | "Type": "instant" 41 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs":[ 3 | { 4 | "LaunchTemplateSpecification":{ 5 | "LaunchTemplateName":"hit-queue5-fleet1", 6 | "Version":"$Latest" 7 | }, 8 | "Overrides":[ 9 | { 10 | "InstanceType":"t2.medium", 11 | "SubnetId":"1234567" 12 | }, 13 | { 14 | "InstanceType":"t2.medium", 15 | "SubnetId":"7654321" 16 | } 17 | ] 18 | } 19 | ], 20 | "OnDemandOptions":{ 21 | "AllocationStrategy":"lowest-price", 22 | "SingleInstanceType":true, 23 | "SingleAvailabilityZone":false, 24 | "MinTargetCapacity":5, 25 | "CapacityReservationOptions":{ 26 | "UsageStrategy":"use-capacity-reservations-first" 27 | } 28 | }, 29 | "TargetCapacitySpecification":{ 30 | "TotalTargetCapacity":5, 31 | "DefaultTargetCapacityType":"on-demand" 32 | }, 33 | "Type":"instant" 34 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs": [ 3 | { 4 | "LaunchTemplateSpecification": { 5 | "LaunchTemplateName": "hit-queue4-fleet1", 6 | "Version": "$Latest" 7 | }, 8 | "Overrides": [ 9 | { 10 | "InstanceType": "t2.medium", 11 | "SubnetId": "1234567" 12 | }, 13 | { 14 | "InstanceType": "t2.large", 15 | "SubnetId": "1234567" 16 | } 17 | ] 18 | } 19 | ], 20 | "OnDemandOptions": { 21 | "AllocationStrategy": "lowest-price", 22 | "SingleInstanceType": false, 23 | "SingleAvailabilityZone": true, 24 | "MinTargetCapacity": 5, 25 | "CapacityReservationOptions": { 26 | "UsageStrategy": "use-capacity-reservations-first" 27 | } 28 | }, 29 | "TargetCapacitySpecification": { 30 | "TotalTargetCapacity": 5, 31 | "DefaultTargetCapacityType": "on-demand" 32 | }, 33 | "Type": "instant" 34 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_capacity_block/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs":[ 3 | { 4 | "LaunchTemplateSpecification":{ 5 | "LaunchTemplateName":"hit-queue-cb-fleet-capacity-block", 6 | "Version":"$Latest" 7 | }, 8 | "Overrides":[ 9 | { 10 | "InstanceType":"t2.medium", 11 | "SubnetId":"1234567" 12 | }, 13 | { 14 | "InstanceType":"t2.large", 15 | "SubnetId":"1234567" 16 | } 17 | ] 18 | } 19 | ], 20 | "OnDemandOptions":{ 21 | "SingleInstanceType":false, 22 | "SingleAvailabilityZone":true, 23 | "MinTargetCapacity":1, 24 | "CapacityReservationOptions":{ 25 | "UsageStrategy":"use-capacity-reservations-first" 26 | } 27 | }, 28 | "TargetCapacitySpecification":{ 29 | "TotalTargetCapacity":5, 30 | "DefaultTargetCapacityType":"capacity-block" 31 | }, 32 | "Type":"instant" 33 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs":[ 3 | { 4 | "LaunchTemplateSpecification":{ 5 | "LaunchTemplateName":"hit-queue2-fleet-ondemand", 6 | "Version":"$Latest" 7 | }, 8 | "Overrides":[ 9 | { 10 | "InstanceType":"t2.medium", 11 | "SubnetId":"1234567" 12 | }, 13 | { 14 | "InstanceType":"t2.large", 15 | "SubnetId":"1234567" 16 | } 17 | ] 18 | } 19 | ], 20 | "OnDemandOptions":{ 21 | "AllocationStrategy":"lowest-price", 22 | "SingleInstanceType":false, 23 | "SingleAvailabilityZone":true, 24 | "MinTargetCapacity":1, 25 | "CapacityReservationOptions":{ 26 | "UsageStrategy":"use-capacity-reservations-first" 27 | } 28 | }, 29 | "TargetCapacitySpecification":{ 30 | "TotalTargetCapacity":5, 31 | "DefaultTargetCapacityType":"on-demand" 32 | }, 33 | "Type":"instant" 34 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_spot/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs":[ 3 | { 4 | "LaunchTemplateSpecification":{ 5 | "LaunchTemplateName":"hit-queue1-fleet-spot", 6 | "Version":"$Latest" 7 | }, 8 | "Overrides":[ 9 | { 10 | "MaxPrice":"10", 11 | "InstanceType":"t2.medium", 12 | "SubnetId":"1234567" 13 | }, 14 | { 15 | "MaxPrice":"10", 16 | "InstanceType":"t2.large", 17 | "SubnetId":"1234567" 18 | } 19 | ] 20 | } 21 | ], 22 | "SpotOptions":{ 23 | "AllocationStrategy":"capacity-optimized", 24 | "SingleInstanceType":false, 25 | "SingleAvailabilityZone":true, 26 | "MinTargetCapacity":1 27 | }, 28 | "TargetCapacitySpecification":{ 29 | "TotalTargetCapacity":5, 30 | "DefaultTargetCapacityType":"spot" 31 | }, 32 | "Type":"instant" 33 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/launch_overrides/expected_launch_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "LaunchTemplateConfigs":[ 3 | { 4 | "LaunchTemplateSpecification":{ 5 | "LaunchTemplateName":"hit-queue2-fleet-ondemand", 6 | "Version":"$Latest" 7 | }, 8 | "Overrides":[ 9 | { 10 | "InstanceType":"t2.medium", 11 | "SubnetId":"1234567" 12 | }, 13 | { 14 | "InstanceType":"t2.large", 15 | "SubnetId":"1234567" 16 | } 17 | ] 18 | } 19 | ], 20 | "TargetCapacitySpecification":{ 21 | "TotalTargetCapacity":5, 22 | "DefaultTargetCapacityType":"on-demand" 23 | }, 24 | "Type":"instant", 25 | "OnDemandOptions":{ 26 | "AllocationStrategy":"lowest-price", 27 | "SingleInstanceType":false, 28 | "SingleAvailabilityZone":true, 29 | "MinTargetCapacity":1, 30 | "CapacityReservationOptions":{ 31 | "UsageStrategy":"use-capacity-reservations-first" 32 | } 33 | }, 34 | "TagSpecifications":[ 35 | { 36 | "ResourceType":"capacity-reservation", 37 | "Tags":[ 38 | { 39 | "Key":"string", 40 | "Value":"string" 41 | } 42 | ] 43 | } 44 | ] 45 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with 4 | # the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | 13 | import os 14 | from types import SimpleNamespace 15 | from unittest.mock import ANY 16 | 17 | import botocore 18 | import pytest 19 | import slurm_plugin 20 | from assertpy import assert_that 21 | from slurm_plugin.clustermgtd import ComputeFleetStatus 22 | from slurm_plugin.fleet_status_manager import ( 23 | SlurmFleetManagerConfig, 24 | _get_computefleet_status, 25 | _manage_fleet_status_transition, 26 | _start_partitions, 27 | _stop_partitions, 28 | ) 29 | from slurm_plugin.slurm_resources import PartitionStatus 30 | 31 | 32 | @pytest.fixture() 33 | def boto3_stubber_path(): 34 | # we need to set the region in the environment because the Boto3ClientFactory requires it. 35 | os.environ["AWS_DEFAULT_REGION"] = "us-east-2" 36 | return "slurm_plugin.instance_manager.boto3" 37 | 38 | 39 | @pytest.mark.parametrize( 40 | ("config_file", "expected_attributes"), 41 | [ 42 | ( 43 | "default.conf", 44 | { 45 | "cluster_name": "test", 46 | "region": "us-east-2", 47 | "terminate_max_batch_size": 1000, 48 | "_boto3_config": {"retries": {"max_attempts": 5, "mode": "standard"}}, 49 | "logging_config": os.path.join( 50 | os.path.dirname(slurm_plugin.__file__), 51 | "logging", 52 | "parallelcluster_fleet_status_manager_logging.conf", 53 | ), 54 | }, 55 | ), 56 | ( 57 | "all_options.conf", 58 | { 59 | "cluster_name": "test_again", 60 | "region": "us-east-1", 61 | "terminate_max_batch_size": 50, 62 | "_boto3_config": { 63 | "retries": {"max_attempts": 10, "mode": "standard"}, 64 | "proxies": {"https": "my.resume.proxy"}, 65 | }, 66 | "logging_config": "/path/to/fleet_status_manager_logging/config", 67 | }, 68 | ), 69 | ], 70 | ) 71 | def test_fleet_status_manager_config(config_file, expected_attributes, test_datadir): 72 | resume_config = SlurmFleetManagerConfig(test_datadir / config_file) 73 | for key in expected_attributes: 74 | assert_that(resume_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key)) 75 | 76 | 77 | @pytest.mark.parametrize( 78 | ("computefleet_status_data_path", "status", "action"), 79 | [ 80 | ("path_to_file_1", ComputeFleetStatus.STOPPED, None), 81 | ("path_to_file_2", ComputeFleetStatus.RUNNING, None), 82 | ("path_to_file_3", ComputeFleetStatus.STOPPING, None), 83 | ("path_to_file_4", ComputeFleetStatus.STARTING, None), 84 | ("path_to_file_5", ComputeFleetStatus.STOP_REQUESTED, "stop"), 85 | ("path_to_file_6", ComputeFleetStatus.START_REQUESTED, "start"), 86 | ("path_to_file_7", ComputeFleetStatus.PROTECTED, None), 87 | ], 88 | ) 89 | def test_fleet_status_manager(mocker, test_datadir, computefleet_status_data_path, status, action): 90 | # mocks 91 | config = SimpleNamespace(some_key_1="some_value_1", some_key_2="some_value_2") 92 | get_computefleet_status_mocked = mocker.patch("slurm_plugin.fleet_status_manager._get_computefleet_status") 93 | get_computefleet_status_mocked.return_value = status 94 | stop_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager._stop_partitions") 95 | start_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager._start_partitions") 96 | 97 | # method to test 98 | _manage_fleet_status_transition(config, computefleet_status_data_path) 99 | 100 | # assertions 101 | get_computefleet_status_mocked.assert_called_once_with(computefleet_status_data_path) 102 | if action == "start": 103 | start_partitions_mocked.assert_called_once() 104 | stop_partitions_mocked.assert_not_called() 105 | elif action == "stop": 106 | stop_partitions_mocked.assert_called_once_with(config) 107 | start_partitions_mocked.assert_not_called() 108 | else: 109 | start_partitions_mocked.assert_not_called() 110 | stop_partitions_mocked.assert_not_called() 111 | 112 | 113 | @pytest.mark.parametrize( 114 | ("config_file", "expected_status"), 115 | [ 116 | ("correct_status.json", ComputeFleetStatus.RUNNING), 117 | ("no_status.json", ValueError), 118 | ("malformed_status.json", FileNotFoundError), 119 | ("wrong_status.json", ValueError), 120 | (None, TypeError), 121 | ], 122 | ) 123 | def test_get_computefleet_status(test_datadir, config_file, expected_status): 124 | if isinstance(expected_status, ComputeFleetStatus): 125 | status = _get_computefleet_status(test_datadir / config_file) 126 | assert_that(status).is_equal_to(expected_status) 127 | else: 128 | with pytest.raises(expected_status): 129 | _get_computefleet_status(test_datadir / config_file) 130 | 131 | 132 | def test_start_partitions(mocker): 133 | update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions") 134 | resume_powering_down_nodes_mocked = mocker.patch("slurm_plugin.fleet_status_manager.resume_powering_down_nodes") 135 | 136 | _start_partitions() 137 | 138 | update_all_partitions_mocked.assert_called_once_with(PartitionStatus.UP, reset_node_addrs_hostname=False) 139 | resume_powering_down_nodes_mocked.assert_called_once() 140 | 141 | 142 | def test_stop_partitions(mocker): 143 | # mocks 144 | config = SimpleNamespace( 145 | terminate_max_batch_size="3", region="us-east-1", cluster_name="test", boto3_config=botocore.config.Config() 146 | ) 147 | update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions") 148 | 149 | terminate_all_compute_nodes_mocked = mocker.patch.object( 150 | slurm_plugin.instance_manager.InstanceManager, "terminate_all_compute_nodes", autospec=True 151 | ) 152 | 153 | # method to test 154 | _stop_partitions(config) 155 | 156 | # assertions 157 | update_all_partitions_mocked.assert_called_once_with(PartitionStatus.INACTIVE, reset_node_addrs_hostname=True) 158 | terminate_all_compute_nodes_mocked.assert_called_once_with(ANY, config.terminate_max_batch_size) 159 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager/test_fleet_status_manager_config/all_options.conf: -------------------------------------------------------------------------------- 1 | [slurm_fleet_status_manager] 2 | cluster_name = test_again 3 | region = us-east-1 4 | proxy = my.resume.proxy 5 | boto3_retry = 10 6 | terminate_max_batch_size = 50 7 | logging_config = /path/to/fleet_status_manager_logging/config -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager/test_fleet_status_manager_config/default.conf: -------------------------------------------------------------------------------- 1 | [slurm_fleet_status_manager] 2 | cluster_name = test 3 | region = us-east-2 4 | proxy = NONE -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/correct_status.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": "RUNNING", 3 | "lastStatusUpdatedTime": "2022-01-26T11:08:18.000Z" 4 | } 5 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/malformed_status: -------------------------------------------------------------------------------- 1 | RUNNING 2 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/no_status.json: -------------------------------------------------------------------------------- 1 | { 2 | "lastStatusUpdatedTime": "2022-01-26T11:08:18.000Z" 3 | } 4 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/wrong_status.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": "NO_EXIST", 3 | "lastStatusUpdatedTime": "2022-01-26T11:08:18.000Z" 4 | } 5 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_resume/test_get_slurm_resume/malformed.json: -------------------------------------------------------------------------------- 1 | malformed json -------------------------------------------------------------------------------- /tests/slurm_plugin/test_resume/test_get_slurm_resume/resume.json: -------------------------------------------------------------------------------- 1 | { 2 | "all_nodes_resume" : "cloud[1-3]", 3 | "jobs" : [ 4 | { 5 | "extra" : "An arbitrary string from --extra", 6 | "features" : "c1,c2", 7 | "job_id" : 140814, 8 | "nodes_alloc" : "cloud[1-4]", 9 | "nodes_resume" : "cloud[1-3]", 10 | "oversubscribe" : "OK", 11 | "partition" : "cloud", 12 | "reservation" : "resv_1234" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /tests/slurm_plugin/test_resume/test_resume_config/all_options.conf: -------------------------------------------------------------------------------- 1 | [slurm_resume] 2 | cluster_name = hit 3 | instance_id = i-instance-id 4 | region = us-east-2 5 | proxy = my.resume.proxy 6 | boto3_retry = 10 7 | launch_max_batch_size = 50 8 | update_node_address = False 9 | logging_config = /path/to/resume_logging/config 10 | dynamodb_table = table-name 11 | head_node_private_ip = head.node.ip 12 | head_node_hostname = head-node-hostname 13 | hosted_zone = hosted-zone 14 | dns_domain = dns.domain 15 | use_private_hostname = False 16 | all_or_nothing_batch = True 17 | clustermgtd_heartbeat_file_path = alternate/clustermgtd_heartbeat 18 | clustermgtd_timeout = 5 19 | job_level_scaling = False 20 | assign_node_max_batch_size = 400 21 | terminate_max_batch_size = 600 22 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_resume/test_resume_config/default.conf: -------------------------------------------------------------------------------- 1 | [slurm_resume] 2 | cluster_name = hit 3 | instance_id = i-instance-id 4 | region = us-east-2 5 | dynamodb_table = table-name 6 | head_node_private_ip = head.node.ip 7 | head_node_hostname = head-node-hostname 8 | clustermgtd_heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 9 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_suspend.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance 4 | # with the License. A copy of the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | import os 13 | 14 | import pytest 15 | import slurm_plugin 16 | from assertpy import assert_that 17 | from slurm_plugin.suspend import SlurmSuspendConfig 18 | 19 | 20 | @pytest.mark.parametrize( 21 | ("config_file", "expected_attributes"), 22 | [ 23 | ( 24 | "default.conf", 25 | { 26 | "logging_config": os.path.join( 27 | os.path.dirname(slurm_plugin.__file__), "logging", "parallelcluster_suspend_logging.conf" 28 | ), 29 | "clustermgtd_timeout": 300, 30 | "clustermgtd_heartbeat_file_path": "/home/ec2-user/clustermgtd_heartbeat", 31 | }, 32 | ), 33 | ( 34 | "all_options.conf", 35 | { 36 | "logging_config": "/path/to/suspend_logging/config", 37 | "clustermgtd_timeout": 5, 38 | "clustermgtd_heartbeat_file_path": "alternate/clustermgtd_heartbeat", 39 | }, 40 | ), 41 | ], 42 | ) 43 | def test_suspend_config(config_file, expected_attributes, test_datadir): 44 | suspend_config = SlurmSuspendConfig(test_datadir / config_file) 45 | for key in expected_attributes: 46 | assert_that(suspend_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key)) 47 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_suspend/test_suspend_config/all_options.conf: -------------------------------------------------------------------------------- 1 | [slurm_suspend] 2 | logging_config = /path/to/suspend_logging/config 3 | clustermgtd_heartbeat_file_path = alternate/clustermgtd_heartbeat 4 | clustermgtd_timeout = 5 5 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_suspend/test_suspend_config/default.conf: -------------------------------------------------------------------------------- 1 | [slurm_suspend] 2 | clustermgtd_heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat 3 | -------------------------------------------------------------------------------- /tests/slurm_plugin/test_task_executor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the 5 | # License. A copy of the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import time 14 | from datetime import datetime, timezone 15 | 16 | from assertpy import assert_that, soft_assertions 17 | from slurm_plugin.common import TaskController 18 | from slurm_plugin.task_executor import TaskExecutor 19 | 20 | 21 | def test_task_executor(): 22 | def get_task(value): 23 | def task(): 24 | return value + 1 25 | 26 | return task 27 | 28 | task_executor = TaskExecutor(worker_pool_size=3, max_backlog=10) 29 | 30 | futures = {value: task_executor.queue_task(get_task(value)) for value in range(10, 20)} 31 | 32 | with soft_assertions(): 33 | for value, future in futures.items(): 34 | assert_that(future.result()).is_equal_to(value + 1) 35 | 36 | task_executor.shutdown() 37 | 38 | 39 | def test_exceeding_max_backlog(): 40 | def get_task(value): 41 | def task(): 42 | time.sleep(value) 43 | return value + 1 44 | 45 | return task 46 | 47 | task_executor = TaskExecutor(worker_pool_size=1, max_backlog=1) 48 | 49 | future = task_executor.queue_task(get_task(10)) 50 | assert_that(task_executor.queue_task).raises(TaskExecutor.MaximumBacklogExceededError).when_called_with( 51 | get_task(20) 52 | ) 53 | 54 | assert_that(future.result()).is_equal_to(11) 55 | 56 | task_executor.shutdown() 57 | 58 | 59 | def test_that_shutdown_does_not_block(): 60 | def get_task(value): 61 | def task(): 62 | task_executor.wait_unless_shutdown(value) 63 | return value + 1 64 | 65 | return task 66 | 67 | def callback(*args): 68 | nonlocal callback_called 69 | callback_called = True 70 | 71 | task_executor = TaskExecutor(worker_pool_size=1, max_backlog=1) 72 | 73 | callback_called = False 74 | start_wait = datetime.now(tz=timezone.utc) 75 | future = task_executor.queue_task(get_task(600)) 76 | future.add_done_callback(callback) 77 | 78 | task_executor.shutdown(wait=True) 79 | 80 | delta = (datetime.now(tz=timezone.utc) - start_wait).total_seconds() 81 | assert_that(delta).is_less_than(300) 82 | 83 | assert_that(future.exception).raises(TaskController.TaskShutdownError) 84 | assert_that(callback_called).is_true() 85 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py{39,310}-cov 4 | code-linters 5 | 6 | # Default testenv. Used to run tests on all python versions. 7 | [testenv] 8 | passenv = 9 | CI 10 | GITHUB_* 11 | usedevelop = 12 | cov: true 13 | nocov: false 14 | allowlist_externals = 15 | bash 16 | deps = 17 | -r tests/requirements.txt 18 | commands = 19 | nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/ 20 | cov: python setup.py clean --all build_ext --force --inplace 21 | cov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --cov-report=xml --cov=src tests/ 22 | 23 | # Section used to define common variables used by multiple testenvs. 24 | [vars] 25 | code_dirs = 26 | setup.py \ 27 | src/ \ 28 | tests/ 29 | 30 | ############################## 31 | ### AUTO-FORMATTER ### 32 | ############################## 33 | 34 | # black is a code formatter for python: https://github.com/ambv/black. 35 | # The following target formats python files with black formatter. 36 | [testenv:black] 37 | basepython = python3 38 | skip_install = true 39 | deps = 40 | black 41 | commands = 42 | black -l 120 \ 43 | {[vars]code_dirs} \ 44 | {posargs} 45 | 46 | # Checks that python files are correctly formatted. 47 | [testenv:black-check] 48 | basepython = python3 49 | skip_install = true 50 | deps = 51 | {[testenv:black]deps} 52 | commands = 53 | {[testenv:black]commands} --check --diff 54 | 55 | # isort is an imports sorter for python: https://github.com/timothycrosley/isort 56 | # The following target sorts the import according to .isort.cfg file. 57 | [testenv:isort] 58 | basepython = python3 59 | skip_install = true 60 | deps = 61 | isort 62 | seed-isort-config 63 | commands = 64 | isort -w 120 \ 65 | {[vars]code_dirs} \ 66 | {posargs} 67 | 68 | # Checks that python imports are correctly sorted. 69 | [testenv:isort-check] 70 | basepython = python3 71 | skip_install = true 72 | deps = {[testenv:isort]deps} 73 | commands = {[testenv:isort]commands} --check --diff 74 | 75 | # Reformats code with black and isort. 76 | [testenv:autoformat] 77 | basepython = python3 78 | skip_install = true 79 | deps = 80 | {[testenv:isort]deps} 81 | {[testenv:black]deps} 82 | commands = 83 | {[testenv:isort]commands} 84 | {[testenv:black]commands} 85 | 86 | 87 | ############################# 88 | ### LINTERS ### 89 | ############################# 90 | 91 | # flake8 python linter: https://github.com/PyCQA/flake8. 92 | # flake8 config is located in .flake8 file 93 | [testenv:flake8] 94 | basepython = python3 95 | skip_install = true 96 | deps = 97 | flake8 98 | flake8-docstrings 99 | flake8-bugbear 100 | # flake8-import-order # delegated to isort 101 | flake8-colors 102 | pep8-naming 103 | commands = 104 | flake8 \ 105 | {[vars]code_dirs} \ 106 | {posargs} 107 | 108 | # bandit security linter for python: https://github.com/PyCQA/bandit 109 | [testenv:bandit] 110 | basepython = python3 111 | skip_install = true 112 | deps = 113 | bandit 114 | commands = 115 | bandit -r \ 116 | -c .bandit.ini \ 117 | --exclude tests \ 118 | {[vars]code_dirs} \ 119 | {posargs} 120 | 121 | # checks that README file is well-formed. 122 | [testenv:readme] 123 | basepython = python3 124 | skip_install = true 125 | deps = 126 | readme_renderer 127 | commands = 128 | python setup.py check -r -s 129 | 130 | # Pylint linter for python: https://www.pylint.org/ 131 | # Pylint config is located in .pylintrc file. 132 | [testenv:pylint] 133 | basepython = python3 134 | deps = 135 | pyflakes 136 | pylint 137 | commands = 138 | pylint \ 139 | {[vars]code_dirs} \ 140 | {posargs} 141 | 142 | # Vulture finds unused code in python: https://github.com/jendrikseipp/vulture 143 | [testenv:vulture] 144 | basepython = python3 145 | skip_install = true 146 | deps = 147 | vulture 148 | commands = 149 | vulture \ 150 | {[vars]code_dirs} \ 151 | {posargs} 152 | 153 | # Static type checker for Python: http://mypy-lang.org/ 154 | [testenv:mypy] 155 | basepython = python3 156 | deps = 157 | mypy 158 | commands = 159 | mypy \ 160 | {[vars]code_dirs} \ 161 | {posargs} 162 | 163 | # semgrep is used to check for security issues 164 | # https://semgrep.dev/ 165 | [testenv:semgrep] 166 | basepython = python3 167 | deps = 168 | semgrep>=1.8.0 169 | commands = 170 | semgrep \ 171 | --config p/r2c-security-audit \ 172 | --config p/secrets \ 173 | --exclude 'third-party/**' \ 174 | --error 175 | 176 | # Target that groups all code linters to run in Travis. 177 | [testenv:code-linters] 178 | basepython = python3 179 | skip_install = true 180 | deps = 181 | {[testenv:black-check]deps} 182 | {[testenv:isort-check]deps} 183 | {[testenv:flake8]deps} 184 | {[testenv:bandit]deps} 185 | {[testenv:semgrep]deps} 186 | # {[testenv:pylint]deps} 187 | # {[testenv:readme]deps} 188 | commands = 189 | {[testenv:black-check]commands} 190 | {[testenv:isort-check]commands} 191 | {[testenv:flake8]commands} 192 | {[testenv:bandit]commands} 193 | {[testenv:semgrep]commands} 194 | # {[testenv:pylint]commands} 195 | # {[testenv:readme]commands} 196 | -------------------------------------------------------------------------------- /util/bump-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | # On Mac OS, the default implementation of sed is BSD sed, but this script requires GNU sed. 6 | if [ "$(uname)" == "Darwin" ]; then 7 | command -v gsed >/dev/null 2>&1 || { echo >&2 "[ERROR] Mac OS detected: please install GNU sed with 'brew install gnu-sed'"; exit 1; } 8 | PATH="/usr/local/opt/gnu-sed/libexec/gnubin:$PATH" 9 | fi 10 | 11 | if [ -z "$1" ]; then 12 | echo "New version not specified. Usage: bump-version.sh NEW_VERSION" 13 | exit 1 14 | fi 15 | 16 | NEW_VERSION=$1 17 | CURRENT_VERSION=$(sed -ne "s/^version = \"\(.*\)\"/\1/p" setup.py) 18 | 19 | sed -i "s/version = \"$CURRENT_VERSION\"/version = \"$NEW_VERSION\"/g" setup.py 20 | -------------------------------------------------------------------------------- /util/create-attribution-doc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -o xtrace 4 | 5 | 6 | append_package_details_to_final_license_file(){ 7 | # Function to Append Package Details to the THIRD-PARTY-LICENSES file 8 | #Arguments -> 1- Package Name, 2- Package Version, 3- License Type , 4- URL for package, 5,6,7- URL for License 9 | # Adding a header to final License file with Package Name, Package Version, License Type , URL for package 10 | echo -e "\n\n\n$1 \n$2 \n$3 \n$4" >> $final_license_file 11 | # Appending License 12 | curl $5 >> $final_license_file 13 | # Adding Dual Licenses if they exist 14 | if [ $# -gt 5 ] 15 | then 16 | curl $6 >> $final_license_file 17 | curl $7 >> $final_license_file 18 | fi 19 | 20 | } 21 | 22 | function create_attribution_doc() { 23 | ATTR_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" 24 | 25 | # Install the python version if it doesnt exist 26 | if test ! -d ${PYENV_ROOT}/versions/${PYTHON_VERSION}; 27 | then 28 | env PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install ${PYTHON_VERSION} 29 | fi 30 | 31 | pyenv virtualenv ${PYTHON_VERSION} attribution-doc-env 32 | # switch to a specific virtual env 33 | source ${PYENV_ROOT}/versions/attribution-doc-env/bin/activate 34 | 35 | # Update Pip 36 | pip3 install --upgrade pip 37 | 38 | # Installing PyInstaller 39 | pip3 install pyinstaller 40 | # Install pip-licenses 41 | pip3 install pip-licenses 42 | 43 | # install via source 44 | pip3 install -e "$(dirname $ATTR_SCRIPT_DIR )" 45 | 46 | final_license_file=$(dirname $ATTR_SCRIPT_DIR )/THIRD-PARTY-LICENSES.txt 47 | 48 | # Create a pip License document 49 | pip-licenses -i aws-parallelcluster-node pip-licenses --format=plain-vertical --with-license-file --with-urls --no-license-path --with-authors --output-file=$final_license_file 50 | 51 | #Getting python version 52 | cpy_version=$(python -V | grep -Eo '([0-9]+)(\.?[0-9]+)' | head -1) 53 | 54 | # Python 55 | append_package_details_to_final_license_file "Python" $cpy_version "PSF License Version 2; Zero-Clause BSD license" "https://raw.githubusercontent.com/python/cpython/$cpy_version/LICENSE" "https://raw.githubusercontent.com/python/cpython/$cpy_version/LICENSE" 56 | 57 | 58 | deactivate 59 | pyenv virtualenv-delete -f attribution-doc-env 60 | 61 | } 62 | 63 | 64 | _error_exit() { 65 | echo "$1" 66 | exit 1 67 | } 68 | 69 | _help() { 70 | local -- _cmd 71 | _cmd=$(basename "$0") 72 | 73 | cat < Python version with which you want to create the attribution document. 79 | -h, --help Print this help message 80 | 81 | Examples: 82 | ${_cmd} 83 | $_cmd --python-version 3.9.10 84 | EOF 85 | } 86 | 87 | function parse_options () { 88 | 89 | while [ $# -gt 0 ] ; do 90 | case "$1" in 91 | --python-version) PYTHON_VERSION="$2"; shift;; 92 | -h|--help|help) _help; exit 0;; 93 | *) _help; _error_exit "[error] Unrecognized option '$1'";; 94 | esac 95 | shift 96 | done 97 | 98 | } 99 | 100 | function main() { 101 | parse_options "$@" 102 | create_attribution_doc 103 | } 104 | 105 | main "$@" 106 | -------------------------------------------------------------------------------- /util/upload-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | _error_exit() { 4 | echo "$1" 5 | exit 1 6 | } 7 | 8 | _info() { 9 | echo "INFO: $1" 10 | } 11 | 12 | _help() { 13 | local -- _cmd 14 | _cmd=$(basename "$0") 15 | 16 | cat < Bucket where upload the package 23 | --srcdir Root folder of the node package 24 | --profile AWS profile name to use for the upload 25 | (optional, default is AWS_PROFILE env variable or "default") 26 | --region Region to use for AWSCli commands (optional, default is "us-east-1") 27 | --scope Disambiguation string used in the S3 path to avoid collisions (default is empty) 28 | -h, --help Print this help message 29 | EOF 30 | } 31 | 32 | main() { 33 | # parse input options 34 | while [ $# -gt 0 ] ; do 35 | case "$1" in 36 | --bucket) _bucket="$2"; shift;; 37 | --bucket=*) _bucket="${1#*=}";; 38 | --srcdir) _srcdir="$2"; shift;; 39 | --srcdir=*) _srcdir="${1#*=}";; 40 | --profile) _profile="$2"; shift;; 41 | --profile=*) _profile="${1#*=}";; 42 | --region) _region="$2"; shift;; 43 | --region=*) _region="${1#*=}";; 44 | --scope) _scope="$2"; shift;; 45 | --scope=*) _scope="${1#*=}";; 46 | -h|--help|help) _help; exit 0;; 47 | *) _help; echo "[error] Unrecognized option '$1'"; exit 1;; 48 | esac 49 | shift 50 | done 51 | 52 | # verify required parameters 53 | if [ -z "${_bucket}" ]; then 54 | _error_exit "--bucket parameter not specified" 55 | _help; 56 | fi 57 | if [ -z "${_srcdir}" ]; then 58 | _error_exit "--srcdir parameter not specified" 59 | _help; 60 | fi 61 | 62 | # initialize optional parameters 63 | if [ -z "${AWS_PROFILE}" ] && [ -z "${_profile}" ]; then 64 | _info "--profile parameter not specified, using 'default'" 65 | elif [ -n "${_profile}" ]; then 66 | _profile="--profile ${_profile}" 67 | fi 68 | if [ -z "${_region}" ]; then 69 | _info "--region parameter not specified, using 'us-east-1'" 70 | _region="us-east-1" 71 | fi 72 | if [ -z "${_scope}" ]; then 73 | _info "--scope parameter not specified, no scope will be used" 74 | _scope="" 75 | fi 76 | 77 | # check bucket or create it 78 | aws ${_profile} s3api head-bucket --bucket "${_bucket}" --region "${_region}" 79 | if [ $? -ne 0 ]; then 80 | _info "Bucket ${_bucket} do not exist, trying to create it" 81 | aws ${_profile} s3api create-bucket --bucket "${_bucket}" --region "${_region}" 82 | if [ $? -ne 0 ]; then 83 | _error_exit "Unable to create bucket ${_bucket}" 84 | fi 85 | fi 86 | 87 | _version=$(grep "version = \"" "${_srcdir}/setup.py" |awk '{print $3}' | tr -d \") 88 | if [ -z "${_version}" ]; then 89 | _error_exit "Unable to detect node version, are you in the right directory?" 90 | fi 91 | _info "Detected version ${_version}" 92 | 93 | # Create archive 94 | _cwd=$(pwd) 95 | pushd "${_srcdir}" > /dev/null || exit 96 | _stashName=$(git stash create) 97 | git archive --format tar --prefix="aws-parallelcluster-node-${_version}/" "${_stashName:-HEAD}" | gzip > "${_cwd}/aws-parallelcluster-node-${_version}.tgz" 98 | #tar zcvf "${_cwd}/aws-parallelcluster-node-${_version}.tgz" --transform "s,^aws-parallelcluster-node/,aws-parallelcluster-node-${_version}/," ../aws-parallelcluster-node 99 | popd > /dev/null || exit 100 | md5sum aws-parallelcluster-node-${_version}.tgz > aws-parallelcluster-node-${_version}.md5 101 | 102 | # upload package 103 | _key_path="parallelcluster/${_version}/node" 104 | if [ -n "${_scope}" ]; then 105 | _key_path="${_key_path}/${_scope}" 106 | fi 107 | aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-node-${_version}.tgz s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.tgz || _error_exit 'Failed to push node to S3' 108 | aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-node-${_version}.md5 s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.md5 || _error_exit 'Failed to push node md5 to S3' 109 | aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-node-${_version}.tgz --output text --query LastModified > aws-parallelcluster-node-${_version}.tgz.date || _error_exit 'Failed to fetch LastModified date' 110 | aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-node-${_version}.tgz.date s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.tgz.date || _error_exit 'Failed to push node date' 111 | 112 | _bucket_region=$(aws ${_profile} s3api get-bucket-location --bucket ${_bucket} --output text) 113 | if [ ${_bucket_region} = "None" ]; then 114 | _bucket_region="" 115 | else 116 | _bucket_region=".${_bucket_region}" 117 | fi 118 | 119 | echo "" 120 | echo "Done. Add the following configuration to the pcluster create config file:" 121 | echo "" 122 | echo "DevSettings:" 123 | echo " NodePackage: s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.tgz" 124 | } 125 | 126 | main "$@" 127 | 128 | # vim:syntax=sh 129 | --------------------------------------------------------------------------------