├── .bandit.ini
├── .flake8
├── .github
    ├── CODEOWNERS
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── bump_version.yml
    │   ├── changelog_enforcer.yml
    │   ├── ci.yml
    │   ├── codeql-analysis.yml
    │   ├── security_exclusions_checker.yml
    │   └── unsafe_patterns_checker.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── .pylintrc
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── THIRD-PARTY-LICENSES.txt
├── requirements.txt
├── setup.py
├── src
    ├── __init__.py
    ├── aws
    │   ├── __init__.py
    │   ├── common.py
    │   └── ec2.py
    ├── common
    │   ├── __init__.py
    │   ├── ec2_utils.py
    │   ├── schedulers
    │   │   ├── __init__.py
    │   │   ├── slurm_commands.py
    │   │   └── slurm_reservation_commands.py
    │   ├── time_utils.py
    │   └── utils.py
    └── slurm_plugin
    │   ├── __init__.py
    │   ├── capacity_block_manager.py
    │   ├── cluster_event_publisher.py
    │   ├── clustermgtd.py
    │   ├── common.py
    │   ├── computemgtd.py
    │   ├── console_logger.py
    │   ├── fleet_manager.py
    │   ├── fleet_status_manager.py
    │   ├── instance_manager.py
    │   ├── logging
    │       ├── parallelcluster_clustermgtd_logging.conf
    │       ├── parallelcluster_computemgtd_logging.conf
    │       ├── parallelcluster_fleet_status_manager_logging.conf
    │       ├── parallelcluster_resume_logging.conf
    │       └── parallelcluster_suspend_logging.conf
    │   ├── resume.py
    │   ├── slurm_resources.py
    │   ├── suspend.py
    │   └── task_executor.py
├── tests
    ├── __init__.py
    ├── aws
    │   └── test_ec2.py
    ├── common.py
    ├── common
    │   ├── schedulers
    │   │   ├── __init__.py
    │   │   ├── test_slurm_commands.py
    │   │   ├── test_slurm_commands
    │   │   │   └── TestPartitionNodelistMapping
    │   │   │   │   ├── test_get_partition_nodelist_mapping
    │   │   │   │       └── slurm_dir
    │   │   │   │       │   └── etc
    │   │   │   │       │       └── pcluster
    │   │   │   │       │           └── parallelcluster_partition_nodelist_mapping.json
    │   │   │   │   └── test_get_partitions
    │   │   │   │       └── slurm_dir
    │   │   │   │           └── etc
    │   │   │   │               └── pcluster
    │   │   │   │                   └── parallelcluster_partition_nodelist_mapping.json
    │   │   └── test_slurm_reservation_commands.py
    │   ├── test_ec2_utils.py
    │   ├── test_time_utils.py
    │   └── test_utils.py
    ├── conftest.py
    ├── requirements.txt
    └── slurm_plugin
    │   ├── __init__.py
    │   ├── slurm_resources
    │       ├── __init__.py
    │       └── test_slurm_resources.py
    │   ├── test_capacity_block_manager.py
    │   ├── test_cluster_event_publisher.py
    │   ├── test_clustermgtd.py
    │   ├── test_clustermgtd
    │       ├── TestClustermgtdConfig
    │       │   ├── test_config_comparison
    │       │   │   ├── config.conf
    │       │   │   └── config_modified.conf
    │       │   └── test_config_parsing
    │       │   │   ├── all_options.conf
    │       │   │   ├── default.conf
    │       │   │   └── health_check.conf
    │       └── test_manage_cluster_boto3
    │       │   └── default.conf
    │   ├── test_common.py
    │   ├── test_common
    │       └── test_read_json
    │       │   ├── faulty.json
    │       │   └── standard.json
    │   ├── test_computemgtd.py
    │   ├── test_computemgtd
    │       └── test_computemgtd_config
    │       │   ├── all_options.conf
    │       │   └── default.conf
    │   ├── test_console_logger.py
    │   ├── test_fleet_manager.py
    │   ├── test_fleet_manager
    │       └── TestEc2CreateFleetManager
    │       │   └── test_evaluate_launch_params
    │       │       ├── all_or_nothing
    │       │           └── expected_launch_params.json
    │       │       ├── fleet-multi-az-multi-it-all_or_nothing
    │       │           └── expected_launch_params.json
    │       │       ├── fleet-multi-az-multi-it
    │       │           └── expected_launch_params.json
    │       │       ├── fleet-multi-az-single-it-all_or_nothing
    │       │           └── expected_launch_params.json
    │       │       ├── fleet-single-az-multi-it-all_or_nothing
    │       │           └── expected_launch_params.json
    │       │       ├── fleet_capacity_block
    │       │           └── expected_launch_params.json
    │       │       ├── fleet_ondemand
    │       │           └── expected_launch_params.json
    │       │       ├── fleet_spot
    │       │           └── expected_launch_params.json
    │       │       └── launch_overrides
    │       │           └── expected_launch_params.json
    │   ├── test_fleet_status_manager.py
    │   ├── test_fleet_status_manager
    │       ├── test_fleet_status_manager_config
    │       │   ├── all_options.conf
    │       │   └── default.conf
    │       └── test_get_computefleet_status
    │       │   ├── correct_status.json
    │       │   ├── malformed_status
    │       │   ├── no_status.json
    │       │   └── wrong_status.json
    │   ├── test_instance_manager.py
    │   ├── test_resume.py
    │   ├── test_resume
    │       ├── test_get_slurm_resume
    │       │   ├── malformed.json
    │       │   └── resume.json
    │       └── test_resume_config
    │       │   ├── all_options.conf
    │       │   └── default.conf
    │   ├── test_suspend.py
    │   ├── test_suspend
    │       └── test_suspend_config
    │       │   ├── all_options.conf
    │       │   └── default.conf
    │   └── test_task_executor.py
├── tox.ini
└── util
    ├── bump-version.sh
    ├── create-attribution-doc.sh
    └── upload-node.sh


/.bandit.ini:
--------------------------------------------------------------------------------
1 | skips: []
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore =
 3 |     # D105: Missing docstring in magic method
 4 |     D105,
 5 |     # D100: Missing docstring in public module
 6 |     D100,
 7 |     # D101: Missing docstring in public class
 8 |     D101,
 9 |     # D102: Missing docstring in public method
10 |     D102,
11 |     # D103: Missing docstring in public function
12 |     D103,
13 |     # D104: Missing docstring in public package
14 |     D104,
15 |     # D107: Missing docstring in __init__
16 |     D107,
17 |     # W503: line break before binary operator => Conflicts with black style.
18 |     W503,
19 |     # N818: exception name should be named with an Error suffix
20 |     N818
21 | exclude =
22 |     .tox,
23 |     .git,
24 |     .pytest_cache,
25 |     docs/source/conf.py,
26 |     build,
27 |     dist,
28 |     tests/fixtures/*,
29 |     *.pyc,
30 |     *.egg-info,
31 |     .cache,
32 |     .eggs
33 | max-complexity = 10
34 | max-line-length = 120
35 | import-order-style = google
36 | application-import-names = flake8
37 | format = %(cyan)s%(path)s%(reset)s:%(bold)s%(yellow)s%(row)d%(reset)s:%(bold)s%(green)s%(col)d%(reset)s: %(bold)s%(red)s%(code)s%(reset)s %(text)s
38 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @aws/aws-parallelcluster-admins @aws/aws-parallelcluster-maintainers
2 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Description of changes
 3 | * Describe *what* you're changing and *why* you're doing these changes.
 4 | 
 5 | ### Tests
 6 | * Describe the automated and/or manual tests executed to validate the patch.
 7 | * Describe the added/modified tests.
 8 | 
 9 | ### References
10 | * Link to impacted open issues.
11 | * Link to related PRs in other packages (i.e. cookbook, node).
12 | * Link to documentation useful to understand the changes.
13 | 
14 | ### Checklist
15 | - Make sure you are pointing to **the right branch**.
16 | - If you're creating a patch for a branch other than `develop` add the branch name as prefix in the PR title (e\.g\. `[release-3.6]`).
17 | - Check all commits' messages are clear, describing what and why vs how.
18 | - Make sure **to have added unit tests or integration tests** to cover the new/modified code.
19 | - Check if documentation is impacted by this change.
20 | 
21 | Please review the [guidelines for contributing](../CONTRIBUTING.md) and [Pull Request Instructions](https://github.com/aws/aws-parallelcluster/wiki/Git-Pull-Request-Instructions).
22 | 
23 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
24 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Documentation for all configuration options:
 2 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 3 | 
 4 | version: 2
 5 | updates:
 6 |   - package-ecosystem: "pip"
 7 |     directory: "/"
 8 |     schedule:
 9 |       interval: "daily"
10 | 


--------------------------------------------------------------------------------
/.github/workflows/bump_version.yml:
--------------------------------------------------------------------------------
 1 | # Bump Version workflow that is triggered manually
 2 | name: Bump Version
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 |     # Inputs the workflow accepts.
 7 |     inputs:
 8 |       pcluster-version:
 9 |         description: 'The target version of ParallelCluster CLI'
10 |         required: true
11 |         type: string
12 |       branch:
13 |         description: 'The Github branch name'
14 |         required: true
15 |         type: string
16 | 
17 | jobs:
18 |   create-pull-requests:
19 |     permissions:
20 |       contents: write
21 |       pull-requests: write
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |     - uses: actions/checkout@v2
25 |       with:
26 |         fetch-depth: 0
27 |         ref: ${{ inputs.branch }}
28 |     - name: Modifiy Code to Change version
29 |       run: ./util/bump-version.sh ${{ inputs.pcluster-version }}
30 |         
31 |     - name: Create a Pull Request
32 |       uses: peter-evans/create-pull-request@v6
33 |       with:
34 |         commit-message: 'Bump version to ${{ inputs.pcluster-version }}'
35 |         title: 'Bump version to ${{ inputs.pcluster-version }}'
36 |         body: |
37 |           This PR contains version bump.
38 |           Auto-generated by Github Action
39 |         branch: versionbump${{ inputs.branch }}${{ inputs.pcluster-version }}
40 |         delete-branch: true
41 |         labels: skip-changelog-update
42 | 


--------------------------------------------------------------------------------
/.github/workflows/changelog_enforcer.yml:
--------------------------------------------------------------------------------
 1 | name: Enforce Changelog Updates
 2 | on:
 3 |   pull_request:
 4 |       types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
 5 | 
 6 | jobs:
 7 |   # Enforces the update of a changelog file on every pull request 
 8 |   changelog:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - uses: dangoslen/changelog-enforcer@v1.4.0
13 |       with:
14 |         changeLogPath: CHANGELOG.md
15 |         skipLabel: skip-changelog-update
16 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ParallelCluster CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   depcheck:
 7 |     runs-on: ubuntu-latest
 8 |     name: DepCheck
 9 |     steps:
10 |       - uses: actions/checkout@v1
11 |       - name: Dependency Check
12 |         uses: dependency-check/Dependency-Check_Action@main
13 |         with:
14 |           path: '.'
15 |           format: 'HTML'
16 |           project: 'aws-parallelcluster-node'
17 |       - name: Upload Test results
18 |         uses: actions/upload-artifact@master
19 |         with:
20 |           name: Depcheck report
21 |           path: ${{github.workspace}}/reports
22 |   build:
23 |     runs-on: ${{ matrix.os }}
24 |     strategy:
25 |       fail-fast: false
26 |       matrix:
27 |         os: [ubuntu-latest]
28 |         name:
29 |           - Python 3.9 Tests
30 |           - Python 3.10 Tests
31 |           - Python 3.9 Tests Coverage
32 |           - Code Checks
33 |         include:
34 |           - name: Python 3.9 Tests
35 |             python: 3.9
36 |             toxdir: cli
37 |             toxenv: py39-nocov
38 |           - name: Python 3.10 Tests
39 |             python: '3.10'
40 |             toxdir: cli
41 |             toxenv: py310-nocov
42 |           - name: Python 3.9 Tests Coverage
43 |             python: 3.9
44 |             toxdir: cli
45 |             toxenv: py39-cov
46 |           - name: Code Checks
47 |             python: 3.9
48 |             toxdir: cli
49 |             toxenv: code-linters
50 | 
51 |     steps:
52 |       - uses: actions/checkout@v2
53 |       - name: Setup Python
54 |         uses: actions/setup-python@v2
55 |         with:
56 |           python-version: ${{ matrix.python }}
57 |       - name: Install Tox and any other packages
58 |         run: pip install tox
59 |       - name: Run Tox
60 |         run: tox -e ${{ matrix.toxenv }}
61 |       - name: Upload code coverage report to Codecov
62 |         uses: codecov/codecov-action@v3
63 |         if: ${{ endsWith(matrix.toxenv, '-cov') }}
64 |         with:
65 |           files: coverage.xml
66 |           flags: unittests
67 |           verbose: true
68 |   shellcheck:
69 |     name: Shellcheck
70 |     runs-on: ubuntu-latest
71 |     steps:
72 |       - uses: actions/checkout@v2
73 |       - name: Run ShellCheck
74 |         uses: ludeeus/action-shellcheck@master
75 |         with:
76 |           severity: warning
77 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   schedule:
 7 |     - cron: '0 10 * * 2'
 8 | 
 9 | jobs:
10 |   analyze:
11 |     name: Analyze
12 |     runs-on: ubuntu-latest
13 |     permissions:
14 |       actions: read
15 |       contents: read
16 |       security-events: write
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         language: [ 'python' ]
21 |     steps:
22 |     - name: Checkout repository
23 |       uses: actions/checkout@v2
24 |     - name: Initialize CodeQL
25 |       uses: github/codeql-action/init@v2
26 |       with:
27 |         languages: ${{ matrix.language }}
28 |         queries: +security-and-quality
29 |     - name: Perform CodeQL Analysis
30 |       uses: github/codeql-action/analyze@v2
31 | 


--------------------------------------------------------------------------------
/.github/workflows/security_exclusions_checker.yml:
--------------------------------------------------------------------------------
 1 | name: Security Exclusions Checker
 2 | on:
 3 |   pull_request:
 4 |       types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
 5 | 
 6 | jobs:
 7 |   # Prevent security exclusions
 8 |   security-exclusions-check:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - name: Check PR
12 |       uses: francesco-giordano/gh-pr-content-checker@v1.0.0
13 |       with:
14 |         diffDoesNotContainRegex: "\\bnosec\\b|\\bnosemgrep\\b"
15 |         skipLabels: skip-security-exclusions-check
16 | 


--------------------------------------------------------------------------------
/.github/workflows/unsafe_patterns_checker.yml:
--------------------------------------------------------------------------------
 1 | name: Unsafe Patterns Checker
 2 | on:
 3 |   pull_request:
 4 |       types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
 5 | 
 6 | jobs:
 7 |   # Prevent bad URL suffix
 8 |   bad-url-suffix-check:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - name: Check PR for Disallowed URL Suffixes
12 |       uses: francesco-giordano/gh-pr-content-checker@v1.0.0
13 |       with:
14 |         diffDoesNotContainRegex: "amazonaws\\.com|amazonaws\\.com\\.cn|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov"
15 |         skipLabels: skip-bad-url-suffix-check
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyo
 3 | *.class
 4 | *~
 5 | *#
 6 | dist/
 7 | build/
 8 | *.egg-info/
 9 | .idea/
10 | *.iml
11 | .DS_Store
12 | .tox/
13 | .coverage
14 | coverage.xml
15 | assets/
16 | report.html
17 | 
18 | aws-parallelcluster-node-*.tgz
19 | aws-parallelcluster-node-*.md5
20 | aws-parallelcluster-node-*.date
21 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | line_length=120
 3 | known_third_party=assertpy,boto3,botocore,common,pytest,retrying,setuptools,slurm_plugin
 4 | # 3 - Vertical Hanging Indent
 5 | # from third_party import (
 6 | #     lib1,
 7 | #     lib2,
 8 | #     lib3,
 9 | #     lib4,
10 | # )
11 | multi_line_output=3
12 | include_trailing_comma=true
13 | profile=black


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: check-added-large-files
 7 |       - id: check-ast
 8 |       - id: check-executables-have-shebangs
 9 |       - id: check-json
10 |       - id: check-merge-conflict
11 |       - id: check-xml
12 |       - id: check-yaml
13 |       - id: debug-statements
14 |       - id: detect-private-key
15 |       - id: check-symlinks
16 |       - id: end-of-file-fixer
17 |       - id: pretty-format-json
18 |         args: ['--autofix', '--indent=4']
19 |       - id: requirements-txt-fixer
20 |       - id: mixed-line-ending
21 |         args: ['--fix=no']
22 | 
23 |   - repo: https://github.com/PyCQA/flake8
24 |     rev: 6.0.0
25 |     hooks:
26 |       - id: flake8
27 |         additional_dependencies: [flake8-docstrings, flake8-bugbear, flake8-colors, pep8-naming]
28 | 
29 |   - repo: https://github.com/timothycrosley/isort
30 |     rev: 5.12.0
31 |     hooks:
32 |       - id: isort
33 |         args: ['-rc', '-w 120']
34 | 
35 |   - repo: https://github.com/ambv/black
36 |     rev: 23.1.0
37 |     hooks:
38 |       - id: black
39 |         args: ['-l 120']
40 | 
41 |   - repo: https://github.com/PyCQA/bandit
42 |     rev: 1.7.4
43 |     hooks:
44 |       - id: bandit
45 |         args: ['-r', '-c', '.bandit.ini', '--exclude', 'tests']
46 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws/aws-parallelcluster-node/issues), or [recently closed](https://github.com/aws/aws-parallelcluster-node/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *develop* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-parallelcluster-node/labels/help%20wanted) issues is a great place to start. 
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws/aws-parallelcluster-node/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | aws-parallelcluster-node
2 | Copyright 2014-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | AWS ParallelCluster Node
 2 | ========================
 3 | 
 4 | [![PyPI Version](https://img.shields.io/pypi/v/aws-parallelcluster-node)](https://pypi.org/project/aws-parallelcluster-node/)
 5 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 6 | [![codecov](https://codecov.io/gh/aws/aws-parallelcluster-node/branch/develop/graph/badge.svg)](https://codecov.io/gh/aws/aws-parallelcluster-node)
 7 | [![ParallelCluster CI](https://github.com/aws/aws-parallelcluster-node/workflows/ParallelCluster%20CI/badge.svg)](https://github.com/aws/aws-parallelcluster-node/actions)
 8 | 
 9 | This repo contains the aws-parallelcluster-node package installed on the Amazon EC2 instances launched
10 | as part of AWS ParallelCluster.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.7.55
2 | retrying~=1.3
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2013-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
 4 | # the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 
12 | import os
13 | 
14 | from setuptools import find_packages, setup
15 | 
16 | # Utility function to read the README file.
17 | # Used for the long_description.  It's nice, because now 1) we have a top level
18 | # README file and 2) it's easier to type in the README file than to put a raw
19 | # string in below ...
20 | 
21 | 
22 | def read(fname):
23 |     path = os.path.join(os.path.dirname(__file__), fname)
24 |     with open(path, "r") as file:
25 |         return file.read()
26 | 
27 | 
28 | console_scripts = [
29 |     "slurm_resume = slurm_plugin.resume:main",
30 |     "slurm_suspend = slurm_plugin.suspend:main",
31 |     "slurm_fleet_status_manager = slurm_plugin.fleet_status_manager:main",
32 |     "clustermgtd = slurm_plugin.clustermgtd:main",
33 |     "computemgtd = slurm_plugin.computemgtd:main",
34 | ]
35 | version = "3.14.0"
36 | requires = ["boto3>=1.7.55", "retrying>=1.3.3"]
37 | 
38 | setup(
39 |     name="aws-parallelcluster-node",
40 |     version=version,
41 |     author="Amazon Web Services",
42 |     description="aws-parallelcluster-node provides the scripts for an AWS ParallelCluster node.",
43 |     url="https://github.com/aws/aws-parallelcluster-node",
44 |     license="Apache License 2.0",
45 |     packages=find_packages("src", exclude=["tests"]),
46 |     package_dir={"": "src"},
47 |     python_requires=">=3.9",
48 |     install_requires=requires,
49 |     entry_points=dict(console_scripts=console_scripts),
50 |     zip_safe=False,
51 |     package_data={"slurm_plugin": ["logging/*.conf"]},
52 |     long_description=(
53 |         "aws-parallelcluster-node is the python package installed on the Amazon EC2 instances launched "
54 |         "as part of AWS ParallelCluster."
55 |     ),
56 |     classifiers=[
57 |         "Development Status :: 5 - Production/Stable",
58 |         "Environment :: Console",
59 |         "Programming Language :: Python",
60 |         "Topic :: Scientific/Engineering",
61 |         "License :: OSI Approved :: Apache Software License",
62 |     ],
63 | )
64 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/src/aws/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/src/aws/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
  4 | # with the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | import functools
 13 | import logging
 14 | import time
 15 | from enum import Enum
 16 | 
 17 | import boto3
 18 | from botocore.config import Config
 19 | from botocore.exceptions import BotoCoreError, ClientError, ParamValidationError
 20 | 
 21 | LOGGER = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class AWSClientError(Exception):
 25 |     """Error during execution of some AWS calls."""
 26 | 
 27 |     class ErrorCode(Enum):
 28 |         """Error codes for AWS ClientError."""
 29 | 
 30 |         VALIDATION_ERROR = "ValidationError"
 31 |         REQUEST_LIMIT_EXCEEDED = "RequestLimitExceeded"
 32 |         THROTTLING_EXCEPTION = "ThrottlingException"
 33 |         CONDITIONAL_CHECK_FAILED_EXCEPTION = "ConditionalCheckFailedException"
 34 | 
 35 |         @classmethod
 36 |         def throttling_error_codes(cls):
 37 |             """Return a set of error codes returned when service rate limits are exceeded."""
 38 |             return {cls.REQUEST_LIMIT_EXCEEDED.value, cls.THROTTLING_EXCEPTION.value}
 39 | 
 40 |     def __init__(self, function_name: str, message: str, error_code: str = None):
 41 |         super().__init__(message)
 42 |         self.message = message
 43 |         self.error_code = error_code
 44 |         self.function_name = function_name
 45 | 
 46 | 
 47 | class LimitExceededError(AWSClientError):
 48 |     """Error caused by exceeding the limits of a downstream AWS service."""
 49 | 
 50 |     def __init__(self, function_name: str, message: str, error_code: str = None):
 51 |         super().__init__(function_name=function_name, message=message, error_code=error_code)
 52 | 
 53 | 
 54 | class BadRequestError(AWSClientError):
 55 |     """Error caused by a problem in the request."""
 56 | 
 57 |     def __init__(self, function_name: str, message: str, error_code: str = None):
 58 |         super().__init__(function_name=function_name, message=message, error_code=error_code)
 59 | 
 60 | 
 61 | class AWSExceptionHandler:
 62 |     """AWS Exception handler."""
 63 | 
 64 |     @staticmethod
 65 |     def handle_client_exception(func):
 66 |         """Handle Boto3 errors, can be used as a decorator."""
 67 | 
 68 |         @functools.wraps(func)
 69 |         def wrapper(*args, **kwargs):
 70 |             try:
 71 |                 return func(*args, **kwargs)
 72 |             except ParamValidationError as validation_error:
 73 |                 error = BadRequestError(
 74 |                     func.__name__,
 75 |                     "Error validating parameter. Failed with exception: {0}".format(str(validation_error)),
 76 |                 )
 77 |             except BotoCoreError as e:
 78 |                 error = AWSClientError(func.__name__, str(e))
 79 |             except ClientError as e:
 80 |                 # add request id
 81 |                 message = e.response["Error"]["Message"]
 82 |                 error_code = e.response["Error"]["Code"]
 83 | 
 84 |                 if error_code in AWSClientError.ErrorCode.throttling_error_codes():
 85 |                     error = LimitExceededError(func.__name__, message, error_code)
 86 |                 elif error_code == AWSClientError.ErrorCode.VALIDATION_ERROR:
 87 |                     error = BadRequestError(func.__name__, message, error_code)
 88 |                 else:
 89 |                     error = AWSClientError(func.__name__, message, error_code)
 90 |             LOGGER.error("Encountered error when performing boto3 call in %s: %s", error.function_name, error.message)
 91 |             raise error
 92 | 
 93 |         return wrapper
 94 | 
 95 |     @staticmethod
 96 |     def retry_on_boto3_throttling(func):
 97 |         """Retry boto3 calls on throttling, can be used as a decorator."""
 98 | 
 99 |         @functools.wraps(func)
100 |         def wrapper(*args, **kwargs):
101 |             while True:
102 |                 try:
103 |                     return func(*args, **kwargs)
104 |                 except ClientError as e:
105 |                     if e.response["Error"]["Code"] != "Throttling":
106 |                         raise
107 |                     LOGGER.debug("Throttling when calling %s function. Will retry in %d seconds.", func.__name__, 5)
108 |                     time.sleep(5)
109 | 
110 |         return wrapper
111 | 
112 | 
113 | def _log_boto3_calls(params, **kwargs):
114 |     service = kwargs["event_name"].split(".")[-2]
115 |     operation = kwargs["event_name"].split(".")[-1]
116 |     region = kwargs["context"].get("client_region", boto3.session.Session().region_name)
117 |     LOGGER.info(
118 |         "Executing boto3 call: region=%s, service=%s, operation=%s, params=%s", region, service, operation, params
119 |     )
120 | 
121 | 
122 | class Boto3Client:
123 |     """Boto3 client Class."""
124 | 
125 |     def __init__(self, client_name: str, config: Config, region: str = None):
126 |         region = region if region else get_region()
127 |         self._client = boto3.client(client_name, region_name=region, config=config if config else None)
128 |         self._client.meta.events.register("provide-client-params.*.*", _log_boto3_calls)
129 | 
130 |     def _paginate_results(self, method, **kwargs):
131 |         """
132 |         Return a generator for a boto3 call, this allows pagination over an arbitrary number of responses.
133 | 
134 |         :param method: boto3 method
135 |         :param kwargs: arguments to method
136 |         :return: generator with boto3 results
137 |         """
138 |         paginator = self._client.get_paginator(method.__name__)
139 |         for page in paginator.paginate(**kwargs).result_key_iters():
140 |             for result in page:
141 |                 yield result
142 | 
143 | 
144 | class Boto3Resource:
145 |     """Boto3 resource Class."""
146 | 
147 |     def __init__(self, resource_name: str):
148 |         self._resource = boto3.resource(resource_name)
149 |         self._resource.meta.client.meta.events.register("provide-client-params.*.*", _log_boto3_calls)
150 | 
151 | 
152 | def get_region():
153 |     """Get region used internally for all the AWS calls."""
154 |     region = boto3.session.Session().region_name
155 |     if region is None:
156 |         raise AWSClientError("get_region", "AWS region not configured")
157 |     return region
158 | 


--------------------------------------------------------------------------------
/src/aws/ec2.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | from typing import List
12 | 
13 | from common.utils import ApiMocker
14 | 
15 | from aws.common import AWSExceptionHandler, Boto3Client
16 | 
17 | 
18 | class CapacityReservationInfo:
19 |     """
20 |     Data object wrapping the result of a describe-capacity-reservations call.
21 | 
22 |     {
23 |         "CapacityReservationId": "cr-123456",
24 |         "OwnerId": "123",
25 |         "CapacityReservationArn": "arn:aws:ec2:us-east-2:123:capacity-reservation/cr-123456",
26 |         "AvailabilityZoneId": "use2-az1",
27 |         "InstanceType": "t3.large",
28 |         "InstancePlatform": "Linux/UNIX",
29 |         "AvailabilityZone": "eu-west-1a",
30 |         "Tenancy": "default",
31 |         "TotalInstanceCount": 1,
32 |         "AvailableInstanceCount": 1,
33 |         "EbsOptimized": false,
34 |         "EphemeralStorage": false,
35 |         "State": "active",
36 |         "StartDate": "2023-11-15T11:30:00+00:00",
37 |         "EndDate": "2023-11-16T11:30:00+00:00",  # capacity-block only
38 |         "EndDateType": "limited",
39 |         "InstanceMatchCriteria": "targeted",
40 |         "CreateDate": "2023-10-25T20:40:13+00:00",
41 |         "Tags": [
42 |             {
43 |                 "Key": "aws:ec2capacityreservation:incrementalRequestedQuantity",
44 |                 "Value": "1"
45 |             },
46 |             {
47 |                 "Key": "aws:ec2capacityreservation:capacityReservationType",
48 |                 "Value": "capacity-block"
49 |             }
50 |         ],
51 |         "CapacityAllocations": [],
52 |         "ReservationType": "capacity-block"  # capacity-block only
53 |     }
54 |     """
55 | 
56 |     def __init__(self, capacity_reservation_data):
57 |         self.capacity_reservation_data = capacity_reservation_data
58 | 
59 |     def capacity_reservation_id(self):
60 |         """Return the id of the Capacity Reservation."""
61 |         return self.capacity_reservation_data.get("CapacityReservationId")
62 | 
63 |     def state(self):
64 |         """Return the state of the Capacity Reservation."""
65 |         return self.capacity_reservation_data.get("State")
66 | 
67 |     def __eq__(self, other):
68 |         return self.__dict__ == other.__dict__
69 | 
70 | 
71 | class Ec2Client(Boto3Client):
72 |     """Implement EC2 Boto3 client."""
73 | 
74 |     def __init__(self, config=None, region=None):
75 |         super().__init__("ec2", region=region, config=config)
76 | 
77 |     @AWSExceptionHandler.handle_client_exception
78 |     @ApiMocker.mockable
79 |     def describe_capacity_reservations(self, capacity_reservation_ids: List[str]) -> List[CapacityReservationInfo]:
80 |         """Accept a space separated list of reservation ids. Return a list of CapacityReservationInfo."""
81 |         result = []
82 |         response = list(
83 |             self._paginate_results(
84 |                 self._client.describe_capacity_reservations,
85 |                 CapacityReservationIds=capacity_reservation_ids,
86 |                 # ReservationType=reservation_type,  # not yet available
87 |             )
88 |         )
89 |         for capacity_reservation in response:
90 |             result.append(CapacityReservationInfo(capacity_reservation))
91 | 
92 |         return result
93 | 


--------------------------------------------------------------------------------
/src/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/src/common/ec2_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License").
 4 | # You may not use this file except in compliance with the License.
 5 | # A copy of the License is located at
 6 | #
 7 | # http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "LICENSE.txt" file accompanying this file.
10 | # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
11 | # See the License for the specific language governing permissions and limitations under the License.
12 | 
13 | 
14 | def get_private_ip_address_and_dns_name(instance_info):
15 |     """
16 |     Return the PrivateIpAddress and PrivateDnsName of the EC2 instance.
17 | 
18 |     The PrivateIpAddress and PrivateDnsName are considered to be the ones for the
19 |     network interface with DeviceIndex = NetworkCardIndex = 0.
20 |     :param instance_info: the dictionary returned by a EC2:DescribeInstances call.
21 |     :return: the PrivateIpAddress and PrivateDnsName of the instance.
22 |     """
23 |     private_ip = instance_info["PrivateIpAddress"]
24 |     private_dns_name = instance_info["PrivateDnsName"]
25 |     all_private_ips = [private_ip]
26 |     for network_interface in instance_info["NetworkInterfaces"]:
27 |         all_private_ips.append(network_interface.get("PrivateIpAddress", private_ip))
28 |         attachment = network_interface["Attachment"]
29 |         if attachment.get("DeviceIndex", -1) == 0 and attachment.get("NetworkCardIndex", -1) == 0:
30 |             private_ip = network_interface.get("PrivateIpAddress", private_ip)
31 |             private_dns_name = network_interface.get("PrivateDnsName", private_dns_name)
32 |     return private_ip, private_dns_name, set(all_private_ips)
33 | 


--------------------------------------------------------------------------------
/src/common/schedulers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/src/common/schedulers/slurm_reservation_commands.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
  4 | # with the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | import logging
 12 | 
 13 | # A nosec comment is appended to the following line in order to disable the B404 check.
 14 | # In this file the input of the module subprocess is trusted.
 15 | import subprocess  # nosec B404
 16 | from datetime import datetime
 17 | from typing import List, Union
 18 | 
 19 | from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL
 20 | from common.utils import (
 21 |     SlurmCommandError,
 22 |     SlurmCommandErrorHandler,
 23 |     check_command_output,
 24 |     run_command,
 25 |     validate_subprocess_argument,
 26 | )
 27 | from retrying import retry
 28 | from slurm_plugin.slurm_resources import SlurmReservation
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | SCONTROL_SHOW_RESERVATION_OUTPUT_AWK_PARSER = (
 34 |     'awk \'BEGIN{{RS="\\n\\n" ; ORS="######\\n";}} {{print}}\' | '
 35 |     + "grep -oP '^(ReservationName=\\S+)|(?<!Next)(State=\\S+)|(Users=\\S+)|(Nodes=\\S+)|(######)'"
 36 | )
 37 | 
 38 | 
 39 | def _create_or_update_reservation(
 40 |     base_command: str,
 41 |     name: str,
 42 |     nodes: str = None,
 43 |     partition: str = None,
 44 |     user: str = None,
 45 |     start_time: Union[datetime, str] = None,  # 'now' is accepted as a value
 46 |     duration: Union[int, str] = None,  # 'infinite' is accepted as a value
 47 |     number_of_nodes: int = None,
 48 |     flags: str = None,
 49 |     command_timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT,
 50 |     raise_on_error=True,
 51 | ):
 52 |     """
 53 |     Create or update slurm reservation, adding all the parameters.
 54 | 
 55 |     Official documentation is https://slurm.schedmd.com/reservations.html
 56 |     """
 57 |     cmd = _add_param(base_command, "ReservationName", name)
 58 |     cmd = _add_param(cmd, "nodes", nodes)
 59 |     cmd = _add_param(cmd, "partition", partition)
 60 |     cmd = _add_param(cmd, "user", user)
 61 |     if isinstance(start_time, datetime):
 62 |         # Convert start time to format accepted by slurm command
 63 |         cmd = _add_param(cmd, "starttime", start_time.strftime("%Y-%m-%dT%H:%M:%S"))
 64 |     elif start_time:
 65 |         cmd = _add_param(cmd, "starttime", start_time)
 66 |     cmd = _add_param(cmd, "duration", duration)
 67 |     cmd = _add_param(cmd, "nodecnt", number_of_nodes)
 68 |     cmd = _add_param(cmd, "flags", flags)
 69 | 
 70 |     run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True)  # nosec B604
 71 | 
 72 | 
 73 | @retry(
 74 |     stop_max_attempt_number=2,
 75 |     wait_fixed=1000,
 76 |     retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError),
 77 | )
 78 | @SlurmCommandErrorHandler.handle_slurm_command_error
 79 | def create_slurm_reservation(
 80 |     name: str,
 81 |     nodes: str = "ALL",
 82 |     partition: str = None,
 83 |     user: str = "slurm",
 84 |     start_time: Union[datetime, str] = None,  # 'now' is accepted as a value
 85 |     duration: Union[int, str] = None,  # 'infinite' is accepted as a value
 86 |     number_of_nodes: int = None,
 87 |     flags: str = "maint",
 88 |     command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT,
 89 |     raise_on_error: bool = True,
 90 | ):
 91 |     """
 92 |     Create slurm reservation with scontrol call.
 93 | 
 94 |     The command to create a reservation is something like the following:
 95 |     scontrol create reservation starttime=2009-02-06T16:00:00 duration=120 user=slurm flags=maint nodes=ALL
 96 |     scontrol create reservation user=root partition=queue1 starttime=noon duration=60 nodecnt=10
 97 | 
 98 |     We're using slurm as default user because it is the default Slurm administrator user in ParallelCluster,
 99 |     this is the user running slurmctld daemon.
100 |     "maint" flag permits to overlap existing reservations.
101 |     Official documentation is https://slurm.schedmd.com/reservations.html
102 |     """
103 |     cmd = f"{SCONTROL} create reservation"
104 | 
105 |     logger.debug("Creating Slurm reservation with command: %s", cmd)
106 |     _create_or_update_reservation(
107 |         cmd, name, nodes, partition, user, start_time, duration, number_of_nodes, flags, command_timeout, raise_on_error
108 |     )
109 | 
110 | 
111 | @retry(
112 |     stop_max_attempt_number=2,
113 |     wait_fixed=1000,
114 |     retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError),
115 | )
116 | @SlurmCommandErrorHandler.handle_slurm_command_error
117 | def update_slurm_reservation(
118 |     name: str,
119 |     nodes: str = None,
120 |     partition: str = None,
121 |     user: str = None,
122 |     start_time: Union[datetime, str] = None,  # 'now' is accepted as a value
123 |     duration: Union[int, str] = None,  # 'infinite' is accepted as a value
124 |     number_of_nodes: int = None,
125 |     flags: str = None,
126 |     command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT,
127 |     raise_on_error: bool = True,
128 | ):
129 |     """
130 |     Update slurm reservation with scontrol call.
131 | 
132 |     The command to update a reservation is something like the following:
133 |     scontrol update ReservationName=root_3 duration=150 users=admin
134 | 
135 |     Official documentation is https://slurm.schedmd.com/reservations.html
136 |     """
137 |     cmd = f"{SCONTROL} update"
138 | 
139 |     logger.debug("Updating Slurm reservation with command: %s", cmd)
140 |     _create_or_update_reservation(
141 |         cmd, name, nodes, partition, user, start_time, duration, number_of_nodes, flags, command_timeout, raise_on_error
142 |     )
143 | 
144 | 
145 | @retry(
146 |     stop_max_attempt_number=2,
147 |     wait_fixed=1000,
148 |     retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError),
149 | )
150 | @SlurmCommandErrorHandler.handle_slurm_command_error
151 | def delete_slurm_reservation(
152 |     name: str,
153 |     command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT,
154 |     raise_on_error: bool = True,
155 | ):
156 |     """
157 |     Delete slurm reservation with scontrol call.
158 | 
159 |     The command to delete a reservation is something like the following:
160 |     scontrol delete ReservationName=root_6
161 | 
162 |     Official documentation is https://slurm.schedmd.com/reservations.html
163 |     """
164 |     cmd = f"{SCONTROL} delete"
165 |     cmd = _add_param(cmd, "ReservationName", name)
166 | 
167 |     logger.debug("Deleting Slurm reservation with command: %s", cmd)
168 |     run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True)  # nosec B604
169 | 
170 | 
171 | def _add_param(cmd, param_name, value):
172 |     """If the given value is not None, validate it and concatenate ' param_name=value' to cmd."""
173 |     if value:
174 |         validate_subprocess_argument(value)
175 |         cmd += f" {param_name}={value}"
176 |     return cmd
177 | 
178 | 
179 | @retry(
180 |     stop_max_attempt_number=2,
181 |     wait_fixed=1000,
182 |     retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError),
183 | )
184 | @SlurmCommandErrorHandler.handle_slurm_command_error
185 | def is_slurm_reservation(
186 |     name: str,
187 |     command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT,
188 |     raise_on_error: bool = True,
189 | ):
190 |     """
191 |     Check if slurm reservation exists, by retrieving information with scontrol call.
192 | 
193 |     Return True if reservation exists, False otherwise.
194 |     Raise a CalledProcessError if the command fails for other reasons.
195 | 
196 |     $ scontrol show ReservationName=root_5
197 |     Reservation root_5 not found
198 |     $ echo $?
199 |     1
200 | 
201 |     $ scontrol show ReservationName=root_6
202 |     ReservationName=root_6 StartTime=2023-10-13T15:57:03 EndTime=2024-10-12T15:57:03 Duration=365-00:00:00
203 |     Nodes=q1-st-cr2-1,q2-dy-cr4-[1-5] NodeCnt=6 CoreCnt=481 Features=(null) PartitionName=(null) Flags=MAINT,SPEC_NODES
204 |     TRES=cpu=481
205 |     Users=root Groups=(null) Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) Watts=n/a
206 |     MaxStartDelay=(null)
207 |     $ echo $?
208 |     0
209 | 
210 |     Official documentation is https://slurm.schedmd.com/reservations.html
211 |     """
212 |     cmd = f"{SCONTROL} show"
213 |     cmd = _add_param(cmd, "ReservationName", name)
214 | 
215 |     try:
216 |         logger.debug("Retrieving Slurm reservation with command: %s", cmd)
217 |         output = check_command_output(
218 |             cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True, log_error=False
219 |         )  # nosec B604
220 |         reservation_exists = f"ReservationName={name}" in output
221 | 
222 |     except subprocess.CalledProcessError as e:
223 |         expected_output = f"Reservation {name} not found"
224 |         error = f" Error is: {e.stderr.rstrip()}." if e.stderr else ""
225 |         output = f" Output is: {e.stdout.rstrip()}." if e.stdout else ""
226 |         if expected_output in error or expected_output in output:
227 |             logger.debug(f"Slurm reservation {name} not found.")
228 |             reservation_exists = False
229 |         else:
230 |             msg = f"Failed when retrieving Slurm reservation info with command {cmd}.{error}{output} {e}"
231 |             logger.error(msg)
232 |             raise SlurmCommandError(msg)
233 | 
234 |     return reservation_exists
235 | 
236 | 
237 | @retry(
238 |     stop_max_attempt_number=2,
239 |     wait_fixed=1000,
240 |     retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError),
241 | )
242 | @SlurmCommandErrorHandler.handle_slurm_command_error
243 | def get_slurm_reservations_info(
244 |     command_timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, raise_on_error: bool = True
245 | ) -> List[SlurmReservation]:
246 |     """
247 |     List existing slurm reservations with scontrol call.
248 | 
249 |     The output of the command is something like the following:
250 |     $ scontrol show reservations
251 |     ReservationName=root_7 StartTime=2023-10-25T09:46:49 EndTime=2024-10-24T09:46:49 Duration=365-00:00:00
252 |     Nodes=queuep4d-dy-crp4d-[1-5] NodeCnt=5 CoreCnt=480 Features=(null) PartitionName=(null) Flags=MAINT,SPEC_NODES
253 |     TRES=cpu=480
254 |     Users=root Groups=(null) Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) Watts=n/a
255 |     MaxStartDelay=(null)
256 | 
257 |     Official documentation is https://slurm.schedmd.com/reservations.html
258 |     """
259 |     # awk is used to replace the \n\n record separator with '######\n'
260 |     show_reservations_command = f"{SCONTROL} show reservations | {SCONTROL_SHOW_RESERVATION_OUTPUT_AWK_PARSER}"
261 |     slurm_reservations_info = check_command_output(
262 |         show_reservations_command, raise_on_error=raise_on_error, timeout=command_timeout, shell=True
263 |     )  # nosec B604
264 | 
265 |     return _parse_reservations_info(slurm_reservations_info)
266 | 
267 | 
268 | def _parse_reservations_info(slurm_reservations_info: str) -> List[SlurmReservation]:
269 |     """Parse slurm reservations info into SlurmReservation objects."""
270 |     # $ /opt/slurm/bin/scontrol show reservations awk 'BEGIN{{RS="\n\n" ; ORS="######\n";}} {{print}}' |
271 |     # grep -oP '^(ReservationName=\S+)|(?<!Next)(State=\S+)|(Users=\S+)|(Nodes=\S+)|(######)'
272 |     # ReservationName=root_8
273 |     # Nodes=queuep4d-dy-crp4d-[1-5]
274 |     # Users=root
275 |     # State=ACTIVE
276 |     # ######
277 |     # ReservationName=root_9
278 |     # Nodes=queue1-st-crt2micro-1
279 |     # Users=root
280 |     # State=ACTIVE
281 |     # ######
282 |     map_slurm_key_to_arg = {"ReservationName": "name", "Nodes": "nodes", "Users": "users", "State": "state"}
283 | 
284 |     reservation_info = slurm_reservations_info.split("######\n")
285 |     slurm_reservations = []
286 |     for reservation in reservation_info:
287 |         lines = reservation.splitlines()
288 |         kwargs = {}
289 |         for line in lines:
290 |             key, value = line.split("=")
291 |             kwargs[map_slurm_key_to_arg[key]] = value
292 |         if lines:
293 |             reservation = SlurmReservation(**kwargs)
294 |             slurm_reservations.append(reservation)
295 | 
296 |     return slurm_reservations
297 | 


--------------------------------------------------------------------------------
/src/common/time_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License").
 4 | # You may not use this file except in compliance with the License.
 5 | # A copy of the License is located at
 6 | #
 7 | # http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "LICENSE.txt" file accompanying this file.
10 | # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
11 | # See the License for the specific language governing permissions and limitations under the License.
12 | 
13 | 
14 | def minutes(min):
15 |     """Convert minutes to milliseconds."""
16 |     return min * seconds(60)
17 | 
18 | 
19 | def seconds(sec):
20 |     """Convert seconds to milliseconds."""
21 |     return sec * 1000
22 | 
23 | 
24 | def seconds_to_minutes(value):
25 |     """Convert seconds to minutes."""
26 |     return int(value / 60)
27 | 


--------------------------------------------------------------------------------
/src/common/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License").
  4 | # You may not use this file except in compliance with the License.
  5 | # A copy of the License is located at
  6 | #
  7 | # http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "LICENSE.txt" file accompanying this file.
 10 | # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
 11 | # See the License for the specific language governing permissions and limitations under the License.
 12 | 
 13 | import collections
 14 | import contextlib
 15 | import functools
 16 | import itertools
 17 | import json
 18 | import logging
 19 | import os
 20 | import pwd
 21 | import shlex
 22 | 
 23 | # A nosec comment is appended to the following line in order to disable the B404 check.
 24 | # In this file the input of the module subprocess is trusted.
 25 | import subprocess  # nosec B404
 26 | import sys
 27 | import time
 28 | from datetime import datetime, timezone
 29 | from enum import Enum
 30 | 
 31 | log = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | class CriticalError(Exception):
 35 |     """Critical error for the daemon."""
 36 | 
 37 |     pass
 38 | 
 39 | 
 40 | class EventType(Enum):
 41 |     ADD = "ADD"
 42 |     REMOVE = "REMOVE"
 43 | 
 44 | 
 45 | Host = collections.namedtuple("Host", ["instance_id", "hostname", "slots", "gpus"])
 46 | UpdateEvent = collections.namedtuple("UpdateEvent", ["action", "message", "host"])
 47 | 
 48 | 
 49 | class SlurmCommandError(Exception):
 50 |     def __init__(self, message: str):
 51 |         super().__init__(message)
 52 | 
 53 | 
 54 | class SlurmCommandErrorHandler:
 55 |     """Handle SlurmCommandError."""
 56 | 
 57 |     @staticmethod
 58 |     def handle_slurm_command_error(func):
 59 |         """Handle slurm command errors, can be used as a decorator."""
 60 | 
 61 |         @functools.wraps(func)
 62 |         def wrapper(*args, **kwargs):
 63 |             try:
 64 |                 return func(*args, **kwargs)
 65 |             except subprocess.CalledProcessError as e:
 66 |                 error = f" Error is: {e.stderr.rstrip()}." if e.stderr else ""
 67 |                 output = f" Output is: {e.stdout.rstrip()}." if e.stdout else ""
 68 |                 msg = f"Failed to execute slurm command.{error}{output} {e}"
 69 |                 log.error(msg)
 70 |                 raise SlurmCommandError(msg)
 71 | 
 72 |         return wrapper
 73 | 
 74 | 
 75 | def load_module(module):
 76 |     """
 77 |     Load python module.
 78 | 
 79 |     :param module: module path, relative to the caller one.
 80 |     :return: the loaded scheduler module
 81 |     """
 82 |     # import module
 83 |     __import__(module)
 84 |     # get module from the loaded maps
 85 |     scheduler_module = sys.modules[module]
 86 |     return scheduler_module
 87 | 
 88 | 
 89 | def check_command_output(
 90 |     command, env=None, raise_on_error=True, execute_as_user=None, log_error=True, timeout=60, shell=False
 91 | ):
 92 |     """
 93 |     Execute shell command and retrieve command output.
 94 | 
 95 |     Usage of this function will result in a B604 bandit violation. When building the command string argument, if using
 96 |     an external argument, please validate it using validate_subprocess_argument and/or validate_absolute_path functions
 97 |     based on the argument type.
 98 | 
 99 |     :param command: command to execute
100 |     :param env: a dictionary containing environment variables
101 |     :param raise_on_error: True to raise subprocess.CalledProcessError on errors
102 |     :param execute_as_user: the user executing the command
103 |     :param log_error: control whether to log or not an error
104 |     :return: the command output
105 |     :raise: subprocess.CalledProcessError if the command fails
106 |     """
107 |     if isinstance(command, str) and not shell:
108 |         command = shlex.split(command)
109 |     # A nosec B602 comment is appended to the following line in order to disable the B602 check.
110 |     # This check is disabled for the following reasons:
111 |     # - Some callers (e.g., common slurm commands) require the use of `shell=True`.
112 |     # - All values passed as the command arg are constructed from known inputs and are properly validated.
113 |     result = _run_command(
114 |         lambda _command, _env, _preexec_fn: subprocess.run(
115 |             _command,
116 |             env=_env,
117 |             preexec_fn=_preexec_fn,
118 |             timeout=timeout,
119 |             check=True,
120 |             stdout=subprocess.PIPE,
121 |             stderr=subprocess.STDOUT,
122 |             encoding="utf-8",
123 |             shell=shell,  # nosec B602
124 |         ),
125 |         command,
126 |         env,
127 |         raise_on_error,
128 |         execute_as_user,
129 |         log_error,
130 |     )
131 | 
132 |     return result.stdout if hasattr(result, "stdout") else ""
133 | 
134 | 
135 | def run_command(command, env=None, raise_on_error=True, execute_as_user=None, log_error=True, timeout=60, shell=False):
136 |     """
137 |     Execute shell command.
138 | 
139 |     Usage of this function will result in a B604 bandit violation. When building the command string argument, if using
140 |     an external argument, please validate it using validate_subprocess_argument and/or validate_absolute_path functions
141 |     based on the argument type.
142 | 
143 |     :param command: command to execute
144 |     :param env: a dictionary containing environment variables
145 |     :param raise_on_error: True to raise subprocess.CalledProcessError on errors
146 |     :param log_error: control whether to log or not an error
147 |     :raise: subprocess.CalledProcessError if the command fails
148 |     """
149 |     if isinstance(command, str) and not shell:
150 |         command = shlex.split(command)
151 |     # A nosec B602 comment is appended to the following line in order to disable the B602 check.
152 |     # This check is disabled for the following reasons:
153 |     # - Some callers (e.g., common slurm commands) require the use of `shell=True`.
154 |     # - All values passed as the command arg are constructed from known inputs and are properly validated.
155 |     _run_command(
156 |         lambda _command, _env, _preexec_fn: subprocess.run(
157 |             _command,
158 |             env=_env,
159 |             preexec_fn=_preexec_fn,
160 |             timeout=timeout,
161 |             check=True,
162 |             encoding="utf-8",
163 |             shell=shell,  # nosec B602
164 |         ),
165 |         command,
166 |         env,
167 |         raise_on_error,
168 |         execute_as_user,
169 |         log_error,
170 |     )
171 | 
172 | 
173 | def _demote(user_uid, user_gid):
174 |     def set_ids():
175 |         os.setgid(user_gid)
176 |         os.setuid(user_uid)
177 | 
178 |     return set_ids
179 | 
180 | 
181 | def _run_command(command_function, command, env=None, raise_on_error=True, execute_as_user=None, log_error=True):
182 |     try:
183 |         if env is None:
184 |             env = {}
185 | 
186 |         env.update(os.environ.copy())
187 |         if execute_as_user:
188 |             log.debug("Executing command as user '%s': %s", execute_as_user, command)
189 |             pw_record = pwd.getpwnam(execute_as_user)
190 |             user_uid = pw_record.pw_uid
191 |             user_gid = pw_record.pw_gid
192 |             preexec_fn = _demote(user_uid, user_gid)
193 |             return command_function(command, env, preexec_fn)
194 |         else:
195 |             log.debug("Executing command: %s", command)
196 |             return command_function(command, env, None)
197 |     except subprocess.CalledProcessError as e:
198 |         # CalledProcessError.__str__ already produces a significant error message
199 |         if raise_on_error:
200 |             if log_error:
201 |                 log.error(e)
202 |             raise
203 |         else:
204 |             if log_error:
205 |                 log.warning(e)
206 |             return e
207 |     except OSError as e:
208 |         log.error("Unable to execute the command %s. Failed with exception: %s", command, e)
209 |         raise
210 | 
211 | 
212 | def sleep_remaining_loop_time(total_loop_time, loop_start_time=None):
213 |     wait_remaining_time(time.sleep, total_wait_time=total_loop_time, wait_start_time=loop_start_time)
214 | 
215 | 
216 | def wait_remaining_time(wait_function, total_wait_time, wait_start_time=None):
217 |     end_time = datetime.now(tz=timezone.utc)
218 |     if not wait_start_time:
219 |         wait_start_time = end_time
220 |     # Always convert the received loop_start_time to utc timezone. This is so that we never rely on the system local
221 |     # time and risk to compare naive datatime instances with localized ones
222 |     wait_start_time = wait_start_time.astimezone(tz=timezone.utc)
223 |     time_delta = (end_time - wait_start_time).total_seconds()
224 |     if 0 <= time_delta < total_wait_time:
225 |         wait_function(total_wait_time - time_delta)
226 | 
227 | 
228 | def grouper(iterable, n):
229 |     """Slice iterable into chunks of size n."""
230 |     it = iter(iterable)
231 |     while True:
232 |         chunk = tuple(itertools.islice(it, n))
233 |         if not chunk:
234 |             return
235 |         yield chunk
236 | 
237 | 
238 | def load_additional_instance_types_data(config, section):
239 |     """Load instance types data from configuration, if set; an empty dict is returned otherwise."""
240 |     instance_types_data = {}
241 |     if config.has_option(section, "instance_types_data"):
242 |         instance_types_data_str = config.get(section, "instance_types_data")
243 |         if instance_types_data_str:
244 |             try:
245 |                 instance_types_data_str = str(instance_types_data_str).strip()
246 | 
247 |                 # Load json value if not empty
248 |                 if instance_types_data_str:
249 |                     instance_types_data = json.loads(instance_types_data_str)
250 | 
251 |                 # Fallback to empty dict if value is None
252 |                 if not instance_types_data:
253 |                     instance_types_data = {}
254 | 
255 |                 log.info(
256 |                     "Additional instance types data loaded for instance types '%s': %s",
257 |                     instance_types_data.keys(),
258 |                     instance_types_data,
259 |                 )
260 |             except Exception as e:
261 |                 raise CriticalError("Error loading instance types data from configuration: {0}".format(e))
262 |     return instance_types_data
263 | 
264 | 
265 | def convert_range_to_list(node_range):
266 |     """
267 |     Convert a number range to a list.
268 | 
269 |     Example input: Input can be like one of the format: "1-3", "1-2,6", "2, 8"
270 |     Example output: [1, 2, 3]
271 |     """
272 |     return sum(
273 |         (
274 |             (list(range(*[int(j) + k for k, j in enumerate(i.split("-"))])) if "-" in i else [int(i)])
275 |             for i in node_range.split(",")
276 |         ),
277 |         [],
278 |     )
279 | 
280 | 
281 | def time_is_up(initial_time: datetime, current_time: datetime, grace_time: float):
282 |     """Check if timeout is exceeded."""
283 |     # Localize datetime objects to UTC if not previously localized
284 |     # All timestamps used in this function should be already localized
285 |     # Assume timestamp was taken from system local time if there is no localization info
286 |     if initial_time is None:
287 |         return True
288 |     if not initial_time.tzinfo:
289 |         logging.warning(
290 |             "Timestamp %s is not localized. Please double check that this is expected, localizing to UTC.", initial_time
291 |         )
292 |         initial_time = initial_time.astimezone(tz=timezone.utc)
293 |     if not current_time.tzinfo:
294 |         logging.warning(
295 |             "Timestamp %s is not localized. Please double check that this is expected, localizing to UTC", current_time
296 |         )
297 |         current_time = current_time.astimezone(tz=timezone.utc)
298 |     time_diff = (current_time - initial_time).total_seconds()
299 |     return time_diff >= grace_time
300 | 
301 | 
302 | def read_json(file_path, default=None):
303 |     """Read json file into a dict."""
304 |     try:
305 |         with open(file_path) as mapping_file:
306 |             return json.load(mapping_file)
307 |     except Exception as e:
308 |         if default is None:
309 |             log.error("Unable to read file from '%s'. Failed with exception: %s", file_path, e)
310 |             raise
311 |         else:
312 |             if not isinstance(e, FileNotFoundError):
313 |                 log.info("Unable to read file '%s' due to an exception: %s. Using default: %s", file_path, e, default)
314 |             return default
315 | 
316 | 
317 | def validate_subprocess_argument(argument):
318 |     """
319 |     Validate an argument used to build a subprocess command.
320 | 
321 |     The validation is done forcing the encoding to be the standard
322 |     Python Unicode / UTF-8 and searching for forbidden patterns.
323 | 
324 |     :param argument: an argument string to validate
325 |     :raise: Exception if the argument contains a forbidden pattern
326 |     :return: True if the argument does not contain forbidden patterns
327 |     """
328 |     forbidden_patterns = ["&", "|", ";", "$", ">", "<", "`", "\\", "!", "#", "\n"]
329 | 
330 |     # Forcing the encoding to be the standard Python Unicode / UTF-8
331 |     # https://docs.python.org/3/howto/unicode.html
332 |     # https://docs.python.org/3/library/codecs.html#standard-encodings
333 |     _argument = (str(argument).encode("utf-8", "ignore")).decode()
334 | 
335 |     if any(pattern in _argument for pattern in forbidden_patterns):
336 |         raise ValueError("Value of provided argument contains at least a forbidden pattern")
337 |     return True
338 | 
339 | 
340 | def validate_absolute_path(path):
341 |     """
342 |     Validate if a path string represents is a valid absolute path.
343 | 
344 |     :param path: path to validate
345 |     :raise: Exception if the path is not a valid absolute path
346 |     :return: True if the path is a valid absolute path
347 |     """
348 |     if not os.path.isabs(path):
349 |         raise ValueError(f"The path {path} is not a valid absolute path")
350 |     return True
351 | 
352 | 
353 | @contextlib.contextmanager
354 | def setup_logging_filter(logger: logging.Logger, custom_field: str):
355 |     """Set up a custom logging filter and remove it at the end of the context."""
356 | 
357 |     class CustomFilter(logging.Filter):
358 |         def __init__(self, custom_field: str):
359 |             super().__init__()
360 |             self.field = custom_field
361 |             self.value = None
362 | 
363 |         def set_custom_value(self, custom_value: str):
364 |             self.value = custom_value
365 | 
366 |         def filter(self, record: logging.LogRecord) -> bool:
367 |             if self.value:
368 |                 record.msg = f"{self.field} {self.value} - {record.msg}"
369 |             return True
370 | 
371 |     custom_filter = CustomFilter(custom_field)
372 |     logger.addFilter(custom_filter)
373 |     try:
374 |         yield custom_filter
375 |     finally:
376 |         # Remove the custom log filter
377 |         logger.removeFilter(custom_filter)
378 | 
379 | 
380 | class ApiMocker:
381 |     """API mocker."""
382 | 
383 |     @staticmethod
384 |     def mockable(func):
385 |         """
386 |         Try to mock passed function by searching for an overrides.py file in the same path of the given func.
387 | 
388 |         This function can be used a decorator and applied any method.
389 | 
390 |         The function will check if a function called with the name of the given function exists
391 |         in the <function-dir>/overrides.py, and if it does, the function will execute it.
392 | 
393 |         E.g. if the method with ApiMocker.mockable decorator is defined in Ec2Client class
394 |         of the ${node_virtualenv_path}/aws/ec2.py module, the mocked function should be defined
395 |         in the ${node_virtualenv_path}/aws/overrides.py file.
396 |         """
397 | 
398 |         def wrapper(*args, **kwargs):
399 |             try:
400 |                 function_name = func.__name__
401 |                 # retrieve parent module of the given function that has the ApiMocker.mockable decorator
402 |                 func_module = func.__module__
403 |                 func_parent_module = func_module[: func_module.rindex(".")]
404 |                 # try to import overrides.py module in the same folder of the module to mock
405 |                 overrides_module = __import__(f"{func_parent_module}.overrides", fromlist=function_name)
406 |                 overrided_func = getattr(overrides_module, function_name)
407 |                 log.info("Calling %s override with args: %s and kwargs: %s", function_name, args, kwargs)
408 |                 result = overrided_func(*args, **kwargs)
409 |             except (ImportError, AttributeError):
410 |                 result = func(*args, **kwargs)
411 |             return result
412 | 
413 |         return wrapper
414 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
 4 | # the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | 
 13 | import functools
 14 | import logging
 15 | from concurrent.futures import Future
 16 | from datetime import datetime
 17 | from enum import Enum
 18 | from typing import Callable, Optional, Protocol, TypedDict
 19 | 
 20 | from common.utils import check_command_output, time_is_up, validate_absolute_path
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | # timestamp used by clustermgtd and computemgtd should be in default ISO format
 25 | # YYYY-MM-DDTHH:MM:SS.ffffff+HH:MM[:SS[.ffffff]]
 26 | TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f%z"
 27 | DEFAULT_COMMAND_TIMEOUT = 30
 28 | 
 29 | ComputeInstanceDescriptor = TypedDict(
 30 |     "ComputeInstanceDescriptor",
 31 |     {
 32 |         "Name": str,
 33 |         "InstanceId": str,
 34 |     },
 35 | )
 36 | 
 37 | 
 38 | class ScalingStrategy(Enum):
 39 |     ALL_OR_NOTHING = "all-or-nothing"
 40 |     BEST_EFFORT = "best-effort"
 41 |     GREEDY_ALL_OR_NOTHING = "greedy-all-or-nothing"
 42 | 
 43 |     @classmethod
 44 |     def _missing_(cls, strategy):
 45 |         # Ref: https://docs.python.org/3/library/enum.html#enum.Enum._missing_
 46 |         _strategy = str(strategy).lower()
 47 |         for member in cls:
 48 |             if member.value == _strategy:
 49 |                 return member
 50 |         return cls.ALL_OR_NOTHING  # Default to all-or-nothing
 51 | 
 52 |     def __str__(self):
 53 |         return str(self.value)
 54 | 
 55 | 
 56 | class TaskController(Protocol):
 57 |     class TaskShutdownError(RuntimeError):
 58 |         """Exception raised if shutdown has been requested."""
 59 | 
 60 |         pass
 61 | 
 62 |     def queue_task(self, task: Callable[[], None]) -> Optional[Future]:
 63 |         """Queue a task and returns a Future for the task or None if the task could not be queued."""
 64 | 
 65 |     def is_shutdown(self) -> bool:
 66 |         """Is shutdown has been requested."""
 67 | 
 68 |     def raise_if_shutdown(self) -> None:
 69 |         """Raise an error if a shutdown has been requested."""
 70 | 
 71 |     def wait_unless_shutdown(self, seconds_to_wait: float) -> None:
 72 |         """Wait for seconds_to_wait or will raise an error if a shutdown has been requested."""
 73 | 
 74 |     def shutdown(self, wait: bool, cancel_futures: bool) -> None:
 75 |         """Request that all tasks be shutdown."""
 76 | 
 77 | 
 78 | def log_exception(
 79 |     logger,
 80 |     action_desc,
 81 |     log_level=logging.ERROR,
 82 |     catch_exception=Exception,
 83 |     raise_on_error=True,
 84 |     exception_to_raise=None,
 85 | ):
 86 |     def _log_exception(function):
 87 |         @functools.wraps(function)
 88 |         def wrapper(*args, **kwargs):
 89 |             wrapped = None
 90 |             try:
 91 |                 wrapped = function(*args, **kwargs)
 92 |             except catch_exception as e:
 93 |                 logger.log(log_level, "Failed when %s with exception %s, message: %s", action_desc, type(e).__name__, e)
 94 |                 if raise_on_error:
 95 |                     if exception_to_raise:
 96 |                         # preserve exception message if exception to raise is same of actual exception
 97 |                         raise e if isinstance(e, exception_to_raise) else exception_to_raise
 98 |                     else:
 99 |                         raise
100 | 
101 |             return wrapped
102 | 
103 |         return wrapper
104 | 
105 |     return _log_exception
106 | 
107 | 
108 | def print_with_count(resource_list):
109 |     """Print resource list with the len of the list."""
110 |     if isinstance(resource_list, str):
111 |         return resource_list
112 |     resource_list = [str(elem) for elem in resource_list]
113 |     return f"(x{len(resource_list)}) {str(resource_list)}"
114 | 
115 | 
116 | def get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
117 |     """Get clustermgtd's last heartbeat."""
118 |     # Use subprocess based method to read shared file to prevent hanging when NFS is down
119 |     # Do not copy to local. Different users need to access the file, but file should be writable by root only
120 |     # Only use last line of output to avoid taking unexpected output in stdout
121 | 
122 |     # Validation to sanitize the input argument and make it safe to use the function affected by B604
123 |     validate_absolute_path(clustermgtd_heartbeat_file_path)
124 | 
125 |     heartbeat = (
126 |         check_command_output(
127 |             f"cat {clustermgtd_heartbeat_file_path}",
128 |             timeout=DEFAULT_COMMAND_TIMEOUT,
129 |             shell=True,  # nosec B604
130 |         )
131 |         .splitlines()[-1]
132 |         .strip()
133 |     )
134 |     # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
135 |     # datetime.strptime will not work with str(datetime)
136 |     # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
137 |     return datetime.strptime(heartbeat, TIMESTAMP_FORMAT)
138 | 
139 | 
140 | def expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
141 |     """Test if clustermgtd heartbeat is expired."""
142 |     if time_is_up(last_heartbeat, current_time, clustermgtd_timeout):
143 |         logger.error(
144 |             "Clustermgtd has been offline since %s. Current time is %s. Timeout of %s seconds has expired!",
145 |             last_heartbeat,
146 |             current_time,
147 |             clustermgtd_timeout,
148 |         )
149 |         return True
150 |     return False
151 | 
152 | 
153 | def is_clustermgtd_heartbeat_valid(current_time, clustermgtd_timeout, clustermgtd_heartbeat_file_path):
154 |     try:
155 |         last_heartbeat = get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
156 |         logger.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
157 |         return not expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
158 |     except Exception as e:
159 |         logger.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e)
160 |         return False
161 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/computemgtd.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | import logging
 13 | import os
 14 | import time
 15 | from configparser import ConfigParser
 16 | from datetime import datetime, timezone
 17 | from io import StringIO
 18 | from logging.config import fileConfig
 19 | 
 20 | # A nosec comment is appended to the following line in order to disable the B404 check.
 21 | # In this file the input of the module subprocess is trusted.
 22 | from subprocess import CalledProcessError  # nosec B404
 23 | 
 24 | from botocore.config import Config
 25 | from common.schedulers.slurm_commands import get_nodes_info
 26 | from common.time_utils import seconds
 27 | from common.utils import check_command_output, run_command, sleep_remaining_loop_time, validate_absolute_path
 28 | from retrying import retry
 29 | from slurm_plugin.common import (
 30 |     DEFAULT_COMMAND_TIMEOUT,
 31 |     expired_clustermgtd_heartbeat,
 32 |     get_clustermgtd_heartbeat,
 33 |     log_exception,
 34 | )
 35 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR
 36 | 
 37 | LOOP_TIME = 60
 38 | RELOAD_CONFIG_ITERATIONS = 10
 39 | # Computemgtd config is under /opt/slurm/etc/pcluster/.slurm_plugin/; all compute nodes share a config
 40 | SLURM_PLUGIN_DIR = "/opt/slurm/etc/pcluster/.slurm_plugin"
 41 | COMPUTEMGTD_CONFIG_PATH = f"{SLURM_PLUGIN_DIR}/parallelcluster_computemgtd.conf"
 42 | log = logging.getLogger(__name__)
 43 | 
 44 | 
 45 | class ComputemgtdConfig:
 46 |     DEFAULTS = {
 47 |         # Basic configs
 48 |         "max_retry": 1,
 49 |         "loop_time": LOOP_TIME,
 50 |         "proxy": "NONE",
 51 |         "disable_computemgtd_actions": False,
 52 |         "clustermgtd_timeout": 600,
 53 |         "slurm_nodename_file": os.path.join(CONFIG_FILE_DIR, "slurm_nodename"),
 54 |         "logging_config": os.path.join(
 55 |             os.path.dirname(__file__), "logging", "parallelcluster_computemgtd_logging.conf"
 56 |         ),
 57 |     }
 58 | 
 59 |     def __init__(self, config_file_path):
 60 |         self._get_config(config_file_path)
 61 | 
 62 |     def __repr__(self):
 63 |         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
 64 |         return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs)
 65 | 
 66 |     @log_exception(log, "reading computemgtd config", catch_exception=Exception, raise_on_error=True)
 67 |     def _get_config(self, config_file_path):
 68 |         """Get computemgtd configuration."""
 69 |         log.info("Reading %s", config_file_path)
 70 |         config = ConfigParser()
 71 |         try:
 72 |             # Validation to sanitize the input argument and make it safe to use the function affected by B604
 73 |             validate_absolute_path(config_file_path)
 74 |             # Use subprocess based method to copy shared file to local to prevent hanging when NFS is down
 75 |             config_str = check_command_output(
 76 |                 f"cat {config_file_path}",
 77 |                 timeout=DEFAULT_COMMAND_TIMEOUT,
 78 |                 shell=True,  # nosec B604
 79 |             )
 80 |             config.read_file(StringIO(config_str))
 81 |         except Exception:
 82 |             log.error("Cannot read computemgtd configuration file: %s", config_file_path)
 83 |             raise
 84 | 
 85 |         # Get config settings
 86 |         self.region = config.get("computemgtd", "region")
 87 |         self.cluster_name = config.get("computemgtd", "cluster_name")
 88 |         # Configure boto3 to retry 1 times by default
 89 |         self._boto3_retry = config.getint("clustermgtd", "boto3_retry", fallback=self.DEFAULTS.get("max_retry"))
 90 |         self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}}
 91 |         self.loop_time = config.getint("computemgtd", "loop_time", fallback=self.DEFAULTS.get("loop_time"))
 92 |         self.clustermgtd_timeout = config.getint(
 93 |             "computemgtd",
 94 |             "clustermgtd_timeout",
 95 |             fallback=self.DEFAULTS.get("clustermgtd_timeout"),
 96 |         )
 97 |         self.disable_computemgtd_actions = config.getboolean(
 98 |             "computemgtd",
 99 |             "disable_computemgtd_actions",
100 |             fallback=self.DEFAULTS.get("disable_computemgtd_actions"),
101 |         )
102 |         self.clustermgtd_heartbeat_file_path = config.get("computemgtd", "clustermgtd_heartbeat_file_path")
103 |         self._slurm_nodename_file = config.get(
104 |             "computemgtd", "slurm_nodename_file", fallback=self.DEFAULTS.get("slurm_nodename_file")
105 |         )
106 |         self.nodename = ComputemgtdConfig._read_nodename_from_file(self._slurm_nodename_file)
107 | 
108 |         proxy = config.get("computemgtd", "proxy", fallback=self.DEFAULTS.get("proxy"))
109 |         if proxy != "NONE":
110 |             self._boto3_config["proxies"] = {"https": proxy}
111 |         self.boto3_config = Config(**self._boto3_config)
112 |         self.logging_config = config.get("computemgtd", "logging_config", fallback=self.DEFAULTS.get("logging_config"))
113 |         # Log configuration
114 |         log.info(self.__repr__())
115 | 
116 |     @staticmethod
117 |     def _read_nodename_from_file(nodename_file_path):
118 |         """Read self nodename from a file."""
119 |         try:
120 |             with open(nodename_file_path, "r") as nodename_file:
121 |                 nodename = nodename_file.read()
122 |             return nodename
123 |         except Exception as e:
124 |             log.error("Unable to read self nodename from %s with exception: %s\n", nodename_file_path, e)
125 |             raise
126 | 
127 | 
128 | @log_exception(log, "self terminating compute instance", catch_exception=CalledProcessError, raise_on_error=False)
129 | def _self_terminate():
130 |     """Self terminate the instance."""
131 |     # Sleep for 10 seconds so termination log entries are uploaded to CW logs
132 |     log.info("Preparing to self terminate the instance in 10 seconds!")
133 |     time.sleep(10)
134 |     log.info("Self terminating instance now!")
135 |     run_command("sudo shutdown -h now")
136 | 
137 | 
138 | @retry(stop_max_attempt_number=3, wait_fixed=1500)
139 | def _get_nodes_info_with_retry(nodes):
140 |     return get_nodes_info(nodes)
141 | 
142 | 
143 | def _is_self_node_down(self_nodename):
144 |     """
145 |     Check if self node is healthy according to the scheduler.
146 | 
147 |     Node is considered healthy if:
148 |     1. Node is not in DOWN
149 |     2. Node is not in POWER_SAVE
150 |     Note: node that is incorrectly attached to the scheduler will be in DOWN* after SlurmdTimeout.
151 |     """
152 |     try:
153 |         self_node = _get_nodes_info_with_retry(self_nodename)[0]
154 |         log.info("Current self node state %s", self_node.__repr__())
155 |         if self_node.is_down() or self_node.is_power():
156 |             log.warning("Node is incorrectly attached to scheduler, preparing for self termination...")
157 |             return True
158 |         log.info("Node is correctly attached to scheduler, not terminating...")
159 |         return False
160 |     except Exception as e:
161 |         # This could happen is slurmctld is down completely
162 |         log.error("Unable to retrieve current node state from slurm with exception: %s\nConsidering node as down!", e)
163 | 
164 |     return True
165 | 
166 | 
167 | def _load_daemon_config(config_file):
168 |     # Get program config
169 |     computemgtd_config = ComputemgtdConfig(config_file)
170 |     # Configure root logger
171 |     try:
172 |         fileConfig(computemgtd_config.logging_config, disable_existing_loggers=False)
173 |     except Exception as e:
174 |         log.warning(
175 |             "Unable to configure logging from %s, using default logging settings.\nException: %s",
176 |             computemgtd_config.logging_config,
177 |             e,
178 |         )
179 |     return computemgtd_config
180 | 
181 | 
182 | def _run_computemgtd(config_file):
183 |     """Run computemgtd actions."""
184 |     # Initial default heartbeat time as computemgtd startup time
185 |     last_heartbeat = datetime.now(tz=timezone.utc)
186 |     log.info("Initializing clustermgtd heartbeat to be computemgtd startup time: %s", last_heartbeat)
187 |     computemgtd_config = _load_daemon_config(config_file)
188 |     reload_config_counter = RELOAD_CONFIG_ITERATIONS
189 |     while True:
190 |         # Get current time
191 |         current_time = datetime.now(tz=timezone.utc)
192 | 
193 |         if reload_config_counter <= 0:
194 |             try:
195 |                 computemgtd_config = _load_daemon_config(config_file)
196 |                 reload_config_counter = RELOAD_CONFIG_ITERATIONS
197 |             except Exception as e:
198 |                 log.warning("Unable to reload daemon config, using previous one.\nException: %s", e)
199 |         else:
200 |             reload_config_counter -= 1
201 | 
202 |         # Check heartbeat
203 |         try:
204 |             last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path)
205 |             log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
206 |         except Exception as e:
207 |             log.warning(
208 |                 "Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s",
209 |                 last_heartbeat,
210 |                 e,
211 |             )
212 |         if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout):
213 |             if computemgtd_config.disable_computemgtd_actions:
214 |                 log.info("All computemgtd actions currently disabled")
215 |             elif _is_self_node_down(computemgtd_config.nodename):
216 |                 _self_terminate()
217 | 
218 |         sleep_remaining_loop_time(computemgtd_config.loop_time, current_time)
219 | 
220 | 
221 | @retry(wait_fixed=seconds(LOOP_TIME))
222 | def main():
223 |     logging.basicConfig(
224 |         level=logging.INFO, format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s"
225 |     )
226 |     log.info("Computemgtd Startup")
227 |     try:
228 |         clustermgtd_config_file = os.environ.get("CONFIG_FILE", COMPUTEMGTD_CONFIG_PATH)
229 |         _run_computemgtd(clustermgtd_config_file)
230 |     except Exception as e:
231 |         log.exception("An unexpected error occurred: %s", e)
232 |         raise
233 | 
234 | 
235 | if __name__ == "__main__":
236 |     main()
237 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/console_logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License").
  4 | # You may not use this file except in compliance with the
  5 | # License. A copy of the License is located at
  6 | #
  7 | # http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import logging
 14 | import re
 15 | from typing import Any, Callable, Iterable
 16 | 
 17 | import boto3
 18 | from slurm_plugin.common import ComputeInstanceDescriptor, TaskController
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class ConsoleLogger:
 24 |     """Class for retrieving and logging instance console output."""
 25 | 
 26 |     def __init__(self, enabled: bool, region: str, console_output_consumer: Callable[[str, str, str], None]):
 27 |         self._region = region
 28 |         self._console_logging_enabled = enabled
 29 |         self._console_output_consumer = console_output_consumer
 30 |         self._boto3_client_factory = lambda service_name: boto3.session.Session().client(
 31 |             service_name, region_name=region
 32 |         )
 33 | 
 34 |     def report_console_output_from_nodes(
 35 |         self,
 36 |         compute_instances: Iterable[ComputeInstanceDescriptor],
 37 |         task_controller: TaskController,
 38 |         task_wait_function: Callable[[], None],
 39 |     ):
 40 |         """Queue a task that will retrieve the console output for failed compute nodes."""
 41 |         if not self._console_logging_enabled:
 42 |             return None
 43 | 
 44 |         # Only schedule a task if we have any compute_instances to query. We also need to realize any lazy instance ID
 45 |         # lookups before we schedule the task since the instance ID mapping may change after we return from this
 46 |         # call but before the task is executed.
 47 |         compute_instances = tuple(compute_instances)
 48 |         if len(compute_instances) < 1:
 49 |             return None
 50 | 
 51 |         task = self._get_console_output_task(
 52 |             raise_if_shutdown=task_controller.raise_if_shutdown,
 53 |             task_wait_function=task_wait_function,
 54 |             client_factory=self._boto3_client_factory,
 55 |             compute_instances=compute_instances,
 56 |         )
 57 | 
 58 |         return task_controller.queue_task(task)
 59 | 
 60 |     def _get_console_output_task(
 61 |         self,
 62 |         task_wait_function: Callable[[], None],
 63 |         raise_if_shutdown: Callable[[], None],
 64 |         client_factory: Callable[[str], Any],
 65 |         compute_instances: Iterable[ComputeInstanceDescriptor],
 66 |     ):
 67 |         def console_collector():
 68 |             try:
 69 |                 # Sleep to allow EC2 time to publish the console output after the node terminates.
 70 |                 task_wait_function()
 71 |                 ec2client = client_factory("ec2")
 72 | 
 73 |                 for output in ConsoleLogger._get_console_output_from_nodes(ec2client, compute_instances):
 74 |                     # If shutdown, raise an exception so that any interested threads will know
 75 |                     # this task was not completed.
 76 |                     raise_if_shutdown()
 77 |                     self._console_output_consumer(
 78 |                         output.get("Name"),
 79 |                         output.get("InstanceId"),
 80 |                         output.get("ConsoleOutput"),
 81 |                     )
 82 |             except Exception as e:
 83 |                 logger.error("Encountered exception while retrieving compute console output: %s", e)
 84 |                 raise
 85 | 
 86 |         return console_collector
 87 | 
 88 |     @staticmethod
 89 |     def _get_console_output_from_nodes(ec2client, compute_instances):
 90 |         pattern = re.compile(r"\r\n|\n")
 91 |         for instance in compute_instances:
 92 |             instance_name = instance.get("Name")
 93 |             instance_id = instance.get("InstanceId")
 94 |             logger.info("Retrieving Console Output for node %s (%s)", instance_id, instance_name)
 95 |             response = ec2client.get_console_output(InstanceId=instance_id)
 96 |             output = response.get("Output")
 97 |             yield {
 98 |                 "Name": instance_name,
 99 |                 "InstanceId": instance_id,
100 |                 "ConsoleOutput": pattern.sub("\r", output) if output else None,
101 |             }
102 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/fleet_status_manager.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | 
 13 | import argparse
 14 | import json
 15 | import logging
 16 | import os
 17 | import sys
 18 | from configparser import ConfigParser
 19 | from logging.config import fileConfig
 20 | 
 21 | from botocore.config import Config
 22 | from common.schedulers.slurm_commands import resume_powering_down_nodes, update_all_partitions
 23 | from slurm_plugin.clustermgtd import ComputeFleetStatus, ComputeFleetStatusManager
 24 | from slurm_plugin.common import log_exception
 25 | from slurm_plugin.instance_manager import InstanceManager
 26 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR, PartitionStatus
 27 | 
 28 | log = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | class SlurmFleetManagerConfig:
 32 |     DEFAULTS = {
 33 |         "max_retry": 5,
 34 |         "terminate_max_batch_size": 1000,
 35 |         "proxy": "NONE",
 36 |         "logging_config": os.path.join(
 37 |             os.path.dirname(__file__), "logging", "parallelcluster_fleet_status_manager_logging.conf"
 38 |         ),
 39 |     }
 40 | 
 41 |     def __init__(self, config_file_path):
 42 |         self._get_config(config_file_path)
 43 | 
 44 |     def __repr__(self):
 45 |         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
 46 |         return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs)
 47 | 
 48 |     @log_exception(log, "reading fleet status manager configuration file", catch_exception=IOError, raise_on_error=True)
 49 |     def _get_config(self, config_file_path):
 50 |         """Get fleetmanager configuration."""
 51 |         log.info("Reading %s", config_file_path)
 52 | 
 53 |         config = ConfigParser()
 54 |         try:
 55 |             config.read_file(open(config_file_path, "r"))
 56 |         except IOError:
 57 |             log.error("Cannot read slurm fleet manager configuration file: %s", config_file_path)
 58 |             raise
 59 | 
 60 |         self.region = config.get("slurm_fleet_status_manager", "region")
 61 |         self.cluster_name = config.get("slurm_fleet_status_manager", "cluster_name")
 62 |         self.terminate_max_batch_size = config.getint(
 63 |             "slurm_fleet_status_manager",
 64 |             "terminate_max_batch_size",
 65 |             fallback=self.DEFAULTS.get("terminate_max_batch_size"),
 66 |         )
 67 |         self._boto3_retry = config.getint(
 68 |             "slurm_fleet_status_manager", "boto3_retry", fallback=self.DEFAULTS.get("max_retry")
 69 |         )
 70 |         self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}}
 71 |         proxy = config.get("slurm_fleet_status_manager", "proxy", fallback=self.DEFAULTS.get("proxy"))
 72 |         if proxy != "NONE":
 73 |             self._boto3_config["proxies"] = {"https": proxy}
 74 |         self.boto3_config = Config(**self._boto3_config)
 75 | 
 76 |         self.logging_config = config.get(
 77 |             "slurm_fleet_status_manager", "logging_config", fallback=self.DEFAULTS.get("logging_config")
 78 |         )
 79 | 
 80 |         log.debug(self.__repr__())
 81 | 
 82 | 
 83 | def _manage_fleet_status_transition(config, computefleet_status_data_path):
 84 |     computefleet_status = _get_computefleet_status(computefleet_status_data_path)
 85 | 
 86 |     if ComputeFleetStatus.is_stop_requested(computefleet_status):
 87 |         _stop_partitions(config)
 88 |     elif ComputeFleetStatus.is_start_requested(computefleet_status):
 89 |         _start_partitions()
 90 | 
 91 | 
 92 | def _start_partitions():
 93 |     log.info("Setting slurm partitions to UP and resuming nodes...")
 94 |     update_all_partitions(PartitionStatus.UP, reset_node_addrs_hostname=False)
 95 |     # TODO: This function was added due to Slurm ticket 12915. The bug is not reproducible and the ticket was then
 96 |     #  closed. This operation may now be useless: we need to check this.
 97 |     resume_powering_down_nodes()
 98 | 
 99 | 
100 | def _stop_partitions(config):
101 |     log.info("Setting slurm partitions to INACTIVE and terminating all compute nodes...")
102 |     update_all_partitions(PartitionStatus.INACTIVE, reset_node_addrs_hostname=True)
103 |     instance_manager = InstanceManager(
104 |         config.region,
105 |         config.cluster_name,
106 |         config.boto3_config,
107 |     )
108 |     instance_manager.terminate_all_compute_nodes(config.terminate_max_batch_size)
109 | 
110 | 
111 | def _get_computefleet_status(computefleet_status_data_path):
112 |     try:
113 |         with open(computefleet_status_data_path, "r", encoding="utf-8") as computefleet_status_data_file:
114 |             computefleet_status = ComputeFleetStatus(
115 |                 json.load(computefleet_status_data_file).get(ComputeFleetStatusManager.COMPUTE_FLEET_STATUS_ATTRIBUTE)
116 |             )
117 |         log.info("ComputeFleet status is: %s", computefleet_status)
118 |     except Exception as e:
119 |         log.error("Cannot read compute fleet status data file: %s.\nException: %s", computefleet_status_data_path, e)
120 |         raise
121 | 
122 |     return computefleet_status
123 | 
124 | 
125 | def main():
126 |     default_log_file = "/var/log/parallelcluster/slurm_fleet_status_manager.log"
127 |     logging.basicConfig(
128 |         filename=default_log_file,
129 |         level=logging.INFO,
130 |         format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
131 |     )
132 |     log.info("FleetManager startup.")
133 |     args = _parse_arguments()
134 |     try:
135 |         config_file = os.environ.get(
136 |             "CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_fleet_status_manager.conf")
137 |         )
138 |         fleet_status_manager_config = SlurmFleetManagerConfig(config_file)
139 |         try:
140 |             # Configure root logger
141 |             fileConfig(fleet_status_manager_config.logging_config, disable_existing_loggers=False)
142 |         except Exception as e:
143 |             log.warning(
144 |                 "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
145 |                 fleet_status_manager_config.logging_config,
146 |                 default_log_file,
147 |                 e,
148 |             )
149 |         log.info("FleetManager config: %s", fleet_status_manager_config)
150 |         _manage_fleet_status_transition(fleet_status_manager_config, args.computefleet_status_data)
151 |         log.info("FleetManager finished.")
152 |     except Exception as e:
153 |         log.exception("Encountered exception when running fleet manager: %s", e)
154 |         sys.exit(1)
155 | 
156 | 
157 | def _parse_arguments():
158 |     parser = argparse.ArgumentParser()
159 |     parser.add_argument("-cf", "--computefleet-status-data", help="Path to compute fleet status data", required=True)
160 |     args = parser.parse_args()
161 |     return args
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     main()
166 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/logging/parallelcluster_clustermgtd_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,computeConsole,events
 3 | 
 4 | [handlers]
 5 | keys=streamHandler,computeConsoleHandler,eventsHandler
 6 | 
 7 | [formatters]
 8 | keys=defaultFormatter,computeConsoleFormatter,eventsFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=streamHandler
13 | 
14 | [formatter_defaultFormatter]
15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s
16 | 
17 | [handler_streamHandler]
18 | class=StreamHandler
19 | level=INFO
20 | formatter=defaultFormatter
21 | args=(sys.stdout,)
22 | 
23 | [logger_computeConsole]
24 | level=INFO
25 | handlers=computeConsoleHandler
26 | propagate=0
27 | qualname=slurm_plugin.clustermgtd.console_output
28 | 
29 | [formatter_computeConsoleFormatter]
30 | format=%(asctime)s - %(message)s
31 | 
32 | [handler_computeConsoleHandler]
33 | class=FileHandler
34 | level=INFO
35 | formatter=computeConsoleFormatter
36 | args=('/var/log/parallelcluster/compute_console_output.log', 'a', None, False)
37 | 
38 | [logger_events]
39 | level=INFO
40 | handlers=eventsHandler
41 | propagate=0
42 | qualname=slurm_plugin.clustermgtd.events
43 | 
44 | [formatter_eventsFormatter]
45 | format=%(message)s
46 | 
47 | [handler_eventsHandler]
48 | class=FileHandler
49 | level=INFO
50 | formatter=eventsFormatter
51 | args=('/var/log/parallelcluster/clustermgtd.events', 'a', None, False)
52 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/logging/parallelcluster_computemgtd_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=streamHandler
 6 | 
 7 | [formatters]
 8 | keys=defaultFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=streamHandler
13 | 
14 | [formatter_defaultFormatter]
15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s
16 | 
17 | [handler_streamHandler]
18 | class=StreamHandler
19 | level=INFO
20 | formatter=defaultFormatter
21 | args=(sys.stdout,)
22 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/logging/parallelcluster_fleet_status_manager_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=fileHandler
 6 | 
 7 | [formatters]
 8 | keys=defaultFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=fileHandler
13 | 
14 | [formatter_defaultFormatter]
15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s
16 | 
17 | [handler_fileHandler]
18 | class=FileHandler
19 | level=INFO
20 | formatter=defaultFormatter
21 | args=("/var/log/parallelcluster/slurm_fleet_status_manager.log",)
22 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/logging/parallelcluster_resume_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,events
 3 | 
 4 | [handlers]
 5 | keys=fileHandler,eventsHandler
 6 | 
 7 | [formatters]
 8 | keys=defaultFormatter,eventsFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=fileHandler
13 | 
14 | [formatter_defaultFormatter]
15 | format=%(asctime)s - %(process)d - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s
16 | 
17 | [handler_fileHandler]
18 | class=FileHandler
19 | level=INFO
20 | formatter=defaultFormatter
21 | args=("/var/log/parallelcluster/slurm_resume.log",)
22 | 
23 | [logger_events]
24 | level=WARNING
25 | handlers=eventsHandler
26 | propagate=0
27 | qualname=slurm_plugin.resume.events
28 | 
29 | [formatter_eventsFormatter]
30 | format=%(message)s
31 | 
32 | [handler_eventsHandler]
33 | class=FileHandler
34 | level=WARNING
35 | formatter=eventsFormatter
36 | args=('/var/log/parallelcluster/slurm_resume.events', 'a', None, False)
37 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/logging/parallelcluster_suspend_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=fileHandler
 6 | 
 7 | [formatters]
 8 | keys=defaultFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=fileHandler
13 | 
14 | [formatter_defaultFormatter]
15 | format=%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s
16 | 
17 | [handler_fileHandler]
18 | class=FileHandler
19 | level=INFO
20 | formatter=defaultFormatter
21 | args=("/var/log/parallelcluster/slurm_suspend.log",)
22 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/resume.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | 
 13 | import argparse
 14 | import logging
 15 | import os
 16 | from configparser import ConfigParser
 17 | from datetime import datetime, timezone
 18 | from logging.config import fileConfig
 19 | 
 20 | from botocore.config import Config
 21 | from common.schedulers.slurm_commands import get_nodes_info, set_nodes_down
 22 | from common.utils import read_json
 23 | from slurm_plugin.cluster_event_publisher import ClusterEventPublisher
 24 | from slurm_plugin.common import ScalingStrategy, is_clustermgtd_heartbeat_valid, print_with_count
 25 | from slurm_plugin.instance_manager import InstanceManager
 26 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR
 27 | 
 28 | log = logging.getLogger(__name__)
 29 | event_logger = log.getChild("events")
 30 | 
 31 | 
 32 | class SlurmResumeConfig:
 33 |     DEFAULTS = {
 34 |         "max_retry": 1,
 35 |         "launch_max_batch_size": 500,
 36 |         "assign_node_max_batch_size": 500,
 37 |         "terminate_max_batch_size": 1000,
 38 |         "update_node_address": True,
 39 |         "clustermgtd_timeout": 300,
 40 |         "proxy": "NONE",
 41 |         "logging_config": os.path.join(os.path.dirname(__file__), "logging", "parallelcluster_resume_logging.conf"),
 42 |         "hosted_zone": None,
 43 |         "dns_domain": None,
 44 |         "use_private_hostname": False,
 45 |         "run_instances_overrides": "/opt/slurm/etc/pcluster/run_instances_overrides.json",
 46 |         "create_fleet_overrides": "/opt/slurm/etc/pcluster/create_fleet_overrides.json",
 47 |         "fleet_config_file": "/etc/parallelcluster/slurm_plugin/fleet-config.json",
 48 |         "job_level_scaling": True,
 49 |         "scaling_strategy": "all-or-nothing",
 50 |     }
 51 | 
 52 |     def __init__(self, config_file_path):
 53 |         self._get_config(config_file_path)
 54 | 
 55 |     def __repr__(self):
 56 |         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
 57 |         return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs)
 58 | 
 59 |     def _get_config(self, config_file_path):
 60 |         """Get resume program configuration."""
 61 |         log.info("Reading %s", config_file_path)
 62 | 
 63 |         config = ConfigParser()
 64 |         try:
 65 |             with open(config_file_path, "r") as config_file:
 66 |                 config.read_file(config_file)
 67 |         except IOError:
 68 |             log.error("Cannot read slurm cloud bursting scripts configuration file: %s", config_file_path)
 69 |             raise
 70 | 
 71 |         self.region = config.get("slurm_resume", "region")
 72 |         self.cluster_name = config.get("slurm_resume", "cluster_name")
 73 |         self.dynamodb_table = config.get("slurm_resume", "dynamodb_table")
 74 |         self.hosted_zone = config.get("slurm_resume", "hosted_zone", fallback=self.DEFAULTS.get("hosted_zone"))
 75 |         self.dns_domain = config.get("slurm_resume", "dns_domain", fallback=self.DEFAULTS.get("dns_domain"))
 76 |         self.use_private_hostname = config.getboolean(
 77 |             "slurm_resume", "use_private_hostname", fallback=self.DEFAULTS.get("use_private_hostname")
 78 |         )
 79 |         self.head_node_private_ip = config.get("slurm_resume", "head_node_private_ip")
 80 |         self.head_node_hostname = config.get("slurm_resume", "head_node_hostname")
 81 |         self.launch_max_batch_size = config.getint(
 82 |             "slurm_resume", "launch_max_batch_size", fallback=self.DEFAULTS.get("launch_max_batch_size")
 83 |         )
 84 |         self.assign_node_max_batch_size = config.getint(
 85 |             "slurm_resume", "assign_node_max_batch_size", fallback=self.DEFAULTS.get("assign_node_max_batch_size")
 86 |         )
 87 |         self.terminate_max_batch_size = config.getint(
 88 |             "slurm_resume", "terminate_max_batch_size", fallback=self.DEFAULTS.get("terminate_max_batch_size")
 89 |         )
 90 |         self.update_node_address = config.getboolean(
 91 |             "slurm_resume", "update_node_address", fallback=self.DEFAULTS.get("update_node_address")
 92 |         )
 93 |         self.scaling_strategy = config.get(
 94 |             "slurm_resume", "scaling_strategy", fallback=self.DEFAULTS.get("scaling_strategy")
 95 |         )  # TODO: Check if it's a valid scaling strategy before calling expensive downstream APIs
 96 |         self.job_level_scaling = config.getboolean(
 97 |             "slurm_resume", "job_level_scaling", fallback=self.DEFAULTS.get("job_level_scaling")
 98 |         )
 99 |         fleet_config_file = config.get(
100 |             "slurm_resume", "fleet_config_file", fallback=self.DEFAULTS.get("fleet_config_file")
101 |         )
102 |         self.fleet_config = read_json(fleet_config_file)
103 | 
104 |         # run_instances_overrides_file and create_fleet_overrides_file contain a json with the following format:
105 |         # {
106 |         #     "queue_name": {
107 |         #         "compute_resource_name": {
108 |         #             <arbitrary-json-with-boto3-api-params-to-override>
109 |         #         },
110 |         #         ...
111 |         #     },
112 |         #     ...
113 |         # }
114 |         run_instances_overrides_file = config.get(
115 |             "slurm_resume", "run_instances_overrides", fallback=self.DEFAULTS.get("run_instances_overrides")
116 |         )
117 |         self.run_instances_overrides = read_json(run_instances_overrides_file, default={})
118 |         create_fleet_overrides_file = config.get(
119 |             "slurm_resume", "create_fleet_overrides", fallback=self.DEFAULTS.get("create_fleet_overrides")
120 |         )
121 |         self.create_fleet_overrides = read_json(create_fleet_overrides_file, default={})
122 | 
123 |         self.clustermgtd_timeout = config.getint(
124 |             "slurm_resume",
125 |             "clustermgtd_timeout",
126 |             fallback=self.DEFAULTS.get("clustermgtd_timeout"),
127 |         )
128 |         self.clustermgtd_heartbeat_file_path = config.get("slurm_resume", "clustermgtd_heartbeat_file_path")
129 | 
130 |         # Configure boto3 to retry 1 times by default
131 |         self._boto3_retry = config.getint("slurm_resume", "boto3_retry", fallback=self.DEFAULTS.get("max_retry"))
132 |         self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}}
133 |         proxy = config.get("slurm_resume", "proxy", fallback=self.DEFAULTS.get("proxy"))
134 |         if proxy != "NONE":
135 |             self._boto3_config["proxies"] = {"https": proxy}
136 |         self.boto3_config = Config(**self._boto3_config)
137 |         self.logging_config = config.get("slurm_resume", "logging_config", fallback=self.DEFAULTS.get("logging_config"))
138 |         self.head_node_instance_id = config.get("slurm_resume", "instance_id", fallback="unknown")
139 | 
140 |         log.debug(self.__repr__())
141 | 
142 | 
143 | def _handle_failed_nodes(node_list, reason="Failure when resuming nodes"):
144 |     """
145 |     Fall back mechanism to handle failure when launching instances.
146 | 
147 |     When encountering a failure, want slurm to deallocate current nodes,
148 |     and re-queue job to be run automatically by new nodes.
149 |     To do this, set node to DOWN, so slurm will automatically re-queue job.
150 |     Then set node to POWER_DOWN so suspend program will be run and node can be reset back to power saving.
151 | 
152 |     If this process is not done explicitly, slurm will wait until ResumeTimeout,
153 |     then execute this process of setting nodes to DOWN then POWER_DOWN.
154 |     To save time, should explicitly set nodes to DOWN in ResumeProgram so clustermgtd can maintain failed nodes.
155 |     Clustermgtd will be responsible for running full DOWN -> POWER_DOWN process.
156 |     """
157 |     if node_list:
158 |         try:
159 |             log.info(
160 |                 "Setting following failed nodes into DOWN state %s with reason: %s", print_with_count(node_list), reason
161 |             )
162 |             set_nodes_down(node_list, reason=reason)
163 |         except Exception as e:
164 |             log.error(
165 |                 "Failed to place nodes %s into DOWN for reason %s with exception: %s",
166 |                 print_with_count(node_list),
167 |                 reason,
168 |                 e,
169 |             )
170 | 
171 | 
172 | def _resume(arg_nodes, resume_config, slurm_resume):
173 |     """Launch new EC2 nodes according to nodes requested by slurm."""
174 |     # Check heartbeat
175 |     current_time = datetime.now(tz=timezone.utc)
176 |     if not is_clustermgtd_heartbeat_valid(
177 |         current_time, resume_config.clustermgtd_timeout, resume_config.clustermgtd_heartbeat_file_path
178 |     ):
179 |         log.error(
180 |             "No valid clustermgtd heartbeat detected, clustermgtd is down!\n"
181 |             "Please check clustermgtd log for error.\n"
182 |             "Not launching nodes %s",
183 |             arg_nodes,
184 |         )
185 |         _handle_failed_nodes(arg_nodes)
186 |         return
187 |     log.info("Launching EC2 instances for the following Slurm nodes: %s", arg_nodes)
188 |     node_list = []
189 |     node_list_with_status = []
190 |     for node in get_nodes_info(arg_nodes):
191 |         node_list.append(node.name)
192 |         node_list_with_status.append((node.name, node.state_string))
193 |     log.info("Current state of Slurm nodes to resume: %s", node_list_with_status)
194 | 
195 |     instance_manager = InstanceManager(
196 |         region=resume_config.region,
197 |         cluster_name=resume_config.cluster_name,
198 |         boto3_config=resume_config.boto3_config,
199 |         table_name=resume_config.dynamodb_table,
200 |         hosted_zone=resume_config.hosted_zone,
201 |         dns_domain=resume_config.dns_domain,
202 |         use_private_hostname=resume_config.use_private_hostname,
203 |         head_node_private_ip=resume_config.head_node_private_ip,
204 |         head_node_hostname=resume_config.head_node_hostname,
205 |         fleet_config=resume_config.fleet_config,
206 |         run_instances_overrides=resume_config.run_instances_overrides,
207 |         create_fleet_overrides=resume_config.create_fleet_overrides,
208 |         job_level_scaling=resume_config.job_level_scaling,
209 |     )
210 |     instance_manager.add_instances(
211 |         slurm_resume=slurm_resume,
212 |         node_list=node_list,
213 |         launch_batch_size=resume_config.launch_max_batch_size,
214 |         assign_node_batch_size=resume_config.assign_node_max_batch_size,
215 |         terminate_batch_size=resume_config.terminate_max_batch_size,
216 |         update_node_address=resume_config.update_node_address,
217 |         scaling_strategy=ScalingStrategy(resume_config.scaling_strategy),
218 |     )
219 |     failed_nodes = set().union(*instance_manager.failed_nodes.values())
220 |     success_nodes = [node for node in node_list if node not in failed_nodes]
221 |     if success_nodes:
222 |         log.info("Successfully launched nodes %s", print_with_count(success_nodes))
223 | 
224 |     if failed_nodes:
225 |         log.error(
226 |             "Failed to launch following nodes, setting nodes to DOWN: %s",
227 |             print_with_count(failed_nodes),
228 |         )
229 |         for error_code, node_list in instance_manager.failed_nodes.items():
230 |             _handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
231 | 
232 |         event_publisher = ClusterEventPublisher.create_with_default_publisher(
233 |             event_logger,
234 |             resume_config.cluster_name,
235 |             "HeadNode",
236 |             "slurm-resume",
237 |             resume_config.head_node_instance_id,
238 |         )
239 |         event_publisher.publish_node_launch_events(instance_manager.failed_nodes)
240 | 
241 | 
242 | def main():
243 |     default_log_file = "/var/log/parallelcluster/slurm_resume.log"
244 |     logging.basicConfig(
245 |         filename=default_log_file,
246 |         level=logging.INFO,
247 |         format="%(asctime)s - %(process)d - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
248 |     )
249 |     log.info("ResumeProgram startup.")
250 |     parser = argparse.ArgumentParser()
251 |     parser.add_argument("nodes", help="Nodes to burst")
252 |     args = parser.parse_args()
253 |     try:
254 |         config_file = os.environ.get("CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_resume.conf"))
255 |         resume_config = SlurmResumeConfig(config_file)
256 |         try:
257 |             # Configure root logger
258 |             fileConfig(resume_config.logging_config, disable_existing_loggers=False)
259 |         except Exception as e:
260 |             log.warning(
261 |                 "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
262 |                 resume_config.logging_config,
263 |                 default_log_file,
264 |                 e,
265 |             )
266 |         log.info("ResumeProgram config: %s", resume_config)
267 | 
268 |         _resume(args.nodes, resume_config, _get_slurm_resume())
269 |         log.info("ResumeProgram finished.")
270 |     except Exception as e:
271 |         log.exception("Encountered exception when requesting instances for %s: %s", args.nodes, e)
272 |         _handle_failed_nodes(args.nodes)
273 | 
274 | 
275 | def _get_slurm_resume():
276 |     slurm_resume = read_json(os.environ.get("SLURM_RESUME_FILE"), default={})
277 |     log_level = logging.INFO if slurm_resume else logging.ERROR
278 |     log.log(log_level, "Slurm Resume File content: %s", slurm_resume)
279 |     return slurm_resume
280 | 
281 | 
282 | if __name__ == "__main__":
283 |     main()
284 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/suspend.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
 4 | # the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 
12 | 
13 | import argparse
14 | import logging
15 | import os
16 | from configparser import ConfigParser
17 | from datetime import datetime, timezone
18 | from logging.config import fileConfig
19 | 
20 | from slurm_plugin.common import is_clustermgtd_heartbeat_valid
21 | from slurm_plugin.slurm_resources import CONFIG_FILE_DIR
22 | 
23 | log = logging.getLogger(__name__)
24 | 
25 | 
26 | class SlurmSuspendConfig:
27 |     DEFAULTS = {
28 |         "clustermgtd_timeout": 300,
29 |         "logging_config": os.path.join(os.path.dirname(__file__), "logging", "parallelcluster_suspend_logging.conf"),
30 |     }
31 | 
32 |     def __init__(self, config_file_path):
33 |         config = ConfigParser()
34 |         try:
35 |             with open(config_file_path, "r") as config_file:
36 |                 config.read_file(config_file)
37 |         except IOError:
38 |             log.error("Cannot read slurm cloud bursting scripts configuration file: %s", config_file_path)
39 |             raise
40 | 
41 |         self.clustermgtd_timeout = config.getint(
42 |             "slurm_suspend",
43 |             "clustermgtd_timeout",
44 |             fallback=self.DEFAULTS.get("clustermgtd_timeout"),
45 |         )
46 |         self.clustermgtd_heartbeat_file_path = config.get("slurm_suspend", "clustermgtd_heartbeat_file_path")
47 |         self.logging_config = config.get(
48 |             "slurm_suspend", "logging_config", fallback=self.DEFAULTS.get("logging_config")
49 |         )
50 |         log.info(self.__repr__())
51 | 
52 | 
53 | def main():
54 |     default_log_file = "/var/log/parallelcluster/slurm_suspend.log"
55 |     logging.basicConfig(
56 |         filename=default_log_file,
57 |         level=logging.INFO,
58 |         format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
59 |     )
60 |     log.info("SuspendProgram startup.")
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument("nodes", help="Nodes to release")
63 |     args = parser.parse_args()
64 |     config_file = os.environ.get("CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_suspend.conf"))
65 |     suspend_config = SlurmSuspendConfig(config_file)
66 |     try:
67 |         # Configure root logger
68 |         fileConfig(suspend_config.logging_config, disable_existing_loggers=False)
69 |     except Exception as e:
70 |         log.warning(
71 |             "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
72 |             suspend_config.logging_config,
73 |             default_log_file,
74 |             e,
75 |         )
76 | 
77 |     log.info("Suspending following nodes. Clustermgtd will cleanup orphaned instances: %s", args.nodes)
78 |     current_time = datetime.now(tz=timezone.utc)
79 |     if not is_clustermgtd_heartbeat_valid(
80 |         current_time, suspend_config.clustermgtd_timeout, suspend_config.clustermgtd_heartbeat_file_path
81 |     ):
82 |         log.error(
83 |             "No valid clustermgtd heartbeat detected, clustermgtd is down! "
84 |             "Please check clustermgtd log for error.\n"
85 |             "Nodes will be reset to POWER_SAVE state after SuspendTimeout. "
86 |             "The backing EC2 instances may not be correctly terminated.\n"
87 |             "Please check and terminate any orphaned instances in EC2!"
88 |         )
89 |     else:
90 |         log.info("SuspendProgram finished. Nodes will be available after SuspendTimeout")
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/src/slurm_plugin/task_executor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License").
 4 | # You may not use this file except in compliance with the
 5 | # License. A copy of the License is located at
 6 | #
 7 | # http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import inspect
14 | import logging
15 | from concurrent.futures import Future, ThreadPoolExecutor
16 | from functools import partial
17 | from threading import Event, Semaphore
18 | from typing import Callable, Optional
19 | 
20 | from slurm_plugin.common import TaskController
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class TaskExecutor:
26 |     """Class for managing execution of asynchronous tasks."""
27 | 
28 |     class MaximumBacklogExceededError(RuntimeError):
29 |         """Exception raised when a task can't be queued due to backlog."""
30 | 
31 |         def __init__(self, task, maximum_backlog):
32 |             self.failed_task = task
33 |             self.maximum_backlog = maximum_backlog
34 | 
35 |     def __init__(self, worker_pool_size, max_backlog):
36 |         self._max_backlog = max_backlog
37 |         self._executor_limit = Semaphore(max_backlog)
38 |         self._shutdown_event = Event()
39 |         self._executor_pool = ThreadPoolExecutor(max_workers=worker_pool_size)
40 | 
41 |     def is_shutdown(self) -> bool:
42 |         return self._shutdown_event.is_set()
43 | 
44 |     def raise_if_shutdown(self) -> None:
45 |         if self.is_shutdown():
46 |             raise TaskController.TaskShutdownError()
47 | 
48 |     def wait_unless_shutdown(self, seconds_to_wait: float) -> None:
49 |         shutdown = self._shutdown_event.wait(seconds_to_wait)
50 |         if shutdown:
51 |             raise TaskController.TaskShutdownError()
52 | 
53 |     def queue_task(self, task: Callable[[], None]) -> Optional[Future]:
54 |         def queue_executor_task_callback(semaphore, *args):
55 |             semaphore.release()
56 | 
57 |         if task:
58 |             self.raise_if_shutdown()
59 | 
60 |             if self._executor_limit.acquire(blocking=False):
61 |                 future = self._executor_pool.submit(task)
62 |                 future.add_done_callback(partial(queue_executor_task_callback, self._executor_limit))
63 | 
64 |                 return future
65 |             else:
66 |                 logging.error(
67 |                     "Unable to queue task due to exceeding backlog limit of %d",
68 |                     self._max_backlog,
69 |                 )
70 |                 raise TaskExecutor.MaximumBacklogExceededError(task=task, maximum_backlog=self._max_backlog)
71 | 
72 |         return None
73 | 
74 |     def shutdown(self, wait: bool = False, cancel_futures: bool = True) -> None:
75 |         if self._executor_pool:
76 |             # Notify any waiters that we are shutting down
77 |             self._shutdown_event.set()
78 | 
79 |             # `cancel_futures` parameter does not exist in python pre-3.9
80 |             can_cancel = "cancel_futures" in inspect.getfullargspec(self._executor_pool.shutdown).kwonlyargs
81 |             (
82 |                 self._executor_pool.shutdown(wait=wait, cancel_futures=cancel_futures)
83 |                 if can_cancel
84 |                 else self._executor_pool.shutdown(wait=False)
85 |             )
86 |             self._executor_pool = None
87 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/tests/aws/test_ec2.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | import os
12 | from collections import namedtuple
13 | 
14 | import pytest
15 | from assertpy import assert_that
16 | 
17 | from aws.common import AWSClientError
18 | from aws.ec2 import CapacityReservationInfo, Ec2Client
19 | 
20 | MockedBoto3Request = namedtuple(
21 |     "MockedBoto3Request", ["method", "response", "expected_params", "generate_error", "error_code"]
22 | )
23 | # Set defaults for attributes of the namedtuple. Since fields with a default value must come after any fields without
24 | # a default, the defaults are applied to the rightmost parameters. In this case generate_error = False and
25 | # error_code = None
26 | MockedBoto3Request.__new__.__defaults__ = (False, None)
27 | 
28 | 
29 | @pytest.fixture()
30 | def boto3_stubber_path():
31 |     # we need to set the region in the environment because the Boto3ClientFactory requires it.
32 |     os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
33 |     return "aws.common.boto3"
34 | 
35 | 
36 | FAKE_CAPACITY_BLOCK_ID = "cr-a1234567"
37 | FAKE_CAPACITY_BLOCK_INFO = {
38 |     "CapacityReservationId": FAKE_CAPACITY_BLOCK_ID,
39 |     "EndDateType": "limited",
40 |     # "ReservationType": "capacity-block",
41 |     "AvailabilityZone": "eu-east-2a",
42 |     "InstanceMatchCriteria": "targeted",
43 |     "EphemeralStorage": False,
44 |     "CreateDate": "2023-07-29T14:22:45Z  ",
45 |     "StartDate": "2023-08-15T12:00:00Z",
46 |     "EndDate": "2023-08-19T12:00:00Z",
47 |     "AvailableInstanceCount": 0,
48 |     "InstancePlatform": "Linux/UNIX",
49 |     "TotalInstanceCount": 16,
50 |     "State": "payment-pending",
51 |     "Tenancy": "default",
52 |     "EbsOptimized": True,
53 |     "InstanceType": "p5.48xlarge",
54 | }
55 | 
56 | 
57 | @pytest.mark.parametrize("generate_error", [True, False])
58 | def test_describe_capacity_reservations(boto3_stubber, generate_error):
59 |     """Verify that describe_capacity_reservations behaves as expected."""
60 |     dummy_message = "dummy error message"
61 |     mocked_requests = [
62 |         MockedBoto3Request(
63 |             method="describe_capacity_reservations",
64 |             expected_params={"CapacityReservationIds": [FAKE_CAPACITY_BLOCK_ID]},
65 |             response=dummy_message if generate_error else {"CapacityReservations": [FAKE_CAPACITY_BLOCK_INFO]},
66 |             generate_error=generate_error,
67 |             error_code=None,
68 |         )
69 |     ]
70 |     boto3_stubber("ec2", mocked_requests)
71 |     if generate_error:
72 |         with pytest.raises(AWSClientError, match=dummy_message):
73 |             Ec2Client().describe_capacity_reservations(capacity_reservation_ids=[FAKE_CAPACITY_BLOCK_ID])
74 |     else:
75 |         return_value = Ec2Client().describe_capacity_reservations(capacity_reservation_ids=[FAKE_CAPACITY_BLOCK_ID])
76 |         assert_that(return_value).is_equal_to([CapacityReservationInfo(FAKE_CAPACITY_BLOCK_INFO)])
77 | 


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
  4 | # with the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | from collections import namedtuple
 12 | 
 13 | from botocore.exceptions import ClientError
 14 | 
 15 | MockedBoto3Request = namedtuple(
 16 |     "MockedBoto3Request", ["method", "response", "expected_params", "generate_error", "error_code"]
 17 | )
 18 | # Set defaults for attributes of the namedtuple. Since fields with a default value must come after any fields without
 19 | # a default, the defaults are applied to the rightmost parameters. In this case generate_error = False and
 20 | # error_code = None
 21 | MockedBoto3Request.__new__.__defaults__ = (False, None)
 22 | 
 23 | 
 24 | def read_text(path):
 25 |     """Read the content of a file."""
 26 |     with path.open() as f:
 27 |         return f.read()
 28 | 
 29 | 
 30 | def client_error(error_code):
 31 |     return ClientError({"Error": {"Code": error_code}}, "failed_operation")
 32 | 
 33 | 
 34 | SINGLE_SUBNET = {"SubnetIds": ["1234567"]}
 35 | MULTIPLE_SUBNETS = {"SubnetIds": ["1234567", "7654321"]}
 36 | 
 37 | FLEET_CONFIG = {
 38 |     "queue": {"c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]}},
 39 |     "queue1": {
 40 |         "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
 41 |         "c52xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.2xlarge"}]},
 42 |         "p4d24xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "p4d.24xlarge"}]},
 43 |         "fleet-spot": {
 44 |             "Api": "create-fleet",
 45 |             "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
 46 |             "MaxPrice": 10,
 47 |             "AllocationStrategy": "capacity-optimized",
 48 |             "CapacityType": "spot",
 49 |             "Networking": SINGLE_SUBNET,
 50 |         },
 51 |     },
 52 |     "queue2": {
 53 |         "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
 54 |         "fleet-ondemand": {
 55 |             "Api": "create-fleet",
 56 |             "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
 57 |             "AllocationStrategy": "lowest-price",
 58 |             "CapacityType": "on-demand",
 59 |             "Networking": SINGLE_SUBNET,
 60 |         },
 61 |     },
 62 |     "queue3": {
 63 |         "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
 64 |         "c52xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.2xlarge"}]},
 65 |         "p4d24xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "p4d.24xlarge"}]},
 66 |     },
 67 |     "queue4": {
 68 |         "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
 69 |         "fleet1": {
 70 |             "Api": "create-fleet",
 71 |             "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
 72 |             "AllocationStrategy": "lowest-price",
 73 |             "CapacityType": "on-demand",
 74 |             "Networking": SINGLE_SUBNET,
 75 |         },
 76 |     },
 77 |     "queue5": {
 78 |         "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
 79 |         "fleet1": {
 80 |             "Api": "create-fleet",
 81 |             "Instances": [{"InstanceType": "t2.medium"}],
 82 |             "AllocationStrategy": "lowest-price",
 83 |             "CapacityType": "on-demand",
 84 |             "Networking": MULTIPLE_SUBNETS,
 85 |         },
 86 |     },
 87 |     "queue6": {
 88 |         "c5xlarge": {"Api": "run-instances", "Instances": [{"InstanceType": "c5.xlarge"}]},
 89 |         "fleet1": {
 90 |             "Api": "create-fleet",
 91 |             "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
 92 |             "AllocationStrategy": "lowest-price",
 93 |             "CapacityType": "on-demand",
 94 |             "Networking": MULTIPLE_SUBNETS,
 95 |         },
 96 |     },
 97 |     "queue-cb": {
 98 |         "run-instances-capacity-block": {
 99 |             "Api": "run-instances",
100 |             "Instances": [{"InstanceType": "c5.xlarge"}],
101 |             "CapacityType": "capacity-block",
102 |             "Networking": SINGLE_SUBNET,
103 |             "CapacityReservationId": "cr-123456",
104 |         },
105 |         "fleet-capacity-block": {
106 |             "Api": "create-fleet",
107 |             "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}],
108 |             "CapacityType": "capacity-block",
109 |             "Networking": SINGLE_SUBNET,
110 |             "CapacityReservationId": "cr-234567",
111 |         },
112 |     },
113 | }
114 | 
115 | LAUNCH_OVERRIDES = {}
116 | 


--------------------------------------------------------------------------------
/tests/common/schedulers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/tests/common/schedulers/test_slurm_commands/TestPartitionNodelistMapping/test_get_partition_nodelist_mapping/slurm_dir/etc/pcluster/parallelcluster_partition_nodelist_mapping.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test": "test-st-cr1-[1-10],test-dy-cr2-[1-2]",
3 |     "test2": "test2-st-cr1-[1-10],test2-dy-cr2-[1-2]"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/common/schedulers/test_slurm_commands/TestPartitionNodelistMapping/test_get_partitions/slurm_dir/etc/pcluster/parallelcluster_partition_nodelist_mapping.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test": "test-st-cr1-[1-10],test-dy-cr2-[1-2]",
3 |     "test2": "test2-st-cr1-[1-10],test2-dy-cr2-[1-2]"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/common/test_ec2_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License").
  4 | # You may not use this file except in compliance with the License.
  5 | # A copy of the License is located at
  6 | #
  7 | # http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "LICENSE.txt" file accompanying this file.
 10 | # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
 11 | # See the License for the specific language governing permissions and limitations under the License.
 12 | import pytest
 13 | from assertpy import assert_that
 14 | from common.ec2_utils import get_private_ip_address_and_dns_name
 15 | 
 16 | 
 17 | @pytest.mark.parametrize(
 18 |     "instance_info, expected_private_ip, expected_private_dns_name, expected_all_private_ips",
 19 |     [
 20 |         (
 21 |             {
 22 |                 "InstanceId": "i-12345",
 23 |                 "InstanceType": "c5.xlarge",
 24 |                 "PrivateIpAddress": "ip.1.0.0.1",
 25 |                 "PrivateDnsName": "ip-1-0-0-1",
 26 |                 "NetworkInterfaces": [
 27 |                     {
 28 |                         "Attachment": {
 29 |                             "DeviceIndex": 0,
 30 |                             "NetworkCardIndex": 0,
 31 |                         },
 32 |                         "PrivateIpAddress": "ip.1.0.0.1",
 33 |                         "PrivateDnsName": "ip-1-0-0-1",
 34 |                     },
 35 |                 ],
 36 |             },
 37 |             "ip.1.0.0.1",
 38 |             "ip-1-0-0-1",
 39 |             {"ip.1.0.0.1"},
 40 |         ),
 41 |         (
 42 |             {
 43 |                 "InstanceId": "i-12345",
 44 |                 "InstanceType": "c5.xlarge",
 45 |                 "PrivateIpAddress": "ip.1.0.0.1",
 46 |                 "PrivateDnsName": "ip-1-0-0-1",
 47 |                 "NetworkInterfaces": [
 48 |                     {
 49 |                         "Attachment": {
 50 |                             "DeviceIndex": 0,
 51 |                             "NetworkCardIndex": 0,
 52 |                         },
 53 |                     },
 54 |                 ],
 55 |             },
 56 |             "ip.1.0.0.1",
 57 |             "ip-1-0-0-1",
 58 |             {"ip.1.0.0.1"},
 59 |         ),
 60 |         (
 61 |             {
 62 |                 "InstanceId": "i-12345",
 63 |                 "InstanceType": "c5.xlarge",
 64 |                 "PrivateIpAddress": "ip.1.0.0.1",
 65 |                 "PrivateDnsName": "ip-1-0-0-1",
 66 |                 "NetworkInterfaces": [
 67 |                     {
 68 |                         "Attachment": {},
 69 |                     },
 70 |                 ],
 71 |             },
 72 |             "ip.1.0.0.1",
 73 |             "ip-1-0-0-1",
 74 |             {"ip.1.0.0.1"},
 75 |         ),
 76 |         (
 77 |             {
 78 |                 "InstanceId": "i-12345",
 79 |                 "InstanceType": "c5.xlarge",
 80 |                 "PrivateIpAddress": "ip.1.0.0.1",
 81 |                 "PrivateDnsName": "ip-1-0-0-1",
 82 |                 "NetworkInterfaces": [
 83 |                     {
 84 |                         "Attachment": {
 85 |                             "DeviceIndex": 0,
 86 |                             "NetworkCardIndex": 1,
 87 |                         },
 88 |                         "PrivateIpAddress": "ip.1.0.0.1",
 89 |                         "PrivateDnsName": "ip-1-0-0-1",
 90 |                     },
 91 |                     {
 92 |                         "Attachment": {
 93 |                             "DeviceIndex": 0,
 94 |                             "NetworkCardIndex": 0,
 95 |                         },
 96 |                         "PrivateIpAddress": "ip.1.0.0.2",
 97 |                         "PrivateDnsName": "ip-1-0-0-2",
 98 |                     },
 99 |                 ],
100 |             },
101 |             "ip.1.0.0.2",
102 |             "ip-1-0-0-2",
103 |             {"ip.1.0.0.1", "ip.1.0.0.2"},
104 |         ),
105 |         (
106 |             {
107 |                 "InstanceId": "i-12345",
108 |                 "InstanceType": "c5.xlarge",
109 |                 "PrivateIpAddress": "ip.1.0.0.1",
110 |                 "PrivateDnsName": "ip-1-0-0-1",
111 |                 "NetworkInterfaces": [
112 |                     {
113 |                         "Attachment": {
114 |                             "DeviceIndex": 0,
115 |                             "NetworkCardIndex": 0,
116 |                         },
117 |                         "PrivateIpAddress": "ip.1.0.0.1",
118 |                         "PrivateDnsName": "ip-1-0-0-1",
119 |                     },
120 |                     {
121 |                         "Attachment": {
122 |                             "DeviceIndex": 0,
123 |                             "NetworkCardIndex": 1,
124 |                         },
125 |                         "PrivateIpAddress": "ip.1.0.0.2",
126 |                         "PrivateDnsName": "ip-1-0-0-2",
127 |                     },
128 |                 ],
129 |             },
130 |             "ip.1.0.0.1",
131 |             "ip-1-0-0-1",
132 |             {"ip.1.0.0.1", "ip.1.0.0.2"},
133 |         ),
134 |     ],
135 | )
136 | def test_get_private_ip_address_and_dns_name(
137 |     mocker, instance_info, expected_private_ip, expected_private_dns_name, expected_all_private_ips
138 | ):
139 |     actual_private_ip, actual_private_dns_name, actual_all_private_ips = get_private_ip_address_and_dns_name(
140 |         instance_info
141 |     )
142 |     assert_that(actual_private_ip).is_equal_to(expected_private_ip)
143 |     assert_that(actual_private_dns_name).is_equal_to(expected_private_dns_name)
144 |     assert_that(actual_all_private_ips).is_equal_to(expected_all_private_ips)
145 | 


--------------------------------------------------------------------------------
/tests/common/test_time_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | import pytest
12 | from assertpy import assert_that
13 | from common.time_utils import seconds_to_minutes
14 | 
15 | 
16 | @pytest.mark.parametrize("value_in_seconds, expected_output", [(0, 0), (12, 0), (60, 1), (66, 1), (1202, 20)])
17 | def test_seconds_to_minutes(value_in_seconds, expected_output):
18 |     assert_that(seconds_to_minutes(value_in_seconds)).is_equal_to(expected_output)
19 | 


--------------------------------------------------------------------------------
/tests/common/test_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
  4 | # with the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | import logging
 12 | import os
 13 | from datetime import datetime, timedelta, timezone
 14 | 
 15 | import common.utils as utils
 16 | import pytest
 17 | from assertpy import assert_that
 18 | from common.utils import read_json
 19 | 
 20 | 
 21 | @pytest.fixture()
 22 | def boto3_stubber_path():
 23 |     # we need to set the region in the environment because the Boto3ClientFactory requires it.
 24 |     os.environ["AWS_DEFAULT_REGION"] = "us-east-2"
 25 |     return "common.utils.boto3"
 26 | 
 27 | 
 28 | @pytest.mark.parametrize(
 29 |     "source_object, chunk_size, expected_grouped_output",
 30 |     [
 31 |         ([1, 2, 3, 4, 5], 2, [(1, 2), (3, 4), (5,)]),
 32 |         ([1, 2, 3, 4, 5, 6], 3, [(1, 2, 3), (4, 5, 6)]),
 33 |         ({"A": 1, "B": 2, "C": 3}, 2, [("A", "B"), ("C",)]),
 34 |         ((1, 2, 3, 4, 5), 2, [(1, 2), (3, 4), (5,)]),
 35 |         ((1, 2, 3), 1, [(1,), (2,), (3,)]),
 36 |     ],
 37 | )
 38 | def test_grouper(source_object, chunk_size, expected_grouped_output):
 39 |     assert_that(list(utils.grouper(source_object, chunk_size))).is_equal_to(expected_grouped_output)
 40 | 
 41 | 
 42 | @pytest.mark.parametrize(
 43 |     "loop_start_time, loop_end_time, loop_total_time, expected_sleep_time",
 44 |     [
 45 |         (
 46 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 47 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 48 |             60,
 49 |             60,
 50 |         ),
 51 |         (
 52 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 53 |             datetime(2020, 1, 1, 0, 1, 00, tzinfo=timezone.utc),
 54 |             60,
 55 |             30,
 56 |         ),
 57 |         (
 58 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 59 |             datetime(2020, 1, 1, 0, 1, 30, tzinfo=timezone.utc),
 60 |             60,
 61 |             0,
 62 |         ),
 63 |         (
 64 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 65 |             datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
 66 |             60,
 67 |             0,
 68 |         ),
 69 |         (
 70 |             datetime(2020, 1, 1, 1, 0, 0, tzinfo=timezone(timedelta(hours=1))),
 71 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 72 |             60,
 73 |             30,
 74 |         ),
 75 |         (
 76 |             datetime(2020, 1, 1, 1, 0, 0),
 77 |             datetime(2020, 1, 1, 0, 0, 30, tzinfo=timezone.utc),
 78 |             60,
 79 |             None,  # can't assert this with naive timezone since the value depends on the system timezone
 80 |         ),
 81 |     ],
 82 | )
 83 | def test_sleep_remaining_loop_time(mocker, loop_start_time, loop_end_time, loop_total_time, expected_sleep_time):
 84 |     sleep_mock = mocker.patch("time.sleep")
 85 |     datetime_now_mock = mocker.MagicMock()
 86 |     datetime_now_mock.now = mocker.MagicMock(return_value=loop_end_time, spec=datetime.now)
 87 |     mocker.patch("common.utils.datetime", datetime_now_mock)
 88 | 
 89 |     utils.sleep_remaining_loop_time(loop_total_time, loop_start_time)
 90 | 
 91 |     if expected_sleep_time:
 92 |         sleep_mock.assert_called_with(expected_sleep_time)
 93 |     elif expected_sleep_time == 0:
 94 |         sleep_mock.assert_not_called()
 95 |     datetime_now_mock.now.assert_called_with(tz=timezone.utc)
 96 | 
 97 | 
 98 | @pytest.mark.parametrize(
 99 |     "argument,raises_exception",
100 |     [
101 |         ("standard parameter name", False),
102 |         ("my/parameter", False),
103 |         ("execute this & then this", True),
104 |         ("redirect | my output", True),
105 |         ("execute\nmultiline", True),
106 |     ],
107 | )
108 | def test_validate_subprocess_argument(argument, raises_exception):
109 |     if raises_exception:
110 |         with pytest.raises(ValueError):
111 |             utils.validate_subprocess_argument(argument)
112 |     else:
113 |         assert_that(utils.validate_subprocess_argument(argument)).is_true()
114 | 
115 | 
116 | @pytest.mark.parametrize(
117 |     "argument,raises_exception",
118 |     [
119 |         ("/usr/my_path", False),
120 |         ("./my_path", True),
121 |         ("my_path", True),
122 |         (".my_path", True),
123 |     ],
124 | )
125 | def test_validate_absolute_path(argument, raises_exception):
126 |     if raises_exception:
127 |         with pytest.raises(ValueError):
128 |             utils.validate_absolute_path(argument)
129 |     else:
130 |         assert_that(utils.validate_absolute_path(argument)).is_true()
131 | 
132 | 
133 | @pytest.mark.parametrize(
134 |     "raw_input, default, expected_output, expected_exception",
135 |     [
136 |         ("", None, None, True),
137 |         ("", {}, {}, True),
138 |         ("{}", {}, {}, True),
139 |         ("malformed", {}, {}, True),
140 |         ("{malformed}", {}, {}, True),
141 |         (
142 |             '{"jobs":[{"extra":null,"job_id":91,"features":null,"nodes_alloc":"q1-dy-c1-3","nodes_resume":"q1-dy-c1-3",'
143 |             '"oversubscribe":"NO","partition":"q1","reservation":null}],"all_nodes_resume":"q1-dy-c1-3"}',
144 |             {},
145 |             {
146 |                 "all_nodes_resume": "q1-dy-c1-3",
147 |                 "jobs": [
148 |                     {
149 |                         "extra": None,
150 |                         "features": None,
151 |                         "job_id": 91,
152 |                         "nodes_alloc": "q1-dy-c1-3",
153 |                         "nodes_resume": "q1-dy-c1-3",
154 |                         "oversubscribe": "NO",
155 |                         "partition": "q1",
156 |                         "reservation": None,
157 |                     }
158 |                 ],
159 |             },
160 |             False,
161 |         ),
162 |     ],
163 | )
164 | def test_read_json(mocker, raw_input, default, expected_output, expected_exception, caplog):
165 |     if default is not None:
166 |         mocker.patch("builtins.open", mocker.mock_open(read_data=raw_input))
167 |         if expected_exception:
168 |             assert_that(read_json(None, default=default)).is_equal_to(default)
169 |         else:
170 |             assert_that(read_json(None, default=default)).is_equal_to(expected_output)
171 |     else:
172 |         with pytest.raises(TypeError):
173 |             read_json(None)
174 |         assert_that(caplog.text).contains("Unable to read file")
175 | 
176 | 
177 | def test_custom_filter(caplog):
178 |     logger = logging.getLogger(__name__)
179 |     caplog.set_level(logging.INFO)
180 | 
181 |     logger.info("This is a log")
182 |     assert_that(caplog.text).matches("This is a log")
183 | 
184 |     with utils.setup_logging_filter(logger, "CustomField") as custom_log_filter:
185 |         custom_log_filter.set_custom_value("CustomValue")
186 |         logger.info("This is a another log")
187 |         assert_that(caplog.text).matches("CustomField CustomValue - This is a another log")
188 | 
189 |     caplog.clear()
190 |     logger.info("This is another log with no filter")
191 |     assert_that(caplog.text).matches("This is another log with no filter")
192 |     assert_that(caplog.text).does_not_match("CustomField CustomValue")
193 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | import boto3
12 | import pytest
13 | from botocore.stub import Stubber
14 | 
15 | 
16 | @pytest.fixture()
17 | def test_datadir(request, datadir):
18 |     """
19 |     Inject the datadir with resources for the specific test function.
20 | 
21 |     If the test function is declared in a class then datadir is ClassName/FunctionName
22 |     otherwise it is only FunctionName.
23 |     """
24 |     function_name = request.function.__name__
25 |     if not request.cls:
26 |         return datadir / function_name
27 | 
28 |     class_name = request.cls.__name__
29 |     return datadir / "{0}/{1}".format(class_name, function_name)
30 | 
31 | 
32 | @pytest.fixture()
33 | def boto3_stubber(mocker, boto3_stubber_path):
34 |     """
35 |     Create a function to easily mock boto3 clients.
36 | 
37 |     To mock a boto3 service simply pass the name of the service to mock and
38 |     the mocked requests, where mocked_requests is an object containing the method to mock,
39 |     the response to return and the expected params for the boto3 method that gets called.
40 | 
41 |     The function makes use of botocore.Stubber to mock the boto3 API calls.
42 |     Multiple boto3 services can be mocked as part of the same test.
43 | 
44 |     :param boto3_stubber_path is the path of the boto3 import to mock. (e.g. pcluster.config.validators.boto3)
45 |     """
46 |     __tracebackhide__ = True
47 |     created_stubbers = []
48 |     mocked_clients = {}
49 | 
50 |     mocked_client_factory = mocker.patch(boto3_stubber_path, autospec=True)
51 |     # use **kwargs to skip parameters passed to the boto3.client other than the "service"
52 |     # e.g. boto3.client("ec2", region_name=region, ...) --> x = ec2
53 |     mocked_client_factory.client.side_effect = lambda x, **kwargs: mocked_clients[x]
54 | 
55 |     def _boto3_stubber(service, mocked_requests):
56 |         client = boto3.client(service)
57 |         stubber = Stubber(client)
58 |         # Save a ref to the stubber so that we can deactivate it at the end of the test.
59 |         created_stubbers.append(stubber)
60 | 
61 |         # Attach mocked requests to the Stubber and activate it.
62 |         if not isinstance(mocked_requests, list):
63 |             mocked_requests = [mocked_requests]
64 |         for mocked_request in mocked_requests:
65 |             if mocked_request.generate_error:
66 |                 stubber.add_client_error(
67 |                     mocked_request.method,
68 |                     service_message=mocked_request.response,
69 |                     expected_params=mocked_request.expected_params,
70 |                     service_error_code=mocked_request.error_code,
71 |                 )
72 |             else:
73 |                 stubber.add_response(
74 |                     mocked_request.method, mocked_request.response, expected_params=mocked_request.expected_params
75 |                 )
76 |         stubber.activate()
77 | 
78 |         # Add stubber to the collection of mocked clients. This allows to mock multiple clients.
79 |         # Mocking twice the same client will replace the previous one.
80 |         mocked_clients[service] = client
81 |         return client
82 | 
83 |     # yield allows to return the value and then continue the execution when the test is over.
84 |     # Used for resources cleanup.
85 |     yield _boto3_stubber
86 | 
87 |     # Assert that all mocked requests were consumed and deactivate all stubbers.
88 |     for stubber in created_stubbers:
89 |         stubber.assert_no_pending_responses()
90 |         stubber.deactivate()
91 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | assertpy
2 | pytest
3 | pytest-cov
4 | pytest-datadir
5 | pytest-html
6 | pytest-mock
7 | pytest-xdist
8 | retrying


--------------------------------------------------------------------------------
/tests/slurm_plugin/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/slurm_resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-parallelcluster-node/3438c8fcc22bb818660c7d47875e8c3a8a52fa31/tests/slurm_plugin/slurm_resources/__init__.py


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_comparison/config.conf:
--------------------------------------------------------------------------------
1 | [clustermgtd]
2 | cluster_name = hit
3 | region = us-east-2
4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
5 | dynamodb_table = table-name
6 | head_node_private_ip = head.node.ip
7 | head_node_hostname = head-node-hostname
8 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_comparison/config_modified.conf:
--------------------------------------------------------------------------------
1 | [clustermgtd]
2 | cluster_name = hit
3 | region = us-east-2
4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
5 | dynamodb_table = table-name-2
6 | head_node_private_ip = head.node.ip
7 | head_node_hostname = head-node-hostname
8 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_parsing/all_options.conf:
--------------------------------------------------------------------------------
 1 | [clustermgtd]
 2 | cluster_name = hit
 3 | region = us-east-1
 4 | heartbeat_file_path = /home/ubuntu/clustermgtd_heartbeat
 5 | loop_time = 30
 6 | boto3_retry = 10
 7 | disable_all_cluster_management = true
 8 | proxy = https://fake.proxy
 9 | logging_config = /my/logging/config
10 | update_node_address = false
11 | launch_max_batch_size = 1
12 | terminate_max_batch_size = 500
13 | node_replacement_timeout = 10
14 | terminate_drain_nodes = false
15 | terminate_down_nodes = false
16 | orphaned_instance_timeout = 60
17 | disable_ec2_health_check = True
18 | disable_scheduled_event_health_check = True
19 | disable_all_health_checks = False
20 | health_check_timeout = 10
21 | dynamodb_table = table-name
22 | head_node_private_ip = head.node.ip
23 | head_node_hostname = head-node-hostname
24 | hosted_zone = hosted-zone
25 | dns_domain = dns.domain
26 | use_private_hostname = false
27 | protected_failure_count = 5
28 | insufficient_capacity_timeout = 50.5
29 | compute_console_logging_enabled = False
30 | compute_console_logging_max_sample_size = 50
31 | compute_console_wait_time: 10
32 | worker_pool_size: 2
33 | worker_pool_max_backlog: 5
34 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_parsing/default.conf:
--------------------------------------------------------------------------------
1 | [clustermgtd]
2 | cluster_name = hit
3 | region = us-east-2
4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
5 | dynamodb_table = table-name
6 | head_node_private_ip = head.node.ip
7 | head_node_hostname = head-node-hostname


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_clustermgtd/TestClustermgtdConfig/test_config_parsing/health_check.conf:
--------------------------------------------------------------------------------
 1 | [clustermgtd]
 2 | cluster_name = hit
 3 | region = us-east-1
 4 | heartbeat_file_path = /home/ubuntu/clustermgtd_heartbeat
 5 | loop_time = 30
 6 | disable_all_cluster_management = true
 7 | proxy = https://fake.proxy
 8 | logging_config = /my/logging/config
 9 | update_node_address = false
10 | launch_max_batch_size = 1
11 | terminate_max_batch_size = 500
12 | node_replacement_timeout = 10
13 | terminate_drain_nodes = false
14 | terminate_down_nodes = false
15 | orphaned_instance_timeout = 60
16 | disable_ec2_health_check = True
17 | disable_scheduled_event_health_check = True
18 | health_check_timeout = 10
19 | dynamodb_table = table-name
20 | head_node_private_ip = head.node.ip
21 | head_node_hostname = head-node-hostname
22 | hosted_zone = hosted-zone
23 | dns_domain = dns.domain
24 | use_private_hostname = false
25 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_clustermgtd/test_manage_cluster_boto3/default.conf:
--------------------------------------------------------------------------------
 1 | [clustermgtd]
 2 | cluster_name = hit
 3 | region = us-east-2
 4 | heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
 5 | dynamodb_table = table-name
 6 | head_node_private_ip = head.node.ip
 7 | head_node_hostname = head-node-hostname
 8 | hosted_zone = hosted-zone
 9 | dns_domain = dns.domain
10 | use_private_hostname = no
11 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_common.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | import logging
 12 | from datetime import datetime, timedelta, timezone
 13 | 
 14 | import pytest
 15 | from assertpy import assert_that
 16 | from common.utils import read_json, time_is_up
 17 | from slurm_plugin.common import TIMESTAMP_FORMAT, ScalingStrategy, get_clustermgtd_heartbeat
 18 | 
 19 | 
 20 | @pytest.mark.parametrize(
 21 |     "initial_time, current_time, grace_time, expected_result",
 22 |     [
 23 |         (datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 1, 1, 0, 0, 29), 30, False),
 24 |         (datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 1, 1, 0, 0, 30), 30, True),
 25 |         (
 26 |             datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
 27 |             # local timezone is 1 hours ahead of UTC, so this time stamp is actually 30 mins before initial_time
 28 |             datetime(2020, 1, 1, 0, 30, 0, tzinfo=timezone(timedelta(hours=1))),
 29 |             30 * 60,
 30 |             False,
 31 |         ),
 32 |         (
 33 |             datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
 34 |             # local timezone is 1 hours ahead of UTC, so this time stamp is actually 30 mins after initial_time
 35 |             datetime(2020, 1, 1, 1, 30, 0, tzinfo=timezone(timedelta(hours=1))),
 36 |             30 * 60,
 37 |             True,
 38 |         ),
 39 |         (
 40 |             datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
 41 |             # local timezone is 1 hours behind of UTC, so this time stamp is actually 1.5 hrs after initial_time
 42 |             datetime(2020, 1, 1, 0, 30, 0, tzinfo=timezone(-timedelta(hours=1))),
 43 |             90 * 60,
 44 |             True,
 45 |         ),
 46 |         (
 47 |             datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
 48 |             # local timezone is 1 hours behind of UTC, so this time stamp is actually 1 hrs after initial_time
 49 |             datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone(-timedelta(hours=1))),
 50 |             90 * 60,
 51 |             False,
 52 |         ),
 53 |         (
 54 |             None,
 55 |             datetime(2020, 1, 24, 23, 42, 12),
 56 |             180,
 57 |             True,
 58 |         ),
 59 |     ],
 60 | )
 61 | def test_time_is_up(initial_time, current_time, grace_time, expected_result):
 62 |     assert_that(time_is_up(initial_time, current_time, grace_time)).is_equal_to(expected_result)
 63 | 
 64 | 
 65 | @pytest.mark.parametrize(
 66 |     "time, expected_parsed_time",
 67 |     [
 68 |         (
 69 |             datetime(2020, 7, 30, 19, 34, 2, 613338, tzinfo=timezone.utc),
 70 |             datetime(2020, 7, 30, 19, 34, 2, 613338, tzinfo=timezone.utc),
 71 |         ),
 72 |         (
 73 |             datetime(2020, 7, 30, 10, 1, 1, tzinfo=timezone(timedelta(hours=1))),
 74 |             datetime(2020, 7, 30, 10, 1, 1, tzinfo=timezone(timedelta(hours=1))),
 75 |         ),
 76 |     ],
 77 | )
 78 | def test_get_clustermgtd_heartbeat(time, expected_parsed_time, mocker):
 79 |     mocker.patch(
 80 |         "slurm_plugin.common.check_command_output",
 81 |         return_value=f"some_random_stdout\n{time.strftime(TIMESTAMP_FORMAT)}",
 82 |     )
 83 |     assert_that(get_clustermgtd_heartbeat("/some/file/path")).is_equal_to(expected_parsed_time)
 84 | 
 85 | 
 86 | @pytest.mark.parametrize(
 87 |     "json_file, default_value, raises_exception, message_in_log",
 88 |     [
 89 |         ("faulty.json", None, True, "Failed with exception"),
 90 |         ("faulty.json", {}, False, "due to an exception"),  # info message
 91 |         ("standard.json", None, False, None),
 92 |         ("non_existing.json", None, True, "Failed with exception"),
 93 |         ("non_existing.json", {}, False, None),  # info message not displayed
 94 |     ],
 95 | )
 96 | def test_read_json(test_datadir, caplog, json_file, default_value, raises_exception, message_in_log):
 97 |     caplog.set_level(logging.INFO)
 98 |     json_file_path = str(test_datadir.joinpath(json_file))
 99 |     if raises_exception:
100 |         with pytest.raises((ValueError, FileNotFoundError)):
101 |             read_json(json_file_path, default_value)
102 |     else:
103 |         read_json(json_file_path, default_value)
104 | 
105 |     if message_in_log:
106 |         assert_that(caplog.text).matches(message_in_log)
107 |     else:
108 |         assert_that(caplog.text).does_not_match("exception")
109 | 
110 | 
111 | @pytest.mark.parametrize(
112 |     "strategy_as_value, expected_strategy_enum",
113 |     [
114 |         ("best-effort", ScalingStrategy.BEST_EFFORT),
115 |         ("all-or-nothing", ScalingStrategy.ALL_OR_NOTHING),
116 |         ("", ScalingStrategy.ALL_OR_NOTHING),
117 |         ("invalid-strategy", ScalingStrategy.ALL_OR_NOTHING),
118 |     ],
119 | )
120 | def test_scaling_strategies_enum_from_value(strategy_as_value, expected_strategy_enum):
121 |     strategy_enum = ScalingStrategy(strategy_as_value)
122 |     assert_that(strategy_enum).is_equal_to(expected_strategy_enum)
123 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_common/test_read_json/faulty.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_property_1": {
3 |                     "test_property_2": "test_value"
4 |                 }
5 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_common/test_read_json/standard.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_property_1": {
3 |       "test_property_2": "test_value"
4 |     }
5 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_computemgtd.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | 
 13 | import logging
 14 | import os
 15 | 
 16 | import pytest
 17 | import slurm_plugin
 18 | from assertpy import assert_that
 19 | from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate
 20 | from slurm_plugin.slurm_resources import DynamicNode
 21 | 
 22 | 
 23 | @pytest.mark.parametrize(
 24 |     ("config_file", "expected_attributes"),
 25 |     [
 26 |         (
 27 |             "default.conf",
 28 |             {
 29 |                 "cluster_name": "hit",
 30 |                 "region": "us-east-2",
 31 |                 "_boto3_config": {"retries": {"max_attempts": 1, "mode": "standard"}},
 32 |                 "clustermgtd_timeout": 600,
 33 |                 "clustermgtd_heartbeat_file_path": "/home/ec2-user/clustermgtd_heartbeat",
 34 |                 "disable_computemgtd_actions": False,
 35 |                 "_slurm_nodename_file": "/etc/parallelcluster/slurm_plugin/slurm_nodename",
 36 |                 "nodename": "some_nodename",
 37 |                 "loop_time": 60,
 38 |                 "logging_config": os.path.join(
 39 |                     os.path.dirname(slurm_plugin.__file__), "logging", "parallelcluster_computemgtd_logging.conf"
 40 |                 ),
 41 |             },
 42 |         ),
 43 |         (
 44 |             "all_options.conf",
 45 |             {
 46 |                 "cluster_name": "hit",
 47 |                 "region": "us-east-2",
 48 |                 "loop_time": 300,
 49 |                 "clustermgtd_timeout": 30,
 50 |                 "clustermgtd_heartbeat_file_path": "/home/ubuntu/clustermgtd_heartbeat",
 51 |                 "_slurm_nodename_file": "/my/nodename/path",
 52 |                 "nodename": "some_nodename",
 53 |                 "disable_computemgtd_actions": True,
 54 |                 "_boto3_config": {
 55 |                     "retries": {"max_attempts": 1, "mode": "standard"},
 56 |                     "proxies": {"https": "my.resume.proxy"},
 57 |                 },
 58 |                 "logging_config": "/path/to/logging/config",
 59 |             },
 60 |         ),
 61 |     ],
 62 | )
 63 | def test_computemgtd_config(config_file, expected_attributes, test_datadir, mocker):
 64 |     mocker.patch("slurm_plugin.computemgtd.ComputemgtdConfig._read_nodename_from_file", return_value="some_nodename")
 65 |     mocker.patch("slurm_plugin.computemgtd.check_command_output", return_value=(test_datadir / config_file).read_text())
 66 |     compute_config = ComputemgtdConfig("/mocked/config/path")
 67 |     for key in expected_attributes:
 68 |         assert_that(compute_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key))
 69 | 
 70 | 
 71 | @pytest.mark.parametrize(
 72 |     "mock_node_info, expected_result",
 73 |     [
 74 |         (
 75 |             [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "DOWN+CLOUD+NOT_RESPONDING", "queue1")],
 76 |             True,
 77 |         ),
 78 |         (
 79 |             [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "IDLE+CLOUD+DRAIN", "queue1")],
 80 |             False,
 81 |         ),
 82 |         (
 83 |             [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "DOWN+CLOUD+DRAIN", "queue1")],
 84 |             True,
 85 |         ),
 86 |         (
 87 |             [DynamicNode("queue1-st-c5xlarge-1", "ip-1", "host-1", "IDLE+CLOUD+POWERED_DOWN", "queue1")],
 88 |             True,
 89 |         ),
 90 |         (
 91 |             Exception,
 92 |             True,
 93 |         ),
 94 |     ],
 95 |     ids=["node_down", "node_drained_idle", "node_drained_down", "node_power_save", "cant_get_node_info"],
 96 | )
 97 | def test_is_self_node_down(mock_node_info, expected_result, mocker):
 98 |     if mock_node_info is Exception:
 99 |         mocker.patch("slurm_plugin.computemgtd._get_nodes_info_with_retry", side_effect=Exception())
100 |     else:
101 |         mocker.patch("slurm_plugin.computemgtd._get_nodes_info_with_retry", return_value=mock_node_info)
102 | 
103 |     assert_that(_is_self_node_down("queue1-st-c5xlarge-1")).is_equal_to(expected_result)
104 | 
105 | 
106 | def test_self_terminate(mocker, caplog):
107 |     """Verify self-termination is implemented via a shutdown command rather than calling TerminateInstances."""
108 |     run_command_patch = mocker.patch("slurm_plugin.computemgtd.run_command")
109 |     sleep_patch = mocker.patch("slurm_plugin.computemgtd.time.sleep")
110 |     with caplog.at_level(logging.INFO):
111 |         _self_terminate()
112 |     assert_that(caplog.text).contains("Preparing to self terminate the instance in 10 seconds!")
113 |     assert_that(caplog.text).contains("Self terminating instance now!")
114 |     run_command_patch.assert_called_with("sudo shutdown -h now")
115 |     sleep_patch.assert_called_with(10)
116 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_computemgtd/test_computemgtd_config/all_options.conf:
--------------------------------------------------------------------------------
 1 | [computemgtd]
 2 | cluster_name = hit
 3 | region = us-east-2
 4 | clustermgtd_heartbeat_file_path = /home/ubuntu/clustermgtd_heartbeat
 5 | disable_computemgtd_actions = True
 6 | slurm_nodename_file = /my/nodename/path
 7 | loop_time = 300
 8 | clustermgtd_timeout = 30
 9 | logging_config = /path/to/logging/config
10 | proxy = my.resume.proxy
11 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_computemgtd/test_computemgtd_config/default.conf:
--------------------------------------------------------------------------------
1 | [computemgtd]
2 | cluster_name = hit
3 | region = us-east-2
4 | clustermgtd_heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
5 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_console_logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License").
  4 | # You may not use this file except in compliance with the
  5 | # License. A copy of the License is located at
  6 | #
  7 | # http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import base64
 14 | import os
 15 | import re
 16 | from concurrent.futures import Future
 17 | from typing import Callable, Optional
 18 | 
 19 | import boto3
 20 | import pytest
 21 | from assertpy import assert_that
 22 | from botocore.stub import Stubber
 23 | from slurm_plugin.common import TaskController
 24 | from slurm_plugin.console_logger import ConsoleLogger
 25 | 
 26 | from tests.common import MockedBoto3Request
 27 | 
 28 | 
 29 | class _TestController(TaskController):
 30 |     def __init__(self):
 31 |         self.tasks_queued: int = 0
 32 |         self._shutdown = False
 33 | 
 34 |     def queue_task(self, task: Callable[[], None]) -> Optional[Future]:
 35 |         self.tasks_queued += 1
 36 |         task()
 37 |         return None
 38 | 
 39 |     def is_shutdown(self) -> bool:
 40 |         return self._shutdown
 41 | 
 42 |     def raise_if_shutdown(self) -> None:
 43 |         if self._shutdown:
 44 |             raise TaskController.TaskShutdownError()
 45 | 
 46 |     def wait_unless_shutdown(self, seconds_to_wait: float) -> None:
 47 |         self.raise_if_shutdown()
 48 | 
 49 |     def shutdown(self, wait: bool = False, cancel_futures: bool = True) -> None:
 50 |         self._shutdown = True
 51 | 
 52 | 
 53 | @pytest.fixture()
 54 | def boto3_stubber_path():
 55 |     # we need to set the region in the environment because the Boto3ClientFactory requires it.
 56 |     os.environ["AWS_DEFAULT_REGION"] = "us-east-2"
 57 |     return "slurm_plugin.instance_manager.boto3"  # FIXME
 58 | 
 59 | 
 60 | @pytest.mark.parametrize(
 61 |     "compute_instances",
 62 |     [
 63 |         [
 64 |             {
 65 |                 "Name": "node-0",
 66 |                 "InstanceId": "i-005457f0c2beb9ad2",
 67 |             },
 68 |             {
 69 |                 "Name": "node-1",
 70 |                 "InstanceId": "i-105457f0c2beb9ad2",
 71 |             },
 72 |         ],
 73 |         [],
 74 |     ],
 75 | )
 76 | def test_get_console_output_from_nodes(compute_instances):
 77 |     def console_callback(name, instance_id, output):
 78 |         actual_results.update({instance_id: output})
 79 | 
 80 |     expected_instances = tuple(node.get("InstanceId") for node in compute_instances if node.get("InstanceId"))
 81 |     expected_results = {instance: f"{instance}:\rConsole output for you too." for instance in expected_instances}
 82 | 
 83 |     mocked_ec2_requests = [
 84 |         MockedBoto3Request(
 85 |             method="get_console_output",
 86 |             response={
 87 |                 "InstanceId": instance,
 88 |                 "Output": str(base64.b64encode(re.sub(r"\r", "\r\n", output).encode("utf-8")), "latin-1"),
 89 |                 "Timestamp": "2022-11-18T23:37:25.000Z",
 90 |             },
 91 |             expected_params={
 92 |                 "InstanceId": instance,
 93 |             },
 94 |             generate_error=False,
 95 |         )
 96 |         for instance, output in expected_results.items()
 97 |     ]
 98 | 
 99 |     actual_results = {}
100 | 
101 |     console_logger = ConsoleLogger(
102 |         enabled=True,
103 |         region="us-east-2",
104 |         console_output_consumer=console_callback,
105 |     )
106 | 
107 |     ec2client = boto3.session.Session().client("ec2", "us-east-2")
108 | 
109 |     task_controller = _TestController()
110 | 
111 |     with Stubber(ec2client) as ec2_stub:
112 |         for request in mocked_ec2_requests:
113 |             ec2_stub.add_response(request.method, request.response, expected_params=request.expected_params)
114 |         console_logger._boto3_client_factory = lambda service_name: ec2client
115 |         console_logger.report_console_output_from_nodes(
116 |             compute_instances=compute_instances,
117 |             task_controller=task_controller,
118 |             task_wait_function=lambda: None,
119 |         )
120 |         ec2_stub.assert_no_pending_responses()
121 | 
122 |     (
123 |         assert_that(task_controller.tasks_queued).is_equal_to(1)
124 |         if len(compute_instances) > 0
125 |         else assert_that(task_controller.tasks_queued).is_zero()
126 |     )
127 |     assert_that(actual_results).is_length(len(mocked_ec2_requests))
128 | 
129 |     for instance, actual_output in actual_results.items():
130 |         assert_that(actual_output).is_equal_to(expected_results.get(instance))
131 | 
132 | 
133 | def test_exception_handling():
134 |     class EC2Client:
135 |         def get_console_output(*args, **kwargs):
136 |             test_controller.shutdown()
137 | 
138 |             instance_id = kwargs.get("InstanceId")
139 |             return {"Output": f"Output for {instance_id}"}
140 | 
141 |     def boto3_factory(*args, **kwargs):
142 |         return EC2Client()
143 | 
144 |     def callback(*args):
145 |         nonlocal call_count
146 |         call_count += 1
147 | 
148 |     call_count = 0
149 | 
150 |     console_logger = ConsoleLogger(enabled=True, region="us-east-2", console_output_consumer=callback)
151 | 
152 |     console_logger._boto3_client_factory = boto3_factory
153 | 
154 |     test_controller = _TestController()
155 | 
156 |     assert_that(console_logger.report_console_output_from_nodes).raises(
157 |         TaskController.TaskShutdownError
158 |     ).when_called_with(
159 |         compute_instances=[{"Name": "hello", "InstanceId": "instance-id"}],
160 |         task_controller=test_controller,
161 |         task_wait_function=lambda: None,
162 |     )
163 | 
164 |     assert_that(test_controller.tasks_queued).is_equal_to(1)
165 |     assert_that(call_count).is_zero()
166 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs":[
 3 |     {
 4 |       "LaunchTemplateSpecification":{
 5 |         "LaunchTemplateName":"hit-queue1-fleet-spot",
 6 |         "Version":"$Latest"
 7 |       },
 8 |       "Overrides":[
 9 |         {
10 |           "MaxPrice":"10",
11 |           "InstanceType":"t2.medium",
12 |           "SubnetId":"1234567"
13 |         },
14 |         {
15 |           "MaxPrice":"10",
16 |           "InstanceType":"t2.large",
17 |           "SubnetId":"1234567"
18 |         }
19 |       ]
20 |     }
21 |   ],
22 |   "TargetCapacitySpecification":{
23 |     "TotalTargetCapacity":5,
24 |     "DefaultTargetCapacityType":"spot"
25 |   },
26 |   "Type":"instant",
27 |   "SpotOptions":{
28 |     "AllocationStrategy":"capacity-optimized",
29 |     "SingleInstanceType":false,
30 |     "SingleAvailabilityZone":true,
31 |     "MinTargetCapacity":5
32 |   }
33 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs": [
 3 |     {
 4 |       "LaunchTemplateSpecification": {
 5 |         "LaunchTemplateName": "hit-queue6-fleet1",
 6 |         "Version": "$Latest"
 7 |       },
 8 |       "Overrides": [
 9 |         {
10 |           "InstanceType": "t2.medium",
11 |           "SubnetId": "1234567"
12 |         },
13 |         {
14 |           "InstanceType": "t2.medium",
15 |           "SubnetId": "7654321"
16 |         },
17 |         {
18 |           "InstanceType": "t2.large",
19 |           "SubnetId": "1234567"
20 |         },
21 |         {
22 |           "InstanceType": "t2.large",
23 |           "SubnetId": "7654321"
24 |         }
25 |       ]
26 |     }
27 |   ],
28 |   "OnDemandOptions": {
29 |     "AllocationStrategy": "lowest-price",
30 |     "SingleInstanceType": false,
31 |     "SingleAvailabilityZone": false,
32 |     "CapacityReservationOptions": {
33 |       "UsageStrategy": "use-capacity-reservations-first"
34 |     }
35 |   },
36 |   "TargetCapacitySpecification": {
37 |     "TotalTargetCapacity": 5,
38 |     "DefaultTargetCapacityType": "on-demand"
39 |   },
40 |   "Type": "instant"
41 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs": [
 3 |     {
 4 |       "LaunchTemplateSpecification": {
 5 |         "LaunchTemplateName": "hit-queue6-fleet1",
 6 |         "Version": "$Latest"
 7 |       },
 8 |       "Overrides": [
 9 |         {
10 |           "InstanceType": "t2.medium",
11 |           "SubnetId": "1234567"
12 |         },
13 |         {
14 |           "InstanceType": "t2.medium",
15 |           "SubnetId": "7654321"
16 |         },
17 |         {
18 |           "InstanceType": "t2.large",
19 |           "SubnetId": "1234567"
20 |         },
21 |         {
22 |           "InstanceType": "t2.large",
23 |           "SubnetId": "7654321"
24 |         }
25 |       ]
26 |     }
27 |   ],
28 |   "OnDemandOptions": {
29 |     "AllocationStrategy": "lowest-price",
30 |     "SingleInstanceType": false,
31 |     "SingleAvailabilityZone": false,
32 |     "CapacityReservationOptions": {
33 |       "UsageStrategy": "use-capacity-reservations-first"
34 |     }
35 |   },
36 |   "TargetCapacitySpecification": {
37 |     "TotalTargetCapacity": 5,
38 |     "DefaultTargetCapacityType": "on-demand"
39 |   },
40 |   "Type": "instant"
41 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs":[
 3 |     {
 4 |       "LaunchTemplateSpecification":{
 5 |         "LaunchTemplateName":"hit-queue5-fleet1",
 6 |         "Version":"$Latest"
 7 |       },
 8 |       "Overrides":[
 9 |         {
10 |           "InstanceType":"t2.medium",
11 |           "SubnetId":"1234567"
12 |         },
13 |         {
14 |           "InstanceType":"t2.medium",
15 |           "SubnetId":"7654321"
16 |         }
17 |       ]
18 |     }
19 |   ],
20 |   "OnDemandOptions":{
21 |     "AllocationStrategy":"lowest-price",
22 |     "SingleInstanceType":true,
23 |     "SingleAvailabilityZone":false,
24 |     "MinTargetCapacity":5,
25 |     "CapacityReservationOptions":{
26 |       "UsageStrategy":"use-capacity-reservations-first"
27 |     }
28 |   },
29 |   "TargetCapacitySpecification":{
30 |     "TotalTargetCapacity":5,
31 |     "DefaultTargetCapacityType":"on-demand"
32 |   },
33 |   "Type":"instant"
34 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs": [
 3 |     {
 4 |       "LaunchTemplateSpecification": {
 5 |         "LaunchTemplateName": "hit-queue4-fleet1",
 6 |         "Version": "$Latest"
 7 |       },
 8 |       "Overrides": [
 9 |         {
10 |           "InstanceType": "t2.medium",
11 |           "SubnetId": "1234567"
12 |         },
13 |         {
14 |           "InstanceType": "t2.large",
15 |           "SubnetId": "1234567"
16 |         }
17 |       ]
18 |     }
19 |   ],
20 |   "OnDemandOptions": {
21 |     "AllocationStrategy": "lowest-price",
22 |     "SingleInstanceType": false,
23 |     "SingleAvailabilityZone": true,
24 |     "MinTargetCapacity": 5,
25 |     "CapacityReservationOptions": {
26 |       "UsageStrategy": "use-capacity-reservations-first"
27 |     }
28 |   },
29 |   "TargetCapacitySpecification": {
30 |     "TotalTargetCapacity": 5,
31 |     "DefaultTargetCapacityType": "on-demand"
32 |   },
33 |   "Type": "instant"
34 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_capacity_block/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs":[
 3 |     {
 4 |       "LaunchTemplateSpecification":{
 5 |         "LaunchTemplateName":"hit-queue-cb-fleet-capacity-block",
 6 |         "Version":"$Latest"
 7 |       },
 8 |       "Overrides":[
 9 |         {
10 |           "InstanceType":"t2.medium",
11 |           "SubnetId":"1234567"
12 |         },
13 |         {
14 |           "InstanceType":"t2.large",
15 |           "SubnetId":"1234567"
16 |         }
17 |       ]
18 |     }
19 |   ],
20 |   "OnDemandOptions":{
21 |     "SingleInstanceType":false,
22 |     "SingleAvailabilityZone":true,
23 |     "MinTargetCapacity":1,
24 |     "CapacityReservationOptions":{
25 |       "UsageStrategy":"use-capacity-reservations-first"
26 |     }
27 |   },
28 |   "TargetCapacitySpecification":{
29 |     "TotalTargetCapacity":5,
30 |     "DefaultTargetCapacityType":"capacity-block"
31 |   },
32 |   "Type":"instant"
33 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs":[
 3 |     {
 4 |       "LaunchTemplateSpecification":{
 5 |         "LaunchTemplateName":"hit-queue2-fleet-ondemand",
 6 |         "Version":"$Latest"
 7 |       },
 8 |       "Overrides":[
 9 |         {
10 |           "InstanceType":"t2.medium",
11 |           "SubnetId":"1234567"
12 |         },
13 |         {
14 |           "InstanceType":"t2.large",
15 |           "SubnetId":"1234567"
16 |         }
17 |       ]
18 |     }
19 |   ],
20 |   "OnDemandOptions":{
21 |     "AllocationStrategy":"lowest-price",
22 |     "SingleInstanceType":false,
23 |     "SingleAvailabilityZone":true,
24 |     "MinTargetCapacity":1,
25 |     "CapacityReservationOptions":{
26 |       "UsageStrategy":"use-capacity-reservations-first"
27 |     }
28 |   },
29 |   "TargetCapacitySpecification":{
30 |     "TotalTargetCapacity":5,
31 |     "DefaultTargetCapacityType":"on-demand"
32 |   },
33 |   "Type":"instant"
34 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_spot/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs":[
 3 |     {
 4 |       "LaunchTemplateSpecification":{
 5 |         "LaunchTemplateName":"hit-queue1-fleet-spot",
 6 |         "Version":"$Latest"
 7 |       },
 8 |       "Overrides":[
 9 |         {
10 |           "MaxPrice":"10",
11 |           "InstanceType":"t2.medium",
12 |           "SubnetId":"1234567"
13 |         },
14 |         {
15 |           "MaxPrice":"10",
16 |           "InstanceType":"t2.large",
17 |           "SubnetId":"1234567"
18 |         }
19 |       ]
20 |     }
21 |   ],
22 |   "SpotOptions":{
23 |     "AllocationStrategy":"capacity-optimized",
24 |     "SingleInstanceType":false,
25 |     "SingleAvailabilityZone":true,
26 |     "MinTargetCapacity":1
27 |   },
28 |   "TargetCapacitySpecification":{
29 |     "TotalTargetCapacity":5,
30 |     "DefaultTargetCapacityType":"spot"
31 |   },
32 |   "Type":"instant"
33 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/launch_overrides/expected_launch_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "LaunchTemplateConfigs":[
 3 |     {
 4 |       "LaunchTemplateSpecification":{
 5 |         "LaunchTemplateName":"hit-queue2-fleet-ondemand",
 6 |         "Version":"$Latest"
 7 |       },
 8 |       "Overrides":[
 9 |         {
10 |           "InstanceType":"t2.medium",
11 |           "SubnetId":"1234567"
12 |         },
13 |         {
14 |           "InstanceType":"t2.large",
15 |           "SubnetId":"1234567"
16 |         }
17 |       ]
18 |     }
19 |   ],
20 |   "TargetCapacitySpecification":{
21 |     "TotalTargetCapacity":5,
22 |     "DefaultTargetCapacityType":"on-demand"
23 |   },
24 |   "Type":"instant",
25 |   "OnDemandOptions":{
26 |     "AllocationStrategy":"lowest-price",
27 |     "SingleInstanceType":false,
28 |     "SingleAvailabilityZone":true,
29 |     "MinTargetCapacity":1,
30 |     "CapacityReservationOptions":{
31 |       "UsageStrategy":"use-capacity-reservations-first"
32 |     }
33 |   },
34 |   "TagSpecifications":[
35 |     {
36 |       "ResourceType":"capacity-reservation",
37 |       "Tags":[
38 |         {
39 |           "Key":"string",
40 |           "Value":"string"
41 |         }
42 |       ]
43 |     }
44 |   ]
45 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
  4 | # the License. A copy of the License is located at
  5 | #
  6 | # http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
  9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 10 | # limitations under the License.
 11 | 
 12 | 
 13 | import os
 14 | from types import SimpleNamespace
 15 | from unittest.mock import ANY
 16 | 
 17 | import botocore
 18 | import pytest
 19 | import slurm_plugin
 20 | from assertpy import assert_that
 21 | from slurm_plugin.clustermgtd import ComputeFleetStatus
 22 | from slurm_plugin.fleet_status_manager import (
 23 |     SlurmFleetManagerConfig,
 24 |     _get_computefleet_status,
 25 |     _manage_fleet_status_transition,
 26 |     _start_partitions,
 27 |     _stop_partitions,
 28 | )
 29 | from slurm_plugin.slurm_resources import PartitionStatus
 30 | 
 31 | 
 32 | @pytest.fixture()
 33 | def boto3_stubber_path():
 34 |     # we need to set the region in the environment because the Boto3ClientFactory requires it.
 35 |     os.environ["AWS_DEFAULT_REGION"] = "us-east-2"
 36 |     return "slurm_plugin.instance_manager.boto3"
 37 | 
 38 | 
 39 | @pytest.mark.parametrize(
 40 |     ("config_file", "expected_attributes"),
 41 |     [
 42 |         (
 43 |             "default.conf",
 44 |             {
 45 |                 "cluster_name": "test",
 46 |                 "region": "us-east-2",
 47 |                 "terminate_max_batch_size": 1000,
 48 |                 "_boto3_config": {"retries": {"max_attempts": 5, "mode": "standard"}},
 49 |                 "logging_config": os.path.join(
 50 |                     os.path.dirname(slurm_plugin.__file__),
 51 |                     "logging",
 52 |                     "parallelcluster_fleet_status_manager_logging.conf",
 53 |                 ),
 54 |             },
 55 |         ),
 56 |         (
 57 |             "all_options.conf",
 58 |             {
 59 |                 "cluster_name": "test_again",
 60 |                 "region": "us-east-1",
 61 |                 "terminate_max_batch_size": 50,
 62 |                 "_boto3_config": {
 63 |                     "retries": {"max_attempts": 10, "mode": "standard"},
 64 |                     "proxies": {"https": "my.resume.proxy"},
 65 |                 },
 66 |                 "logging_config": "/path/to/fleet_status_manager_logging/config",
 67 |             },
 68 |         ),
 69 |     ],
 70 | )
 71 | def test_fleet_status_manager_config(config_file, expected_attributes, test_datadir):
 72 |     resume_config = SlurmFleetManagerConfig(test_datadir / config_file)
 73 |     for key in expected_attributes:
 74 |         assert_that(resume_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key))
 75 | 
 76 | 
 77 | @pytest.mark.parametrize(
 78 |     ("computefleet_status_data_path", "status", "action"),
 79 |     [
 80 |         ("path_to_file_1", ComputeFleetStatus.STOPPED, None),
 81 |         ("path_to_file_2", ComputeFleetStatus.RUNNING, None),
 82 |         ("path_to_file_3", ComputeFleetStatus.STOPPING, None),
 83 |         ("path_to_file_4", ComputeFleetStatus.STARTING, None),
 84 |         ("path_to_file_5", ComputeFleetStatus.STOP_REQUESTED, "stop"),
 85 |         ("path_to_file_6", ComputeFleetStatus.START_REQUESTED, "start"),
 86 |         ("path_to_file_7", ComputeFleetStatus.PROTECTED, None),
 87 |     ],
 88 | )
 89 | def test_fleet_status_manager(mocker, test_datadir, computefleet_status_data_path, status, action):
 90 |     # mocks
 91 |     config = SimpleNamespace(some_key_1="some_value_1", some_key_2="some_value_2")
 92 |     get_computefleet_status_mocked = mocker.patch("slurm_plugin.fleet_status_manager._get_computefleet_status")
 93 |     get_computefleet_status_mocked.return_value = status
 94 |     stop_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager._stop_partitions")
 95 |     start_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager._start_partitions")
 96 | 
 97 |     # method to test
 98 |     _manage_fleet_status_transition(config, computefleet_status_data_path)
 99 | 
100 |     # assertions
101 |     get_computefleet_status_mocked.assert_called_once_with(computefleet_status_data_path)
102 |     if action == "start":
103 |         start_partitions_mocked.assert_called_once()
104 |         stop_partitions_mocked.assert_not_called()
105 |     elif action == "stop":
106 |         stop_partitions_mocked.assert_called_once_with(config)
107 |         start_partitions_mocked.assert_not_called()
108 |     else:
109 |         start_partitions_mocked.assert_not_called()
110 |         stop_partitions_mocked.assert_not_called()
111 | 
112 | 
113 | @pytest.mark.parametrize(
114 |     ("config_file", "expected_status"),
115 |     [
116 |         ("correct_status.json", ComputeFleetStatus.RUNNING),
117 |         ("no_status.json", ValueError),
118 |         ("malformed_status.json", FileNotFoundError),
119 |         ("wrong_status.json", ValueError),
120 |         (None, TypeError),
121 |     ],
122 | )
123 | def test_get_computefleet_status(test_datadir, config_file, expected_status):
124 |     if isinstance(expected_status, ComputeFleetStatus):
125 |         status = _get_computefleet_status(test_datadir / config_file)
126 |         assert_that(status).is_equal_to(expected_status)
127 |     else:
128 |         with pytest.raises(expected_status):
129 |             _get_computefleet_status(test_datadir / config_file)
130 | 
131 | 
132 | def test_start_partitions(mocker):
133 |     update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions")
134 |     resume_powering_down_nodes_mocked = mocker.patch("slurm_plugin.fleet_status_manager.resume_powering_down_nodes")
135 | 
136 |     _start_partitions()
137 | 
138 |     update_all_partitions_mocked.assert_called_once_with(PartitionStatus.UP, reset_node_addrs_hostname=False)
139 |     resume_powering_down_nodes_mocked.assert_called_once()
140 | 
141 | 
142 | def test_stop_partitions(mocker):
143 |     # mocks
144 |     config = SimpleNamespace(
145 |         terminate_max_batch_size="3", region="us-east-1", cluster_name="test", boto3_config=botocore.config.Config()
146 |     )
147 |     update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions")
148 | 
149 |     terminate_all_compute_nodes_mocked = mocker.patch.object(
150 |         slurm_plugin.instance_manager.InstanceManager, "terminate_all_compute_nodes", autospec=True
151 |     )
152 | 
153 |     # method to test
154 |     _stop_partitions(config)
155 | 
156 |     # assertions
157 |     update_all_partitions_mocked.assert_called_once_with(PartitionStatus.INACTIVE, reset_node_addrs_hostname=True)
158 |     terminate_all_compute_nodes_mocked.assert_called_once_with(ANY, config.terminate_max_batch_size)
159 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager/test_fleet_status_manager_config/all_options.conf:
--------------------------------------------------------------------------------
1 | [slurm_fleet_status_manager]
2 | cluster_name = test_again
3 | region = us-east-1
4 | proxy = my.resume.proxy
5 | boto3_retry = 10
6 | terminate_max_batch_size = 50
7 | logging_config = /path/to/fleet_status_manager_logging/config


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager/test_fleet_status_manager_config/default.conf:
--------------------------------------------------------------------------------
1 | [slurm_fleet_status_manager]
2 | cluster_name = test
3 | region = us-east-2
4 | proxy = NONE


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/correct_status.json:
--------------------------------------------------------------------------------
1 | {
2 |   "status": "RUNNING",
3 |   "lastStatusUpdatedTime": "2022-01-26T11:08:18.000Z"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/malformed_status:
--------------------------------------------------------------------------------
1 | RUNNING
2 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/no_status.json:
--------------------------------------------------------------------------------
1 | {
2 |   "lastStatusUpdatedTime": "2022-01-26T11:08:18.000Z"
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_fleet_status_manager/test_get_computefleet_status/wrong_status.json:
--------------------------------------------------------------------------------
1 | {
2 |   "status": "NO_EXIST",
3 |   "lastStatusUpdatedTime": "2022-01-26T11:08:18.000Z"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_resume/test_get_slurm_resume/malformed.json:
--------------------------------------------------------------------------------
1 | malformed json


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_resume/test_get_slurm_resume/resume.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "all_nodes_resume" : "cloud[1-3]",
 3 |   "jobs" : [
 4 |     {
 5 |       "extra" : "An arbitrary string from --extra",
 6 |       "features" : "c1,c2",
 7 |       "job_id" : 140814,
 8 |       "nodes_alloc" : "cloud[1-4]",
 9 |       "nodes_resume" : "cloud[1-3]",
10 |       "oversubscribe" : "OK",
11 |       "partition" : "cloud",
12 |       "reservation" : "resv_1234"
13 |     }
14 |   ]
15 | }


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_resume/test_resume_config/all_options.conf:
--------------------------------------------------------------------------------
 1 | [slurm_resume]
 2 | cluster_name = hit
 3 | instance_id = i-instance-id
 4 | region = us-east-2
 5 | proxy = my.resume.proxy
 6 | boto3_retry = 10
 7 | launch_max_batch_size = 50
 8 | update_node_address = False
 9 | logging_config = /path/to/resume_logging/config
10 | dynamodb_table = table-name
11 | head_node_private_ip = head.node.ip
12 | head_node_hostname = head-node-hostname
13 | hosted_zone = hosted-zone
14 | dns_domain = dns.domain
15 | use_private_hostname = False
16 | all_or_nothing_batch = True
17 | clustermgtd_heartbeat_file_path = alternate/clustermgtd_heartbeat
18 | clustermgtd_timeout = 5
19 | job_level_scaling = False
20 | assign_node_max_batch_size = 400
21 | terminate_max_batch_size = 600
22 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_resume/test_resume_config/default.conf:
--------------------------------------------------------------------------------
1 | [slurm_resume]
2 | cluster_name = hit
3 | instance_id = i-instance-id
4 | region = us-east-2
5 | dynamodb_table = table-name
6 | head_node_private_ip = head.node.ip
7 | head_node_hostname = head-node-hostname
8 | clustermgtd_heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
9 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_suspend.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 4 | # with the License. A copy of the License is located at
 5 | #
 6 | # http://aws.amazon.com/apache2.0/
 7 | #
 8 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 9 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10 | # limitations under the License.
11 | 
12 | import os
13 | 
14 | import pytest
15 | import slurm_plugin
16 | from assertpy import assert_that
17 | from slurm_plugin.suspend import SlurmSuspendConfig
18 | 
19 | 
20 | @pytest.mark.parametrize(
21 |     ("config_file", "expected_attributes"),
22 |     [
23 |         (
24 |             "default.conf",
25 |             {
26 |                 "logging_config": os.path.join(
27 |                     os.path.dirname(slurm_plugin.__file__), "logging", "parallelcluster_suspend_logging.conf"
28 |                 ),
29 |                 "clustermgtd_timeout": 300,
30 |                 "clustermgtd_heartbeat_file_path": "/home/ec2-user/clustermgtd_heartbeat",
31 |             },
32 |         ),
33 |         (
34 |             "all_options.conf",
35 |             {
36 |                 "logging_config": "/path/to/suspend_logging/config",
37 |                 "clustermgtd_timeout": 5,
38 |                 "clustermgtd_heartbeat_file_path": "alternate/clustermgtd_heartbeat",
39 |             },
40 |         ),
41 |     ],
42 | )
43 | def test_suspend_config(config_file, expected_attributes, test_datadir):
44 |     suspend_config = SlurmSuspendConfig(test_datadir / config_file)
45 |     for key in expected_attributes:
46 |         assert_that(suspend_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key))
47 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_suspend/test_suspend_config/all_options.conf:
--------------------------------------------------------------------------------
1 | [slurm_suspend]
2 | logging_config = /path/to/suspend_logging/config
3 | clustermgtd_heartbeat_file_path = alternate/clustermgtd_heartbeat
4 | clustermgtd_timeout = 5
5 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_suspend/test_suspend_config/default.conf:
--------------------------------------------------------------------------------
1 | [slurm_suspend]
2 | clustermgtd_heartbeat_file_path = /home/ec2-user/clustermgtd_heartbeat
3 | 


--------------------------------------------------------------------------------
/tests/slurm_plugin/test_task_executor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License").
 4 | # You may not use this file except in compliance with the
 5 | # License. A copy of the License is located at
 6 | #
 7 | # http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
10 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import time
14 | from datetime import datetime, timezone
15 | 
16 | from assertpy import assert_that, soft_assertions
17 | from slurm_plugin.common import TaskController
18 | from slurm_plugin.task_executor import TaskExecutor
19 | 
20 | 
21 | def test_task_executor():
22 |     def get_task(value):
23 |         def task():
24 |             return value + 1
25 | 
26 |         return task
27 | 
28 |     task_executor = TaskExecutor(worker_pool_size=3, max_backlog=10)
29 | 
30 |     futures = {value: task_executor.queue_task(get_task(value)) for value in range(10, 20)}
31 | 
32 |     with soft_assertions():
33 |         for value, future in futures.items():
34 |             assert_that(future.result()).is_equal_to(value + 1)
35 | 
36 |     task_executor.shutdown()
37 | 
38 | 
39 | def test_exceeding_max_backlog():
40 |     def get_task(value):
41 |         def task():
42 |             time.sleep(value)
43 |             return value + 1
44 | 
45 |         return task
46 | 
47 |     task_executor = TaskExecutor(worker_pool_size=1, max_backlog=1)
48 | 
49 |     future = task_executor.queue_task(get_task(10))
50 |     assert_that(task_executor.queue_task).raises(TaskExecutor.MaximumBacklogExceededError).when_called_with(
51 |         get_task(20)
52 |     )
53 | 
54 |     assert_that(future.result()).is_equal_to(11)
55 | 
56 |     task_executor.shutdown()
57 | 
58 | 
59 | def test_that_shutdown_does_not_block():
60 |     def get_task(value):
61 |         def task():
62 |             task_executor.wait_unless_shutdown(value)
63 |             return value + 1
64 | 
65 |         return task
66 | 
67 |     def callback(*args):
68 |         nonlocal callback_called
69 |         callback_called = True
70 | 
71 |     task_executor = TaskExecutor(worker_pool_size=1, max_backlog=1)
72 | 
73 |     callback_called = False
74 |     start_wait = datetime.now(tz=timezone.utc)
75 |     future = task_executor.queue_task(get_task(600))
76 |     future.add_done_callback(callback)
77 | 
78 |     task_executor.shutdown(wait=True)
79 | 
80 |     delta = (datetime.now(tz=timezone.utc) - start_wait).total_seconds()
81 |     assert_that(delta).is_less_than(300)
82 | 
83 |     assert_that(future.exception).raises(TaskController.TaskShutdownError)
84 |     assert_that(callback_called).is_true()
85 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | envlist =
  3 |     py{39,310}-cov
  4 |     code-linters
  5 | 
  6 | # Default testenv. Used to run tests on all python versions.
  7 | [testenv]
  8 | passenv =
  9 |     CI
 10 |     GITHUB_*
 11 | usedevelop =
 12 |     cov: true
 13 |     nocov: false
 14 | allowlist_externals =
 15 |     bash
 16 | deps =
 17 |     -r tests/requirements.txt
 18 | commands =
 19 |     nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/
 20 |     cov: python setup.py clean --all build_ext --force --inplace
 21 |     cov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --cov-report=xml --cov=src tests/
 22 | 
 23 | # Section used to define common variables used by multiple testenvs.
 24 | [vars]
 25 | code_dirs =
 26 |     setup.py \
 27 |     src/ \
 28 |     tests/
 29 | 
 30 | ##############################
 31 | ###     AUTO-FORMATTER     ###
 32 | ##############################
 33 | 
 34 | # black is a code formatter for python: https://github.com/ambv/black.
 35 | # The following target formats python files with black formatter.
 36 | [testenv:black]
 37 | basepython = python3
 38 | skip_install = true
 39 | deps =
 40 |     black
 41 | commands =
 42 |     black -l 120 \
 43 |         {[vars]code_dirs} \
 44 |         {posargs}
 45 | 
 46 | # Checks that python files are correctly formatted.
 47 | [testenv:black-check]
 48 | basepython = python3
 49 | skip_install = true
 50 | deps =
 51 |     {[testenv:black]deps}
 52 | commands =
 53 |     {[testenv:black]commands} --check --diff
 54 | 
 55 | # isort is an imports sorter for python: https://github.com/timothycrosley/isort
 56 | # The following target sorts the import according to .isort.cfg file.
 57 | [testenv:isort]
 58 | basepython = python3
 59 | skip_install = true
 60 | deps =
 61 |     isort
 62 |     seed-isort-config
 63 | commands =
 64 |     isort -w 120 \
 65 |         {[vars]code_dirs} \
 66 |         {posargs}
 67 | 
 68 | # Checks that python imports are correctly sorted.
 69 | [testenv:isort-check]
 70 | basepython = python3
 71 | skip_install = true
 72 | deps = {[testenv:isort]deps}
 73 | commands = {[testenv:isort]commands} --check --diff
 74 | 
 75 | # Reformats code with black and isort.
 76 | [testenv:autoformat]
 77 | basepython = python3
 78 | skip_install = true
 79 | deps =
 80 |     {[testenv:isort]deps}
 81 |     {[testenv:black]deps}
 82 | commands =
 83 |     {[testenv:isort]commands}
 84 |     {[testenv:black]commands}
 85 | 
 86 | 
 87 | #############################
 88 | ###        LINTERS        ###
 89 | #############################
 90 | 
 91 | # flake8 python linter: https://github.com/PyCQA/flake8.
 92 | # flake8 config is located in .flake8 file
 93 | [testenv:flake8]
 94 | basepython = python3
 95 | skip_install = true
 96 | deps =
 97 |     flake8
 98 |     flake8-docstrings
 99 |     flake8-bugbear
100 |     # flake8-import-order # delegated to isort
101 |     flake8-colors
102 |     pep8-naming
103 | commands =
104 |     flake8 \
105 |         {[vars]code_dirs} \
106 |         {posargs}
107 | 
108 | # bandit security linter for python: https://github.com/PyCQA/bandit
109 | [testenv:bandit]
110 | basepython = python3
111 | skip_install = true
112 | deps =
113 |     bandit
114 | commands =
115 |     bandit -r \
116 |         -c .bandit.ini \
117 |         --exclude tests \
118 |         {[vars]code_dirs} \
119 |         {posargs}
120 | 
121 | # checks that README file is well-formed.
122 | [testenv:readme]
123 | basepython = python3
124 | skip_install = true
125 | deps =
126 |     readme_renderer
127 | commands =
128 |     python setup.py check -r -s
129 | 
130 | # Pylint linter for python: https://www.pylint.org/
131 | # Pylint config is located in .pylintrc file.
132 | [testenv:pylint]
133 | basepython = python3
134 | deps =
135 |     pyflakes
136 |     pylint
137 | commands =
138 |     pylint \
139 |         {[vars]code_dirs} \
140 |         {posargs}
141 | 
142 | # Vulture finds unused code in python: https://github.com/jendrikseipp/vulture
143 | [testenv:vulture]
144 | basepython = python3
145 | skip_install = true
146 | deps =
147 |     vulture
148 | commands =
149 |     vulture \
150 |         {[vars]code_dirs} \
151 |         {posargs}
152 | 
153 | # Static type checker for Python: http://mypy-lang.org/
154 | [testenv:mypy]
155 | basepython = python3
156 | deps =
157 |     mypy
158 | commands =
159 |     mypy \
160 |         {[vars]code_dirs} \
161 |         {posargs}
162 | 
163 | # semgrep is used to check for security issues
164 | # https://semgrep.dev/
165 | [testenv:semgrep]
166 | basepython = python3
167 | deps =
168 |     semgrep>=1.8.0
169 | commands =
170 |     semgrep \
171 |         --config p/r2c-security-audit \
172 |         --config p/secrets \
173 |         --exclude 'third-party/**' \
174 |         --error
175 | 
176 | # Target that groups all code linters to run in Travis.
177 | [testenv:code-linters]
178 | basepython = python3
179 | skip_install = true
180 | deps =
181 |     {[testenv:black-check]deps}
182 |     {[testenv:isort-check]deps}
183 |     {[testenv:flake8]deps}
184 |     {[testenv:bandit]deps}
185 |     {[testenv:semgrep]deps}
186 |     # {[testenv:pylint]deps}
187 |     # {[testenv:readme]deps}
188 | commands =
189 |     {[testenv:black-check]commands}
190 |     {[testenv:isort-check]commands}
191 |     {[testenv:flake8]commands}
192 |     {[testenv:bandit]commands}
193 |     {[testenv:semgrep]commands}
194 |     # {[testenv:pylint]commands}
195 |     # {[testenv:readme]commands}
196 | 


--------------------------------------------------------------------------------
/util/bump-version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | # On Mac OS, the default implementation of sed is BSD sed, but this script requires GNU sed.
 6 | if [ "$(uname)" == "Darwin" ]; then
 7 |   command -v gsed >/dev/null 2>&1 || { echo >&2 "[ERROR] Mac OS detected: please install GNU sed with 'brew install gnu-sed'"; exit 1; }
 8 |   PATH="/usr/local/opt/gnu-sed/libexec/gnubin:$PATH"
 9 | fi
10 | 
11 | if [ -z "$1" ]; then
12 |     echo "New version not specified. Usage: bump-version.sh NEW_VERSION"
13 |     exit 1
14 | fi
15 | 
16 | NEW_VERSION=$1
17 | CURRENT_VERSION=$(sed -ne "s/^version = \"\(.*\)\"/\1/p" setup.py)
18 | 
19 | sed -i "s/version = \"$CURRENT_VERSION\"/version = \"$NEW_VERSION\"/g" setup.py
20 | 


--------------------------------------------------------------------------------
/util/create-attribution-doc.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e -o xtrace
  4 | 
  5 | 
  6 | append_package_details_to_final_license_file(){
  7 |   # Function to Append Package Details to the THIRD-PARTY-LICENSES file
  8 |   #Arguments ->  1- Package Name, 2- Package Version, 3- License Type , 4- URL for package, 5,6,7- URL for License
  9 |   # Adding a header to final License file with Package Name, Package Version, License Type , URL for package
 10 |   echo -e "\n\n\n$1 \n$2 \n$3 \n$4" >> $final_license_file
 11 |   # Appending License
 12 |   curl $5 >> $final_license_file
 13 |   # Adding Dual Licenses if they exist 
 14 |   if [ $# -gt 5 ]
 15 |     then
 16 |       curl $6 >> $final_license_file
 17 |       curl $7 >> $final_license_file
 18 |   fi
 19 | 
 20 | }
 21 | 
 22 | function create_attribution_doc() {
 23 |   ATTR_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 24 | 
 25 |   # Install the python version if it doesnt exist
 26 |   if test ! -d ${PYENV_ROOT}/versions/${PYTHON_VERSION};
 27 |   then 
 28 |     env PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install ${PYTHON_VERSION}
 29 |   fi
 30 |   
 31 |   pyenv virtualenv ${PYTHON_VERSION} attribution-doc-env
 32 |   # switch to a specific virtual env
 33 |   source ${PYENV_ROOT}/versions/attribution-doc-env/bin/activate
 34 |   
 35 |   # Update Pip
 36 |   pip3 install --upgrade pip
 37 |   
 38 |   # Installing PyInstaller
 39 |   pip3 install pyinstaller
 40 |   # Install pip-licenses
 41 |   pip3 install pip-licenses
 42 |   
 43 |   # install via source
 44 |   pip3 install -e "$(dirname $ATTR_SCRIPT_DIR )"
 45 |   
 46 |   final_license_file=$(dirname $ATTR_SCRIPT_DIR )/THIRD-PARTY-LICENSES.txt
 47 |   
 48 |   # Create a pip License document
 49 |   pip-licenses -i aws-parallelcluster-node pip-licenses --format=plain-vertical --with-license-file --with-urls --no-license-path --with-authors --output-file=$final_license_file
 50 | 
 51 |   #Getting python version
 52 |   cpy_version=$(python -V |  grep -Eo '([0-9]+)(\.?[0-9]+)' | head -1)
 53 |   
 54 |   # Python
 55 |   append_package_details_to_final_license_file "Python" $cpy_version "PSF License Version 2; Zero-Clause BSD license" "https://raw.githubusercontent.com/python/cpython/$cpy_version/LICENSE" "https://raw.githubusercontent.com/python/cpython/$cpy_version/LICENSE"
 56 |   
 57 |   
 58 |   deactivate
 59 |   pyenv virtualenv-delete -f attribution-doc-env
 60 | 
 61 | }
 62 | 
 63 | 
 64 | _error_exit() {
 65 |    echo "$1"
 66 |    exit 1
 67 | }
 68 | 
 69 | _help() {
 70 |     local -- _cmd
 71 |     _cmd=$(basename "$0")
 72 | 
 73 |     cat <<EOF
 74 |   This script will create the THIRD_PARTY_LICENSE.txt file assuming you have already installed Pyenv
 75 |   Usage: ${_cmd} [OPTION]...
 76 | 
 77 | 
 78 |   --python-version <version>                                        Python version with which you want to create the attribution document.
 79 |   -h, --help                                                        Print this help message
 80 |   
 81 |   Examples:
 82 |   ${_cmd}
 83 |   $_cmd --python-version 3.9.10
 84 | EOF
 85 | }
 86 | 
 87 | function parse_options () {
 88 |   
 89 |    while [ $# -gt 0 ] ; do
 90 |           case "$1" in
 91 |               --python-version)                      PYTHON_VERSION="$2"; shift;;
 92 |               -h|--help|help)                       _help; exit 0;;
 93 |               *)                                    _help; _error_exit "[error] Unrecognized option '$1'";;
 94 |           esac
 95 |           shift
 96 |       done
 97 | 
 98 | }
 99 | 
100 | function main() {
101 |   parse_options "$@"
102 |   create_attribution_doc
103 | }
104 | 
105 | main "$@"
106 | 


--------------------------------------------------------------------------------
/util/upload-node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | _error_exit() {
  4 |    echo "$1"
  5 |    exit 1
  6 | }
  7 | 
  8 | _info() {
  9 |   echo "INFO: $1"
 10 | }
 11 | 
 12 | _help() {
 13 |     local -- _cmd
 14 |     _cmd=$(basename "$0")
 15 | 
 16 |     cat <<EOF
 17 | 
 18 |   Usage: ${_cmd} [OPTION]...
 19 | 
 20 |   Copy the AWS ParallelCluster Node in an S3 bucket.
 21 | 
 22 |   --bucket <bucket>             Bucket where upload the package
 23 |   --srcdir <src-dir>            Root folder of the node package
 24 |   --profile <aws-profile>       AWS profile name to use for the upload
 25 |                                 (optional, default is AWS_PROFILE env variable or "default")
 26 |   --region <aws-region>         Region to use for AWSCli commands (optional, default is "us-east-1")
 27 |   --scope <string>              Disambiguation string used in the S3 path to avoid collisions (default is empty)
 28 |   -h, --help                    Print this help message
 29 | EOF
 30 | }
 31 | 
 32 | main() {
 33 |     # parse input options
 34 |     while [ $# -gt 0 ] ; do
 35 |         case "$1" in
 36 |             --bucket)           _bucket="$2"; shift;;
 37 |             --bucket=*)         _bucket="${1#*=}";;
 38 |             --srcdir)           _srcdir="$2"; shift;;
 39 |             --srcdir=*)         _srcdir="${1#*=}";;
 40 |             --profile)          _profile="$2"; shift;;
 41 |             --profile=*)        _profile="${1#*=}";;
 42 |             --region)           _region="$2"; shift;;
 43 |             --region=*)         _region="${1#*=}";;
 44 |             --scope)            _scope="$2"; shift;;
 45 |             --scope=*)          _scope="${1#*=}";;
 46 |             -h|--help|help)     _help; exit 0;;
 47 |             *)                  _help; echo "[error] Unrecognized option '$1'"; exit 1;;
 48 |         esac
 49 |         shift
 50 |     done
 51 | 
 52 |     # verify required parameters
 53 |     if [ -z "${_bucket}" ]; then
 54 |         _error_exit "--bucket parameter not specified"
 55 |         _help;
 56 |     fi
 57 |     if [ -z "${_srcdir}" ]; then
 58 |         _error_exit "--srcdir parameter not specified"
 59 |         _help;
 60 |     fi
 61 | 
 62 |     # initialize optional parameters
 63 |     if [ -z "${AWS_PROFILE}" ] && [ -z "${_profile}" ]; then
 64 |         _info "--profile parameter not specified, using 'default'"
 65 |     elif [ -n "${_profile}" ]; then
 66 |         _profile="--profile ${_profile}"
 67 |     fi
 68 |     if [ -z "${_region}" ]; then
 69 |         _info "--region parameter not specified, using 'us-east-1'"
 70 |         _region="us-east-1"
 71 |     fi
 72 |     if [ -z "${_scope}" ]; then
 73 |         _info "--scope parameter not specified, no scope will be used"
 74 |         _scope=""
 75 |     fi
 76 | 
 77 |     # check bucket or create it
 78 |     aws ${_profile} s3api head-bucket --bucket "${_bucket}" --region "${_region}"
 79 |     if [ $? -ne 0 ]; then
 80 |         _info "Bucket ${_bucket} do not exist, trying to create it"
 81 |         aws ${_profile} s3api create-bucket --bucket "${_bucket}" --region "${_region}"
 82 |         if [ $? -ne 0 ]; then
 83 |             _error_exit "Unable to create bucket ${_bucket}"
 84 |         fi
 85 |     fi
 86 | 
 87 |     _version=$(grep "version = \"" "${_srcdir}/setup.py" |awk '{print $3}' | tr -d \")
 88 |     if [ -z "${_version}" ]; then
 89 |         _error_exit "Unable to detect node version, are you in the right directory?"
 90 |     fi
 91 |     _info "Detected version ${_version}"
 92 | 
 93 |     # Create archive
 94 |     _cwd=$(pwd)
 95 |     pushd "${_srcdir}" > /dev/null || exit
 96 |     _stashName=$(git stash create)
 97 |     git archive --format tar --prefix="aws-parallelcluster-node-${_version}/" "${_stashName:-HEAD}" | gzip > "${_cwd}/aws-parallelcluster-node-${_version}.tgz"
 98 |     #tar zcvf "${_cwd}/aws-parallelcluster-node-${_version}.tgz" --transform "s,^aws-parallelcluster-node/,aws-parallelcluster-node-${_version}/," ../aws-parallelcluster-node
 99 |     popd > /dev/null || exit
100 |     md5sum aws-parallelcluster-node-${_version}.tgz > aws-parallelcluster-node-${_version}.md5
101 | 
102 |     # upload package
103 |     _key_path="parallelcluster/${_version}/node"
104 |     if [ -n "${_scope}" ]; then
105 |         _key_path="${_key_path}/${_scope}"
106 |     fi
107 |     aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-node-${_version}.tgz s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.tgz || _error_exit 'Failed to push node to S3'
108 |     aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-node-${_version}.md5 s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.md5 || _error_exit 'Failed to push node md5 to S3'
109 |     aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-node-${_version}.tgz --output text --query LastModified > aws-parallelcluster-node-${_version}.tgz.date || _error_exit 'Failed to fetch LastModified date'
110 |     aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-node-${_version}.tgz.date s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.tgz.date || _error_exit 'Failed to push node date'
111 | 
112 |     _bucket_region=$(aws ${_profile} s3api get-bucket-location --bucket ${_bucket} --output text)
113 |     if [ ${_bucket_region} = "None" ]; then
114 |         _bucket_region=""
115 |     else
116 |         _bucket_region=".${_bucket_region}"
117 |     fi
118 | 
119 |     echo ""
120 |     echo "Done. Add the following configuration to the pcluster create config file:"
121 |     echo ""
122 |     echo "DevSettings:"
123 |     echo "  NodePackage: s3://${_bucket}/${_key_path}/aws-parallelcluster-node-${_version}.tgz"
124 | }
125 | 
126 | main "$@"
127 | 
128 | # vim:syntax=sh
129 | 


--------------------------------------------------------------------------------