├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── documentation-request.md │ └── feature-request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── build-image.yml │ ├── build-push-image-from-main.yml │ └── publish-release.yml ├── .gitmodules ├── HEALTH_CHECKS.md ├── LICENSE ├── Makefile ├── README.md ├── SETUP.md ├── USAGE.md ├── alertmanager ├── README.md ├── alertmanager.yaml ├── alerts │ └── healthchecks-alerts.yaml └── images │ ├── alertmanager.png │ ├── create-receiver.png │ ├── pciealert.png │ ├── slack-alert-example.png │ └── slack.png ├── autopilot-daemon ├── Dockerfile ├── go.mod ├── go.sum ├── gpu-bw │ ├── entrypoint.py │ └── gpuLocalBandwidthTest.sh ├── gpu-dcgm │ └── entrypoint.py ├── gpu-mem │ ├── entrypoint.py │ └── gpucheck.cu ├── gpu-power │ └── power-throttle.sh ├── gpu-remapped │ ├── entrypoint.py │ └── remapped-rows.sh ├── network │ ├── README.md │ ├── iperf3_entrypoint.py │ ├── iperf3_start_clients.py │ ├── iperf3_start_servers.py │ ├── iperf3_stop_servers.py │ ├── iperf3_utils.py │ ├── network_workload.py │ └── ping-entrypoint.py ├── pkg │ ├── cmd │ │ └── main.go │ ├── handler │ │ ├── handler.go │ │ └── messagestruct.go │ ├── healthcheck │ │ ├── functions.go │ │ ├── global.go │ │ └── healthcheck.go │ └── utils │ │ ├── functions.go │ │ ├── global.go │ │ ├── listwatch.go │ │ ├── nodelabels.go │ │ └── prometheus.go └── utils │ ├── briefings.sh │ └── runHealthchecks.py ├── figures ├── autopilot-daemon-pod.pdf ├── autopilot-daemon-pod.png ├── autopilot-logo.png ├── autopilot-main-loop.pdf ├── autopilot-main-loop.svg ├── big-picture.pdf ├── big-picture.svg ├── invasive-check-flow.pdf ├── invasive-check-flow.svg ├── periodic-check-flow.pdf └── periodic-check-flow.svg ├── grafana ├── autopilot-dashboard.json └── autopilot-dashboard.yaml └── helm-charts └── autopilot ├── .helmignore ├── Chart.yaml ├── README.md ├── templates ├── NOTES.txt ├── _helpers.tpl ├── autopilot.yaml ├── metrics_service.yaml ├── pullsecret.yaml ├── service.yaml ├── serviceaccount.yaml └── servicemonitor.yaml └── values.yaml /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Describe a problem or bug with Autopilot. 4 | title: "[Bug]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Summary 11 | 12 | _A clear and concise description of what the bug is._ 13 | 14 | ## Steps To Reproduce 15 | 16 | _A detailed list of actions to take in order to reproduce the problem or bug._ 17 | 18 | ## Expected behavior 19 | 20 | _A clear and concise description of what you expected to happen._ 21 | 22 | ## Evidence 23 | 24 | _Are there logs, screenshots, or helpful documentation to include? Otherwise N/A._ 25 | 26 | ## Proposed Solution 27 | 28 | _Include a brief description of a potential solution and method to verify the solution, if possible. Otherwise N/A._ 29 | 30 | **Additional context** 31 | _Add any other context about the problem here._ 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation Request 3 | about: Propose new documentation for Autopilot. 4 | title: "[Documentation]" 5 | labels: documentation 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Summary 11 | 12 | _Include a brief summary of the documentation changes you're proposing._ 13 | 14 | ## Impact 15 | 16 | - _What pages will need to be updated?_ 17 | - _Will there be broken links with these changes?_ 18 | - _Are there any images you're adding or external content that exists outside the repository?_ 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Describe a feature to see in Autopilot. 4 | title: "[Feature]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Summary 11 | 12 | - _Include a brief summary of the feature you'd like to see._ 13 | - _Is this feature being motivated by existing gaps or existing issues with Autopilot?_ 14 | 15 | ## Impact 16 | 17 | - _How big of impact would this feature have to autopilot?_ 18 | - _Is this a new component or total overhaul of an existing feature set?_ 19 | - _What are your thoughts and how can we size this feature accordingly?_ 20 | 21 | ## Proposed Solution 22 | 23 | - _Include a brief description of a potential implementation, if possible. Otherwise just let us know what you'd like to see!_ 24 | - _Any links or snippets of knowledge you share, the better we'll all be at understanding the contribution._ 25 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 8 | 9 | # Summary 10 | 11 | - _What changes are proposed in this pull request?_ 12 | 13 | ## Scope and Impact 14 | 15 | - _API Changes?_ 16 | - _Should any users or specific teams be notified of breaking changes?_ 17 | 18 | ## GitHub Issue 19 | - [#XYZ - Issue Name 1](https://github.com/IBM/autopilot/issues) 20 | 21 | ## How was this Pull-Request Tested and Validated? 22 | 23 | - _Steps used to test and validate the changes. Commands and additional content is welcomed._ 24 | - _If not applicable mark as N/A._ 25 | 26 | ## Pull-Request Reminders 27 | 28 | - Does the [Autopilot Readme](https://github.com/IBM/autopilot?tab=readme-ov-file#ai-training-autopilot) require updates? 29 | - _Yes or No -- if yes, were they made?_ 30 | 31 | - Are there any new software dependencies introduced to this Pull-Request? 32 | - _Yes or No -- if yes, what are they?_ 33 | -------------------------------------------------------------------------------- /.github/workflows/build-image.yml: -------------------------------------------------------------------------------- 1 | name: Test Build Container Image on PR 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | branches: 7 | - 'main' 8 | paths: 9 | - 'autopilot-daemon/**' 10 | 11 | jobs: 12 | docker: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Remove unnecessary files 16 | run: | 17 | sudo rm -rf /usr/share/dotnet 18 | sudo rm -rf /usr/local/lib/android 19 | 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | 23 | - name: Build and push 24 | uses: docker/build-push-action@v5 25 | with: 26 | context: autopilot-daemon 27 | push: false 28 | tags: test 29 | -------------------------------------------------------------------------------- /.github/workflows/build-push-image-from-main.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Latest Container Image 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - 'main' 8 | paths-ignore: 9 | - '.github/**' 10 | 11 | jobs: 12 | docker: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Remove unnecessary files 16 | run: | 17 | sudo rm -rf /usr/share/dotnet 18 | sudo rm -rf /usr/local/lib/android 19 | 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | 23 | - name: Docker meta 24 | id: meta 25 | uses: docker/metadata-action@v5 26 | with: 27 | images: quay.io/autopilot/autopilot 28 | tags: latest 29 | 30 | - name: Log into registry 31 | uses: docker/login-action@v3 32 | with: 33 | registry: quay.io 34 | username: ${{ secrets.QUAY_USERNAME }} 35 | password: ${{ secrets.QUAY_PASSWORD }} 36 | 37 | - name: Build and push 38 | uses: docker/build-push-action@v5 39 | with: 40 | context: autopilot-daemon 41 | push: true 42 | tags: ${{ steps.meta.outputs.tags }} 43 | -------------------------------------------------------------------------------- /.github/workflows/publish-release.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Create New Release - Quay and Helm 4 | on: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | release: 9 | permissions: 10 | contents: write 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v3 15 | with: 16 | fetch-depth: 0 17 | 18 | - name: Configure Git 19 | run: | 20 | git config user.name "$GITHUB_ACTOR" 21 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 22 | - name: Install Helm 23 | uses: azure/setup-helm@v3 24 | with: 25 | token: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | - name: Run chart-releaser 28 | uses: helm/chart-releaser-action@v1.6.0 29 | env: 30 | CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" 31 | CR_SKIP_EXISTING: true 32 | with: 33 | pages_branch: gh-pages 34 | charts_dir: helm-charts 35 | skip_existing: true 36 | packages_with_index: true 37 | token: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | docker: 40 | runs-on: ubuntu-latest 41 | steps: 42 | - name: Remove unnecessary files 43 | run: | 44 | sudo rm -rf /usr/share/dotnet 45 | sudo rm -rf /usr/local/lib/android 46 | 47 | - name: Checkout 48 | uses: actions/checkout@v3 49 | with: 50 | fetch-depth: 0 51 | 52 | - name: Read helm chart version 53 | run: echo "CHART_VERSION=$(grep '^version:' helm-charts/autopilot/Chart.yaml | cut -d ":" -f2 | tr -d ' ')" >> $GITHUB_ENV 54 | 55 | - name: Checkout 56 | uses: actions/checkout@v4 57 | 58 | - name: Docker meta 59 | id: meta 60 | uses: docker/metadata-action@v5 61 | with: 62 | images: quay.io/autopilot/autopilot 63 | tags: ${{ env.CHART_VERSION }} 64 | 65 | - name: Log into registry 66 | uses: docker/login-action@v3 67 | with: 68 | registry: quay.io 69 | username: ${{ secrets.QUAY_USERNAME }} 70 | password: ${{ secrets.QUAY_PASSWORD }} 71 | 72 | - name: Build and push 73 | uses: docker/build-push-action@v5 74 | with: 75 | context: autopilot-daemon 76 | push: true 77 | tags: ${{ steps.meta.outputs.tags }} -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/.gitmodules -------------------------------------------------------------------------------- /HEALTH_CHECKS.md: -------------------------------------------------------------------------------- 1 | # Health Checks 2 | 3 | Here is a breakdown of the existing health checks: 4 | 5 | 1. **PCIe Bandwidth Check (pciebw)** 6 | - Description : Host-to-device connection speeds, one measurement per GPU. Codebase in tag [v12.4.1](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest) 7 | - Outputs: Pass/fail results based on PCIe bandwidth thresholds. 8 | - Implementation: Compares bandwidth results to a threshold (e.g., 8 GB/s). If the measured bandwidth falls below the threshold, it triggers a failure. 9 | - It is recommended to set a threshold that is 25% or lower of the expected peak PCIe bandwidth capability, which maps to maximum peak from 16 lanes to 4 lanes. For example, for a PCIe Gen4x16, reported peak bandwidth is 63GB/s. A degradation at 25% is 15.75GB/s, which corresponds to PCIe Gen4x4. 10 | - The measured bandwidth is expected to be at least 80% of the expected peak PCIe generation bandwidth. 11 | 2. **GPU Memory Check (remapped)** 12 | - Description: Information from nvidia-smi regarding GPU memory remapped rows. 13 | - Outputs: Reports the state of GPU memory (normal/faulty). 14 | - Implementation: Analyzes remapped rows information to assess potential GPU memory issues. 15 | 3. **GPU Memory Bandwidth Performance (gpumem)** 16 | - Description: Memory bandwidth measurements using DAXPY and DGEMM. 17 | - Outputs: Performance metrics (eg., TFlops, power). 18 | - Implementation: CUDA code that valuates memory bandwidth and flags deviations from expected performance values. 19 | 4. **GPU Diagnostics (dcgm)** 20 | - Description: Runs NVidia DCGM diagnostics using dcgmi diag. 21 | - Outputs: Diagnostic results (pass/fail). 22 | - Implementation: Analyzes GPU health, including memory, power, and thermal performance. 23 | 5. **PVC Create/Delete (pvc)** 24 | - Description: Given a storage class, tests if a PVC can be created and deleted. 25 | - Output: pass/fail depending on the success or failure of creation and deletion of a PVC. If either operation fail, the result is a failure. 26 | - Implementation: creation of a PVC through K8s APIs. 27 | 6. **Network Reachability Check (ping)** 28 | - Description: Pings between nodes to assess connectivity. 29 | - Outputs: Pass/fail based on ping success. 30 | - Implementation: all-to-all reachability test. 31 | 7. **Network Bandwidth Check (iperf)** 32 | - Description: Tests network bandwidth by launching clients and servers on multiple interfaces through iperf3. Results are aggregated per interface results from network tests. Further details can be found in [the dedicated page](autopilot-daemon/network/README.md). 33 | - Outputs: Aggregate bandwidth on each interface, per node (in Gb/s). 34 | - Implementation: Tests network bandwidth by launching clients and servers on multiple interfaces and by running a ring topology on all network interfaces found on the pod that are exposed by network controllers like multi-nic CNI, which exposes fast network interfaces in the pods requesting them. Does not run on `eth0`. 35 | 36 | These checks are configured to run periodically (e.g., hourly), and results are accessible via Prometheus, direct API queries or labels on the worker nodes. 37 | 38 | ![image](figures/periodic-check-flow.svg) 39 | 40 | ## Deep Diagnostics and Node Labeling 41 | 42 | Autopilot's periodic health checks, will label the worker nodes according to the result obtained. 43 | Lightweight and invasive health checks, may use different labeling system. 44 | 45 | If the health checks, lightweight or invasive, report success, the node is marked with 46 | 47 | ```yaml 48 | autopilot.ibm.com/gpuhealth: PASS 49 | ``` 50 | 51 | When the lightweight health checks report an issue, the node is labelled with 52 | 53 | ```yaml 54 | autopilot.ibm.com/gpuhealth: WARN 55 | ``` 56 | 57 | ### Invasive health checks 58 | 59 | The invasive DCGM diagnostics level 3 health check, executed automatically only on nodes that have free GPUs. This deeper analysis is needed to reveal problems in the GPUs that can be found only after running level 3 DCGM diagnostic. 60 | 61 | ![image](figures/invasive-check-flow.svg) 62 | 63 | This type of diagnostics can help deciding if the worker node should be used for running workloads or not. To facilitate this task, Autopilot will label nodes with key `autopilot.ibm.com/dcgm.level.3`. 64 | 65 | If a fatal error is found, the `gpuhealth` label is updated to evict. 66 | 67 | ```yaml 68 | autopilot.ibm.com/gpuhealth: EVICT 69 | ``` 70 | 71 | Only fatal errors should produce an `EVICT` label. We follow [NVIDIA recommendations](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3), although it is possible to customize the list of tests through the Helm chart. The default values are `[PCIe,NVLink,ECC,GPU Memory]`. 72 | 73 | If errors are found during the level 3 diagnostics, the label `autopilot.ibm.com/dcgm.level.3` will contain the result and timestamp related to the latest run, while the annotation `autopilot.ibm.com/dcgm.level.3.output` will contain detailed information about the error in the following format: 74 | 75 | ```yaml 76 | labels: 77 | autopilot.ibm.com/dcgm.level.3: ERR_Year-Month-Date_Hour.Minute.UTC 78 | annotations: 79 | autopilot.ibm.com/dcgm.level.3.output: Diagnostic_Test.gpuID,Diagnostic_Test.gpuID,...` 80 | ``` 81 | 82 | - `ERR`: An indicator that an error has occurred 83 | - `Year-Month-Date_Hour.Minute.UTC`: Timestamp of completed diagnostics 84 | - `Diagnostic_Test`: Name of the test that has failed (formatted to replace spaces with underscores) 85 | - `gpuID`: ID of GPU where the failure has occurred 86 | 87 | **Example:** 88 | ``` 89 | labels: 90 | autopilot.ibm.com/dcgm.level.3=ERR_2024-10-10_19.12.03UTC 91 | annotations: 92 | autopilot.ibm.com/dcgm.level.3.output=memory_bandwidth.0.1.2.3 93 | 94 | ``` 95 | 96 | If there are no errors, the value of `autopilot.ibm.com/dcgm.level.3` is set to `PASS_Year-Month-Date_Hour.Minute.UTC` while `autopilot.ibm.com/dcgm.level.3.output` will be empty. 97 | 98 | ### Logs and Metrics 99 | 100 | All health checks results are exported through Prometheus, but they can be also found in each pod's logs. 101 | 102 | All metrics are accessible through Prometheus and Grafana dashboards. The gauge exposed is `autopilot_health_checks` and can be customized with the following filters: 103 | 104 | - `check`, select one or more specific health checks 105 | - `node`, filter by node name 106 | - `cpumodel` and `gpumodel`, for heterogeneous clusters 107 | - `deviceid` to select specific GPUs, when available 108 | 109 | For more information on how to set up alerts based on metrics, please refer to the [alert manager folder](alertmanager/README.md). 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TAG=dev 2 | IMAGE=containerregistry:5000/autopilot 3 | 4 | image-build: 5 | @docker build -t ${IMAGE}:v${TAG} -f autopilot-daemon/Dockerfile autopilot-daemon/ 6 | 7 | image-push: 8 | @docker push ${IMAGE}:v${TAG} 9 | 10 | all: image-build image-push 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # AI Training Autopilot 4 | 5 | Autopilot is a Kubernetes-native daemon that continuously monitors and evaluates GPUs, network and storage health, designed to detect and report infrastructure-level issues during the lifetime of AI workloads. It is an open-source project developed by IBM Research. 6 | 7 | In AI training jobs, which may run for weeks or months, anomalies in the GPUs and network can happen anytime and often go undetected. In this case, performance degrades suddenly and a deep diagnostic is needed to identify the root cause, delaying or deleting the current job. Similarly, hardware anomalies can greatly disrupt the throughput and latency of an AI inference server. 8 | 9 | The role of Autopilot is to detect and report any problems that are detected by its health checks during the lifetime of the job and the existence of a cluster. 10 | 11 | It implements a set of health checks evaluating the status of the system. These health checks focus mainly on subtle/software issues (i.e., row-remapping or PCIe link degradation), but also run connectivity tests (i.e., ping, iperf) to verify that secondary NICs are reachable. It can also verify that persistent volume claims (PVC) creation is functional for a given storage class. 12 | 13 | ![image](figures/autopilot-daemon-pod.png) 14 | 15 | Autopilot is deployed as a Kubernetes DaemonSet on all worker nodes that have GPUs. Each pod exposes a Service that can be accessed through RESTful API to request the execution of health checks. Therefore, each health check has its own entry point, but also a generic “status” entry point is provided. 16 | 17 | The DaemonSet does not run as privileged and requires access to GPUs without requesting them as resources. Therefore, the GPUs are seen as available by the scheduler. 18 | 19 | The main code is written in Go, while health checks are written in a combination of Python, Go, bash and CUDA. Each Autopilot pod runs health checks only on the node it resides. A pod can request other pods to run health checks on their nodes, and in that case, results are gathered and showed by the requestor pod. 20 | 21 | If Autopilot requires full access to GPUs to run more invasive workloads, it will spawn a separate job with resources requests and limits set. 22 | 23 | ![image](figures/autopilot-main-loop.svg) 24 | 25 | ## Health Checks 26 | 27 | The current status of Autopilot includes: 28 | 29 | - **GPU PCIe Link Bandwidth**: The PCIe NVidia bandwidth test to check host-to-device connection on each node 30 | - **GPU Memory**: GPUs remapped rows evaluation through `nvidia-smi` 31 | - **GPU Memory Bandwidth Performance**: GPUs memory bandwidth evaluation through DAXPY and DGEMM 32 | - **GPU Diagnostics**: NVidia DCGM (Data Center GPU Manager) diagnostics through `dcgmi diag` 33 | - **GPU Power Slowdown**: verify if power throttle is active through `nvidia-smi` 34 | - **Network Reachability**: `ping` to evaluate hosts reachability 35 | - **Network Bandwidth**: `iperf3` to evaluate network bandwidth and hosts connectivity 36 | - **PVC Create/Delete**: given a storageclass, test the ability to successfully provision a Persistent Volume Claim 37 | - **DCGM level 3**: deep diagnostics through NVidia DCGM tool. This test runs as a separate Job that reserves all the GPUs in the node if they are free 38 | 39 | A subset of the tests is enabled by default, and they run by default every hour. Both the the list of health checks and the timer can be customized at initialization time. 40 | 41 | By default, the periodic checks list contains PCIe, rows remapping, GPUs power, DCGM level 1 and ping. 42 | 43 | Results from health checks are exported as Prometheus Gauges, so that users and admins can easily check the status of the system on Grafana. 44 | 45 | Detailed description of all the health checks, can be found in [HEALTH_CHECKS.md](HEALTH_CHECKS.md). 46 | 47 | ### Diagnostics and Node Labeling 48 | 49 | Autopilot's periodic and invasive health checks, will label the worker nodes according to the result obtained. 50 | Lightweight and invasive health checks, may use different labeling system. Refer to [HEALTH_CHECKS.md](HEALTH_CHECKS.md) for more details about the labels format. 51 | 52 | The information saved in the labels, can be used by admins, kube-scheduler or other workload management systems like [CodeFlare](https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/) to steer the execution of workloads for enhanced fault tolerance. 53 | 54 | ![image](figures/big-picture.svg) 55 | 56 | ## Install 57 | 58 | To learn how to install Autopilot, please refer to [SETUP.md](SETUP.md) 59 | 60 | ## Usage 61 | 62 | To learn how to invoke health checks, please refer to [USAGE.md](USAGE.md). 63 | -------------------------------------------------------------------------------- /SETUP.md: -------------------------------------------------------------------------------- 1 | 2 | # Install Autopilot 3 | 4 | Autopilot can be installed through Helm and need enough privileges to create objects like services, serviceaccounts, namespaces and relevant RBAC. 5 | 6 | ## Helm Chart customization 7 | 8 | Helm charts values and how-to for customization can be found [here](helm-charts/autopilot/README.md). 9 | 10 | ## Install 11 | 12 | 1) Add autopilot repo 13 | 14 | ```bash 15 | helm repo add autopilot https://ibm.github.io/autopilot/ 16 | ``` 17 | 18 | 2) Install autopilot (idempotent command). The config file is for customizing the helm values. It is not mandatory. If the default values work for you, omit the `-f`. The `--namespace` parameter says where the helm chart will be deployed 19 | 20 | ```bash 21 | helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml 22 | ``` 23 | 24 | The controllers should show up in the selected namespace 25 | 26 | ```bash 27 | kubectl get po -n autopilot 28 | ``` 29 | 30 | ```bash 31 | NAME READY STATUS RESTARTS AGE 32 | autopilot-daemon-autopilot-g7j6h 1/1 Running 0 70m 33 | autopilot-daemon-autopilot-g822n 1/1 Running 0 70m 34 | autopilot-daemon-autopilot-x6h8d 1/1 Running 0 70m 35 | autopilot-daemon-autopilot-xhntv 1/1 Running 0 70m 36 | ``` 37 | 38 | ## Uninstall 39 | 40 | ```bash 41 | helm uninstall autopilot -n autopilot 42 | kubectl delete namespace autopilot 43 | ``` 44 | 45 | ## Enabling Prometheus 46 | 47 | ### Kubernetes Users 48 | 49 | The ServiceMonitor object is the one that enables Prometheus to scrape the metrics produced by Autopilot. 50 | In order for Prometheus to find the right objects, the `ServiceMonitor` needs to be annotated with the Prometheus' release name. It is usually `prometheus`, and that's the default added in the Autopilot release. 51 | If that is not the case in your cluster, the correct release label can be found by checking in the `ServiceMonitor` of Prometheus itself, or the name of Prometheus helm chart. 52 | Then, Autopilot's `ServiceMonitor` can be labeled with the following command 53 | 54 | ```bash 55 | kubectl label servicemonitors.monitoring.coreos.com -n autopilot autopilot-metrics-monitor release= 56 | ``` 57 | 58 | ### OpenShift Users 59 | 60 | **If on OpenShift**, after completing the installation, manually label the namespace to enable metrics to be scraped by Prometheus with the following command: 61 | The `ServiceMonitor` labeling is not required. 62 | 63 | ```bash 64 | kubectl label ns autopilot openshift.io/cluster-monitoring=true 65 | ``` 66 | 67 | ## Enabling Grafana Dashboard 68 | 69 | To deploy the autopilot Grafana dashboard, you need a Grafana instance on your cluster. For instance, Grafana and Prometheus can be installed via `prometheus-community/kube-prometheus-stack` helm charts. 70 | 71 | The dashboard can be installed by: 72 | 73 | - Importing the `autopilot-dashboard.json` file in the Grafana web console; 74 | - Importing the dashboard id `23123` in the Grafana web console. The dashboard is published in the [Grafana dashboards](https://grafana.com/grafana/dashboards/23123-autopilot-metrics/) website under the name of Autopilot Metrics; 75 | - Applying the `GrafanaDashboard` object provided by running the following command: 76 | 77 | ```bash 78 | kubectl create -f grafana/autopilot-dashboard.yaml [-n ] 79 | ``` 80 | 81 | The dashboard have some default values, for instance `3.4Gb/s` for the PCIe bandwidth alert threshold, but each value can be customized. 82 | -------------------------------------------------------------------------------- /USAGE.md: -------------------------------------------------------------------------------- 1 | # Manually Query the Autopilot Service 2 | 3 | Autopilot provides a `/status` handler that can be queried to get the entire system status, meaning that it will run all the tests on all the nodes. Autopilot is reachable by service name `autopilot-healthchecks.autopilot.svc` in-cluster only, meaning it can be reached from a pod running in the cluster, or through port forwarding (see below). 4 | 5 | Health check names are `pciebw`, `dcgm`, `remapped`, `ping`, `iperf`, `pvc`, `gpumem`. 6 | 7 | For example, using port forwarding to localhost or by exposing the service 8 | 9 | ```bash 10 | kubectl port-forward service/autopilot-healthchecks 3333:3333 -n autopilot 11 | # or oc expose service autopilot-healthchecks -n autopilot in OpenShift 12 | ``` 13 | 14 | If using port forward, then launch `curl` on another terminal 15 | 16 | ```bash 17 | curl "http://localhost:3333/status?check=pciebw&host=nodename1" 18 | ``` 19 | 20 | Alternatively, retrieve the route with `kubectl get routes autopilot-healthchecks -n autopilot` 21 | When using routes, it is recommended to [increase the timeout](https://docs.openshift.com/container-platform/4.10/networking/routes/route-configuration.html#nw-configuring-route-timeouts_route-configuration) with the following command 22 | 23 | ```bash 24 | oc annotate route autopilot-healthchecks -n autopilot --overwrite haproxy.router.openshift.io/timeout=30m 25 | ``` 26 | 27 | Then: 28 | 29 | ```bash 30 | curl "http:///status?check=pciebw&host=nodename1" 31 | ``` 32 | 33 | All tests can be tailored by a combination of: 34 | 35 | - `host=`, to run all tests on a specific node or on a comma separated list of nodes. 36 | - `check=`, to run a single test (`pciebw`, `dcgm`, `remapped`, `gpumem`, `ping`, `iperf` or `all`) or a list of comma separated tests. When no parameters are specified, only `pciebw`, `dcgm`, `remapped`, `ping` tests are run. 37 | - `job=`, run tests on nodes running a job labeled with `key=value` in a specific namespace. 38 | - `nodelabel=`, run tests on nodes having the `key=value` label. 39 | - `batch=<#hosts>`, how many hosts to check at a single moment. Requests to the batch are run in parallel asynchronously. Batching is done to avoid running too many requests in parallel when the number of worker nodes increases. Defaults to all nodes. 40 | 41 | Some health checks provide further customization. More details on all the tests can be found [here](https://github.com/IBM/autopilot/autopilot-daemon/HEALTH_CHECKS.md) 42 | 43 | Note that if multiple node selection parameters (`host`, `job`, `nodelabel`) are provided together, Autopilot will run tests on nodes that match _any_ of the specified parameters (set union). For example, the following command will run the `pciebw` test on all nodes that either have the label `label1` OR are running the job `jobKey=job2` because both `nodelabel` and `job` parameters are provided in the input: 44 | 45 | ```bash 46 | curl "http:///status?check=pciebw&nodelabel=label1&job=default:jobKey=job2" 47 | ``` 48 | 49 | ## DCGM 50 | 51 | This test runs `dcgmi diag`, and we support only `r` as [parameter](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#command-line-options). 52 | 53 | The default is `1`, but can customize it by `/status?check=dcgm&r=2`. 54 | 55 | ## Network Bandwidth Validation with IPERF 56 | 57 | As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user. Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user. 58 | 59 | - For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`. 60 | 61 | - Invocation from the exposed Autopilot API is as follows below: 62 | 63 | ```bash 64 | # Invoked via the `status` handle: 65 | curl "http://127.0.0.1:3333/status?check=iperf&workload=ring&pclients=&startport=" 66 | 67 | # Invoked via the `status` with defaults (iperf clients = 8, starting server port = 5200, workload = ring): 68 | curl "http://127.0.0.1:3333/status?check=iperf" 69 | 70 | # Invoked via the `iperf` handle directly: 71 | curl "http://127.0.0.1:3333/iperf?workload=ring&pclients=&startport=" 72 | 73 | # Invoked via the `iperf` handle directly (iperf clients = 8, starting server port = 5200, workload = ring): 74 | curl "http://127.0.0.1:3333/iperf" 75 | ``` 76 | 77 | ## Concrete Example 78 | 79 | In this example, we target one node and check the pcie bandwidth and use the port-forwarding method. 80 | In this scenario, we have a value lower than `8GB/s`, which results in an alert. This error will be exported to the OpenShift web console and on Slack, if that is enabled by admins. 81 | 82 | ```bash 83 | curl "http://127.0.0.1:3333/status?check=pciebw" 84 | ``` 85 | 86 | The output of the command above, will be similar to the following (edited to save space): 87 | 88 | ```bash 89 | Checking status on all nodes 90 | Autopilot Endpoint: 10.128.6.187 91 | Node: hostname 92 | url(s): http://10.128.6.187:3333/status?host=hostname&check=pciebw 93 | Response: 94 | Checking system status of host hostname (localhost) 95 | 96 | [[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation. 97 | [[ PCIEBW ]] FAIL 98 | Host hostname 99 | 12.3 12.3 12.3 12.3 5.3 12.3 12.3 12.3 100 | 101 | Node Status: PCIE Failed 102 | ------------------------------------- 103 | 104 | 105 | Autopilot Endpoint: 10.131.4.93 106 | Node: hostname2 107 | url(s): http://10.131.4.93:3333/status?host=hostname2&check=pciebw 108 | Response: 109 | Checking system status of host hostname2 (localhost) 110 | 111 | [[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation. 112 | [[ PCIEBW ]] SUCCESS 113 | Host hostname2 114 | 12.1 12.0 12.3 12.3 11.9 11.5 12.1 12.1 115 | 116 | Node Status: Ok 117 | ------------------------------------- 118 | 119 | Node Summary: 120 | 121 | {'hostname': ['PCIE Failed'], 122 | 'hostname2': ['Ok']} 123 | 124 | runtime: 31.845192193984985 sec 125 | ``` 126 | -------------------------------------------------------------------------------- /alertmanager/README.md: -------------------------------------------------------------------------------- 1 | # Alerting for autopilot tests on OpenShift clusters 2 | 3 | Autopilot can issue alerts when: 4 | 5 | 1) any health check reports an issue 6 | 2) a node is labeled with `ERR` as a result of an health check 7 | 3) any of the Autopilot pods fail. 8 | 9 | This folder contains the files needed to enable the above alerts and to set up Slack notifications using Prometheus and AlertManager on OpenShift. 10 | 11 | There are 3 main steps to set it up: 12 | 13 | 1) Create `PrometheusRules` (alerting rules) 14 | 2) Create a Slack webhook application 15 | 3) Create an `AlertManager` Receiver 16 | 17 | These steps are explained in more detail below. 18 | 19 | ## Create alerting rules for Prometheus 20 | 21 | ```console 22 | oc project openshift-monitoring 23 | oc create -f healthchecks-alerts.yaml 24 | ``` 25 | 26 | Note the following in the example below: 27 | 28 | - The `PrometheusRule` is created in the `openshift-monitoring` namespace - this is the namespace where Prometheus and Alert Manager is deployed on the OpenShift cluster. 29 | - The `alert: autopilot` label is added to match the alert with an Alert Manager receiver that we will create in the last step. This is how Prometheus knows which Alert Manager receiver to send the alert to. 30 | 31 | For example: `sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0` is the PromQL query used to count how many nodes have a GPU device with a PCIE bandwidth of less than 4. 32 | 33 | ```yaml 34 | - alert: LowPCIeBandwidth 35 | annotations: 36 | description: | 37 | GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has a PCIE bandwidth of {{ $value }} {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 38 | summary: GPU with a PCIe bandwidth of 4 or less 39 | expr: | 40 | sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0 41 | for: 1m 42 | labels: 43 | severity: warning 44 | alert: autopilot 45 | ``` 46 | 47 | ## Observe OpenShift dashboard notifications 48 | 49 | Once you have deployed the above `PrometheusRules`, you should start seeing alerts in the OpenShift dashboard when one of the autopilot tests fails. For example, this alert below warns about low PCIE bandwidth on a GPU device on a node: 50 | ![PCIE Alert](images/pciealert.png) 51 | 52 | ## Create a Slack incoming webhook application 53 | 54 | - Create a Slack workspace using your personal Slack account 55 | - Go to https://slack.com/apps and select your workspace from the dropdown menu in the top right of the page 56 | - Click on `Get Essential Apps` and search the App Directory for `Incoming WebHooks` 57 | 58 | You should see a page like this: 59 | ![Slack Webhook](images/slack.png) 60 | 61 | If there is no existing webhook, you can create one by following the official documentations [here](https://api.slack.com/messaging/webhooks). 62 | 63 | - Click on `Add to Slack` and choose which Slack channel to post messages to from the dropdown menu or create a new channel. 64 | 65 | - Click on `Add Incoming Webhooks Integration` 66 | 67 | - Copy and paste the `WebhookURL`. We will use this when we configure the `AlertManager` Receiver in the next step. 68 | It should look something like this: 69 | 70 | ```bash 71 | https://hooks.slack.com/services/ 72 | ``` 73 | 74 | ## Create an `AlertManager` receiver using Slack through the OpenShift Web UI 75 | 76 | - Log into the OpenShift WebUI as an admin 77 | - Click on Administration -> Cluster Settings -> Configuration -> Alertmanager 78 | 79 | You should see this page: 80 | ![Alert Manager](images/alertmanager.png) 81 | 82 | Click on `Create Receiver` 83 | 84 | - Choose a Receiver name and set the Receiver type as Slack 85 | - Click on `Create` and fill out the following fields: 86 | 87 | - Paste the Slack Webhook URL you copied in the previous step into the `Slack API URL` field 88 | - Write the Slack channel name to send notifications to in the `Channel field` 89 | - Click on `Show advanced configuration` 90 | - We suggest to set the title as follows: 91 | 92 | ```console 93 | [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} 94 | Autopilot Health Check Report 95 | ``` 96 | 97 | - We suggest to set the text as follows: 98 | 99 | ```console 100 | {{ range .Alerts -}} 101 | *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} 102 | *Description:* {{ .Annotations.description }} 103 | *Details:* 104 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 105 | {{ end }} 106 | {{ end }} 107 | ``` 108 | 109 | - In the Routing Labels section, provide the label that we provided in the Prometheus `AlertingRule` in the first step. 110 | - Set it to `alert=autopilot`, which is a label added in the various Prometheus Rules in `healthchecks-alerts.yaml` 111 | This ensures that Prometheus will route the `AlertingRule` we created to this specific `AlertManager` receiver. 112 | 113 | - Click on `Save` 114 | 115 | This will generate a yaml file like `alertmanager.yaml` in this folder and will update the `AlertManager` pod configuration to add your new receiver. Now we will start receiving alerts from the Prometheus `AlertingRule` we created. Note that in `alertmanager.yaml`, there is a `critical` receiver to catch all the `critical` alerts. Some of the Autopilot alerts also have the `severity=critical` label (for instance, `dcgm level 3` or `ping`), but those will be captured by the `alert=autopilot` label anyways. 116 | 117 | You can check the status of the `AlertManager` pod with this command: 118 | 119 | ```console 120 | oc -n openshift-monitoring logs -l 'alertmanager=main' 121 | ``` 122 | 123 | That's it! Now you can get notifications in Slack every time an autopilot test fails or if any of the pods fail. If there is something else you wish to get notification for, you simply need to create a new `PrometheusRule` with a new `expr` and label, and create a new `AlertManager` Slack receiver with a matching label. 124 | 125 | Below is an example of a Slack alert of a firing and then resolved rule. 126 | ![Slack Messages](images/slack-alert-example.png) 127 | -------------------------------------------------------------------------------- /alertmanager/alertmanager.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 5m 3 | slack_api_url: >- 4 | 5 | inhibit_rules: 6 | - equal: 7 | - namespace 8 | - alertname 9 | source_matchers: 10 | - severity = critical 11 | - alert = slack 12 | target_matchers: 13 | - severity =~ warning|info 14 | - equal: 15 | - namespace 16 | - alertname 17 | source_matchers: 18 | - severity = warning 19 | target_matchers: 20 | - severity = info 21 | - equal: 22 | - namespace 23 | source_matchers: 24 | - alertname = InfoInhibitor 25 | target_matchers: 26 | - severity = info 27 | receivers: 28 | - name: Autopilot 29 | slack_configs: 30 | - channel: 31 | send_resolved: true 32 | text: |- 33 | {{ range .Alerts -}} 34 | *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} 35 | *Description:* {{ .Annotations.description }} 36 | *Details:* 37 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 38 | {{ end }} 39 | {{ end }} 40 | title: >- 41 | [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing 42 | | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ 43 | .CommonLabels.job }} 44 | Autopilot Health Check Report 45 | - name: Critical 46 | slack_configs: 47 | - channel: 48 | link_names: true 49 | send_resolved: true 50 | text: |- 51 | {{ range .Alerts -}} 52 | *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} 53 | *Description:* {{ .Annotations.description }} 54 | *Details:* 55 | {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` 56 | {{ end }} 57 | {{ end }} 58 | title: >- 59 | [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing 60 | | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ 61 | .CommonLabels.job }} 62 | - name: Default 63 | - name: 'null' 64 | - name: Watchdog 65 | route: 66 | group_by: 67 | - namespace 68 | group_interval: 5m 69 | group_wait: 30s 70 | receiver: Default 71 | repeat_interval: 12h 72 | routes: 73 | - matchers: 74 | - alertname = Watchdog 75 | receiver: Watchdog 76 | - matchers: 77 | - alertname = InfoInhibitor 78 | receiver: 'null' 79 | - receiver: Critical 80 | matchers: 81 | - severity = critical 82 | - receiver: Autopilot 83 | matchers: 84 | - alert = autopilot 85 | -------------------------------------------------------------------------------- /alertmanager/alerts/healthchecks-alerts.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | name: autopilot-metrics 5 | namespace: openshift-monitoring 6 | labels: 7 | app: autopilot 8 | spec: 9 | groups: 10 | - name: Alerts on GPU related issues 11 | rules: 12 | - alert: AutopilotLowPCIeBandwidth 13 | annotations: 14 | description: | 15 | GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has a PCIE bandwidth of {{ $value }}{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 16 | summary: GPU with a PCIe bandwidth of 4 or less 17 | expr: | 18 | sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0 19 | for: 1m 20 | labels: 21 | severity: warning 22 | alert: autopilot 23 | - alert: AutopilotDCGMErrors 24 | annotations: 25 | description: | 26 | GPUs on node {{ $labels.node }} have DCGM failures{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 27 | summary: GPUs have DCGM failures 28 | expr: | 29 | sum (autopilot_health_checks{health="dcgm"}==1) by (node) 30 | for: 1m 31 | labels: 32 | severity: warning 33 | alert: autopilot 34 | - alert: AutopilotGPUPowerSlowdownEnabled 35 | annotations: 36 | description: | 37 | GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has power slowdown enabled 38 | summary: A GPU has power slowdown enabled{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 39 | expr: | 40 | sum (autopilot_health_checks{health="power-slowdown"}==1) by (node, deviceid) 41 | for: 1m 42 | labels: 43 | severity: warning 44 | alert: autopilot 45 | - alert: AutopilotRemappedRowsActive 46 | annotations: 47 | description: | 48 | GPU device {{ $labels.deviceid}} on node {{ $labels.node }} with incorrect remapped rows in memory{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 49 | summary: A GPU device has incorrect remapped rows 50 | expr: | 51 | sum (autopilot_health_checks{health="remapped"}==1) by (node, deviceid) 52 | for: 1m 53 | labels: 54 | severity: warning 55 | alert: autopilot 56 | - alert: AutopilotDCGMLevel3Errors 57 | annotations: 58 | description: | 59 | A node reported errors after running DCGM level 3 - check health of nodes{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 60 | summary: Node {{ $labels.node }} has GPU errors 61 | expr: | 62 | kube_node_labels{label_autopilot_ibm_com_dcgm_level_3=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_dcgm_level_3!~""} 63 | for: 5m 64 | labels: 65 | severity: critical 66 | alert: autopilot 67 | - name: Alerts on network related issues 68 | rules: 69 | - alert: AutopilotPingFailures 70 | annotations: 71 | description: | 72 | Node {{ $labels.node }} cannot reach IP {{ $labels.deviceid }}{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 73 | summary: Node has unreachable IPs 74 | expr: | 75 | sum (autopilot_health_checks{health="ping"} > 0) by (deviceid) 76 | for: 10m 77 | labels: 78 | severity: critical 79 | alert: autopilot 80 | - name: Alerts on PVC related issues 81 | rules: 82 | - alert: AutopilotPVCAlert 83 | annotations: 84 | description: | 85 | PVC creation by Autopilot on node {{ $labels.node }} failed{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 86 | summary: PVC cannot be created 87 | expr: | 88 | sum (autopilot_health_checks{health="pvc"}==1) by (node) 89 | for: 5m 90 | labels: 91 | severity: critical 92 | alert: autopilot 93 | - name: Generic alert on periodic check failure 94 | rules: 95 | - alert: AutopilotGPUNodeHealth 96 | annotations: 97 | description: | 98 | Node {{ $labels.node }} reported errors after running Autopilot's periodic health checks{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. 99 | summary: Node {{ $labels.node }} has errors 100 | expr: | 101 | kube_node_labels{label_autopilot_ibm_com_gpuhealth=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_gpuhealth!~""} 102 | for: 1m 103 | labels: 104 | severity: warning 105 | alert: autopilot 106 | - name: Alerts on Autopilot pods not ready 107 | rules: 108 | - alert: AutopilotPodsNotReady 109 | annotations: 110 | description: Autopilot pod on node {{ $labels.node }} is not ready{{ with $console_url := "console_url" | query }}{{ if ne 111 | (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url 112 | ) }}{{ end }}{{ end }}. 113 | summary: Autopilot pod on node {{ $labels.node }} is not ready 114 | expr: count by (namespace) (kube_pod_info and on (pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"autopilot.*"} > 0 or kube_pod_container_status_terminated_reason{reason=~"Error", namespace=~"autopilot.*"} > 0)) 115 | for: 15m 116 | labels: 117 | severity: critical 118 | alert: autopilot -------------------------------------------------------------------------------- /alertmanager/images/alertmanager.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/alertmanager.png -------------------------------------------------------------------------------- /alertmanager/images/create-receiver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/create-receiver.png -------------------------------------------------------------------------------- /alertmanager/images/pciealert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/pciealert.png -------------------------------------------------------------------------------- /alertmanager/images/slack-alert-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/slack-alert-example.png -------------------------------------------------------------------------------- /alertmanager/images/slack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/slack.png -------------------------------------------------------------------------------- /autopilot-daemon/Dockerfile: -------------------------------------------------------------------------------- 1 | # FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel as cudabuild 2 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel as cudabuild 3 | 4 | RUN apt -y update && apt -y upgrade && apt -y clean && apt -y autoremove \ 5 | && apt install -y --no-install-recommends build-essential git wget openssh-server && \ 6 | apt -y clean && apt -y autoremove 7 | 8 | RUN git clone --depth 1 --branch v12.4.1 https://github.com/NVIDIA/cuda-samples.git 9 | WORKDIR cuda-samples/Samples/1_Utilities/bandwidthTest 10 | 11 | RUN make SMS="80 86 90" 12 | 13 | WORKDIR /workspace 14 | 15 | COPY gpu-mem/gpucheck.cu . 16 | 17 | RUN nvcc -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 gpucheck.cu -o gpucheck -lcublas --linker-options -lnvidia-ml -O3 18 | 19 | FROM golang:1.21 AS gobuild 20 | 21 | ENV GOOS=linux 22 | ENV GOARCH=amd64 23 | ENV CGO_ENABLED=0 24 | 25 | WORKDIR /workspace 26 | COPY . /workspace/ 27 | 28 | RUN go build -o bin/autopilot pkg/cmd/main.go 29 | 30 | ####################### Final Image 31 | 32 | # FROM python:3.9.15-slim 33 | FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04 34 | RUN apt -y update && apt -y upgrade && DEBIAN_FRONTEND="noninteractive" TZ="America/New_York" apt install -y --no-install-recommends \ 35 | build-essential iperf3 iputils-ping \ 36 | python3 \ 37 | pip \ 38 | pciutils \ 39 | wget \ 40 | net-tools \ 41 | software-properties-common \ 42 | git \ 43 | && apt -y clean && apt -y autoremove 44 | 45 | RUN add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && apt -y update && apt install -y datacenter-gpu-manager 46 | # add ca-certificates (Alpine commands, previous base image) 47 | # RUN apk update && apk --no-cache add ca-certificates 48 | # RUN adduser -s /bin/bash -D -h /home/autopilot autopilot -G root 49 | 50 | # RDMA ping utils 51 | # RUN apt -y update && apt -y upgrade && apt -y install build-essential automake autoconf libtool libibverbs-dev librdmacm-dev libibumad-dev pciutils libpci-dev 52 | # RUN git clone https://github.com/linux-rdma/perftest.git && \ 53 | # cd perftest && \ 54 | # ./autogen.sh && ./configure && \ 55 | # make && make install 56 | # Add capabilities for ping 57 | RUN setcap cap_net_raw,cap_net_admin+p /bin/ping 58 | 59 | RUN useradd -ms /bin/bash autopilot && usermod -g root autopilot 60 | 61 | # set working directory 62 | WORKDIR /home/autopilot 63 | 64 | COPY --from=gobuild /workspace/bin/autopilot /usr/local/bin/autopilot 65 | 66 | # PCIe tests files 67 | COPY --from=cudabuild /workspace/cuda-samples/Samples/1_Utilities/bandwidthTest/bandwidthTest /home/autopilot/gpu-bw/bandwidthTest 68 | COPY gpu-bw/gpuLocalBandwidthTest.sh /home/autopilot/gpu-bw/gpuLocalBandwidthTest.sh 69 | COPY gpu-bw/entrypoint.py /home/autopilot/gpu-bw/entrypoint.py 70 | 71 | # DGEMM DAXPY test files 72 | 73 | COPY --from=cudabuild /workspace/gpucheck /home/autopilot/gpu-mem/gpucheck 74 | COPY gpu-mem/entrypoint.py /home/autopilot/gpu-mem/entrypoint.py 75 | 76 | 77 | # Network tests files 78 | # COPY network/metrics-entrypoint.py /home/autopilot/network/metrics-entrypoint.py 79 | COPY network/ping-entrypoint.py /home/autopilot/network/ping-entrypoint.py 80 | COPY network/iperf3_entrypoint.py /home/autopilot/network/iperf3_entrypoint.py 81 | COPY network/iperf3_utils.py /home/autopilot/network/iperf3_utils.py 82 | COPY network/network_workload.py /home/autopilot/network/network_workload.py 83 | COPY network/iperf3_start_servers.py /home/autopilot/network/iperf3_start_servers.py 84 | COPY network/iperf3_stop_servers.py /home/autopilot/network/iperf3_stop_servers.py 85 | COPY network/iperf3_start_clients.py /home/autopilot/network/iperf3_start_clients.py 86 | 87 | # Remapped Rows test files 88 | COPY gpu-remapped/entrypoint.py /home/autopilot/gpu-remapped/entrypoint.py 89 | COPY gpu-remapped/remapped-rows.sh /home/autopilot/gpu-remapped/remapped-rows.sh 90 | 91 | COPY utils /home/autopilot/utils 92 | 93 | # DGCM test files and dependencies 94 | COPY gpu-dcgm/entrypoint.py /home/autopilot/gpu-dcgm/entrypoint.py 95 | # RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/datacenter-gpu-manager_3.1.8_amd64.deb && dpkg --install datacenter-gpu-manager_3.1.8_amd64.deb 96 | 97 | # GPU Power cap 98 | COPY gpu-power/power-throttle.sh /home/autopilot/gpu-power/power-throttle.sh 99 | 100 | # Last touches 101 | RUN pip install --upgrade pip && pip install kubernetes netifaces aiohttp[speedups] 102 | RUN apt -y update && apt install -y vim curl && apt -y clean && apt -y autoremove 103 | RUN chmod 755 /usr/local/bin/autopilot && chown -hR autopilot /home/autopilot && chmod -R g=u /home/autopilot 104 | RUN chmod 777 /tmp 105 | 106 | 107 | 108 | CMD ["/usr/local/bin/autopilot"] 109 | -------------------------------------------------------------------------------- /autopilot-daemon/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/IBM/autopilot 2 | 3 | go 1.21 4 | 5 | toolchain go1.21.1 6 | 7 | require ( 8 | github.com/prometheus/client_golang v1.15.0 9 | github.com/thanhpk/randstr v1.0.6 10 | k8s.io/api v0.29.2 11 | k8s.io/apimachinery v0.29.2 12 | k8s.io/client-go v0.29.2 13 | k8s.io/klog/v2 v2.110.1 14 | k8s.io/kubectl v0.29.2 15 | ) 16 | 17 | require ( 18 | github.com/beorn7/perks v1.0.1 // indirect 19 | github.com/cespare/xxhash/v2 v2.2.0 // indirect 20 | github.com/davecgh/go-spew v1.1.1 // indirect 21 | github.com/emicklei/go-restful/v3 v3.11.0 // indirect 22 | github.com/go-logr/logr v1.3.0 // indirect 23 | github.com/go-openapi/jsonpointer v0.19.6 // indirect 24 | github.com/go-openapi/jsonreference v0.20.2 // indirect 25 | github.com/go-openapi/swag v0.22.3 // indirect 26 | github.com/gogo/protobuf v1.3.2 // indirect 27 | github.com/golang/protobuf v1.5.3 // indirect 28 | github.com/google/gnostic-models v0.6.8 // indirect 29 | github.com/google/go-cmp v0.6.0 // indirect 30 | github.com/google/gofuzz v1.2.0 // indirect 31 | github.com/google/uuid v1.3.0 // indirect 32 | github.com/josharian/intern v1.0.0 // indirect 33 | github.com/json-iterator/go v1.1.12 // indirect 34 | github.com/mailru/easyjson v0.7.7 // indirect 35 | github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect 36 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 37 | github.com/modern-go/reflect2 v1.0.2 // indirect 38 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 39 | github.com/prometheus/client_model v0.3.0 // indirect 40 | github.com/prometheus/common v0.42.0 // indirect 41 | github.com/prometheus/procfs v0.9.0 // indirect 42 | golang.org/x/net v0.23.0 // indirect 43 | golang.org/x/oauth2 v0.10.0 // indirect 44 | golang.org/x/sys v0.18.0 // indirect 45 | golang.org/x/term v0.18.0 // indirect 46 | golang.org/x/text v0.14.0 // indirect 47 | golang.org/x/time v0.3.0 // indirect 48 | google.golang.org/appengine v1.6.7 // indirect 49 | google.golang.org/protobuf v1.33.0 // indirect 50 | gopkg.in/inf.v0 v0.9.1 // indirect 51 | gopkg.in/yaml.v2 v2.4.0 // indirect 52 | gopkg.in/yaml.v3 v3.0.1 // indirect 53 | k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect 54 | k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect 55 | sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect 56 | sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect 57 | sigs.k8s.io/yaml v1.3.0 // indirect 58 | ) 59 | -------------------------------------------------------------------------------- /autopilot-daemon/gpu-bw/entrypoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | 5 | def main(): 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-t', '--threshold', type=str, default='4') 9 | args = parser.parse_args() 10 | output = os.popen('bash ./utils/briefings.sh') 11 | result = output.read() 12 | # print(result) 13 | 14 | if "ABORT" not in result: 15 | print("[[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation.") 16 | output = os.popen('./gpu-bw/gpuLocalBandwidthTest.sh -t ' + args.threshold) 17 | result = output.read() 18 | 19 | if "ABORT" in result or "SKIP" in result: 20 | print("[[ PCIEBW ]] ABORT") 21 | print(result) 22 | exit() 23 | 24 | print("SUCCESS") 25 | print("Host ", os.getenv("NODE_NAME")) 26 | splitres = result.split("\n") 27 | bws = "" 28 | for line in splitres: 29 | if "Bandwidth =" in line: 30 | x = line.split("= ", 2) 31 | y = x[1].split(" GB/s") 32 | bws += y[0] + " " 33 | print(bws.strip()) 34 | else: 35 | print("[[ PCIEBW ]] ABORT") 36 | print(result) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() -------------------------------------------------------------------------------- /autopilot-daemon/gpu-bw/gpuLocalBandwidthTest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # MH: 4 | # This file is supposed to be used for GPU instance in LLM cluster with PXB topology. Test the localhost only 5 | # This version can detect more than 8 GPUs but may not correctly work on systems with more than 8 GPUs 6 | # 7 | # Requirement: pre-compiled bandwidthTest from cuda_samples on instances. 8 | # 9 | # Usage: 10 | # A. Update PROG and FN in the script. Threshold T is set to 7 by default 11 | # B. run gpuLocalBandwidthTest.sh 12 | # C. Check the last line of output: SUCCESS or FAIL 13 | # 14 | # Note: some parameters are hard coded. You may want to change them for different environment. 15 | # 16 | # Find me at minghungchen@ibm.com if any questions 17 | # 18 | # Ver. 1.3 19 | 20 | PROG="/home/autopilot/gpu-bw/bandwidthTest" 21 | 22 | 23 | while getopts t:f: flag 24 | do 25 | case "${flag}" in 26 | t) T=${OPTARG};; 27 | esac 28 | done 29 | echo "Threshold: $T"; 30 | 31 | RES=$(ls -d /dev/nvidia* 2>1) 32 | numre='^[0-9]+$' 33 | D=-1 34 | for d in $RES; do 35 | d=${d#*"nvidia"*} 36 | if [[ "$d" =~ $numre ]]; then 37 | D=0 38 | break 39 | fi 40 | done 41 | if [[ $D -eq 0 ]]; then 42 | echo -n "Detected NVIDIA GPU: " 43 | for d in $RES; do 44 | d=${d#*"nvidia"*} 45 | if [[ "$d" =~ $numre ]]; then 46 | echo -n "$d " 47 | D=$((D+1)) 48 | fi 49 | done 50 | echo "Total: $D" 51 | else 52 | echo "No NVIDIA GPU detected. Skipping the bandwidth test." 53 | echo "SKIP" 54 | exit 0 55 | fi 56 | 57 | D=$((D-1)) 58 | for i in $(seq 0 1 $D) ; do 59 | EXEC+="$($PROG --htod --memory=pinned --device=$i --csv 2>&1)" 60 | EXEC+="\n" 61 | done 62 | errors="$(echo ${EXEC} | grep -i '802\|error')" 63 | if [[ -n $errors ]]; then 64 | echo "CRITICAL ERROR WITH GPUs" 65 | echo "ABORT" 66 | echo -e $EXEC 67 | else 68 | echo -e $EXEC 69 | echo "SUCCESS" 70 | fi -------------------------------------------------------------------------------- /autopilot-daemon/gpu-dcgm/entrypoint.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | import os 4 | import argparse 5 | import re 6 | import datetime 7 | from kubernetes import client, config 8 | from kubernetes.client.rest import ApiException 9 | 10 | config.load_incluster_config() 11 | v1 = client.CoreV1Api() 12 | nodename = os.getenv("NODE_NAME") 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-r', '--run', type=str, default='1') 16 | parser.add_argument('-l', '--label_node', action='store_true') 17 | parser.add_argument('-v', '--verbose', action='store_true') 18 | args = parser.parse_args() 19 | 20 | def main(): 21 | output = os.popen('bash ./utils/briefings.sh') 22 | result = output.read() 23 | print(result) 24 | 25 | if "ABORT" not in result: 26 | print("[[ DCGM ]] Briefings completed. Continue with dcgm evaluation.") 27 | command = ['dcgmi', 'diag', '-j', '-r', args.run] 28 | try_dcgm(command,args.run) 29 | else: 30 | print("[[ DCGM ]] ABORT") 31 | print(result) 32 | 33 | 34 | # translate key-strings into lowercase and strip spaces 35 | def unify_string_format(key: str) -> str: 36 | to_lower = key.strip().lower() 37 | res, _ = re.subn('[\/|\s]', '_', to_lower) 38 | return res 39 | 40 | def parse_all_results(result: str): 41 | dcgm_dict = json.loads(result) 42 | tests_dict = dcgm_dict['DCGM GPU Diagnostic']['test_categories'] 43 | success = True 44 | output = "" 45 | for category in tests_dict: 46 | for test in category['tests']: 47 | test_failing=False 48 | for result in test['results']: 49 | if result['status'] == 'Fail': 50 | success = False 51 | if test_failing is False: 52 | output += f'{unify_string_format(test["name"])}' 53 | test_failing = True 54 | output += f'{"." + str(result["gpu_id"]) if "gpu_id" in result else "NoGPUid"}' 55 | return success, output 56 | 57 | 58 | # parsing the json result string based on a comma-separated list of paths (levels separated by '.') 59 | def parse_selected_results(result: str, testpaths: str): 60 | ''' 61 | follow the list of selected paths down the dcgm json tree 62 | 63 | the specification of the paths: .. 64 | 65 | to walk down this example json snippet below your path should be: 66 | 67 | 'DCGM GPU Diagnostic.Hardware.GPU Memory' 68 | 69 | for the search, all strings are turned to lowercase and spaces are replaced with '_' 70 | therefore the following path would achieve the same: 71 | 72 | 'dcgm_gpu_diagnostic.HARDWare.gpu Memory' 73 | 74 | "DCGM GPU Diagnostic" : { 75 | "test_categories" : [ { 76 | ... 77 | "category" : "Hardware", 78 | "tests" : [ { 79 | "name" : "GPU Memory", 80 | "results" : [ { 81 | "gpu_id" : "0", 82 | "status" : "Fail", 83 | ... 84 | 85 | 86 | The paths need to be specified in env variable AUTOPILOT_DCGM_RESULT_PATHS as a comma-separated list 87 | If the variable is not set, then the regular scan is performed 88 | ''' 89 | _dcgm_json_levels = [ 90 | ("top_level","dcgm_gpu_diagnostic"), 91 | ("category","tests"), 92 | ("name","results") 93 | ] 94 | 95 | 96 | # scan the dictionary and recursively transform all keys using key_update 97 | def normalize_json_keys(data) -> dict: 98 | ndata = {} 99 | if not isinstance(data, dict) and not isinstance(data, list): 100 | return data 101 | for key,val in data.items(): 102 | key_n = unify_string_format(key) 103 | 104 | if isinstance(val, dict): 105 | val_n = normalize_json_keys(data[key]) 106 | elif isinstance(val, list): 107 | val_n = [ normalize_json_keys(v) for v in val ] 108 | else: 109 | val_n = data[key] 110 | 111 | ndata[ key_n ] = val_n 112 | 113 | # unfortunately, the top level of dcgm dict is structured differently from the rest, 114 | # adjusting by inserting/moving it's sub-dict into top-level and rename 115 | if _dcgm_json_levels[0][1] in ndata: 116 | ndata[_dcgm_json_levels[0][0]] = _dcgm_json_levels[0][1] # replace old dcgm_gpu_diagnostics with 'top_level' as a name 117 | ndata[_dcgm_json_levels[0][1]] = ndata[_dcgm_json_levels[0][1]].pop("test_categories") # move test_categories entry to new 'top_level' 118 | return ndata 119 | 120 | 121 | # recursively dive into the json tree by following a given path 122 | def dive_to_test(data, jpath: list[str], depth: int): 123 | assert( 3-len(jpath) == depth ) 124 | assert( depth < 3 ) 125 | 126 | jlevel_spec = _dcgm_json_levels[depth] 127 | 128 | if not isinstance(data, list): 129 | data = [data] 130 | for entry in data: 131 | if jlevel_spec[0] in entry and jpath[0] == unify_string_format( entry[jlevel_spec[0]] ): 132 | if depth == 2: 133 | return entry[ jlevel_spec[1] ] 134 | else: 135 | return dive_to_test( entry[ jlevel_spec[1] ], jpath[1:], depth+1 ) 136 | return 137 | 138 | # browses the result section of a single test and extracts info 139 | def parse_single_test_result(data) -> tuple[bool, str]: 140 | if not data: 141 | return False, "No Data" 142 | if not isinstance(data, list): 143 | data = [data] 144 | 145 | success = True 146 | output = [] 147 | for entry in data: 148 | if "status" in entry: 149 | good = (unify_string_format(entry['status']) == 'pass') 150 | success &= good 151 | if not good: 152 | output.append( ( 153 | entry["gpu_id"] if "gpu_id" in entry else "NoGPU_ID", 154 | entry["info"] if "info" in entry else "NoInfo" 155 | )) 156 | else: 157 | success &= False 158 | output.append( ("No Status") ) 159 | return success,output 160 | 161 | # create output from the parsed results (can be adjusted to whatever) 162 | def build_output(output_list: tuple[str, str]) -> str: 163 | print(output_list) 164 | output = "" 165 | for test,result in output_list: 166 | if len(output): 167 | output += ";" 168 | output += f'{unify_string_format(test)}:' 169 | for result_data in result: 170 | for r in result_data: 171 | output += f'{unify_string_format(r)},' 172 | return output 173 | 174 | jdata = json.load(result) 175 | norm_d = normalize_json_keys(jdata) 176 | 177 | result_list = [] 178 | overall_success = True 179 | for path in testpaths.split(','): 180 | single_test_result = dive_to_test( norm_d, [ unify_string_format(p) for p in path.split('.') ], 0 ) 181 | test_success,output = parse_single_test_result(single_test_result) 182 | overall_success &= test_success 183 | if not test_success: 184 | result_list.append( (path, output) ) 185 | return overall_success, build_output(result_list) 186 | 187 | 188 | 189 | def try_dcgm(command,run_level): 190 | result = subprocess.run(command, text=True, capture_output=True) 191 | return_code = result.returncode # 0 for success 192 | if return_code != 0: 193 | print("[[ DCGM ]] DCGM process terminated with errors. Other processes might be running on GPUs. ABORT") 194 | command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv'] 195 | try: 196 | proc = subprocess.run(command, check=True, text=True, capture_output=True) 197 | except subprocess.CalledProcessError: 198 | print("[[ DCGM ]] nvidia-smi check terminated with errors. ABORT") 199 | exit() 200 | if proc.stdout: 201 | print("[[ DCGM ]] GPUs currently utilized:\n", proc.stdout) 202 | 203 | if result.stderr: 204 | print(result.stderr) 205 | print("[[ DCGM ]] exited with error: " + result.stderr + " ERR") 206 | else: 207 | testpaths = os.getenv("AUTOPILOT_DCGM_RESULT_PATHS") 208 | if args.verbose: 209 | print(result.stdout) 210 | if testpaths == None: 211 | success, output = parse_all_results(result.stdout) 212 | if success: 213 | print("[[ DCGM ]] SUCCESS") 214 | else: 215 | print("Host", nodename) 216 | print("[[ DCGM ]] FAIL") 217 | if args.label_node: 218 | patch_node(success, output,run_level) 219 | 220 | 221 | def patch_node(success, output,run_level): 222 | now = datetime.datetime.now(datetime.timezone.utc) 223 | timestamp = now.strftime("%Y-%m-%d_%H.%M.%SUTC") 224 | result = "" 225 | general_health = "PASS" 226 | try: 227 | k8s_node = v1.read_node(nodename) 228 | except ApiException as e: 229 | print("Exception when calling corev1api->read_node: %s\n" % e) 230 | exit() 231 | 232 | node_labels = k8s_node.metadata.labels 233 | if os.getenv("DCGM_FATAL_ERRORS") == "": 234 | # Only fatal errors should produce an EVICT label. Based on https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3 235 | dcgm_fatal_errors = ['PCIe','NVLink','ECC','GPU Memory'] 236 | else: 237 | dcgm_fatal_errors = os.getenv("DCGM_FATAL_ERRORS") 238 | 239 | if success and node_labels["autopilot.ibm.com/gpuhealth"] in ["PASS", "TESTING"]: 240 | # If there is some other warning coming from other tests, i.e., ping or storage, we would overwrite this information. Let's play it safe at this point. 241 | result = "PASS_"+timestamp 242 | elif not success: 243 | result = "ERR_"+timestamp 244 | general_health = "WARN" 245 | for error in dcgm_fatal_errors: 246 | unified = unify_string_format(error) 247 | if unified in output: 248 | general_health = "EVICT" 249 | 250 | label = { 251 | "metadata": { 252 | "labels": { 253 | f"autopilot.ibm.com/dcgm.level.{run_level}": result, 254 | "autopilot.ibm.com/gpuhealth": general_health}, 255 | "annotations": { 256 | f"autopilot.ibm.com/dcgm.level.{run_level}.output": str(output) 257 | } 258 | } 259 | } 260 | try: 261 | api_response = v1.patch_node(nodename, label) 262 | except ApiException as e: 263 | print("Exception when calling corev1api->patch_node: %s\n" % e) 264 | exit() 265 | 266 | if __name__ == '__main__': 267 | main() 268 | -------------------------------------------------------------------------------- /autopilot-daemon/gpu-mem/entrypoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def main(): 4 | output = os.popen('bash ./utils/briefings.sh') 5 | result = output.read() 6 | 7 | if "ABORT" not in result: 8 | print("[[ GPU-MEM ]] Briefings completed. Continue with memory evaluation.") 9 | output = os.popen('./gpu-mem/gpucheck') 10 | result = output.read() 11 | if "NONE" in result: 12 | print("[[ GPU-MEM ]] Health Check successful") 13 | exit() 14 | 15 | print("[[ GPU-MEM ]] Health Check unsuccessful. FAIL.") 16 | print(result) 17 | exit() 18 | 19 | if __name__ == '__main__': 20 | main() -------------------------------------------------------------------------------- /autopilot-daemon/gpu-mem/gpucheck.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define MAX_BLOCKS 512 11 | #define THREADS_PER_BLOCK 256 12 | #define btoa(x) ((x)?"true":"false") 13 | 14 | double cuda_dgemm(const char *, const char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); 15 | void cuda_dgemm_free(); 16 | 17 | #define CUDA_RC(rc) if( (rc) != cudaSuccess ) \ 18 | {printf("Error %s at %s line %d\n", cudaGetErrorString(cudaGetLastError()), __FILE__,__LINE__); exit(1);} 19 | 20 | #define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \ 21 | {printf("Error %s at %s line %d\n", cudaGetErrorString(cudaGetLastError()), __FILE__,__LINE__-1); exit(1);} 22 | 23 | double walltime(void); 24 | 25 | __global__ void daxpy(const double alpha, const double * x, double * y, int npts) 26 | { 27 | for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < npts; i += blockDim.x * gridDim.x) y[i] = alpha*x[i] + y[i]; 28 | } 29 | 30 | static nvmlDevice_t nvmldevice; 31 | static unsigned int temperature, power, smMHz; 32 | 33 | int main(int argc, char * argv[]) 34 | { 35 | int i, d, npts, iter, maxiter, mydevice, numDevices; 36 | double * __restrict__ x, * __restrict__ y; 37 | double * dev_x, * dev_y; 38 | double * Amat, * Bmat, * Cmat; 39 | int m, n, k, lda, ldb, ldc; 40 | double alpha, beta; 41 | double BW_pinned_h2d, BW_pageable_h2d, BW_pinned_d2h, BW_pageable_d2h, BW_daxpy, TFlops; 42 | double time1, time2; 43 | cudaDeviceProp prop; 44 | double * metrics; 45 | nvmlDevice_t *device; 46 | unsigned int device_count; 47 | 48 | npts = 1024*1024*(1024/8); 49 | 50 | 51 | // initialize nvml 52 | if (NVML_SUCCESS != nvmlInit()) { 53 | fprintf(stderr, "failed to initialize NVML ... exiting\n"); 54 | } 55 | 56 | if (NVML_SUCCESS != nvmlDeviceGetCount(&device_count)) { 57 | fprintf(stderr, "nvmlDeviceGetCount failed ... exiting\n"); 58 | } 59 | 60 | device = (nvmlDevice_t *) malloc(device_count*sizeof(nvmlDevice_t)); 61 | 62 | for (i = 0; i < device_count; i++) { 63 | if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &device[i])) { 64 | fprintf(stderr, "nvmlDeviceGetHandleByIndex failed ... exiting\n"); 65 | } 66 | } 67 | 68 | // set matrix dimensions large enough to reach close to peak Flops 69 | m = 8192; n = 8192; k = 8192; 70 | Amat = (double *) malloc(m*k*sizeof(double)); 71 | Bmat = (double *) malloc(k*n*sizeof(double)); 72 | Cmat = (double *) malloc(m*n*sizeof(double)); 73 | 74 | #pragma omp parallel for 75 | for (i=0; i<(m*k); i++) Amat[i] = 1.2e-2*((double) (i%100)); 76 | #pragma omp parallel for 77 | for (i=0; i<(k*n); i++) Bmat[i] = 1.5e-3*((double) ((i + 100)%1000)); 78 | #pragma omp parallel for 79 | for (i=0; i<(m*n); i++) Cmat[i] = 1.5e-3*((double) ((i + 500)%1000)); 80 | 81 | CUDA_RC(cudaGetDeviceCount(&numDevices)); 82 | 83 | 84 | metrics = (double *) malloc(numDevices*9*sizeof(double)); 85 | y = (double *) malloc(npts*sizeof(double)); 86 | 87 | bool* faulty = (bool*) malloc(numDevices*sizeof(bool)); 88 | for (i = 0; i < numDevices; ++i) 89 | faulty[i] = false; 90 | 91 | 92 | for (d = 0; d < numDevices; d++) { 93 | mydevice = d; /*local_rank % numDevices;*/ 94 | 95 | // assign nvmldevice to this rank's GPU 96 | nvmldevice = device[mydevice]; 97 | 98 | CUDA_RC(cudaSetDevice(mydevice)); 99 | CUDA_RC(cudaGetDeviceProperties(&prop, mydevice)); 100 | 101 | // use pinned memory for x, pageable memory for y 102 | CUDA_RC(cudaMallocHost((void **)&x, npts*sizeof(double))); 103 | // y = (double *) malloc(npts*sizeof(double)); 104 | 105 | CUDA_RC(cudaMalloc((void **)&dev_x, npts*sizeof(double))); 106 | CUDA_RC(cudaMalloc((void **)&dev_y, npts*sizeof(double))); 107 | 108 | #pragma omp parallel for 109 | for (i=0; i MAX_BLOCKS) numBlocks = MAX_BLOCKS; 157 | 158 | time1 = walltime(); 159 | for (iter=0; iter>>(alpha, dev_x, dev_y, npts); 161 | CUDA_CHECK(); 162 | } 163 | CUDA_RC(cudaDeviceSynchronize()); 164 | time2 = walltime(); 165 | 166 | BW_daxpy = 3.0*8.0e-9*((double) npts)*((double) maxiter)/(time2 - time1); 167 | metrics[9*d+4] = BW_daxpy; 168 | if(BW_daxpy < 1300) 169 | faulty[d] = true; 170 | 171 | // free(y); 172 | CUDA_RC(cudaFreeHost(x)); 173 | CUDA_RC(cudaFree(dev_x)); 174 | CUDA_RC(cudaFree(dev_y)); 175 | 176 | beta = 0.0; lda = m; ldb = k; ldc = m; 177 | TFlops = cuda_dgemm("N", "N", &m, &n, &k, &alpha, Amat, &lda, Bmat, &ldb, &beta, Cmat, &ldc); 178 | cuda_dgemm_free(); 179 | metrics[9*d+5] = TFlops; 180 | if(TFlops < 16) 181 | faulty[d] = true; 182 | 183 | metrics[9*d+6] = (double) temperature; 184 | metrics[9*d+7] = 1.0e-3*((double) power); // convert to Watts 185 | metrics[9*d+8] = (double) smMHz; 186 | } 187 | printf(" GPU H2D(p) H2D D2H(p) D2H daxpy dgemm temp power smMHz\n"); 188 | for (d = 0; d < numDevices; d++) { 189 | printf("%3d %6.2lf %6.2lf %6.2lf %6.2lf %7.2lf %6.2lf %6.0lf %8.0lf %8.0lf\n", 190 | d, metrics[9*d], metrics[9*d+1], metrics[9*d+2], metrics[9*d+3], metrics[9*d+4], metrics[9*d+5], metrics[9*d+6], metrics[9*d+7], metrics[9*d+8]); 191 | } 192 | printf("Summary of GPU errors:"); 193 | bool allgood = true; 194 | for (d = 0; d < numDevices; d++) { 195 | if (faulty[d]) { 196 | allgood = false; 197 | printf("GPU %d -- H2D(p): %f; daxpy: %f; dgemm: %f", d, metrics[9*d+0], metrics[9*d+4], metrics[9*d+5]); 198 | } 199 | } 200 | if (allgood) { 201 | printf(" NONE "); 202 | } 203 | free(y); 204 | free(metrics); 205 | free(faulty); 206 | return 0; 207 | } 208 | 209 | double walltime(void) 210 | { 211 | double elapsed; 212 | struct timeval tv; 213 | gettimeofday(&tv,NULL); 214 | elapsed = ((double) tv.tv_sec) + 1.0e-6*((double) tv.tv_usec); 215 | return elapsed; 216 | } 217 | 218 | 219 | // variables for cublas dgemm wrapper 220 | static double * d_A, * d_B, * d_C; 221 | static cublasHandle_t handle; 222 | 223 | // use the Fortran dgemm argument list 224 | double cuda_dgemm(const char * transa, const char * transb, int * m, int * n, int * k, 225 | double * alpha, double * A, int * lda, double * B, int * ldb, 226 | double * beta, double * C, int * ldc) 227 | { 228 | int M, N, K, LDA, LDB, LDC; 229 | int asize, bsize, csize; 230 | double time1, time2, TFlops; 231 | cublasOperation_t opA, opB; 232 | int iter, maxiter = 400, sample_iter = 350; 233 | 234 | M = *m; N = *n; K = *k; 235 | LDA = *lda; LDB = *ldb; LDC = *ldc; 236 | 237 | asize = M*K; 238 | bsize = K*N; 239 | csize = M*N; 240 | 241 | cublasCreate(&handle); 242 | cudaMalloc((void **)&d_A, asize*sizeof(double)); 243 | cudaMalloc((void **)&d_B, bsize*sizeof(double)); 244 | cudaMalloc((void **)&d_C, csize*sizeof(double)); 245 | 246 | cublasSetVector(asize, sizeof(double), A, 1, d_A, 1); 247 | cublasSetVector(bsize, sizeof(double), B, 1, d_B, 1); 248 | cublasSetVector(csize, sizeof(double), C, 1, d_C, 1); 249 | 250 | if (transa[0] == 'n' || transa[0] == 'N') opA = CUBLAS_OP_N; 251 | else if (transa[0] == 't' || transa[0] == 'T') opA = CUBLAS_OP_T; 252 | 253 | if (transb[0] == 'n' || transb[0] == 'N') opB = CUBLAS_OP_N; 254 | else if (transb[0] == 't' || transb[0] == 'T') opB = CUBLAS_OP_T; 255 | 256 | 257 | // call one time outside the timers, then time it 258 | cublasDgemm(handle, opA, opB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta, d_C, LDC); 259 | cudaDeviceSynchronize(); 260 | 261 | time1 = walltime(); 262 | for (iter = 0; iter < maxiter; iter++) { 263 | cublasDgemm(handle, opA, opB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta, d_C, LDC); 264 | if (iter == sample_iter) { 265 | if (NVML_SUCCESS != nvmlDeviceGetTemperature(nvmldevice, NVML_TEMPERATURE_GPU, &temperature)) temperature = 0; 266 | if (NVML_SUCCESS != nvmlDeviceGetPowerUsage(nvmldevice, &power)) power = 0; 267 | if (NVML_SUCCESS != nvmlDeviceGetClockInfo(nvmldevice, NVML_CLOCK_SM, &smMHz)) smMHz = 0; 268 | } 269 | cudaDeviceSynchronize(); 270 | } 271 | time2 = walltime(); 272 | TFlops = 2.0e-12*((double) maxiter)*((double) M)*((double) N)*((double) K)/(time2 - time1); 273 | 274 | cudaMemcpy(C, d_C, csize*sizeof(double), cudaMemcpyDeviceToHost); 275 | 276 | return TFlops; 277 | } 278 | 279 | void cuda_dgemm_free() 280 | { 281 | cudaFree(d_A); 282 | cudaFree(d_B); 283 | cudaFree(d_C); 284 | cublasDestroy(handle); 285 | return; 286 | } 287 | -------------------------------------------------------------------------------- /autopilot-daemon/gpu-power/power-throttle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUT="$(bash /home/autopilot/utils/briefings.sh | grep ABORT)" 3 | echo ${OUT} 4 | if [[ ! -z $OUT ]]; then 5 | echo "[[GPU POWER]] ABORT" 6 | exit 0 7 | fi 8 | echo "[[GPU POWER]] Briefings completed. Continue with power cap evaluation." 9 | 10 | RES=$(ls -d /dev/nvidia* 2>1) 11 | numre='^[0-9]+$' 12 | D=-1 13 | for d in $RES; do 14 | d=${d#*"nvidia"*} 15 | if [[ "$d" =~ $numre ]]; then 16 | D=0 17 | break 18 | fi 19 | done 20 | if [[ $D -eq 0 ]]; then 21 | echo -n "[GPU POWER] Detected NVIDIA GPU: " 22 | for d in $RES; do 23 | d=${d#*"nvidia"*} 24 | if [[ "$d" =~ $numre ]]; then 25 | echo -n "$d " 26 | D=$((D+1)) 27 | fi 28 | done 29 | echo "Total: $D" 30 | else 31 | echo "[GPU POWER] No NVIDIA GPU detected. Skipping the Power Throttle check." 32 | echo "ABORT" 33 | exit 0 34 | fi 35 | RESULT="" 36 | FAIL=0 37 | for i in $(seq 0 1 $((D-1))) ; do 38 | OUT=$(nvidia-smi --format=csv -i $i --query-gpu=clocks_event_reasons.hw_slowdown) 39 | NOTACTIVE=$(echo $OUT | grep "Not Active") 40 | if [[ ! -z "$NOTACTIVE" ]]; then 41 | RESULT+="0 " 42 | else 43 | RESULT+="1 " 44 | FAIL=1 45 | fi 46 | done 47 | if [[ $FAIL -ne 0 ]]; then 48 | echo "[GPU POWER] FAIL" 49 | else 50 | echo "[GPU POWER] SUCCESS" 51 | fi 52 | echo $RESULT -------------------------------------------------------------------------------- /autopilot-daemon/gpu-remapped/entrypoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def main(): 5 | output = os.popen('bash ./utils/briefings.sh') 6 | result = output.read() 7 | print(result) 8 | 9 | if "ABORT" not in result: 10 | print("[[ REMAPPED ROWS ]] Briefings completed. Continue with remapped rows evaluation.") 11 | output = os.popen('./gpu-remapped/remapped-rows.sh') 12 | result = output.read() 13 | if "FAIL" not in result: 14 | print("[[ REMAPPED ROWS ]] SUCCESS") 15 | else: 16 | print("[[ REMAPPED ROWS ]] FAIL") 17 | print("Host ", os.getenv("NODE_NAME")) 18 | print(result.strip()) 19 | return 0 20 | print("Host ", os.getenv("NODE_NAME")) 21 | print(result.strip()) 22 | else: 23 | print("[[ REMAPPED ROWS ]] ABORT") 24 | print(result.strip()) 25 | 26 | if __name__ == '__main__': 27 | main() -------------------------------------------------------------------------------- /autopilot-daemon/gpu-remapped/remapped-rows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RES=$(ls -d /dev/nvidia* 2>1) 3 | numre='^[0-9]+$' 4 | D=-1 5 | for d in $RES; do 6 | d=${d#*"nvidia"*} 7 | if [[ "$d" =~ $numre ]]; then 8 | D=0 9 | break 10 | fi 11 | done 12 | if [[ $D -eq 0 ]]; then 13 | echo -n "Detected NVIDIA GPU: " 14 | for d in $RES; do 15 | d=${d#*"nvidia"*} 16 | if [[ "$d" =~ $numre ]]; then 17 | echo -n "$d " 18 | D=$((D+1)) 19 | fi 20 | done 21 | echo "Total: $D" 22 | else 23 | echo "No NVIDIA GPU detected. Skipping the Remapped Rows check." 24 | echo "SKIP" 25 | exit 0 26 | fi 27 | RESULT="" 28 | FAIL=0 29 | for i in $(seq 0 1 $((D-1))) ; do 30 | OUT=$(nvidia-smi -q -i $i| grep -A 10 "Remapped Rows") 31 | REMAPPED=$(echo $OUT | egrep "Pending\s*:\s+Yes") 32 | if [[ -z "$REMAPPED" ]]; then 33 | RESULT+="0 " 34 | else 35 | RESULT+="1 " 36 | FAIL=1 37 | fi 38 | done 39 | if [[ $FAIL -eq 1 ]]; then 40 | echo FAIL 41 | fi 42 | echo $RESULT -------------------------------------------------------------------------------- /autopilot-daemon/network/README.md: -------------------------------------------------------------------------------- 1 | # Network Validation Tests 2 | 3 | Autopilot provides two network validation tests: 4 | 5 | - Reachability: runs `ping` against all network interfaces available in all the Autopilot pods 6 | - Bandwidth: runs `iperf3` to validate the network bandwidth available. 7 | 8 | ## Iperf 9 | 10 | This test, in it's current form, is primarily for running `TCP` `data plane` `port-to-port` network workloads to gather key performance statistics. This performs a `Ring Traversal` (or as we call it, a `ring workload`) through all network interfaces (net1-X interfaces) at varying intensity (number of simultaneous client & servers per interface). In future versions of Autopilot, more workloads and customization to the workloads may be provided. 11 | 12 | ### Ring workload 13 | A "Ring Workload", in our case is similar the commonly known "Ring Topology" such that the execution calls flow sequentially in a particular _direction_ that forms a "ring" like pattern. _Most importantly, none of the the compute infrastructure is actually configured in a ring, we merely develop workloads that resemble a ring pattern._ The motivation for these workloads is to achieve full line rate throughput on a port-by-port (in our case network interfaces net1-X) basis for a single logical cluster. 14 | 15 | Assume we have the following set of nodes `[A,B,C]`. We can create a `ring` starting from node `A` that flows to the direction of `C`: 16 | 17 | ```console 18 | A -> B 19 | B -> C 20 | C -> A 21 | ``` 22 | 23 | In our case, a "Ring Workload" will exhaust all starting pointings. We call these iterations, `timesteps`. In a compute infrastructure with `n` number of nodes, we can say there will be `n-1` total timesteps. Said differently, there's `n-1` possible starting points that form a ring such that no node flows to itself. Each of the pairs of execution in a given timestep will execute in parallel. 24 | 25 | ```console 26 | Timestep 1: 27 | ------------ 28 | A -> B 29 | B -> C 30 | C -> A 31 | 32 | Timestep 2: 33 | ------------ 34 | A -> C 35 | B -> A 36 | C -> B 37 | ``` 38 | 39 | As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user. Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user. 40 | 41 | For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`. 42 | 43 | For each timestep, all `pairs` are executed simultaneously. For each pair some `number of clients` are started in parallel and will run for `5 seconds` using `zero-copies` against a respective `iperf3 server` 44 | 45 | Metrics such `minimum`, `maximum`, `mean`, `aggregate` bitrates and transfers are tracked for both the `sender` and the `receiver` for each `client -> server` execution. The results are stored both as `JSON` in the respective `pod` as well as summarized and dumped into the `pod logs`. 46 | 47 | Invocation from the exposed Autopilot API is as follows below: 48 | 49 | ```bash 50 | # Invoked via the `status` handle: 51 | curl "http://127.0.0.1:3333/status?check=iperf&workload=ring&pclients=&startport=" 52 | 53 | # Invoked via the `iperf` handle directly: 54 | curl "http://127.0.0.1:3333/iperf?workload=ring&pclients=&startport=" 55 | ``` 56 | -------------------------------------------------------------------------------- /autopilot-daemon/network/iperf3_entrypoint.py: -------------------------------------------------------------------------------- 1 | from iperf3_utils import * 2 | from network_workload import NetworkWorkload 3 | 4 | parser = argparse.ArgumentParser() 5 | 6 | parser.add_argument( 7 | "--workload", 8 | type=str, 9 | default="ring", 10 | help=('The type of network workload. Supported workload values: "ring"'), 11 | ) 12 | 13 | parser.add_argument( 14 | "--pclients", 15 | type=str, 16 | default="8", 17 | help=( 18 | 'The number of clients to run in parallel. Note. This is not using the iperf3 "-P" option. ' 19 | 'This spawns "N" number of iperf3 client instances in parallel to a target server. For each client, ' 20 | 'a respective port on the target server will be pinned. For instance, if there are 3 "pclients" ' 21 | "specified, then there will be 3 instances of a particular network interface on 3 different ports." 22 | ), 23 | ) 24 | 25 | parser.add_argument( 26 | "--startport", 27 | type=str, 28 | default="5200", 29 | help=( 30 | 'The default port value. In the event that "pclients" is greater than 1, the default port value used ' 31 | "to generate servers will automatically increase to accomdate the clients running in parallel." 32 | ), 33 | ) 34 | 35 | parser.add_argument( 36 | "--cleanup", 37 | action="store_true", 38 | help=("When provided, this will kill ALL iperf servers on every node."), 39 | ) 40 | 41 | args = vars(parser.parse_args()) 42 | 43 | 44 | async def make_server_connection(event, address, handle): 45 | """ 46 | Handles connections to the target autopilot pod on a different worker-node. 47 | Attempts to ensure synchronization via asyncio events... 48 | 49 | Args: 50 | address (str): The address of the autopilot pod. 51 | handle (str): The endpoint handle for the connection. 52 | 53 | """ 54 | try: 55 | if event != None: 56 | await event.wait() 57 | url = f"http://{address}:{AUTOPILOT_PORT}{handle}" 58 | total_timeout = aiohttp.ClientTimeout(total=60 * 10) 59 | async with aiohttp.ClientSession(timeout=total_timeout) as session: 60 | async with session.get(url) as resp: 61 | reply = await resp.text() 62 | except Exception as e: 63 | # If we can't create servers we'll need to exit...something has gone wrong 64 | # with the network. 65 | log.error(f"Error when creating server on {address} at {handle}: {e}") 66 | sys.exit(1) 67 | 68 | 69 | async def make_client_connection(event, iface, src, dst, address, handle): 70 | # Task waits for the event to be set before starting its work. 71 | try: 72 | if event != None: 73 | await event.wait() 74 | url = f"http://{address}:{AUTOPILOT_PORT}{handle}" 75 | total_timeout = aiohttp.ClientTimeout(total=60 * 10) 76 | async with aiohttp.ClientSession(timeout=total_timeout) as session: 77 | async with session.get(url) as resp: 78 | reply = await resp.text() 79 | reply = "".join(reply.split()) 80 | try: 81 | json_reply = json.loads(reply) 82 | except json.JSONDecodeError as e: 83 | log.error( 84 | f"Failed to decode JSON from response: {e}. Response: {reply}" 85 | ) 86 | return {"src": src, "dst": dst, "iface": iface, "data": {}} 87 | 88 | return {"src": src, "dst": dst, "iface": iface, "data": json_reply} 89 | except Exception as e: 90 | log.error(f"Error during client connection to {address} at {handle}: {e}") 91 | log.error(f"Failure occured with from src {src} to dst {dst} on iface {iface}") 92 | return {"src": src, "dst": dst, "iface": iface, "data": {}} 93 | 94 | 95 | async def iperf_start_servers(node_map, num_servers, port_start): 96 | """ 97 | Starts iperf3 servers on each node by sending requests to the corresponding endpoints 98 | derived in the node_map. Each server will be launched from the corresponding autopilot 99 | pod that the endpoint represents on the worker-node. 100 | 101 | Args: 102 | node_map (dict): A dictionary mapping worker-nodes to representation data. 103 | num_servers (str): The number of iperf3 servers to start on each node. 104 | port_start (str) The port to start launching servers from on each node. 105 | """ 106 | tasks = [ 107 | make_server_connection( 108 | None, 109 | node_map[node]["endpoint"], 110 | f"/iperfservers?numservers={num_servers}&startport={port_start}", 111 | ) 112 | for node in node_map 113 | ] 114 | await asyncio.gather(*tasks) 115 | 116 | 117 | async def run_workload(workload_type, nodemap, workload, num_clients, port_start): 118 | """ 119 | Starts network tests according to the specified workload. 120 | 121 | Args: 122 | workload_type (str): A workload type to run. 123 | node_map (dict): A dictionary mapping node names to their endpoints, pods, and network interfaces. 124 | workload (dict): A dictionary specifying the workload and steps for the network tests. 125 | num_clients (str): The number of parallel clients to test against the server (used to also increase port val.) 126 | port_start (str): A port associated to the server, 127 | """ 128 | if SupportedWorkload.RING.value == workload_type: 129 | event = asyncio.Event() 130 | # All the nodes "should have" the same amount of interfaces...let's just get the first node and check how many there are... 131 | # This is also assuming that the ordering of the ifaces in this list are accurate...i.e., starting with net1-0 and so forth 132 | netifaces_count = len(nodemap[next(iter(nodemap))]["netifaces"]) 133 | results = [] 134 | for iface in range(netifaces_count): 135 | interface_results = [] 136 | log.info(f"Running Interface net1-{iface}") 137 | for step in workload: 138 | tasks = [] 139 | for pair in workload[step]: 140 | for source, target in pair.items(): 141 | task = make_client_connection( 142 | event, 143 | f"net1-{iface}", 144 | f"{nodemap[source]['pod']}_on_{source}", 145 | f"{nodemap[target]['pod']}_on_{target}", 146 | nodemap[source]["endpoint"], 147 | f"/iperfclients?dstip={nodemap[target]['netifaces'][iface]}&dstport={port_start}&numclients={num_clients}", 148 | ) 149 | tasks.append(task) 150 | await asyncio.sleep(1) 151 | event.set() 152 | res = await asyncio.gather(*tasks) 153 | interface_results.append(res) 154 | results.append(interface_results) 155 | 156 | grids = [] 157 | summary_avg = [] 158 | for i, el in enumerate(results): 159 | grid = {} 160 | total_bitrate = 0 161 | count = 0 162 | for l in el: 163 | for host in l: 164 | src = host["src"] 165 | dst = host["dst"] 166 | if host["data"] == {}: 167 | # Failure had occured resulting in a 0.0 bitrate. 168 | bitrate = 0.0 169 | else: 170 | bitrate = float( 171 | host["data"]["receiver"]["aggregate"]["bitrate"] 172 | ) 173 | count = count + 1 174 | total_bitrate = total_bitrate + bitrate 175 | if src not in grid: 176 | grid[src] = {} 177 | grid[src][dst] = bitrate 178 | avg = str(round(Decimal(total_bitrate / count), 2)) 179 | summary_avg.append(f"net1-{i} Average Bandwidth Gb/s: {avg}") 180 | grids.append(grid) 181 | 182 | for i, grid in enumerate(grids): 183 | print(f"Network Throughput net1-{i}:") 184 | pods = sorted(grid.keys()) 185 | print(f"{'src/dst':<40}" + "".join(f"{dst:<40}" for pod in pods)) 186 | for src_pod in pods: 187 | row = [f"{grid[src_pod].get(dst_pod, 'N/A'):<40}" for dst_pod in pods] 188 | print(f"{src_pod:<40}" + "".join(row)) 189 | print() 190 | 191 | print("Overall Network Interface Average Bandwidth:") 192 | for i in summary_avg: 193 | print(i) 194 | 195 | else: 196 | log.error("Unsupported Workload Attempted") 197 | sys.exit(1) 198 | 199 | 200 | async def cleanup_iperf_servers(node_map): 201 | """ 202 | Removes all started iperf servers across all nodes. 203 | 204 | Args: 205 | node_map (dict): A dictionary mapping worker-nodes to representation data. 206 | """ 207 | tasks = [ 208 | make_server_connection( 209 | None, 210 | node_map[node]["endpoint"], 211 | f"/iperfstopservers", 212 | ) 213 | for node in node_map 214 | ] 215 | await asyncio.gather(*tasks) 216 | 217 | 218 | async def main(): 219 | type_of_workload = args["workload"].upper() 220 | num_parallel_clients = args["pclients"] 221 | port_start = args["startport"] 222 | cleanup_iperf = args["cleanup"] 223 | 224 | wl = NetworkWorkload() 225 | autopilot_node_map = wl.gen_autopilot_node_map_json() 226 | if type_of_workload in (workload.value for workload in SupportedWorkload): 227 | if SupportedWorkload.RING.value == type_of_workload: 228 | ring_workload = wl.generate_ring_topology_json(autopilot_node_map) 229 | await iperf_start_servers( 230 | autopilot_node_map, num_parallel_clients, port_start 231 | ) 232 | await run_workload( 233 | type_of_workload, 234 | autopilot_node_map, 235 | ring_workload, 236 | num_parallel_clients, 237 | port_start, 238 | ) 239 | 240 | else: 241 | # 242 | # TODO: Build other workloads... 243 | # 244 | log.error("Unsupported Workload Attempted") 245 | sys.exit(1) 246 | else: 247 | log.error("Unsupported Workload Attempted") 248 | sys.exit(1) 249 | 250 | if cleanup_iperf: 251 | await cleanup_iperf_servers(autopilot_node_map) 252 | 253 | 254 | if __name__ == "__main__": 255 | asyncio.run(main()) 256 | 257 | -------------------------------------------------------------------------------- /autopilot-daemon/network/iperf3_start_clients.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | from decimal import Decimal 5 | from iperf3_utils import * 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--dstip", type=str, default="", help="IP for iperf3 server") 9 | parser.add_argument("--dstport", type=int, default=5200, help="Port for iperf3 server") 10 | parser.add_argument("--numclients", type=int, default=1, help="Number of clients") 11 | args = parser.parse_args() 12 | 13 | 14 | async def run_iperf_client(dstip, dstport, iteration, duration_seconds): 15 | dstport += iteration 16 | command = [ 17 | "iperf3", 18 | "-c", 19 | dstip, 20 | "-p", 21 | str(dstport), 22 | "-t", 23 | duration_seconds, 24 | "-i", 25 | "1.0", 26 | "-Z", 27 | ] 28 | 29 | default_res = { 30 | "interface": {"ip": dstip, "port": dstport}, 31 | "results": { 32 | "sender": { 33 | "transfer": {"rate": 0.0, "units": "n/a"}, 34 | "bitrate": {"rate": 0.0, "units": "n/a"}, 35 | }, 36 | "receiver": { 37 | "transfer": {"rate": 0.0, "units": "n/a"}, 38 | "bitrate": {"rate": 0.0, "units": "n/a"}, 39 | }, 40 | }, 41 | } 42 | 43 | try: 44 | process = await asyncio.wait_for( 45 | asyncio.create_subprocess_exec( 46 | *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE 47 | ), 48 | timeout=60, 49 | ) 50 | stdout, stderr = await process.communicate() 51 | output_filename = f"{dstip}_{dstport}_client.log" 52 | with open(output_filename, "w") as f: 53 | f.write(stdout.decode()) 54 | except Exception as e: 55 | return {"interface": {"ip": dstip, "port": dstport}, "results": default_res} 56 | 57 | # In theory this should not occur since we catch this above...but just to be safe let's ensure 58 | # the return code is zero... 59 | if process.returncode != 0: 60 | return {"interface": {"ip": dstip, "port": dstport}, "results": default_res} 61 | 62 | result = { 63 | "sender": { 64 | "transfer": {"rate": 0.0, "units": "n/a"}, 65 | "bitrate": {"rate": 0.0, "units": "n/a"}, 66 | }, 67 | "receiver": { 68 | "transfer": {"rate": 0.0, "units": "n/a"}, 69 | "bitrate": {"rate": 0.0, "units": "n/a"}, 70 | }, 71 | } 72 | iperf3_stdout = stdout.decode().strip().splitlines() 73 | for line in iperf3_stdout: 74 | line = line.lower() 75 | if "sender" in line: 76 | parts = line.split() 77 | result["sender"]["transfer"] = {"rate": parts[4], "units": parts[5]} 78 | result["sender"]["bitrate"] = {"rate": parts[6], "units": parts[7]} 79 | elif "receiver" in line: 80 | parts = line.split() 81 | result["receiver"]["transfer"] = {"rate": parts[4], "units": parts[5]} 82 | result["receiver"]["bitrate"] = {"rate": parts[6], "units": parts[7]} 83 | return {"interface": {"ip": dstip, "port": dstport}, "results": result} 84 | 85 | 86 | def calculate_stats(values, num_clients): 87 | return { 88 | "aggregate": { 89 | "transfer": str(round(Decimal(sum(values["transfer"])), 2)), 90 | "bitrate": str(round(Decimal(sum(values["bitrate"])), 2)), 91 | }, 92 | "mean": { 93 | "transfer": str(round(Decimal(sum(values["transfer"]) / num_clients), 2)), 94 | "bitrate": str(round(Decimal(sum(values["bitrate"]) / num_clients), 2)), 95 | }, 96 | "min": { 97 | "transfer": str(round(Decimal(min(values["transfer"])), 2)), 98 | "bitrate": str(round(Decimal(min(values["bitrate"])), 2)), 99 | }, 100 | "max": { 101 | "transfer": str(round(Decimal(max(values["transfer"])), 2)), 102 | "bitrate": str(round(Decimal(max(values["bitrate"])), 2)), 103 | }, 104 | } 105 | 106 | 107 | async def main(): 108 | dstip, dstport, numclients = args.dstip, args.dstport, args.numclients 109 | duration_seconds = "5" 110 | 111 | tasks = [ 112 | asyncio.create_task(run_iperf_client(dstip, dstport, i, duration_seconds)) 113 | for i in range(numclients) 114 | ] 115 | results = await asyncio.gather(*tasks) 116 | 117 | sender_values = {"transfer": [], "bitrate": []} 118 | receiver_values = {"transfer": [], "bitrate": []} 119 | 120 | total_results = {} 121 | for idx, result in enumerate(results): 122 | total_results[str(idx)] = result 123 | sender_values["transfer"].append( 124 | float(result["results"]["sender"]["transfer"]["rate"]) 125 | ) 126 | sender_values["bitrate"].append( 127 | float(result["results"]["sender"]["bitrate"]["rate"]) 128 | ) 129 | receiver_values["transfer"].append( 130 | float(result["results"]["receiver"]["transfer"]["rate"]) 131 | ) 132 | receiver_values["bitrate"].append( 133 | float(result["results"]["receiver"]["bitrate"]["rate"]) 134 | ) 135 | 136 | stats = { 137 | "sender": calculate_stats(sender_values, numclients), 138 | "receiver": calculate_stats(receiver_values, numclients), 139 | } 140 | 141 | total_results["stats"] = stats 142 | summary_file = f"{dstip}_summary.json" 143 | with open(summary_file, "w") as f: 144 | json.dump(total_results, f, indent=4) 145 | print(json.dumps(stats, indent=4)) 146 | 147 | 148 | if __name__ == "__main__": 149 | asyncio.run(main()) 150 | 151 | -------------------------------------------------------------------------------- /autopilot-daemon/network/iperf3_start_servers.py: -------------------------------------------------------------------------------- 1 | from iperf3_utils import * 2 | 3 | parser = argparse.ArgumentParser() 4 | parser.add_argument( 5 | "--numservers", 6 | type=int, 7 | default=1, 8 | help=( 9 | 'The number of servers (on different ports) to have running on a single IP. Note. For "numservers" values greater than 1 ' 10 | 'the "startport" value will be adjusted for each subsequently started server by a factor of 1.' 11 | ), 12 | ) 13 | 14 | parser.add_argument( 15 | "--startport", 16 | type=int, 17 | default=5200, 18 | help=( 19 | 'The default port value. In the event that "numservers" is greater than 1, the default port value used ' 20 | "to generate servers will automatically increase to accomdate the clients running in parallel." 21 | ), 22 | ) 23 | args = vars(parser.parse_args()) 24 | 25 | 26 | def main(): 27 | num_server = args["numservers"] 28 | port = args["startport"] 29 | interfaces = [] 30 | entrylist = json.loads('{}') 31 | 32 | try: 33 | config.load_incluster_config() 34 | v1 = client.CoreV1Api() 35 | except: 36 | log.error("Failed to load Kubernetes CoreV1API.") 37 | exit(1) 38 | try: 39 | autopilot_pods = v1.list_namespaced_pod( 40 | namespace=AUTOPILOT_NAMESPACE, field_selector="metadata.name="+CURR_POD_NAME 41 | ) 42 | except ApiException as e: 43 | log.error( 44 | "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e 45 | ) 46 | exit(1) 47 | 48 | pod = autopilot_pods.items[0] 49 | try: 50 | entrylist = json.loads( 51 | pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"] 52 | ) 53 | except KeyError: 54 | log.info( 55 | f'Key k8s.v1.cni.cncf.io/network-status not found on pod "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}"') 56 | if len(entrylist) > 0: 57 | interfaces = [ 58 | iface 59 | for iface in netifaces.interfaces() 60 | if "net" in iface and iface not in ("lo", "eth0", "tunl0") 61 | ] 62 | else: 63 | interfaces = [ 64 | iface 65 | for iface in netifaces.interfaces() 66 | if iface not in ("lo", "tunl0") 67 | ] 68 | 69 | 70 | if not interfaces: 71 | log.error( 72 | f'Secondary nics not found for "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}".' 73 | ) 74 | sys.exit(1) 75 | 76 | for iface in interfaces: 77 | for i in range(num_server): 78 | try: 79 | address = netifaces.ifaddresses(iface) 80 | ip = address[netifaces.AF_INET][0]["addr"] 81 | command = ["iperf3", "-s", "-B", ip, "-p", str(port + i), "-D"] 82 | log.info( 83 | f"Starting iperf3 server {ip}:{port + i} using {iface} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}..." 84 | ) 85 | subprocess.run(command, text=True, capture_output=True, check=True) 86 | except subprocess.CalledProcessError as e: 87 | log.error( 88 | f"Server failed to start on {ip}:{port + i} using {iface} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}.\n " 89 | f"Exited with error: {e.stderr}" 90 | ) 91 | sys.exit(1) 92 | except KeyError: 93 | log.error( 94 | f"No AF_INET (IPv4) address found for interface {iface} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}." 95 | ) 96 | sys.exit(1) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /autopilot-daemon/network/iperf3_stop_servers.py: -------------------------------------------------------------------------------- 1 | from iperf3_utils import * 2 | 3 | 4 | def kill_all_iperf_servers(): 5 | try: 6 | result = subprocess.run( 7 | ["ps", "aux"], text=True, capture_output=True, check=True 8 | ) 9 | except subprocess.CalledProcessError as e: 10 | print(f"Error occurred while listing processes: {e}") 11 | sys.exit(1) 12 | 13 | processes = result.stdout.splitlines() 14 | 15 | for process in processes: 16 | try: 17 | # Don't combine the strings...this won't work if "-s" is placed in a different position... 18 | if "iperf3" in process and "-s" in process: 19 | parts = process.split() 20 | if len(parts) > 1: 21 | pid = int(parts[1]) 22 | if pid > 1: 23 | try: 24 | os.kill(pid, signal.SIGTERM) 25 | except PermissionError: 26 | log.error( 27 | f"Permission denied: Could not kill process with PID {pid} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}." 28 | ) 29 | sys.exit(1) 30 | except ProcessLookupError: 31 | log.error( 32 | f"Process with PID {pid} does not exist in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}." 33 | ) 34 | sys.exit(1) 35 | except Exception as e: 36 | log.error( 37 | f"Failed to kill process with PID {pid}: {e} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}" 38 | ) 39 | sys.exit(1) 40 | else: 41 | log.error( 42 | f"Unexpected format in process line: {process} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}" 43 | ) 44 | sys.exit(1) 45 | except ValueError: 46 | log.error( 47 | f"Could not convert PID to an integer: {process} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}" 48 | ) 49 | sys.exit(1) 50 | except Exception as e: 51 | log.error( 52 | f"An unexpected error occurred: {e} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}" 53 | ) 54 | sys.exit(1) 55 | log.info(f"All iperf servers have been removed (not deleting default iperf server)") 56 | 57 | 58 | if __name__ == "__main__": 59 | kill_all_iperf_servers() 60 | 61 | -------------------------------------------------------------------------------- /autopilot-daemon/network/iperf3_utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from decimal import Decimal 3 | import argparse 4 | import asyncio 5 | import logging 6 | import aiohttp 7 | import os 8 | import json 9 | import requests 10 | import netifaces 11 | import subprocess 12 | import sys 13 | import signal 14 | 15 | from kubernetes import client, config 16 | from kubernetes.client.rest import ApiException 17 | 18 | log = logging.getLogger(__name__) 19 | logging.basicConfig( 20 | format="[NETWORK] - [IPERF] - [%(levelname)s] : %(message)s", 21 | level=logging.INFO, 22 | ) 23 | 24 | 25 | # 26 | # TODO: Add this to network_workload.py 27 | # 28 | class SupportedWorkload(Enum): 29 | RING = "RING" 30 | 31 | 32 | CURR_POD_NAME = os.getenv("POD_NAME") 33 | CURR_WORKER_NODE_NAME = os.getenv("NODE_NAME") 34 | AUTOPILOT_NAMESPACE = os.getenv("NAMESPACE") 35 | AUTOPILOT_PORT = os.getenv("AUTOPILOT_HEALTHCHECKS_SERVICE_PORT") 36 | -------------------------------------------------------------------------------- /autopilot-daemon/network/network_workload.py: -------------------------------------------------------------------------------- 1 | from iperf3_utils import * 2 | 3 | 4 | # 5 | # TODO: Make this an abstract class... 6 | # 7 | # 8 | 9 | 10 | class NetworkWorkload: 11 | def __init__(self, namespace=None, workload_name="Ring Topology"): 12 | self.namespace = namespace or os.getenv("NAMESPACE") 13 | self.workload = workload_name 14 | self.log = logging.getLogger(__name__) 15 | logging.basicConfig( 16 | format="[NETWORK] - [WORKLOAD-GEN] - [%(levelname)s] : %(message)s", 17 | level=logging.INFO, 18 | ) 19 | 20 | try: 21 | config.load_incluster_config() 22 | self.v1 = client.CoreV1Api() 23 | except: 24 | self.log.error("Failed to load Kubernetes CoreV1API.") 25 | exit(1) 26 | 27 | def get_all_ifaces(self): 28 | address_map = {} 29 | 30 | try: 31 | autopilot_pods = self.v1.list_namespaced_pod( 32 | namespace=self.namespace, label_selector="app=autopilot" 33 | ) 34 | except ApiException as e: 35 | self.log.error( 36 | "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e 37 | ) 38 | exit(1) 39 | entrylist = json.loads('{}') 40 | for pod in autopilot_pods.items: 41 | try: 42 | entrylist = json.loads( 43 | pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"] 44 | ) 45 | except KeyError: 46 | log.info( 47 | f'Key k8s.v1.cni.cncf.io/network-status not found on pod "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}"') 48 | if len(entrylist) > 0: 49 | for entry in entrylist: 50 | try: 51 | iface = entry["interface"] 52 | except KeyError: 53 | self.log.info("Interface key name not found, assigning 'k8s-pod-network'.") 54 | iface = "k8s-pod-network" 55 | if address_map.get(iface) == None: 56 | address_map[iface] = [] 57 | address_map.get(iface).append((pod.spec.node_name, entry["ips"])) 58 | else: 59 | pod_ips = pod.status.pod_i_ps 60 | if pod_ips != None: 61 | iface = "default" 62 | if address_map.get(iface) == None: 63 | address_map[iface] = [] 64 | ips = [] 65 | for pod_ip in pod_ips: 66 | ips.append(pod_ip.ip) 67 | address_map.get(iface).append((pod.spec.node_name, ips)) 68 | 69 | if len(address_map) == 0: 70 | self.log.error("No interfaces found. FAIL.") 71 | return address_map 72 | 73 | def gen_autopilot_node_map_json(self): 74 | # 75 | # TODO: This is bad because it gets all endpoints, but what happens if 76 | # we have a failing worker that doesn't have any pods? 77 | # 78 | # Well we skip it...this bad...why? Well, the user won't know... 79 | # 80 | # Proposal, warn the user at least that NOT ALL work nodes will be tested... 81 | # 82 | try: 83 | endpoints = self.v1.list_namespaced_endpoints( 84 | self.namespace, 85 | field_selector="metadata.name=autopilot-healthchecks", 86 | ) 87 | except ApiException as e: 88 | self.log.error( 89 | "Exception when calling Kubernetes CoreV1Api->list_namespaced_endpoints: %s\n" 90 | % e 91 | ) 92 | exit(1) 93 | 94 | autopilot_node_map = {} 95 | for endpointslice in endpoints.items: 96 | addresses = endpointslice.subsets[0].addresses 97 | for item in addresses: 98 | node_name = item.node_name 99 | if node_name not in autopilot_node_map: 100 | pod_name = item.target_ref.name 101 | ip_address = item.ip 102 | autopilot_node_map[node_name] = { 103 | "pod": pod_name, 104 | "endpoint": ip_address, 105 | } 106 | 107 | addresses = self.get_all_ifaces() 108 | for add in addresses: 109 | if add != "eth0": 110 | for entry in addresses.get(add): 111 | worker_node_name = entry[0] 112 | net_interfaces = entry[1] 113 | if worker_node_name in autopilot_node_map: 114 | autopilot_node_map[worker_node_name][ 115 | "netifaces" 116 | ] = net_interfaces 117 | 118 | return autopilot_node_map 119 | 120 | def generate_ring_topology_json(self, worker_nodes_map): 121 | pair_links = {} 122 | node_count = len(worker_nodes_map) 123 | if node_count > 1: 124 | worker_nodes = list(worker_nodes_map.keys()) 125 | for t in range(1, node_count): 126 | step_pairs = [] 127 | for i in range(node_count): 128 | source = worker_nodes[i] 129 | target = worker_nodes[(i + t) % node_count] 130 | step_pairs.append({source: target}) 131 | pair_links[t] = step_pairs 132 | return pair_links 133 | 134 | def print_autopilot_node_map_json(self, worker_node_map): 135 | self.log.info(f"\n{json.dumps(worker_node_map, indent=4)}") 136 | 137 | def print_ring_topology_json(self, ring_workload): 138 | output = "" 139 | for step in ring_workload: 140 | output += f"Time Step {step}:\n" 141 | for pair in ring_workload[step]: 142 | for source, target in pair.items(): 143 | output += f" {source} -> {target}\n" 144 | self.log.info(f"\n{output}") 145 | 146 | def print_ring_workload(self): 147 | autopilot_node_map_json = self.gen_autopilot_node_map_json() 148 | ring_workload_pairs_json = self.generate_ring_topology_json( 149 | autopilot_node_map_json 150 | ) 151 | output = "" 152 | for step in ring_workload_pairs_json: 153 | output += f"Time Step {step}\n" 154 | for pair in ring_workload_pairs_json[step]: 155 | for source, dest in pair.items(): 156 | output += ( 157 | f" Pod-to-Pod: {autopilot_node_map_json[source]['pod']} " 158 | f"-> {autopilot_node_map_json[dest]['pod']}\n" 159 | f" Endpoint-to-Endpoint: {autopilot_node_map_json[source]['endpoint']} -> " 160 | f"{autopilot_node_map_json[dest]['endpoint']}\n" 161 | ) 162 | output += f"\n" 163 | self.log.info(f"\n{output}") 164 | -------------------------------------------------------------------------------- /autopilot-daemon/network/ping-entrypoint.py: -------------------------------------------------------------------------------- 1 | from kubernetes import client, config 2 | from kubernetes.client.rest import ApiException 3 | import os 4 | import json 5 | import argparse 6 | import asyncio 7 | import subprocess 8 | import time 9 | import netifaces 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--job', type=str, default='None', help='Workload node discovery w/ given namespace and label. Ex: \"--job=namespace:label-key=label-value\". Default is set to None.') 13 | parser.add_argument('--nodelabel', type=str, default='None', help='Node label to select nodes. Ex: \"label-key=label-value\". Default is set to None.') 14 | parser.add_argument('--nodes', type=str, default='all', help='Node(s) running autopilot that will be reached out by ping. Can be a comma separated list. Default is \"all\". Servers are reached out sequentially') 15 | args = vars(parser.parse_args()) 16 | 17 | job = args['job'] 18 | nodemap = {} 19 | namespace_self = os.getenv("NAMESPACE") 20 | nodename_self = os.getenv("NODE_NAME") 21 | config.load_incluster_config() 22 | kubeapi = client.CoreV1Api() 23 | 24 | async def main(): 25 | nodelist = args['nodes'].replace(' ', '').split(',') # list of nodes 26 | job = args['job'] 27 | nodelabel = args['nodelabel'] 28 | nodemap = {} 29 | allnodes = False 30 | check_local_ifaces() 31 | if 'all' in nodelist and job == 'None' and nodelabel == 'None': 32 | allnodes = True 33 | else: 34 | nodemap = get_job_nodes(nodelist) 35 | 36 | nodes={} 37 | ifaces=set() 38 | print("[PING] Pod running ping: ", os.getenv("POD_NAME")) 39 | print("[PING] Starting: collecting node list") 40 | try: 41 | retries = 0 42 | daemonset_size = expectedPods() 43 | autopilot_pods = kubeapi.list_namespaced_pod(namespace=namespace_self, label_selector="app=autopilot") 44 | while len(autopilot_pods.items) < daemonset_size or retries > 100: 45 | print("[PING] Waiting for all Autopilot pods to run") 46 | time.sleep(5) 47 | autopilot_pods = kubeapi.list_namespaced_pod(namespace=namespace_self, label_selector="app=autopilot") 48 | retries +=1 49 | if retries > 100 and len(autopilot_pods.items) < daemonset_size: 50 | print("[PING] Reached max retries of 100. ABORT") 51 | exit() 52 | 53 | except ApiException as e: 54 | print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) 55 | exit() 56 | 57 | # run through all pods and create a map of all interfaces 58 | print("Creating a list of interfaces and IPs") 59 | entrylist = json.loads('{}') 60 | for pod in autopilot_pods.items: 61 | if pod.spec.node_name != nodename_self and (allnodes or (pod.spec.node_name in nodemap.keys())): 62 | try: 63 | entrylist = json.loads(pod.metadata.annotations['k8s.v1.cni.cncf.io/network-status']) 64 | except KeyError: 65 | print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod.metadata.name, "- node", pod.spec.node_name) 66 | if len(entrylist) > 0 : 67 | node={} 68 | nodes[pod.spec.node_name] = node 69 | for entry in entrylist: 70 | try: 71 | iface=entry['interface'] 72 | except KeyError: 73 | print("Interface key name not found, assigning 'k8s-pod-network'.") 74 | iface = "k8s-pod-network" 75 | ifaces = ifaces | {iface} 76 | node[iface] = { 77 | 'ips': entry['ips'], 78 | 'pod': pod.metadata.name 79 | } 80 | else: 81 | node={} 82 | nodes[pod.spec.node_name] = node 83 | pod_ips = pod.status.pod_i_ps 84 | if pod_ips != None: 85 | iface = "default" 86 | ifaces = ifaces | {iface} 87 | iplist = [] 88 | for pod_ip in pod_ips: 89 | iplist.append(pod_ip.ip) 90 | node[iface] = { 91 | 'ips': iplist, 92 | 'pod': pod.metadata.name 93 | } 94 | 95 | 96 | 97 | if len(nodes.keys()) == 0: 98 | print("[PING] No nodes found. ABORT") 99 | exit(0) 100 | # run ping tests to each pod on each interface 101 | print("[PING] Running ping tests for every interface") 102 | conn_dict = dict() 103 | clients = [] 104 | for nodename in nodes.keys(): 105 | conn_dict[nodename] = {} 106 | for iface in ifaces: 107 | try: 108 | ips = nodes[nodename][iface]['ips'] 109 | except KeyError: 110 | print("Interface", iface, "not found, skipping.") 111 | continue 112 | for index, ip in enumerate(ips): 113 | command = ['ping',ip,'-t','45','-c','10'] 114 | indexed_iface = iface+("-"+str(index) if len(ips)>1 else "") 115 | clients.append((subprocess.Popen(command, start_new_session=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE), nodename, ip, indexed_iface)) 116 | for c in clients: 117 | try: 118 | c[0].wait(50) 119 | except: 120 | print("Timeout while waiting for", c[2], "on node", c[1]) 121 | continue 122 | fail = False 123 | for c in clients: 124 | stdout, stderr = c[0].communicate() 125 | if stderr: 126 | print("[PING] output parse exited with error: " + stderr) 127 | fail = True 128 | else: 129 | if "Unreachable" in stdout or "100% packet loss" in stdout: 130 | print("Node", c[1], c[2], c[3], "1") 131 | fail = True 132 | else: 133 | print("Node", c[1], c[2], c[3], "0") 134 | if fail: 135 | print("[PING] At least one node unreachable. FAIL") 136 | else: 137 | print("[PING] all nodes reachable. success") 138 | 139 | def check_local_ifaces(): 140 | podname = os.getenv("POD_NAME") 141 | pod_list = kubeapi.list_namespaced_pod(namespace=namespace_self, field_selector="metadata.name="+podname) 142 | ips = [] 143 | iface_count = 0 144 | pod_self = pod_list.items[0] 145 | entrylist = json.loads('{}') 146 | ip_addresses = [netifaces.ifaddresses(iface)[netifaces.AF_INET][0]['addr'] for iface in netifaces.interfaces() if netifaces.AF_INET in netifaces.ifaddresses(iface)] 147 | try: 148 | entrylist = json.loads(pod_self.metadata.annotations['k8s.v1.cni.cncf.io/network-status']) 149 | except KeyError: 150 | print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod_self.metadata.name, "- node", pod_self.spec.node_name) 151 | if len(entrylist) > 0: 152 | for entry in entrylist: 153 | try: 154 | iface=entry['interface'] 155 | except KeyError: 156 | continue 157 | for ip in entry['ips']: 158 | if ip not in ip_addresses: 159 | print("[PING] IFACES count inconsistent. Pod annotation reports", entry['ips'], ", not found in the pod among", ip_addresses, "ABORT") 160 | exit() 161 | ips.append(entry['ips']) 162 | iface_count += len(entry['ips']) 163 | else: 164 | pod_ips = pod_self.status.pod_i_ps 165 | if pod_ips != None: 166 | for pod_ip in pod_ips: 167 | if pod_ip.ip not in ip_addresses: 168 | print("[PING] IFACES count inconsistent. Pod annotation reports", pod_ip.ip, ", not found in the pod among", ip_addresses, "ABORT") 169 | exit() 170 | ips.append(pod_ip.ip) 171 | iface_count += len(pod_ips) 172 | 173 | 174 | 175 | def get_job_nodes(nodelist): 176 | v1 = client.CoreV1Api() 177 | # get nodes from job is specified 178 | nodemap = {} 179 | node_name_self = os.getenv("NODE_NAME") 180 | job = args['job'] 181 | if job != 'None': 182 | job = args['job'].split(':') 183 | job_ns = job[0] # ex: "default" 184 | job_label = job[1] # ex: "job-name=my-job" or "app=my-app"] 185 | try: 186 | job_pods = v1.list_namespaced_pod(namespace=job_ns, label_selector=job_label) 187 | except ApiException as e: 188 | print("[PING] Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) 189 | 190 | print('[PING] Workload:', ': '.join(job)) 191 | for pod in job_pods.items: 192 | if pod.spec.node_name != node_name_self: 193 | nodemap[pod.spec.node_name] = True 194 | 195 | nodelabel = args['nodelabel'] 196 | if nodelabel != 'None': 197 | try: 198 | labeled_nodes = v1.list_node(label_selector=nodelabel) 199 | except ApiException as e: 200 | print("Exception when calling CoreV1Api->list_node: %s\n" % e) 201 | exit() 202 | if len(labeled_nodes.items) == 0: 203 | print ("No node is labeled with", nodelabel, " - ABORT.") 204 | exit() 205 | for labeled_node in labeled_nodes.items: 206 | if labeled_node.metadata.name != node_name_self: 207 | nodemap[labeled_node.metadata.name] = True 208 | # get nodes from input list, if any 209 | if 'all' not in nodelist: 210 | for i in nodelist: 211 | if i != node_name_self: 212 | nodemap[i] = True 213 | return nodemap 214 | 215 | 216 | def expectedPods(): 217 | v1 = client.AppsV1Api() 218 | try: 219 | autopilot = v1.list_namespaced_daemon_set(namespace=namespace_self, label_selector="app=autopilot") 220 | except ApiException as e: 221 | print("[PING] Exception when calling fetching Autopilot by corev1api->list_namespaced_daemon_set", e) 222 | return 0 223 | return autopilot.items[0].status.desired_number_scheduled 224 | 225 | if __name__ == '__main__': 226 | asyncio.run(main()) -------------------------------------------------------------------------------- /autopilot-daemon/pkg/cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "flag" 6 | "fmt" 7 | "net/http" 8 | "os" 9 | "time" 10 | 11 | "github.com/IBM/autopilot/pkg/handler" 12 | "github.com/IBM/autopilot/pkg/healthcheck" 13 | "github.com/IBM/autopilot/pkg/utils" 14 | "github.com/prometheus/client_golang/prometheus" 15 | "github.com/prometheus/client_golang/prometheus/promhttp" 16 | "k8s.io/klog/v2" 17 | ) 18 | 19 | func main() { 20 | port := flag.String("port", "3333", "Port for the webhook to listen to. Defaulted to 3333") 21 | bwThreshold := flag.Int("bw", 4, "Sets bandwidth threshold for the init container") 22 | logFile := flag.String("logfile", "", "File where to save all the events") 23 | v := flag.String("loglevel", "2", "Log level") 24 | repeat := flag.Int("w", 24, "Run all tests periodically on each node. Time set in hours. Defaults to 24h") 25 | invasive := flag.Int("invasive-check-timer", 4, "Run invasive checks (e.g., dcgmi level 3) on each node when GPUs are free. Time set in hours. Defaults to 4h. Set to 0 to avoid invasive checks") 26 | 27 | flag.Parse() 28 | 29 | klog.InitFlags(nil) 30 | flag.Set("alsologtostderr", "true") 31 | if *logFile != "" { 32 | flag.Set("log_file", *logFile) 33 | } 34 | flag.Set("v", *v) 35 | flag.Set("logtostderr", "false") 36 | klog.OsExit = func(exitCode int) { 37 | fmt.Printf("os.Exit(%d)\n", exitCode) 38 | } 39 | 40 | utils.UserConfig = utils.InitConfig{ 41 | BWThreshold: *bwThreshold, 42 | } 43 | 44 | reg := prometheus.NewRegistry() 45 | utils.InitMetrics(reg) 46 | 47 | utils.InitHardwareMetrics() 48 | 49 | // Init the node status map 50 | healthcheck.InitNodeStatusMap() 51 | 52 | pMux := http.NewServeMux() 53 | promHandler := promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) 54 | pMux.Handle("/metrics", promHandler) 55 | 56 | go func() { 57 | klog.Info("Serving metrics on :8081") 58 | err := http.ListenAndServe(":8081", pMux) 59 | if err != nil { 60 | klog.Error(err.Error()) 61 | os.Exit(1) 62 | } 63 | }() 64 | 65 | readinessMux := http.NewServeMux() 66 | readinessMux.Handle("/readinessprobe", handler.ReadinessProbeHandler()) 67 | 68 | go func() { 69 | klog.Info("Serving Readiness Probe on :8080") 70 | err := http.ListenAndServe(":8080", readinessMux) 71 | if err != nil { 72 | klog.Error(err.Error()) 73 | os.Exit(1) 74 | } 75 | }() 76 | 77 | hcMux := http.NewServeMux() 78 | 79 | hcMux.Handle("/dcgm", handler.DCGMHandler()) 80 | hcMux.Handle("/gpumem", handler.GpuMemHandler()) 81 | hcMux.Handle("/gpupower", handler.GpuPowerHandler()) 82 | hcMux.Handle("/iperf", handler.IperfHandler()) 83 | hcMux.Handle("/iperfservers", handler.StartIperfServersHandler()) 84 | hcMux.Handle("/iperfstopservers", handler.StopAllIperfServersHandler()) 85 | hcMux.Handle("/iperfclients", handler.StartIperfClientsHandler()) 86 | hcMux.Handle("/invasive", handler.InvasiveCheckHandler()) 87 | hcMux.Handle("/pciebw", handler.PCIeBWHandler()) 88 | hcMux.Handle("/ping", handler.PingHandler()) 89 | hcMux.Handle("/pvc", handler.PVCHandler()) 90 | hcMux.Handle("/remapped", handler.RemappedRowsHandler()) 91 | hcMux.Handle("/status", handler.SystemStatusHandler()) 92 | 93 | s := &http.Server{ 94 | Addr: ":" + *port, 95 | Handler: hcMux, 96 | ReadTimeout: 30 * time.Minute, 97 | WriteTimeout: 30 * time.Minute, 98 | IdleTimeout: 30 * time.Minute, 99 | } 100 | 101 | go func() { 102 | klog.Info("Serving Health Checks on port :", *port) 103 | err := s.ListenAndServe() 104 | if errors.Is(err, http.ErrServerClosed) { 105 | klog.Info("Server Closed") 106 | } else if errors.Is(err, http.ErrAbortHandler) { 107 | klog.Info("Server Aborted") 108 | } else if errors.Is(err, http.ErrContentLength) { 109 | klog.Info("Response size too large") 110 | } else if errors.Is(err, http.ErrBodyReadAfterClose) { 111 | klog.Info("Read after close") 112 | } else if errors.Is(err, http.ErrHandlerTimeout) { 113 | klog.Info("Handler timed out") 114 | } 115 | if err != nil { 116 | klog.Info("EXITING") 117 | klog.Error(err.Error()) 118 | os.Exit(1) 119 | } 120 | }() 121 | 122 | // Create a Watcher over nodes. Needed to export metrics from data created by external jobs (i.e., dcgm Jobs) 123 | go utils.WatchNode() 124 | 125 | // Run the health checks at startup, then start the timer 126 | healthcheck.PeriodicCheck() 127 | 128 | periodicChecksTicker := time.NewTicker(time.Duration(*repeat) * time.Hour) 129 | defer periodicChecksTicker.Stop() 130 | invasiveChecksTicker := time.NewTicker(time.Duration(*invasive) * time.Hour) 131 | defer invasiveChecksTicker.Stop() 132 | for { 133 | select { 134 | case <-periodicChecksTicker.C: 135 | healthcheck.PeriodicCheck() 136 | case <-invasiveChecksTicker.C: 137 | if *invasive > 0 { 138 | healthcheck.InvasiveCheck() 139 | } 140 | } 141 | } 142 | 143 | // cert := "/etc/admission-webhook/tls/tls.crt" 144 | // key := "/etc/admission-webhook/tls/tls.key" 145 | 146 | // err := http.ListenAndServeTLS(":"+*port, cert, key, mux) 147 | // if errors.Is(err, http.ErrServerClosed) { 148 | // klog.Error("Server closed") 149 | // } else if err != nil { 150 | // klog.Error("error starting server: %s\n", err) 151 | // os.Exit(1) 152 | // } 153 | } 154 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/handler/handler.go: -------------------------------------------------------------------------------- 1 | package handler 2 | 3 | import ( 4 | "encoding/json" 5 | "net/http" 6 | "strconv" 7 | "strings" 8 | 9 | "github.com/IBM/autopilot/pkg/healthcheck" 10 | "github.com/IBM/autopilot/pkg/utils" 11 | "k8s.io/klog/v2" 12 | ) 13 | 14 | func SystemStatusHandler() http.Handler { 15 | fn := func(w http.ResponseWriter, r *http.Request) { 16 | nodelabel := r.URL.Query().Get("nodelabel") 17 | if nodelabel == "" { 18 | nodelabel = "None" 19 | } 20 | hosts := r.URL.Query().Get("host") 21 | if hosts == "" { 22 | hosts = "all" 23 | } 24 | checks := r.URL.Query().Get("check") 25 | if checks == "" { 26 | checks = "all" 27 | } 28 | batch := r.URL.Query().Get("batch") 29 | if batch == "" { 30 | batch = "0" 31 | } 32 | jobName := r.URL.Query().Get("job") 33 | if jobName == "" { 34 | jobName = "None" 35 | } 36 | dcgmR := r.URL.Query().Get("r") 37 | if dcgmR == "" { 38 | dcgmR = "1" 39 | } 40 | if strings.Contains(checks, string(healthcheck.Iperf)) { 41 | klog.Info("Running iperf3 on hosts ", hosts, " or job ", jobName) 42 | w.Write([]byte("Running iperf3 on hosts " + hosts + " or job " + jobName + "\n\n")) 43 | checks = strings.Trim(checks, "iperf") 44 | workload := r.URL.Query().Get("workload") 45 | if workload == "" { 46 | workload = "ring" 47 | } 48 | pclients := r.URL.Query().Get("pclients") 49 | if pclients == "" { 50 | pclients = "8" 51 | } 52 | startport := r.URL.Query().Get("startport") 53 | if startport == "" { 54 | startport = "5200" 55 | } 56 | cleanup := "" 57 | if r.URL.Query().Has("cleanup") { 58 | cleanup = "--cleanup" 59 | } 60 | out, err := healthcheck.RunIperf(workload, pclients, startport, cleanup) 61 | if err != nil { 62 | klog.Error(err.Error()) 63 | } 64 | if out != nil { 65 | w.Write(*out) 66 | } 67 | } 68 | if checks != "" { 69 | if hosts == utils.NodeName { 70 | utils.HealthcheckLock.Lock() 71 | defer utils.HealthcheckLock.Unlock() 72 | out, err := healthcheck.RunHealthLocalNode(checks, dcgmR, jobName, nodelabel, r) 73 | if err != nil { 74 | klog.Error(err.Error()) 75 | } 76 | w.Write(*out) 77 | hasFailures := healthcheck.GetNodeStatus() 78 | klog.Info("Errors after running local, on demand health checks: ", hasFailures) 79 | if hasFailures { 80 | utils.PatchNode(utils.GPUHealthWarnLabel, utils.NodeName, false) 81 | } else { 82 | utils.PatchNode(utils.GPUHealthPassLabel, utils.NodeName, false) 83 | } 84 | 85 | } else { 86 | klog.Info("Asking to run on remote node(s) ", hosts, " or with node label ", nodelabel) 87 | w.Write([]byte("Asking to run on remote node(s) " + hosts + " or with node label " + nodelabel + "\n\n")) 88 | out, err := healthcheck.RunHealthRemoteNodes(hosts, checks, batch, jobName, dcgmR, nodelabel) 89 | if err != nil { 90 | klog.Error(err.Error()) 91 | } 92 | w.Write(*out) 93 | } 94 | } 95 | 96 | } 97 | return http.HandlerFunc(fn) 98 | } 99 | 100 | func PCIeBWHandler() http.Handler { 101 | fn := func(w http.ResponseWriter, r *http.Request) { 102 | w.Write([]byte("Requesting pcie test with bw: " + strconv.Itoa(utils.UserConfig.BWThreshold) + "\n")) 103 | out, err := healthcheck.RunPCIeBW() 104 | if err != nil { 105 | klog.Error(err.Error()) 106 | } 107 | if out != nil { 108 | w.Write(*out) 109 | } 110 | 111 | } 112 | return http.HandlerFunc(fn) 113 | } 114 | 115 | func RemappedRowsHandler() http.Handler { 116 | fn := func(w http.ResponseWriter, r *http.Request) { 117 | w.Write([]byte("Requesting Remapped Rows check on all GPUs\n")) 118 | out, err := healthcheck.RunRemappedRows() 119 | if err != nil { 120 | klog.Error(err.Error()) 121 | } 122 | if out != nil { 123 | w.Write(*out) 124 | } 125 | 126 | } 127 | return http.HandlerFunc(fn) 128 | } 129 | 130 | func PingHandler() http.Handler { 131 | fn := func(w http.ResponseWriter, r *http.Request) { 132 | w.Write([]byte("Ping test")) 133 | hosts := r.URL.Query().Get("host") 134 | if hosts == "" { 135 | hosts = "all" 136 | } 137 | jobName := r.URL.Query().Get("job") 138 | if jobName == "" { 139 | jobName = "None" 140 | } 141 | nodelabel := r.URL.Query().Get("nodelabel") 142 | if nodelabel == "" { 143 | nodelabel = "None" 144 | } 145 | out, err := healthcheck.RunPing(hosts, jobName, nodelabel) 146 | if err != nil { 147 | klog.Error(err.Error()) 148 | } 149 | if out != nil { 150 | w.Write(*out) 151 | } 152 | } 153 | return http.HandlerFunc(fn) 154 | } 155 | 156 | func InvasiveCheckHandler() http.Handler { 157 | fn := func(w http.ResponseWriter, r *http.Request) { 158 | w.Write([]byte("Launching invasive health checks. Results will be added to 'autopilot.ibm.com/gpuhealth' and 'autopilot.ibm.com/dcgm.level.3' node labels\n")) 159 | healthcheck.InvasiveCheck() 160 | } 161 | return http.HandlerFunc(fn) 162 | } 163 | 164 | func IperfHandler() http.Handler { 165 | fn := func(w http.ResponseWriter, r *http.Request) { 166 | 167 | workload := r.URL.Query().Get("workload") 168 | if workload == "" { 169 | workload = "ring" 170 | } 171 | pclients := r.URL.Query().Get("pclients") 172 | if pclients == "" { 173 | pclients = "8" 174 | } 175 | startport := r.URL.Query().Get("startport") 176 | if startport == "" { 177 | startport = "5200" 178 | } 179 | cleanup := "" 180 | if r.URL.Query().Has("cleanup") { 181 | cleanup = "--cleanup" 182 | } 183 | out, err := healthcheck.RunIperf(workload, pclients, startport, cleanup) 184 | if err != nil { 185 | klog.Error(err.Error()) 186 | } 187 | if out != nil { 188 | w.Write(*out) 189 | } 190 | } 191 | return http.HandlerFunc(fn) 192 | } 193 | 194 | func StartIperfServersHandler() http.Handler { 195 | fn := func(w http.ResponseWriter, r *http.Request) { 196 | numservers := r.URL.Query().Get("numservers") 197 | if numservers == "" { 198 | numservers = "8" 199 | } 200 | startport := r.URL.Query().Get("startport") 201 | if startport == "" { 202 | startport = "5200" 203 | } 204 | out, err := healthcheck.StartIperfServers(numservers, startport) 205 | 206 | if err != nil { 207 | klog.Error(err.Error()) 208 | } 209 | if out != nil { 210 | w.Write(*out) 211 | } 212 | } 213 | return http.HandlerFunc(fn) 214 | } 215 | 216 | func StopAllIperfServersHandler() http.Handler { 217 | fn := func(w http.ResponseWriter, r *http.Request) { 218 | out, err := healthcheck.StopAllIperfServers() 219 | if err != nil { 220 | klog.Error(err.Error()) 221 | } 222 | if out != nil { 223 | w.Write(*out) 224 | } 225 | } 226 | return http.HandlerFunc(fn) 227 | } 228 | 229 | func StartIperfClientsHandler() http.Handler { 230 | fn := func(w http.ResponseWriter, r *http.Request) { 231 | dstip := r.URL.Query().Get("dstip") 232 | dstport := r.URL.Query().Get("dstport") 233 | numclients := r.URL.Query().Get("numclients") 234 | out, err := healthcheck.StartIperfClients(dstip, dstport, numclients) 235 | if err != nil { 236 | klog.Error(err.Error()) 237 | } 238 | if out != nil { 239 | w.Write(*out) 240 | } 241 | } 242 | return http.HandlerFunc(fn) 243 | } 244 | 245 | func DCGMHandler() http.Handler { 246 | fn := func(w http.ResponseWriter, r *http.Request) { 247 | w.Write([]byte("DCGM test")) 248 | dcgmR := r.URL.Query().Get("r") 249 | if dcgmR == "" { 250 | dcgmR = "1" 251 | } 252 | out, err := healthcheck.RunDCGM(dcgmR) 253 | if err != nil { 254 | klog.Error(err.Error()) 255 | } 256 | if out != nil { 257 | w.Write(*out) 258 | } 259 | } 260 | return http.HandlerFunc(fn) 261 | } 262 | 263 | func GpuPowerHandler() http.Handler { 264 | fn := func(w http.ResponseWriter, r *http.Request) { 265 | w.Write([]byte("GPU Power Measurement test")) 266 | out, err := healthcheck.RunGPUPower() 267 | if err != nil { 268 | klog.Error(err.Error()) 269 | } 270 | if out != nil { 271 | w.Write(*out) 272 | } 273 | } 274 | return http.HandlerFunc(fn) 275 | } 276 | 277 | func GpuMemHandler() http.Handler { 278 | fn := func(w http.ResponseWriter, r *http.Request) { 279 | w.Write([]byte("GPU Memory DGEMM+DAXPY test")) 280 | out, err := healthcheck.RunGPUPower() 281 | if err != nil { 282 | klog.Error(err.Error()) 283 | } 284 | if out != nil { 285 | w.Write(*out) 286 | } 287 | } 288 | return http.HandlerFunc(fn) 289 | } 290 | 291 | func PVCHandler() http.Handler { 292 | fn := func(w http.ResponseWriter, r *http.Request) { 293 | w.Write([]byte("PVC create-delete test\n")) 294 | out, err := healthcheck.RunCreateDeletePVC() 295 | if err != nil { 296 | klog.Error(err.Error()) 297 | } 298 | if out != nil { 299 | w.Write(*out) 300 | } 301 | } 302 | return http.HandlerFunc(fn) 303 | } 304 | 305 | func ReadinessProbeHandler() http.Handler { 306 | fn := func(w http.ResponseWriter, r *http.Request) { 307 | data := HealthResult{"readinessProbe", "ready"} 308 | w.Header().Set("Content-Type", "application/json") 309 | w.WriteHeader(http.StatusCreated) 310 | json.NewEncoder(w).Encode(data) 311 | } 312 | return http.HandlerFunc(fn) 313 | } 314 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/handler/messagestruct.go: -------------------------------------------------------------------------------- 1 | package handler 2 | 3 | type HealthResult struct { 4 | Name string 5 | Body string 6 | } 7 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/healthcheck/functions.go: -------------------------------------------------------------------------------- 1 | package healthcheck 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "time" 7 | 8 | "github.com/IBM/autopilot/pkg/utils" 9 | corev1 "k8s.io/api/core/v1" 10 | "k8s.io/apimachinery/pkg/api/resource" 11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | "k8s.io/klog/v2" 13 | ) 14 | 15 | func ListPVC() (string, error) { 16 | pvc, err := utils.GetClientsetInstance().Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Get(context.Background(), utils.PodName, metav1.GetOptions{}) 17 | if err != nil { 18 | klog.Error("Error in creating the lister", err.Error()) 19 | return "ABORT", err 20 | } 21 | switch pvc.Status.Phase { 22 | case "Bound": 23 | { 24 | klog.Info("[PVC Create-Delete] PVC Bound: SUCCESS") 25 | klog.Info("Observation: ", utils.NodeName, " 0") 26 | utils.HchecksGauge.WithLabelValues("pvc", utils.NodeName, utils.CPUModel, utils.GPUModel, "").Set(0) 27 | } 28 | case "Pending": 29 | { 30 | waitonpvc := time.NewTicker(time.Minute) 31 | defer waitonpvc.Stop() 32 | <-waitonpvc.C 33 | pvc, err := utils.GetClientsetInstance().Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Get(context.Background(), utils.PodName, metav1.GetOptions{}) 34 | if err != nil { 35 | klog.Error("[PVC Create-Delete] Error in creating the lister: ", err.Error()) 36 | return "[PVC Create-Delete] PVC not found. ABORT ", err 37 | } 38 | phase := pvc.Status.Phase 39 | if pvc.Status.Phase == "Pending" { 40 | klog.Info("[PVC Create-Delete] Timer is up with PVC Pending. Force delete. FAIL") 41 | klog.Info("Observation: ", utils.NodeName, " 1") 42 | utils.HchecksGauge.WithLabelValues("pvc", utils.NodeName, utils.CPUModel, utils.GPUModel, "").Set(1) 43 | err := deletePVC(utils.PodName) 44 | if err != nil { 45 | return "[PVC Create-Delete] Error in deleting the PVC. ABORT ", err 46 | } 47 | HealthCheckStatus[PVC] = true 48 | return "[PVC Create-Delete] FAIL", nil 49 | } 50 | if phase == "Bound" { 51 | klog.Info("[PVC Create-Delete] PVC Bound: SUCCESS") 52 | klog.Info("Observation: ", utils.NodeName, " 0") 53 | utils.HchecksGauge.WithLabelValues("pvc", utils.NodeName, utils.CPUModel, utils.GPUModel, "").Set(0) 54 | } 55 | } 56 | } 57 | err = deletePVC(utils.PodName) 58 | if err != nil { 59 | return "Error in deleting the PVC. ABORT ", err 60 | } 61 | return "[PVC Create-Delete] PVC SUCCESS", nil 62 | } 63 | 64 | func deletePVC(pvc string) error { 65 | cset := utils.GetClientsetInstance() 66 | err := cset.Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Delete(context.TODO(), pvc, metav1.DeleteOptions{}) 67 | if err != nil { 68 | klog.Info("[PVC Delete] Failed. ABORT. ", err.Error()) 69 | } 70 | return err 71 | } 72 | 73 | func createPVC() error { 74 | cset := utils.GetClientsetInstance() 75 | storageclass := os.Getenv("PVC_TEST_STORAGE_CLASS") 76 | pvcTemplate := corev1.PersistentVolumeClaim{ 77 | ObjectMeta: metav1.ObjectMeta{ 78 | Name: utils.PodName, 79 | }, 80 | Spec: corev1.PersistentVolumeClaimSpec{ 81 | StorageClassName: &storageclass, 82 | AccessModes: []corev1.PersistentVolumeAccessMode{ 83 | corev1.ReadWriteMany, 84 | }, 85 | Resources: corev1.VolumeResourceRequirements{ 86 | Requests: corev1.ResourceList{ 87 | "storage": resource.MustParse("100Mi"), 88 | }, 89 | }, 90 | }, 91 | } 92 | // Check if any previous instance exists, cleanup if so 93 | pvc, _ := utils.GetClientsetInstance().Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Get(context.Background(), utils.PodName, metav1.GetOptions{}) 94 | 95 | if pvc.Name != "" { 96 | klog.Info("[PVC Create] Found pre-existing instance. Cleanup ", pvc.Name) 97 | deletePVC(utils.PodName) 98 | waitDelete := time.NewTimer(30 * time.Second) 99 | <-waitDelete.C 100 | } 101 | 102 | _, err := cset.Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Create(context.TODO(), &pvcTemplate, metav1.CreateOptions{}) 103 | 104 | if err != nil { 105 | klog.Info("[PVC Create] Failed. ABORT. ", err.Error()) 106 | } 107 | return err 108 | } 109 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/healthcheck/global.go: -------------------------------------------------------------------------------- 1 | package healthcheck 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | 7 | "k8s.io/klog/v2" 8 | ) 9 | 10 | type HealthCheck string 11 | 12 | // Holding each test current status to facilitate node labeling 13 | var HealthCheckStatus map[HealthCheck]bool 14 | var defaultPeriodicChecks string = "pciebw,remapped,dcgm,ping,gpupower" 15 | 16 | const ( 17 | Undefined HealthCheck = "" 18 | DCGM HealthCheck = "dcgm" 19 | GPUMem HealthCheck = "gpumem" 20 | GPUPower HealthCheck = "gpupower" 21 | Iperf HealthCheck = "iperf" 22 | PCIeBW HealthCheck = "pciebw" 23 | Ping HealthCheck = "ping" 24 | PVC HealthCheck = "pvc" 25 | RowRemap HealthCheck = "remapped" 26 | ) 27 | 28 | func GetPeriodicChecks() string { 29 | checks, exists := os.LookupEnv("PERIODIC_CHECKS") 30 | if !exists { 31 | klog.Info("Run all periodic health checks\n") 32 | return defaultPeriodicChecks 33 | } 34 | return checks 35 | } 36 | 37 | func InitNodeStatusMap() { 38 | HealthCheckStatus = make(map[HealthCheck]bool) 39 | checklist := GetPeriodicChecks() 40 | for _, v := range strings.Split(checklist, ",") { 41 | klog.Info("Init entry map ", v) 42 | HealthCheckStatus[HealthCheck(v)] = false 43 | } 44 | } 45 | 46 | func GetNodeStatus() bool { 47 | hasFailures := false 48 | for v := range HealthCheckStatus { 49 | hasFailures = hasFailures || HealthCheckStatus[v] 50 | } 51 | return hasFailures 52 | } 53 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/utils/functions.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "strconv" 7 | 8 | "context" 9 | 10 | "github.com/thanhpk/randstr" 11 | batchv1 "k8s.io/api/batch/v1" 12 | corev1 "k8s.io/api/core/v1" 13 | "k8s.io/apimachinery/pkg/api/resource" 14 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 16 | "k8s.io/apimachinery/pkg/fields" 17 | "k8s.io/apimachinery/pkg/types" 18 | "k8s.io/client-go/kubernetes" 19 | "k8s.io/client-go/rest" 20 | "k8s.io/klog/v2" 21 | resourcehelper "k8s.io/kubectl/pkg/util/resource" 22 | ) 23 | 24 | func GetClientsetInstance() *K8sClientset { 25 | csetLock.Lock() 26 | if k8sClientset == nil { 27 | if k8sClientset == nil { 28 | k8sClientset = &K8sClientset{} 29 | config, err := rest.InClusterConfig() 30 | if err != nil { 31 | panic(err.Error()) 32 | } 33 | k8sClientset.Cset, err = kubernetes.NewForConfig(config) 34 | if err != nil { 35 | panic(err.Error()) 36 | } 37 | } 38 | 39 | } 40 | csetLock.Unlock() 41 | return k8sClientset 42 | } 43 | 44 | func GetNode(nodename string) (*corev1.Node, error) { 45 | cset := GetClientsetInstance() 46 | fieldselector, err := fields.ParseSelector("metadata.name=" + nodename) 47 | if err != nil { 48 | klog.Info("Error in creating the field selector ", err.Error()) 49 | return nil, err 50 | } 51 | instance, err := cset.Cset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{FieldSelector: fieldselector.String()}) 52 | if err != nil { 53 | klog.Info("Error in creating the watcher ", err.Error()) 54 | return nil, err 55 | } 56 | return &instance.Items[0], nil 57 | } 58 | 59 | // Returns true if GPUs are not currently requested by any workload 60 | func GPUsAvailability() bool { 61 | node, _ := GetNode(NodeName) 62 | nodelabels := node.Labels 63 | if _, found := nodelabels["nvidia.com/gpu.present"]; !found { 64 | klog.Info("At least one GPU busy on node ", NodeName, ". Cannot run invasive health checks.") 65 | return false 66 | } 67 | // Once cleared, list pods using gpus and abort the check if gpus are in use 68 | fieldselector, err := fields.ParseSelector("spec.nodeName=" + NodeName + ",status.phase!=" + string(corev1.PodSucceeded)) 69 | if err != nil { 70 | klog.Info("Error in creating the field selector ", err.Error()) 71 | return false 72 | } 73 | cset := GetClientsetInstance() 74 | pods, err := cset.Cset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{ 75 | FieldSelector: fieldselector.String(), 76 | }) 77 | if err != nil { 78 | klog.Info("Cannot list pods:", err.Error()) 79 | return false 80 | } 81 | for _, pod := range pods.Items { 82 | podReqs, podLimits := resourcehelper.PodRequestsAndLimits(&pod) 83 | gpuReq := podReqs["nvidia.com/gpu"] 84 | gpuLim := podLimits["nvidia.com/gpu"] 85 | if gpuReq.Value() > 0 || gpuLim.Value() > 0 { 86 | klog.Info("Pod ", pod.Name, " with requests ", gpuReq.Value(), " and limits ", gpuLim.Value(), ". Cannot run invasive health checks.") 87 | return false 88 | } 89 | } 90 | klog.Info("GPUs are free. Will run invasive health checks.") 91 | return true 92 | } 93 | 94 | func CreateJob(healthcheck string) error { 95 | var args []string 96 | var cmd []string 97 | switch healthcheck { 98 | case "dcgm": 99 | cmd = []string{"python3"} 100 | args = []string{"gpu-dcgm/entrypoint.py", "-r", "3", "-l", "-v"} 101 | } 102 | cset := GetClientsetInstance() 103 | 104 | fieldselector, err := fields.ParseSelector("metadata.name=" + PodName) 105 | if err != nil { 106 | klog.Info("Error in creating the field selector", err.Error()) 107 | return err 108 | } 109 | pods, err := cset.Cset.CoreV1().Pods("autopilot").List(context.TODO(), metav1.ListOptions{ 110 | FieldSelector: fieldselector.String(), 111 | }) 112 | if err != nil { 113 | klog.Info("Cannot get pod:", err.Error()) 114 | return err 115 | } 116 | autopilotPod := pods.Items[0] 117 | // setting TTL to 30 sec, but looking for used defined value 118 | ttlsec := int32(30) 119 | if os.Getenv("INVASIVE_JOB_TTLSEC") != "" { 120 | val, _ := strconv.Atoi(os.Getenv("INVASIVE_JOB_TTLSEC")) 121 | ttlsec = int32(val) 122 | } 123 | 124 | backofflimits := int32(0) 125 | job := &batchv1.Job{ 126 | ObjectMeta: metav1.ObjectMeta{ 127 | Name: healthcheck + "-" + randstr.Hex(6), 128 | Namespace: autopilotPod.Namespace, 129 | }, 130 | Spec: batchv1.JobSpec{ 131 | TTLSecondsAfterFinished: &ttlsec, 132 | BackoffLimit: &backofflimits, 133 | Template: corev1.PodTemplateSpec{ 134 | Spec: corev1.PodSpec{ 135 | RestartPolicy: "Never", 136 | ServiceAccountName: "autopilot", 137 | NodeName: NodeName, 138 | InitContainers: []corev1.Container{ 139 | { 140 | Name: "init", 141 | Image: autopilotPod.Spec.InitContainers[0].DeepCopy().Image, 142 | ImagePullPolicy: "IfNotPresent", 143 | Command: autopilotPod.Spec.InitContainers[0].DeepCopy().Command, 144 | Args: autopilotPod.Spec.InitContainers[0].DeepCopy().Args, 145 | }, 146 | }, 147 | Containers: []corev1.Container{ 148 | { 149 | Name: "main", 150 | Image: autopilotPod.Spec.Containers[0].DeepCopy().Image, 151 | ImagePullPolicy: "IfNotPresent", 152 | Command: cmd, 153 | Args: args, 154 | Resources: corev1.ResourceRequirements{ 155 | Limits: corev1.ResourceList{ 156 | "nvidia.com/gpu": resource.MustParse("8"), 157 | }, 158 | Requests: corev1.ResourceList{ 159 | "nvidia.com/gpu": resource.MustParse("8"), 160 | }, 161 | }, 162 | Env: []corev1.EnvVar{ 163 | { 164 | Name: "NODE_NAME", 165 | Value: NodeName, 166 | }, 167 | }, 168 | }, 169 | }, 170 | }, 171 | }, 172 | }, 173 | } 174 | klog.Info("Try create Job") 175 | _, err = cset.Cset.BatchV1().Jobs(Namespace).Create(context.TODO(), job, 176 | metav1.CreateOptions{}) 177 | if err != nil { 178 | klog.Info("Couldn't create Job ", err.Error()) 179 | return err 180 | } 181 | klog.Info("Created") 182 | return nil 183 | } 184 | 185 | func PatchNode(label string, nodename string, force bool) error { 186 | cset := GetClientsetInstance() 187 | 188 | // Should not patch the gpuhealth label if it's currently in TESTING or EVICT 189 | node, err := cset.Cset.CoreV1().Nodes().Get(context.TODO(), nodename, v1.GetOptions{}) 190 | if err != nil { 191 | klog.Info("[Node Patch] Failed read node ", err.Error()) 192 | return err 193 | } 194 | labels := node.GetLabels() 195 | if current, found := labels["autopilot.ibm.com/gpuhealth"]; found { 196 | klog.Info("Node ", nodename, " label found ", current) 197 | if current == "TESTING" || current == "EVICT" { 198 | if !force { 199 | klog.Info("Cannot patch node's label, value found: ", current) 200 | return errors.New("Node status " + current) 201 | } else { 202 | klog.Info("Force patch for completed testing") 203 | } 204 | } 205 | } else { 206 | klog.Info("No label found, will go ahead patching the node") 207 | } 208 | _, err = cset.Cset.CoreV1().Nodes().Patch(context.TODO(), nodename, types.StrategicMergePatchType, []byte(label), v1.PatchOptions{}) 209 | if err != nil { 210 | klog.Info("[Node Patch] Failed. ", err.Error()) 211 | return err 212 | } 213 | klog.Info("Node patched with label ", label) 214 | return nil 215 | } 216 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/utils/global.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os" 5 | "sync" 6 | 7 | "k8s.io/client-go/kubernetes" 8 | ) 9 | 10 | type InitConfig struct { 11 | BWThreshold int 12 | } 13 | 14 | var UserConfig InitConfig 15 | 16 | type K8sClientset struct { 17 | Cset *kubernetes.Clientset 18 | } 19 | 20 | var k8sClientset *K8sClientset 21 | var csetLock sync.Mutex 22 | 23 | var HealthcheckLock sync.Mutex 24 | 25 | var CPUModel string 26 | var GPUModel string 27 | 28 | var NodeName string = os.Getenv("NODE_NAME") 29 | var Namespace string = os.Getenv("NAMESPACE") 30 | var PodName string = os.Getenv("POD_NAME") 31 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/utils/listwatch.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "strings" 6 | 7 | corev1 "k8s.io/api/core/v1" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | "k8s.io/apimachinery/pkg/fields" 10 | "k8s.io/apimachinery/pkg/watch" 11 | "k8s.io/client-go/tools/cache" 12 | toolswatch "k8s.io/client-go/tools/watch" 13 | "k8s.io/klog/v2" 14 | ) 15 | 16 | func WatchNode() { 17 | watchFunc := func(options metav1.ListOptions) (watch.Interface, error) { 18 | timeout := int64(60) 19 | fieldselector, err := fields.ParseSelector("metadata.name=" + NodeName) 20 | if err != nil { 21 | klog.Info("Error in creating the field selector", err.Error()) 22 | return nil, err 23 | } 24 | instance, err := GetClientsetInstance().Cset.CoreV1().Nodes().Watch(context.Background(), metav1.ListOptions{TimeoutSeconds: &timeout, FieldSelector: fieldselector.String()}) 25 | if err != nil { 26 | klog.Info("Error in creating the watcher", err.Error()) 27 | return nil, err 28 | } 29 | return instance, err 30 | } 31 | 32 | watcher, _ := toolswatch.NewRetryWatcher("1", &cache.ListWatch{WatchFunc: watchFunc}) 33 | 34 | for event := range watcher.ResultChan() { 35 | item := event.Object.(*corev1.Node) 36 | 37 | switch event.Type { 38 | case watch.Modified: 39 | { 40 | key := "autopilot.ibm.com/dcgm.level.3" 41 | labels := item.GetLabels() 42 | if val, found := labels[key]; found { 43 | var res float64 44 | res = 0 45 | if strings.Contains(val, "EVICT") { 46 | res = 1 47 | klog.Info("[DCGM level 3] Update observation: ", NodeName, " Fatal error found") 48 | } 49 | HchecksGauge.WithLabelValues("dcgm", NodeName, CPUModel, GPUModel, "").Set(res) 50 | } 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/utils/nodelabels.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // All GPU tests pass 4 | var GPUHealthPassLabel string = ` 5 | { 6 | "metadata": { 7 | "labels": { 8 | "autopilot.ibm.com/gpuhealth": "PASS" 9 | } 10 | } 11 | } 12 | ` 13 | 14 | // At least one GPU test fails. No info about the severity of the failure 15 | var GPUHealthWarnLabel string = ` 16 | { 17 | "metadata": { 18 | "labels": { 19 | "autopilot.ibm.com/gpuhealth": "WARN" 20 | } 21 | } 22 | } 23 | ` 24 | 25 | var GPUHealthEmptyLabel string = ` 26 | { 27 | "metadata": { 28 | "labels": { 29 | "autopilot.ibm.com/gpuhealth": "" 30 | } 31 | } 32 | } 33 | ` 34 | 35 | var GPUHealthTestingLabel string = ` 36 | { 37 | "metadata": { 38 | "labels": { 39 | "autopilot.ibm.com/gpuhealth": "TESTING" 40 | } 41 | } 42 | } 43 | ` 44 | 45 | // Some health check failed. Can be any health check 46 | var NodeHealthWarnLabel string = ` 47 | { 48 | "metadata": { 49 | "labels": { 50 | "autopilot.ibm.com/nodehealth": "WARN" 51 | } 52 | } 53 | } 54 | ` 55 | 56 | var NodeHealthEmptyLabel string = ` 57 | { 58 | "metadata": { 59 | "labels": { 60 | "autopilot.ibm.com/nodehealth": "" 61 | } 62 | } 63 | } 64 | ` 65 | -------------------------------------------------------------------------------- /autopilot-daemon/pkg/utils/prometheus.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os/exec" 5 | "strings" 6 | 7 | "github.com/prometheus/client_golang/prometheus" 8 | "k8s.io/klog/v2" 9 | ) 10 | 11 | var ( 12 | Requests = prometheus.NewCounter( 13 | prometheus.CounterOpts{ 14 | Namespace: "autopilot", 15 | Name: "health_checks_req_total", 16 | Help: "Number of invocations to Autopilot", 17 | }, 18 | ) 19 | 20 | HchecksGauge = prometheus.NewGaugeVec( 21 | prometheus.GaugeOpts{ 22 | Namespace: "autopilot", 23 | Name: "health_checks", 24 | Help: "Summary of the health checks measurements on compute nodes. Gauge Vector version", 25 | }, 26 | []string{"health", "node", "cpumodel", "gpumodel", "deviceid"}, 27 | ) 28 | ) 29 | 30 | func InitMetrics(reg prometheus.Registerer) { 31 | // Register custom metrics with the global prometheus registry 32 | reg.MustRegister(HchecksGauge) 33 | } 34 | 35 | func InitHardwareMetrics() { 36 | // Define CPUModel global variable 37 | cpu := "N/A" 38 | 39 | cmd := "cat /proc/cpuinfo | egrep '^model name' | uniq | awk '{print substr($0, index($0,$4))}'| sed 's/(//; s/)//'" 40 | out, err := exec.Command("bash", "-c", cmd).CombinedOutput() 41 | if err != nil { 42 | klog.Info("Error retrieving cpu model info", err.Error()) 43 | } else { 44 | cpu = strings.TrimSpace(string(out[:])) 45 | } 46 | klog.Info("CPU_MODEL: ", cpu) 47 | CPUModel = cpu 48 | 49 | // Define GPUModel global variable 50 | gpu := "N/A" 51 | 52 | cmd2 := exec.Command("nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader") 53 | out, err = cmd2.CombinedOutput() 54 | if err != nil { 55 | klog.Info("Error retrieving gpu model info", err.Error()) 56 | } else { 57 | tmp := strings.TrimSpace(string(out[:])) 58 | gpu = strings.Split(tmp, "\n")[0] 59 | } 60 | klog.Info("GPU_MODEL: ", gpu) 61 | GPUModel = gpu 62 | } 63 | -------------------------------------------------------------------------------- /autopilot-daemon/utils/briefings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | exists=`which nvidia-smi` 3 | if [[ -z $exists ]] 4 | then 5 | echo !! nvidia-smi not present. ABORT. 6 | killall5 7 | fi 8 | 9 | CMD="$(nvidia-smi)" 10 | errors="$(echo ${CMD} | grep -i err)" 11 | if [[ -n $errors ]] 12 | then 13 | echo !! nvidia-smi failed to start. ABORT. 14 | killall5 15 | fi 16 | 17 | CMD="$(nvidia-smi --query-gpu=mig.mode.current --format=csv)" 18 | mig="$(echo ${CMD} | grep Enabled)" 19 | if [[ -n $mig ]] 20 | then 21 | echo !! MIG enabled. ABORT. 22 | exit 23 | fi 24 | 25 | CMD="$(dcgmi --version)" 26 | errors="$(echo ${CMD} | grep -i 'fail|error')" 27 | if [[ -n $errors ]] 28 | then 29 | echo !! dcgmi failed to start. ABORT. 30 | exit 31 | fi -------------------------------------------------------------------------------- /autopilot-daemon/utils/runHealthchecks.py: -------------------------------------------------------------------------------- 1 | ################################################################################## 2 | # Python program that uses the Python Client Library for Kubernetes to 3 | # run autopilot health checks on all nodes or a specific node(s). 4 | # Healchecks include PCIEBW, and GPU REMAPPED ROWS. 5 | # Image: us.icr.io/cil15-shared-registry/gracek/run-healthchecks:3.0.1 6 | ################################################################################## 7 | import argparse 8 | import os 9 | import time 10 | import asyncio 11 | import aiohttp 12 | from itertools import islice 13 | import pprint 14 | from kubernetes import client, config 15 | from kubernetes.client.rest import ApiException 16 | from multiprocessing import Pool 17 | 18 | # load in cluster kubernetes config for access to cluster 19 | config.load_incluster_config() 20 | v1 = client.CoreV1Api() 21 | 22 | # get arguments for service, namespace, node(s), and check (test type) 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--service', type=str, default='autopilot-healthchecks', help='Autopilot healthchecks service name. Default is \"autopilot-healthchecks\".') 25 | 26 | parser.add_argument('--namespace', type=str, default='autopilot', help='Namespace where autopilot DaemonSet is deployed. Default is \"autopilot\".') 27 | 28 | parser.add_argument('--nodes', type=str, default='all', help='Node(s) that will run a healthcheck. Can be a comma separated list. Default is \"all\" unless --wkload is provided, then set to None. Specific nodes can be provided in addition to --wkload.') 29 | 30 | parser.add_argument('--check', type=str, default='all', help='The specific test(s) that will run: \"all\", \"pciebw\", \"dcgm\", \"remapped\", \"ping\", \"gpumem\", \"pvc\" or \"gpupower\". Default is \"all\". Can be a comma separated list.') 31 | 32 | parser.add_argument('--batchSize', default='0', type=str, help='Number of nodes to check in parallel. Default is set to the number of the worker nodes.') 33 | 34 | parser.add_argument('--wkload', type=str, default='None', help='Workload node discovery w/ given namespace and label. Ex: \"--wkload=namespace:label-key=label-value\". Default is set to None.') 35 | 36 | parser.add_argument('--dcgmR', type=str, default='1', help='Run a diagnostic in dcgmi. Run a diagnostic. (Note: higher numbered tests include all beneath.)\n\t1 - Quick (System Validation ~ seconds)\n\t2 - Medium (Extended System Validation ~ 2 minutes)\n\t3 - Long (System HW Diagnostics ~ 15 minutes)\n\t4 - Extended (Longer-running System HW Diagnostics)') 37 | 38 | parser.add_argument('--nodelabel', type=str, default='None', help='Node label to select nodes. Ex: \"label-key=label-value\". Default is set to None.') 39 | 40 | args = vars(parser.parse_args()) 41 | service = args['service'] 42 | namespace = args['namespace'] 43 | node = args['nodes'].replace(' ', '').split(',') # list of nodes 44 | checks = args['check'].replace(' ', '').split(',') # list of checks 45 | batch_size = int(args['batchSize']) 46 | nodelabel = args['nodelabel'] 47 | wkload = args['wkload'] 48 | if wkload != 'None': 49 | wkload = args['wkload'].split(':') 50 | if '' in wkload: 51 | print("Invalid job definition, must be namespace:label=value. Got",wkload) 52 | exit() 53 | 54 | if ((wkload != "None") or (nodelabel != "None")) and (args['nodes'] == 'all'): 55 | node = [] 56 | 57 | # debug: runtime 58 | start_time = time.time() 59 | 60 | def find_labeled_nodes(): 61 | try: 62 | labeled_nodes = v1.list_node(label_selector=nodelabel) 63 | except ApiException as e: 64 | print("Exception when calling CoreV1Api->list_node: %s\n" % e) 65 | exit() 66 | if len(labeled_nodes.items) == 0: 67 | print ("No node is labeled with", nodelabel, " - ABORT.") 68 | exit() 69 | for labeled_node in labeled_nodes.items: 70 | node_name = labeled_node.metadata.name 71 | if node_name not in node: 72 | node.append(node_name) 73 | 74 | # find workload addresses 75 | def find_wkload(): 76 | node_len = len(node) 77 | copy = False 78 | wkload_ns = wkload[0] # ex: "default" 79 | wkload_label = wkload[1] # ex: "job-name=my-job" or "app=my-app" 80 | try: 81 | wkload_pods = v1.list_namespaced_pod(namespace=wkload_ns, label_selector=wkload_label) 82 | except ApiException as e: 83 | print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) 84 | exit() 85 | print('Workload:', ': '.join(wkload)) 86 | if len(wkload_pods.items) == 0: 87 | print("No workload labeled with", wkload_label, "- ABORT.") 88 | exit() 89 | for pod in wkload_pods.items: 90 | node_name = pod.spec.node_name 91 | if node_name not in node: 92 | node.append(node_name) 93 | else: 94 | copy = True 95 | if (len(node) == node_len) and not copy: 96 | print('Error: Issue with --wkload parameter.\nMake sure your workload is spelled correctly and exists in the cluster. ABORT') 97 | exit() 98 | 99 | 100 | # get addresses in desired endpointslice (autopilot-healthchecks) based on which node(s) the user chooses 101 | def get_addresses(): 102 | global server_address 103 | server_address = '' 104 | try: 105 | endpoints = v1.list_namespaced_endpoints(namespace=namespace) 106 | except ApiException as e: 107 | print("Exception when calling CoreV1Api->list_namespaced_endpoints: %s\n" % e) 108 | exit() 109 | for endpointslice in endpoints.items: 110 | if endpointslice.metadata.name == service: 111 | # print("EndpointSlice: " + str(endpointslice.metadata.name)) 112 | addresses = endpointslice.subsets[0].addresses 113 | if node[0] == 'all': 114 | # server_address = [addresses[0], addresses[len(addresses)-1]] 115 | return addresses 116 | else: 117 | address_list = [] 118 | for address in addresses: 119 | if address.node_name in node: 120 | address_list.append(address) 121 | else: 122 | server_address = address 123 | if len(address_list) > 0: 124 | return address_list 125 | # if server_address == '': # when all nodes are being tested / there's only one node 126 | # print('Iperf test cannot be completed') 127 | 128 | # create url for test 129 | def create_url(address, daemon_node): 130 | urls = [] 131 | for check in checks: 132 | if check == 'all': 133 | urls.append('http://' + str(address.ip) + ':3333/status?host=' + daemon_node) 134 | return urls 135 | extra_params = "" 136 | if "ping" in args['check']: 137 | if args['wkload'] != 'None': 138 | extra_params += "&job=" + args['wkload'] 139 | if nodelabel != 'None': 140 | extra_params += "&nodelabel=" + nodelabel 141 | if args['nodes'] != 'all' : 142 | extra_params += "&pingnodes=" + args['nodes'] 143 | if "dcgm" in args['check']: 144 | extra_params += "&r=" + args['dcgmR'] 145 | urls.append('http://' + str(address.ip) + ':3333/status?host=' + daemon_node + '&check=' + args['check'] + extra_params) 146 | return urls 147 | 148 | # check and print status of each node 149 | def get_node_status(responses): 150 | node_status_list = [] 151 | for response in responses: 152 | response_list = response.split('\n') 153 | for line in response_list: 154 | if (('FAIL' in line) or ('ABORT' in line)): 155 | if ('PCIE' in line): 156 | node_status_list.append('PCIE Failed') 157 | elif('REMAPPED ROWS' in line): 158 | node_status_list.append('REMAPPED ROWS Failed') 159 | elif('DCGM' in line): 160 | node_status_list.append('DCGM Failed') 161 | elif('GPU POWER' in line): 162 | node_status_list.append('GPU POWER Failed') 163 | elif('PING' in line): 164 | node_status_list.append('PING Failed') 165 | elif('GPU-MEM' in line): 166 | node_status_list.append("GPU MEM Test Failed") 167 | elif('PVC' in line): 168 | node_status_list.append("PVC Create-Delete Test Failed") 169 | elif('Disconnected' in line): 170 | node_status_list.append('Connection to Server Failed') 171 | 172 | if len(node_status_list) < 1: 173 | node_status_list.append('OK') 174 | return node_status_list 175 | 176 | async def makeconnection(address): 177 | daemon_node = str(address.node_name) 178 | pid = os.getpid() 179 | url = create_url(address, daemon_node) 180 | output = '\nAutopilot Endpoint: {ip}\nNode: {daemon_node}\nurl(s): {url}'.format(ip=address.ip, daemon_node=daemon_node, url='\n '.join(url)) 181 | print(f"Initiated connection to {url}.") 182 | total_timeout=aiohttp.ClientTimeout(total=60*60*24) 183 | try: 184 | async with aiohttp.ClientSession(timeout=total_timeout) as session: 185 | async with session.get(url[0]) as resp: 186 | reply = await resp.text() 187 | except aiohttp.client_exceptions.ServerDisconnectedError: 188 | print("Server Disconnected") 189 | reply = "Server Disconnected. ABORT" 190 | 191 | response=[reply] 192 | node_status_list = get_node_status(response) 193 | output += '\nResponse:\n{response}\nNode Status: {status}\n-------------------------------------\n'.format(response='~~\n'.join(response), status=', '.join(node_status_list)) 194 | # output += "\n-------------------------------------\n" # separator 195 | return output, pid, daemon_node, node_status_list 196 | 197 | 198 | async def main(addresses): 199 | res = await asyncio.gather(*(makeconnection(addr) for addr in addresses)) 200 | return res 201 | 202 | def batch_of_nodes(nodelist, batch_size): 203 | it = iter(nodelist) 204 | while True: 205 | batch = list(islice(it, batch_size)) 206 | if not batch: 207 | break 208 | yield batch 209 | 210 | # start program 211 | if __name__ == "__main__": 212 | # initializing some variables 213 | if wkload != 'None': 214 | find_wkload() 215 | if nodelabel != 'None': 216 | find_labeled_nodes() 217 | addresses = get_addresses() 218 | total_nodes = len(addresses) 219 | node_status = {} # updates after each node is tested 220 | pids_tups = [] # debug: process list 221 | pids_dict = {} # debug: process list 222 | 223 | if batch_size == 0 or batch_size > total_nodes: 224 | batch_size = total_nodes 225 | asyncres = [] 226 | 227 | for b in batch_of_nodes(addresses, batch_size): 228 | asyncres.extend(asyncio.run(main(b))) 229 | 230 | for result, pid, daemon_node, node_status_list in asyncres: 231 | pids_tups.append((pid, daemon_node)) 232 | node_status[daemon_node] = node_status_list 233 | print(result) 234 | 235 | print("Node Summary:\n") 236 | pprint.pprint(node_status) 237 | 238 | # debug: print each process with the nodes they ran 239 | # for p, n in pids_tups: 240 | # pids_dict.setdefault(p, []).append(n) 241 | # print("\n~~~DEBUGGING BELOW~~~\nProcesses (randomly ordered) and the nodes they ran (process:[nodes]):") 242 | # pprint.pprint(pids_dict, width=1) 243 | 244 | # print runtime 245 | print('\nruntime:', str(time.time() - start_time), 'sec') 246 | -------------------------------------------------------------------------------- /figures/autopilot-daemon-pod.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-daemon-pod.pdf -------------------------------------------------------------------------------- /figures/autopilot-daemon-pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-daemon-pod.png -------------------------------------------------------------------------------- /figures/autopilot-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-logo.png -------------------------------------------------------------------------------- /figures/autopilot-main-loop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-main-loop.pdf -------------------------------------------------------------------------------- /figures/autopilot-main-loop.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
Autopilot Main Loop
Autopilot Main Loop
Run Health Checks
Run Health Checks
Check GPUs Availability
Check GPUs Availa...
Periodic / On Demand
Periodic / On Demand
Invasive
Invasive
Label nodes
Label nodes
Via Job
Via Job
allows
allows
Look at node
resource request/limit
Look at node...
Via Autopilot Pod
Via Autopilot Pod
Viewer does not support full SVG 1.1
-------------------------------------------------------------------------------- /figures/big-picture.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/big-picture.pdf -------------------------------------------------------------------------------- /figures/big-picture.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
On-demand Evaluation
   using node/Job label through Autopilot
On-demand Evaluation...
Workload lifetime
Workload lifetime
Continuous
system evaluation through Autopilot
Continuous...
Migration decision
Migration dec...
Placement decision
Placement dec...
Continuous
system evaluation through Autopilot
Continuous...
Scheduler or opinionated tool (e.g., CodeFlare)
Scheduler or opinionated tool (e.g., C...
Viewer does not support full SVG 1.1
-------------------------------------------------------------------------------- /figures/invasive-check-flow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/invasive-check-flow.pdf -------------------------------------------------------------------------------- /figures/periodic-check-flow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/periodic-check-flow.pdf -------------------------------------------------------------------------------- /helm-charts/autopilot/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm-charts/autopilot/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: autopilot 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: v2.1.3 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: latest 25 | -------------------------------------------------------------------------------- /helm-charts/autopilot/README.md: -------------------------------------------------------------------------------- 1 | # Helm Chart Customization 2 | 3 | ## Latest tag 4 | 5 | At every PR merge, we automatically build the `latest` tag that can be pulled by using `quay.io/autopilot/autopilot:latest`. 6 | 7 | This tag contains the latest changes and it must be considered as a dev image. For stable releases, always refer to the published ones. 8 | 9 | ## Customize Helm chart 10 | 11 | Autopilot is set to run on NVidia GPU nodes. It is possible to run it on heterogeneous nodes (i.e., CPU only and GPU only), GPU only nodes or CPU only nodes. 12 | 13 | ```yaml 14 | onlyOnGPUNodes: true 15 | ``` 16 | 17 | Running on GPU nodes only, will: 18 | 19 | 1) add the `nvidia.com/gpu.present: 'true'` label and 20 | 2) enable the init container, which checks on the nvidia device plug-in to be setup 21 | 22 | Alternatively, `onlyOnGPUNodes` can be set to false and Autopilot will run on all worker nodes, regardless of the accelerators. 23 | Notice that, in this heterogeneous case, the GPU health checks will error out in the non-GPU nodes. 24 | 25 | - Autopilot runs tests periodically. The default is set to every hour and 4 hours for regular and deep diagnostics respectively, but these can be customized be changing the following 26 | 27 | ```yaml 28 | repeat: # periodic health checks timer (default 1h) 29 | invasive: # deeper diagnostic timer (default 4h, 0 to disable) 30 | ``` 31 | 32 | - The list of GPU errors considered fatal as a result of a dcgmi run, can be customized through the `DCGM_FATAL_ERRORS` environment variable. This is used to label nodes with extra WARN/EVICT labels. The list defaults to [PCIe,NVLink,ECC,GPU Memory] and refers to https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3 33 | 34 | ```yaml 35 | - name: "DCGM_FATAL_ERRORS" 36 | value: "" 37 | ``` 38 | 39 | - Invasive jobs (e.g., dcgm level 3), are executed as separate job. The job deletes itself by default after 30s. This parameter can be customized by the env variable below 40 | 41 | ```yaml 42 | - name: "INVASIVE_JOB_TTLSEC" 43 | value: "" 44 | ``` 45 | 46 | - PCIe bandwidth critical value is defaulted to 4GB/s. It is recommended to set a threshold that is 25% or lower of the expected peak PCIe bandwidth capability, which maps to maximum peak from 16 lanes to 4 lanes. For example, for a PCIe Gen4x16, reported peak bandwidth is 63GB/s. A degradation at 25% is 15.75GB/s, which corresponds to PCIe Gen4x4. The measured bandwidth is expected to be at least 80% of the expected peak PCIe generation bandwidth. 47 | 48 | ```yaml 49 | PCIeBW: 50 | ``` 51 | 52 | - If secondary nics are available by, for instance, Multus or Multi-Nic-Operator, those can be enabled in autopilot by setting the following 53 | 54 | ```yaml 55 | annotations: 56 | k8s.v1.cni.cncf.io/networks: 57 | ``` 58 | 59 | - The list of periodic health checks can be customized through an environment variable. In the example below, we select all health checks and specify the storage class for the `pvc` test 60 | 61 | If running on CPU nodes only, `pciebw,remapped,dcgm and gpupower` can be removed 62 | 63 | ```yaml 64 | env: 65 | - name: "PERIODIC_CHECKS" 66 | value: "pciebw,remapped,dcgm,ping,gpupower,pvc" 67 | - name: "PVC_TEST_STORAGE_CLASS" 68 | value: "example-storage-class" 69 | ``` 70 | 71 | All these values can be saved in a `config.yaml` file. 72 | 73 | ## Install 74 | 75 | If you have your own configuration file, it can be passed to the `helm` install command with the `-f` parameter. If you want to install the default values, just omit the parameter. 76 | 77 | ```bash 78 | helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace <-f your-config.yml> 79 | ``` 80 | 81 | For more customization, please refer to `values.yaml`. 82 | -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Autopilot DaemonSet deployed. -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "mutating-webhook.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "mutating-webhook.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "mutating-webhook.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "mutating-webhook.labels" -}} 37 | helm.sh/chart: {{ include "mutating-webhook.chart" . }} 38 | {{ include "mutating-webhook.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "mutating-webhook.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "mutating-webhook.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "mutating-webhook.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "mutating-webhook.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | 64 | {{/* 65 | Create the name of the namespace to use 66 | */}} 67 | {{- define "mutating-webhook.namespaceName" -}} 68 | {{- if .Values.namespace.create }} 69 | {{- default (include "mutating-webhook.fullname" .) .Values.namespace.name }} 70 | {{- end }} 71 | {{- end }} 72 | -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/autopilot.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | labels: 5 | app: autopilot 6 | name: {{ printf "%s" .Chart.Name }} 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: autopilot 11 | template: 12 | metadata: 13 | annotations: 14 | {{- toYaml .Values.annotations | nindent 8 }} 15 | labels: 16 | app: autopilot 17 | spec: 18 | {{- if .Values.affinity }} 19 | affinity: 20 | {{- toYaml .Values.affinity | nindent 8 }} 21 | {{- end}} 22 | nodeSelector: 23 | {{- if .Values.nodeSelector }} 24 | {{- toYaml .Values.nodeSelector | nindent 8 }} 25 | {{- end}} 26 | {{- if .Values.onlyOnGPUNodes }} 27 | nvidia.com/gpu.present: 'true' 28 | {{- end}} 29 | serviceAccountName: autopilot 30 | {{- if .Values.pullSecrets.create }} 31 | imagePullSecrets: 32 | - name: {{ .Values.pullSecrets.name }} 33 | {{- end}} 34 | {{- if .Values.onlyOnGPUNodes }} 35 | initContainers: 36 | - args: 37 | - until [ -f /usr/bin/nvidia-smi ]; do echo waiting for nvidia device plug-in to be setup; sleep 5 && exit -1; done 38 | command: 39 | - sh 40 | - -c 41 | image: {{ .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }} 42 | imagePullPolicy: Always 43 | name: device-plugin-validation 44 | securityContext: 45 | runAsNonRoot: true 46 | runAsUser: 1000910000 47 | {{- end}} 48 | containers: 49 | - image: {{ .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }} 50 | command: 51 | - sh 52 | - -c 53 | - | 54 | /usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }} --invasive-check-timer {{ .Values.invasive }} 55 | imagePullPolicy: {{ .Values.image.pullPolicy }} 56 | name: autopilot 57 | securityContext: 58 | runAsNonRoot: true 59 | runAsUser: 1000910000 60 | capabilities: 61 | add: 62 | - NET_RAW 63 | - NET_ADMIN 64 | env: 65 | {{- range .Values.env }} 66 | - name: {{ .name }} 67 | value: {{ .value | quote}} 68 | {{- end }} 69 | - name: "NODE_NAME" 70 | valueFrom: 71 | fieldRef: 72 | fieldPath: spec.nodeName 73 | - name: "NAMESPACE" 74 | valueFrom: 75 | fieldRef: 76 | fieldPath: metadata.namespace 77 | - name: "POD_NAME" 78 | valueFrom: 79 | fieldRef: 80 | fieldPath: metadata.name 81 | ports: 82 | - containerPort: {{ .Values.service.port }} 83 | name: healthcheck 84 | - containerPort: 8081 85 | name: http 86 | - containerPort: 8080 87 | name: readinessprobe 88 | readinessProbe: 89 | httpGet: 90 | path: /readinessprobe 91 | port: 8080 92 | initialDelaySeconds: 15 93 | periodSeconds: 120 94 | timeoutSeconds: 10 95 | livenessProbe: 96 | initialDelaySeconds: 15 97 | periodSeconds: 120 98 | timeoutSeconds: 15 99 | {{- if .Values.onlyOnGPUNodes }} 100 | exec: 101 | command: 102 | - nvidia-smi 103 | {{- else }} 104 | httpGet: 105 | path: /readinessprobe 106 | port: 8080 107 | {{- end}} 108 | resources: 109 | {{- toYaml .Values.resources | nindent 12 }} 110 | volumeMounts: 111 | {{- if .Values.additionalVolumeMounts }} 112 | {{- toYaml .Values.additionalVolumeMounts | nindent 12 }} 113 | {{- end }} 114 | volumes: 115 | {{- if .Values.additionalVolumeClaimTemplates }} 116 | {{- toYaml .Values.additionalVolumeClaimTemplates | nindent 8 }} 117 | {{- end}} 118 | 119 | -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/metrics_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: autopilot 6 | name: autopilot-metrics-service 7 | spec: 8 | ports: 9 | - name: http 10 | port: 8081 11 | protocol: TCP 12 | targetPort: http 13 | selector: 14 | app: autopilot 15 | -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/pullsecret.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.pullSecrets.create -}} 2 | apiVersion: v1 3 | data: 4 | .dockerconfigjson: {{ .Values.pullSecrets.imagePullSecretData }} 5 | kind: Secret 6 | metadata: 7 | name: {{ .Values.pullSecrets.name }} 8 | type: kubernetes.io/dockerconfigjson 9 | {{- end}} -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: autopilot 6 | name: autopilot-healthchecks 7 | annotations: 8 | {{- toYaml .Values.serviceAnnotations | nindent 4 }} 9 | spec: 10 | ports: 11 | - port: {{ .Values.service.port }} 12 | protocol: TCP 13 | name: healthcheck 14 | selector: 15 | app: autopilot 16 | --- 17 | apiVersion: v1 18 | kind: Service 19 | metadata: 20 | labels: 21 | app: autopilot 22 | name: autopilot-readinessprobe 23 | spec: 24 | ports: 25 | - port: 8080 26 | protocol: TCP 27 | name: readinessprobe 28 | selector: 29 | app: autopilot -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{ if .Capabilities.APIVersions.Has "security.openshift.io/v1" -}} 2 | kind: SecurityContextConstraints 3 | apiVersion: security.openshift.io/v1 4 | metadata: 5 | name: scc-autopilot 6 | allowPrivilegedContainer: true 7 | runAsUser: 8 | type: RunAsAny 9 | seLinuxContext: 10 | type: RunAsAny 11 | fsGroup: 12 | type: RunAsAny 13 | supplementalGroups: 14 | type: RunAsAny 15 | users: 16 | - system:serviceaccount:{{ .Release.Namespace }}:autopilot 17 | allowedCapabilities: 18 | - 'NET_RAW' 19 | - 'NET_ADMIN' 20 | volumes: 21 | - configMap 22 | - csi 23 | - downwardAPI 24 | - emptyDir 25 | - ephemeral 26 | - hostPath 27 | - persistentVolumeClaim 28 | - projected 29 | - secret 30 | {{ end -}} 31 | --- 32 | apiVersion: v1 33 | kind: ServiceAccount 34 | metadata: 35 | name: autopilot 36 | --- 37 | apiVersion: rbac.authorization.k8s.io/v1 38 | kind: ClusterRole 39 | metadata: 40 | name: autopilot 41 | rules: 42 | - apiGroups: [""] 43 | resources: ["endpoints"] 44 | verbs: ["get", "list"] 45 | - apiGroups: [""] 46 | resources: ["pods"] 47 | verbs: ["get", "list"] 48 | - apiGroups: ["batch"] 49 | resources: ["jobs"] 50 | verbs: ["get", "list", "create"] 51 | - apiGroups: [""] 52 | resources: ["nodes"] 53 | verbs: ["list", "get", "patch", "watch"] 54 | - apiGroups: ["apps"] 55 | resources: ["daemonsets"] 56 | verbs: ["list", "get"] 57 | - apiGroups: [""] 58 | resources: ["persistentvolumeclaims"] 59 | verbs: ["list", "get", "create", "delete"] 60 | --- 61 | apiVersion: rbac.authorization.k8s.io/v1 62 | kind: ClusterRoleBinding 63 | metadata: 64 | name: autopilot 65 | subjects: 66 | - kind: ServiceAccount 67 | namespace: {{ .Release.Namespace }} 68 | name: autopilot 69 | roleRef: 70 | kind: ClusterRole 71 | name: autopilot 72 | apiGroup: rbac.authorization.k8s.io 73 | -------------------------------------------------------------------------------- /helm-charts/autopilot/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | # Prometheus Monitor Service (Metrics) 2 | {{ if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" -}} 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: ServiceMonitor 5 | metadata: 6 | labels: 7 | app: autopilot 8 | app.kubernetes.io/name: servicemonitor 9 | app.kubernetes.io/component: metrics 10 | release: prometheus 11 | name: autopilot-metrics-monitor 12 | spec: 13 | endpoints: 14 | - path: /metrics 15 | port: http 16 | scheme: http 17 | selector: 18 | matchLabels: 19 | app: autopilot 20 | {{ end -}} 21 | -------------------------------------------------------------------------------- /helm-charts/autopilot/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for the Autopilot DaemonSet. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | image: 5 | repository: quay.io/autopilot/autopilot 6 | pullPolicy: Always 7 | 8 | # Bandwidth threshold below which PCIe links are considered defective (Gb/s) 9 | # It is recommended to set a threshold that is 25% or lower of the expected peak PCIe bandwidth capability, which maps to maximum peak from 16 lanes to 4 lanes. For example, for a PCIe Gen4x16, reported peak bandwidth is 63GB/s. A degradation at 25% is 15.75GB/s, which corresponds to PCIe Gen4x4. The measured bandwidth is expected to be at least 80% of the expected peak PCIe generation bandwidth. 10 | PCIeBW: 4 11 | 12 | # Timer for periodic checks, in hours 13 | repeat: 1 14 | 15 | # Timer for periodic invasive checks, in hours (e.g., dcgmi diag -r 3). Set to 0 to disable (for non nvidia gpu systems) 16 | invasive: 4 17 | 18 | # Image pull secret if the image is in a private repository 19 | pullSecrets: 20 | create: false 21 | name: autopilot-pull-secret 22 | imagePullSecretData: 23 | 24 | env: 25 | # List of periodic checks to be executed every `repeat` hours. 26 | # If not running on GPU nodes, pciebw,remapped,dcgm and gpupower can be removed 27 | - name: "PERIODIC_CHECKS" 28 | value: "pciebw,remapped,dcgm,ping,gpupower" 29 | # Storage class name to test 30 | - name: "PVC_TEST_STORAGE_CLASS" 31 | value: "" 32 | # List of GPU errors considered fatal, as a result of a dcgmi run. This is used to label nodes with extra WARN/EVICT labels. The list defaults to [PCIe,NVLink,ECC,GPU Memory] and refers to https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3 33 | - name: "DCGM_FATAL_ERRORS" 34 | value: "" 35 | # Invasive jobs (e.g., dcgm level 3), are executed as separate job. The job deletes itself by default after 30s. This parameter can be customized by the env variable below 36 | - name: "INVASIVE_JOB_TTLSEC" 37 | value: "" 38 | 39 | service: 40 | port: 3333 41 | 42 | annotations: 43 | # k8s.v1.cni.cncf.io/networks: multi-nic-network 44 | 45 | nodeSelector: 46 | # nvidia.com/gpu.present: 'true' 47 | # nvidia.com/mig.config: 'all-disabled' 48 | 49 | affinity: 50 | 51 | # Running on GPU nodes only, will: 52 | # 1) add the `nvidia.com/gpu.present: 'true'` label and 53 | # 2) enable the init container, which checks on the nvidia device plug-in to be setup 54 | onlyOnGPUNodes: true 55 | 56 | resources: 57 | # We advice to not set cpu and memory limits. DCGM requires several GB of memory to run and it may OOMKill the pod 58 | limits: 59 | nvidia.com/gpu: 0 60 | requests: 61 | nvidia.com/gpu: 0 62 | 63 | # klog configuration 64 | loglevel: 2 65 | # logfile: "/home/autopilot/data/report.log" 66 | 67 | # optional remote storage. A PVC and secret must exist 68 | additionalVolumeClaimTemplates: 69 | # - name: logdir 70 | # persistentVolumeClaim: 71 | # claimName: my-pvc 72 | # - name: autopilot-tls-secret 73 | # secret: 74 | # secretName: autopilot-webhook 75 | additionalVolumeMounts: 76 | # - name: autopilot-tls-secret 77 | # mountPath: "/etc/autopilot-tls-secret/tls" 78 | # readOnly: true 79 | # - mountPath: /data 80 | # name: logdir 81 | --------------------------------------------------------------------------------