├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   ├── documentation-request.md
    │   └── feature-request.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── build-image.yml
    │   ├── build-push-image-from-main.yml
    │   └── publish-release.yml
├── .gitmodules
├── HEALTH_CHECKS.md
├── LICENSE
├── Makefile
├── README.md
├── SETUP.md
├── USAGE.md
├── alertmanager
    ├── README.md
    ├── alertmanager.yaml
    ├── alerts
    │   └── healthchecks-alerts.yaml
    └── images
    │   ├── alertmanager.png
    │   ├── create-receiver.png
    │   ├── pciealert.png
    │   ├── slack-alert-example.png
    │   └── slack.png
├── autopilot-daemon
    ├── Dockerfile
    ├── go.mod
    ├── go.sum
    ├── gpu-bw
    │   ├── entrypoint.py
    │   └── gpuLocalBandwidthTest.sh
    ├── gpu-dcgm
    │   └── entrypoint.py
    ├── gpu-mem
    │   ├── entrypoint.py
    │   └── gpucheck.cu
    ├── gpu-power
    │   └── power-throttle.sh
    ├── gpu-remapped
    │   ├── entrypoint.py
    │   └── remapped-rows.sh
    ├── network
    │   ├── README.md
    │   ├── iperf3_entrypoint.py
    │   ├── iperf3_start_clients.py
    │   ├── iperf3_start_servers.py
    │   ├── iperf3_stop_servers.py
    │   ├── iperf3_utils.py
    │   ├── network_workload.py
    │   └── ping-entrypoint.py
    ├── pkg
    │   ├── cmd
    │   │   └── main.go
    │   ├── handler
    │   │   ├── handler.go
    │   │   └── messagestruct.go
    │   ├── healthcheck
    │   │   ├── functions.go
    │   │   ├── global.go
    │   │   └── healthcheck.go
    │   └── utils
    │   │   ├── functions.go
    │   │   ├── global.go
    │   │   ├── listwatch.go
    │   │   ├── nodelabels.go
    │   │   └── prometheus.go
    └── utils
    │   ├── briefings.sh
    │   └── runHealthchecks.py
├── figures
    ├── autopilot-daemon-pod.pdf
    ├── autopilot-daemon-pod.png
    ├── autopilot-logo.png
    ├── autopilot-main-loop.pdf
    ├── autopilot-main-loop.svg
    ├── big-picture.pdf
    ├── big-picture.svg
    ├── invasive-check-flow.pdf
    ├── invasive-check-flow.svg
    ├── periodic-check-flow.pdf
    └── periodic-check-flow.svg
├── grafana
    ├── autopilot-dashboard.json
    └── autopilot-dashboard.yaml
└── helm-charts
    └── autopilot
        ├── .helmignore
        ├── Chart.yaml
        ├── README.md
        ├── templates
            ├── NOTES.txt
            ├── _helpers.tpl
            ├── autopilot.yaml
            ├── metrics_service.yaml
            ├── pullsecret.yaml
            ├── service.yaml
            ├── serviceaccount.yaml
            └── servicemonitor.yaml
        └── values.yaml


/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Describe a problem or bug with Autopilot.
 4 | title: "[Bug]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # Summary
11 | 
12 | _A clear and concise description of what the bug is._
13 | 
14 | ## Steps To Reproduce
15 | 
16 | _A detailed list of actions to take in order to reproduce the problem or bug._
17 | 
18 | ## Expected behavior
19 | 
20 | _A clear and concise description of what you expected to happen._
21 | 
22 | ## Evidence
23 | 
24 | _Are there logs, screenshots, or helpful documentation to include? Otherwise N/A._
25 | 
26 | ## Proposed Solution
27 | 
28 | _Include a brief description of a potential solution and method to verify the solution, if possible. Otherwise N/A._
29 | 
30 | **Additional context**
31 | _Add any other context about the problem here._
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation Request
 3 | about: Propose new documentation for Autopilot.
 4 | title: "[Documentation]"
 5 | labels: documentation
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # Summary
11 | 
12 | _Include a brief summary of the documentation changes you're proposing._
13 | 
14 | ## Impact
15 | 
16 | - _What pages will need to be updated?_ 
17 | - _Will there be broken links with these changes?_
18 | - _Are there any images you're adding or external content that exists outside the repository?_
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Describe a feature to see in Autopilot.
 4 | title: "[Feature]"
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # Summary
11 | 
12 | - _Include a brief summary of the feature you'd like to see._
13 | - _Is this feature being motivated by existing gaps or existing issues with Autopilot?_
14 | 
15 | ## Impact
16 | 
17 | - _How big of impact would this feature have to autopilot?_
18 |   - _Is this a new component or total overhaul of an existing feature set?_
19 |   - _What are your thoughts and how can we size this feature accordingly?_
20 | 
21 | ## Proposed Solution
22 | 
23 | - _Include a brief description of a potential implementation, if possible. Otherwise just let us know what you'd like to see!_
24 | - _Any links or snippets of knowledge you share, the better we'll all be at understanding the contribution._
25 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!-- 
 2 | 
 3 | -=-=-=-= Replace the italic content with your own. -=-=-=-=
 4 | -=-=-=-=  Don't forget to update the GitHub Issue   -=-=-=-=
 5 | -=-=-=-=      your Pull-Request pertains to!        -=-=-=-=
 6 | 
 7 |  -->
 8 | 
 9 | # Summary
10 | 
11 | - _What changes are proposed in this pull request?_
12 | 
13 | ## Scope and Impact
14 | 
15 | - _API Changes?_
16 | - _Should any users or specific teams be notified of breaking changes?_
17 | 
18 | ## GitHub Issue
19 | - [#XYZ - Issue Name 1](https://github.com/IBM/autopilot/issues)
20 | 
21 | ## How was this Pull-Request Tested and Validated?
22 | 
23 | - _Steps used to test and validate the changes. Commands and additional content is welcomed._
24 | - _If not applicable mark as N/A._
25 | 
26 | ## Pull-Request Reminders
27 | 
28 | - Does the [Autopilot Readme](https://github.com/IBM/autopilot?tab=readme-ov-file#ai-training-autopilot) require updates?
29 |   - _Yes or No -- if yes, were they made?_
30 | 
31 | - Are there any new software dependencies introduced to this Pull-Request?
32 |   - _Yes or No -- if yes, what are they?_
33 | 


--------------------------------------------------------------------------------
/.github/workflows/build-image.yml:
--------------------------------------------------------------------------------
 1 | name: Test Build Container Image on PR
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |     branches:
 7 |       - 'main'
 8 |     paths:
 9 |       - 'autopilot-daemon/**'
10 | 
11 | jobs:
12 |   docker:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Remove unnecessary files
16 |         run: |
17 |           sudo rm -rf /usr/share/dotnet
18 |           sudo rm -rf /usr/local/lib/android
19 |           
20 |       - name: Checkout
21 |         uses: actions/checkout@v4
22 |         
23 |       - name: Build and push
24 |         uses: docker/build-push-action@v5
25 |         with:
26 |           context: autopilot-daemon
27 |           push: false
28 |           tags: test   
29 | 


--------------------------------------------------------------------------------
/.github/workflows/build-push-image-from-main.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Push Latest Container Image
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - 'main'
 8 |     paths-ignore:
 9 |       - '.github/**'
10 | 
11 | jobs:
12 |   docker:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Remove unnecessary files
16 |         run: |
17 |           sudo rm -rf /usr/share/dotnet
18 |           sudo rm -rf /usr/local/lib/android
19 |           
20 |       - name: Checkout
21 |         uses: actions/checkout@v4
22 |         
23 |       - name: Docker meta
24 |         id: meta
25 |         uses: docker/metadata-action@v5
26 |         with:
27 |           images: quay.io/autopilot/autopilot
28 |           tags: latest
29 |       
30 |       - name: Log into registry 
31 |         uses: docker/login-action@v3
32 |         with:
33 |           registry: quay.io
34 |           username: ${{ secrets.QUAY_USERNAME }}
35 |           password: ${{ secrets.QUAY_PASSWORD }}
36 |      
37 |       - name: Build and push
38 |         uses: docker/build-push-action@v5
39 |         with:
40 |           context: autopilot-daemon
41 |           push: true
42 |           tags: ${{ steps.meta.outputs.tags }}
43 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-release.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Create New Release - Quay and Helm
 4 | on:
 5 |   workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   release:
 9 |     permissions:
10 |       contents: write
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout
14 |         uses: actions/checkout@v3
15 |         with:
16 |           fetch-depth: 0
17 | 
18 |       - name: Configure Git
19 |         run: |
20 |           git config user.name "$GITHUB_ACTOR"
21 |           git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
22 |       - name: Install Helm
23 |         uses: azure/setup-helm@v3
24 |         with:
25 |           token: ${{ secrets.GITHUB_TOKEN }}
26 | 
27 |       - name: Run chart-releaser
28 |         uses: helm/chart-releaser-action@v1.6.0
29 |         env:
30 |           CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
31 |           CR_SKIP_EXISTING: true
32 |         with:
33 |           pages_branch: gh-pages
34 |           charts_dir: helm-charts
35 |           skip_existing: true
36 |           packages_with_index: true
37 |           token: ${{ secrets.GITHUB_TOKEN }}
38 | 
39 |   docker:
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |       - name: Remove unnecessary files
43 |         run: |
44 |           sudo rm -rf /usr/share/dotnet
45 |           sudo rm -rf /usr/local/lib/android
46 |           
47 |       - name: Checkout
48 |         uses: actions/checkout@v3
49 |         with:
50 |           fetch-depth: 0
51 |           
52 |       - name: Read helm chart version
53 |         run: echo "CHART_VERSION=$(grep '^version:' helm-charts/autopilot/Chart.yaml | cut -d ":" -f2 | tr -d ' ')" >> $GITHUB_ENV
54 |       
55 |       - name: Checkout
56 |         uses: actions/checkout@v4
57 |         
58 |       - name: Docker meta
59 |         id: meta
60 |         uses: docker/metadata-action@v5
61 |         with:
62 |           images: quay.io/autopilot/autopilot
63 |           tags: ${{ env.CHART_VERSION }}
64 |       
65 |       - name: Log into registry 
66 |         uses: docker/login-action@v3
67 |         with:
68 |           registry: quay.io
69 |           username: ${{ secrets.QUAY_USERNAME }}
70 |           password: ${{ secrets.QUAY_PASSWORD }}
71 |      
72 |       - name: Build and push
73 |         uses: docker/build-push-action@v5
74 |         with:
75 |           context: autopilot-daemon
76 |           push: true
77 |           tags: ${{ steps.meta.outputs.tags }}


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/.gitmodules


--------------------------------------------------------------------------------
/HEALTH_CHECKS.md:
--------------------------------------------------------------------------------
  1 | # Health Checks
  2 | 
  3 | Here is a breakdown of the existing health checks:
  4 | 
  5 | 1. **PCIe Bandwidth Check (pciebw)**
  6 |     - Description  : Host-to-device connection speeds, one measurement per GPU. Codebase in tag [v12.4.1](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest)
  7 |     - Outputs: Pass/fail results based on PCIe bandwidth thresholds.
  8 |     - Implementation: Compares bandwidth results to a threshold (e.g., 8 GB/s). If the measured bandwidth falls below the threshold, it triggers a failure.
  9 |     - It is recommended to set a threshold that is 25% or lower of the expected peak PCIe bandwidth capability, which maps to maximum peak from 16 lanes to 4 lanes. For example, for a PCIe Gen4x16, reported peak bandwidth is 63GB/s. A degradation at 25% is 15.75GB/s, which corresponds to PCIe Gen4x4.
 10 |     - The measured bandwidth is expected to be at least 80% of the expected peak PCIe generation bandwidth.
 11 | 2. **GPU Memory Check (remapped)**
 12 |     - Description: Information from nvidia-smi regarding GPU memory remapped rows.
 13 |     - Outputs: Reports the state of GPU memory (normal/faulty).
 14 |     - Implementation: Analyzes remapped rows information to assess potential GPU memory issues.
 15 | 3. **GPU Memory Bandwidth Performance (gpumem)**
 16 |     - Description: Memory bandwidth measurements using DAXPY and DGEMM.
 17 |     - Outputs: Performance metrics (eg., TFlops, power).
 18 |     - Implementation: CUDA code that valuates memory bandwidth and flags deviations from expected performance values.
 19 | 4. **GPU Diagnostics (dcgm)**
 20 |     - Description: Runs NVidia DCGM diagnostics using dcgmi diag.
 21 |     - Outputs: Diagnostic results (pass/fail).
 22 |     - Implementation: Analyzes GPU health, including memory, power, and thermal performance.
 23 | 5. **PVC Create/Delete (pvc)**
 24 |     - Description: Given a storage class, tests if a PVC can be created and deleted.
 25 |     - Output: pass/fail depending on the success or failure of creation and deletion of a PVC. If either operation fail, the result is a failure.
 26 |     - Implementation: creation of a PVC through K8s APIs.
 27 | 6. **Network Reachability Check (ping)**
 28 |     - Description: Pings between nodes to assess connectivity.
 29 |     - Outputs: Pass/fail based on ping success.
 30 |     - Implementation: all-to-all reachability test.
 31 | 7. **Network Bandwidth Check (iperf)**
 32 |     - Description: Tests network bandwidth by launching clients and servers on multiple interfaces through iperf3. Results are aggregated per interface results from network tests. Further details can be found in [the dedicated page](autopilot-daemon/network/README.md).
 33 |     - Outputs: Aggregate bandwidth on each interface, per node (in Gb/s).
 34 |     - Implementation: Tests network bandwidth by launching clients and servers on multiple interfaces and by running a ring topology on all network interfaces found on the pod that are exposed by network controllers like multi-nic CNI, which exposes fast network interfaces in the pods requesting them. Does not run on `eth0`.
 35 | 
 36 | These checks are configured to run periodically (e.g., hourly), and results are accessible via Prometheus, direct API queries or labels on the worker nodes.
 37 | 
 38 | ![image](figures/periodic-check-flow.svg)
 39 | 
 40 | ## Deep Diagnostics and Node Labeling
 41 | 
 42 | Autopilot's periodic health checks, will label the worker nodes according to the result obtained.
 43 | Lightweight and invasive health checks, may use different labeling system.
 44 | 
 45 | If the health checks, lightweight or invasive, report success, the node is marked with
 46 | 
 47 | ```yaml
 48 | autopilot.ibm.com/gpuhealth: PASS
 49 | ```
 50 | 
 51 | When the lightweight health checks report an issue, the node is labelled with
 52 | 
 53 | ```yaml
 54 | autopilot.ibm.com/gpuhealth: WARN
 55 | ```
 56 | 
 57 | ### Invasive health checks
 58 | 
 59 | The invasive DCGM diagnostics level 3 health check, executed automatically only on nodes that have free GPUs. This deeper analysis is needed to reveal problems in the GPUs that can be found only after running level 3 DCGM diagnostic.
 60 | 
 61 | ![image](figures/invasive-check-flow.svg)
 62 | 
 63 | This type of diagnostics can help deciding if the worker node should be used for running workloads or not. To facilitate this task, Autopilot will label nodes with key `autopilot.ibm.com/dcgm.level.3`.
 64 | 
 65 | If a fatal error is found, the `gpuhealth` label is updated to evict.
 66 | 
 67 | ```yaml
 68 | autopilot.ibm.com/gpuhealth: EVICT
 69 | ```
 70 | 
 71 | Only fatal errors should produce an `EVICT` label. We follow [NVIDIA recommendations](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3), although it is possible to customize the list of tests through the Helm chart. The default values are `[PCIe,NVLink,ECC,GPU Memory]`.
 72 | 
 73 | If errors are found during the level 3 diagnostics, the label `autopilot.ibm.com/dcgm.level.3` will contain the result and timestamp related to the latest run, while the annotation `autopilot.ibm.com/dcgm.level.3.output` will contain detailed information about the error in the following format:
 74 | 
 75 | ```yaml
 76 | labels:
 77 |     autopilot.ibm.com/dcgm.level.3: ERR_Year-Month-Date_Hour.Minute.UTC
 78 | annotations:
 79 |     autopilot.ibm.com/dcgm.level.3.output: Diagnostic_Test.gpuID,Diagnostic_Test.gpuID,...`
 80 | ```
 81 | 
 82 | - `ERR`: An indicator that an error has occurred
 83 | - `Year-Month-Date_Hour.Minute.UTC`: Timestamp of completed diagnostics
 84 | - `Diagnostic_Test`: Name of the test that has failed (formatted to replace spaces with underscores)
 85 | - `gpuID`: ID of GPU where the failure has occurred
 86 | 
 87 | **Example:** 
 88 | ```
 89 | labels:
 90 |     autopilot.ibm.com/dcgm.level.3=ERR_2024-10-10_19.12.03UTC
 91 | annotations:
 92 |     autopilot.ibm.com/dcgm.level.3.output=memory_bandwidth.0.1.2.3
 93 | 
 94 | ```
 95 | 
 96 | If there are no errors, the value of `autopilot.ibm.com/dcgm.level.3` is set to `PASS_Year-Month-Date_Hour.Minute.UTC` while `autopilot.ibm.com/dcgm.level.3.output` will be empty.
 97 | 
 98 | ### Logs and Metrics
 99 | 
100 | All health checks results are exported through Prometheus, but they can be also found in each pod's logs.
101 | 
102 | All metrics are accessible through Prometheus and Grafana dashboards. The gauge exposed is `autopilot_health_checks` and can be customized with the following filters:
103 | 
104 | - `check`, select one or more specific health checks
105 | - `node`, filter by node name
106 | - `cpumodel` and `gpumodel`, for heterogeneous clusters
107 | - `deviceid` to select specific GPUs, when available
108 | 
109 | For more information on how to set up alerts based on metrics, please refer to the [alert manager folder](alertmanager/README.md).
110 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TAG=dev
 2 | IMAGE=containerregistry:5000/autopilot
 3 | 
 4 | image-build:
 5 | 	@docker build -t ${IMAGE}:v${TAG} -f autopilot-daemon/Dockerfile autopilot-daemon/
 6 | 
 7 | image-push:
 8 | 	@docker push ${IMAGE}:v${TAG}
 9 | 
10 | all: image-build image-push
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="figures/autopilot-logo.png" width="25%" height="25%">
 2 | 
 3 | # AI Training Autopilot
 4 | 
 5 | Autopilot is a Kubernetes-native daemon that continuously monitors and evaluates GPUs, network and storage health, designed to detect and report infrastructure-level issues during the lifetime of AI workloads. It is an open-source project developed by IBM Research.
 6 | 
 7 | In AI training jobs, which may run for weeks or months, anomalies in the GPUs and network can happen anytime and often go undetected. In this case, performance degrades suddenly and a deep diagnostic is needed to identify the root cause, delaying or deleting the current job. Similarly, hardware anomalies can greatly disrupt the throughput and latency of an AI inference server.
 8 | 
 9 | The role of Autopilot is to detect and report any problems that are detected by its health checks during the lifetime of the job and the existence of a cluster.
10 | 
11 | It implements a set of health checks evaluating the status of the system. These health checks focus mainly on subtle/software issues (i.e., row-remapping or PCIe link degradation), but also run connectivity tests (i.e., ping, iperf) to verify that secondary NICs are reachable. It can also verify that persistent volume claims (PVC) creation is functional for a given storage class.
12 | 
13 | ![image](figures/autopilot-daemon-pod.png)
14 | 
15 | Autopilot is deployed as a Kubernetes DaemonSet on all worker nodes that have GPUs. Each pod exposes a Service that can be accessed through RESTful API to request the execution of health checks. Therefore, each health check has its own entry point, but also a generic “status” entry point is provided.
16 | 
17 | The DaemonSet does not run as privileged and requires access to GPUs without requesting them as resources. Therefore, the GPUs are seen as available by the scheduler.
18 | 
19 | The main code is written in Go, while health checks are written in a combination of Python, Go, bash and CUDA. Each Autopilot pod runs health checks only on the node it resides. A pod can request other pods to run health checks on their nodes, and in that case, results are gathered and showed by the requestor pod.
20 | 
21 | If Autopilot requires full access to GPUs to run more invasive workloads, it will spawn a separate job with resources requests and limits set.
22 | 
23 | ![image](figures/autopilot-main-loop.svg)
24 | 
25 | ## Health Checks
26 | 
27 | The current status of Autopilot includes:
28 | 
29 | - **GPU PCIe Link Bandwidth**: The PCIe NVidia bandwidth test to check host-to-device connection on each node
30 | - **GPU Memory**: GPUs remapped rows evaluation through `nvidia-smi`
31 | - **GPU Memory Bandwidth Performance**: GPUs memory bandwidth evaluation through DAXPY and DGEMM
32 | - **GPU Diagnostics**: NVidia DCGM (Data Center GPU Manager) diagnostics through `dcgmi diag`
33 | - **GPU Power Slowdown**: verify if power throttle is active through `nvidia-smi`
34 | - **Network Reachability**: `ping` to evaluate hosts reachability
35 | - **Network Bandwidth**: `iperf3` to evaluate network bandwidth and hosts connectivity
36 | - **PVC Create/Delete**: given a storageclass, test the ability to successfully provision a Persistent Volume Claim
37 | - **DCGM level 3**: deep diagnostics through NVidia DCGM tool. This test runs as a separate Job that reserves all the GPUs in the node if they are free
38 | 
39 | A subset of the tests is enabled by default, and they run by default every hour. Both the the list of health checks and the timer can be customized at initialization time.
40 | 
41 | By default, the periodic checks list contains PCIe, rows remapping, GPUs power, DCGM level 1 and ping.
42 | 
43 | Results from health checks are exported as Prometheus Gauges, so that users and admins can easily check the status of the system on Grafana.
44 | 
45 | Detailed description of all the health checks, can be found in [HEALTH_CHECKS.md](HEALTH_CHECKS.md).
46 | 
47 | ### Diagnostics and Node Labeling
48 | 
49 | Autopilot's periodic and invasive health checks, will label the worker nodes according to the result obtained.
50 | Lightweight and invasive health checks, may use different labeling system. Refer to [HEALTH_CHECKS.md](HEALTH_CHECKS.md) for more details about the labels format.
51 | 
52 | The information saved in the labels, can be used by admins, kube-scheduler or other workload management systems like [CodeFlare](https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/) to steer the execution of workloads for enhanced fault tolerance.
53 | 
54 | ![image](figures/big-picture.svg)
55 | 
56 | ## Install
57 | 
58 | To learn how to install Autopilot, please refer to [SETUP.md](SETUP.md)
59 | 
60 | ## Usage
61 | 
62 | To learn how to invoke health checks, please refer to [USAGE.md](USAGE.md).
63 | 


--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Install Autopilot
 3 | 
 4 | Autopilot can be installed through Helm and need enough privileges to create objects like services, serviceaccounts, namespaces and relevant RBAC.
 5 | 
 6 | ## Helm Chart customization
 7 | 
 8 | Helm charts values and how-to for customization can be found [here](helm-charts/autopilot/README.md).
 9 | 
10 | ## Install
11 | 
12 | 1) Add autopilot repo
13 | 
14 | ```bash
15 | helm repo add autopilot https://ibm.github.io/autopilot/
16 | ```
17 | 
18 | 2) Install autopilot (idempotent command). The config file is for customizing the helm values. It is not mandatory. If the default values work for you, omit the `-f`. The `--namespace` parameter says where the helm chart will be deployed
19 | 
20 | ```bash
21 | helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml
22 | ```
23 | 
24 | The controllers should show up in the selected namespace
25 | 
26 | ```bash
27 | kubectl get po -n autopilot
28 | ```
29 | 
30 | ```bash
31 | NAME                               READY   STATUS    RESTARTS   AGE
32 | autopilot-daemon-autopilot-g7j6h   1/1     Running   0          70m
33 | autopilot-daemon-autopilot-g822n   1/1     Running   0          70m
34 | autopilot-daemon-autopilot-x6h8d   1/1     Running   0          70m
35 | autopilot-daemon-autopilot-xhntv   1/1     Running   0          70m
36 | ```
37 | 
38 | ## Uninstall
39 | 
40 | ```bash
41 |  helm uninstall autopilot -n autopilot
42 |  kubectl delete namespace autopilot
43 | ```
44 | 
45 | ## Enabling Prometheus
46 | 
47 | ### Kubernetes Users
48 | 
49 | The ServiceMonitor object is the one that enables Prometheus to scrape the metrics produced by Autopilot.
50 | In order for Prometheus to find the right objects, the `ServiceMonitor` needs to be annotated with the Prometheus' release name. It is usually `prometheus`, and that's the default added in the Autopilot release.
51 | If that is not the case in your cluster, the correct release label can be found by checking in the `ServiceMonitor` of Prometheus itself, or the name of Prometheus helm chart.
52 | Then, Autopilot's `ServiceMonitor` can be labeled with the following command
53 | 
54 | ```bash
55 | kubectl label servicemonitors.monitoring.coreos.com -n autopilot autopilot-metrics-monitor release=<prometheus-release-name>
56 | ```
57 | 
58 | ### OpenShift Users
59 | 
60 | **If on OpenShift**, after completing the installation, manually label the namespace to enable metrics to be scraped by Prometheus with the following command:
61 | The `ServiceMonitor` labeling is not required.
62 | 
63 | ```bash
64 | kubectl label ns autopilot openshift.io/cluster-monitoring=true
65 | ```
66 | 
67 | ## Enabling Grafana Dashboard
68 | 
69 | To deploy the autopilot Grafana dashboard, you need a Grafana instance on your cluster. For instance, Grafana and Prometheus can be installed via `prometheus-community/kube-prometheus-stack` helm charts.
70 | 
71 | The dashboard can be installed by:
72 | 
73 | - Importing the `autopilot-dashboard.json` file in the Grafana web console;
74 | - Importing the dashboard id `23123` in the Grafana web console. The dashboard is published in the [Grafana dashboards](https://grafana.com/grafana/dashboards/23123-autopilot-metrics/) website under the name of Autopilot Metrics;
75 | - Applying the `GrafanaDashboard` object provided by running the following command:
76 | 
77 | ```bash
78 | kubectl create -f grafana/autopilot-dashboard.yaml [-n <grafana-operator-namespace>]
79 | ```
80 | 
81 | The dashboard have some default values, for instance `3.4Gb/s` for the PCIe bandwidth alert threshold, but each value can be customized.
82 | 


--------------------------------------------------------------------------------
/USAGE.md:
--------------------------------------------------------------------------------
  1 | # Manually Query the Autopilot Service
  2 | 
  3 | Autopilot provides a `/status` handler that can be queried to get the entire system status, meaning that it will run all the tests on all the nodes. Autopilot is reachable by service name `autopilot-healthchecks.autopilot.svc` in-cluster only, meaning it can be reached from a pod running in the cluster, or through port forwarding (see below).
  4 | 
  5 | Health check names are `pciebw`, `dcgm`, `remapped`, `ping`, `iperf`, `pvc`, `gpumem`.
  6 | 
  7 | For example, using port forwarding to localhost or by exposing the service
  8 | 
  9 | ```bash
 10 | kubectl port-forward service/autopilot-healthchecks 3333:3333 -n autopilot
 11 | # or oc expose service autopilot-healthchecks -n autopilot in OpenShift
 12 | ```
 13 | 
 14 | If using port forward, then launch `curl` on another terminal
 15 | 
 16 | ```bash
 17 | curl "http://localhost:3333/status?check=pciebw&host=nodename1"
 18 | ```
 19 | 
 20 | Alternatively, retrieve the route with `kubectl get routes autopilot-healthchecks -n autopilot`
 21 | When using routes, it is recommended to [increase the timeout](https://docs.openshift.com/container-platform/4.10/networking/routes/route-configuration.html#nw-configuring-route-timeouts_route-configuration) with the following command
 22 | 
 23 | ```bash
 24 | oc annotate route autopilot-healthchecks -n autopilot --overwrite haproxy.router.openshift.io/timeout=30m 
 25 | ```
 26 | 
 27 | Then:
 28 | 
 29 | ```bash
 30 | curl "http://<route-name>/status?check=pciebw&host=nodename1"
 31 | ```
 32 | 
 33 | All tests can be tailored by a combination of:
 34 | 
 35 | - `host=<hostname1,hostname2,...>`, to run all tests on a specific node or on a comma separated list of nodes.
 36 | - `check=<healthcheck1,healtcheck2,...>`, to run a single test (`pciebw`, `dcgm`, `remapped`, `gpumem`, `ping`, `iperf` or `all`) or a list of comma separated tests. When no parameters are specified, only `pciebw`, `dcgm`, `remapped`, `ping` tests are run.
 37 | - `job=<namespace:key=value>`, run tests on nodes running a job labeled with `key=value` in a specific namespace.
 38 | - `nodelabel=<key=value>`, run tests on nodes having the `key=value` label.
 39 | - `batch=<#hosts>`, how many hosts to check at a single moment. Requests to the batch are run in parallel asynchronously. Batching is done to avoid running too many requests in parallel when the number of worker nodes increases. Defaults to all nodes.
 40 | 
 41 | Some health checks provide further customization. More details on all the tests can be found [here](https://github.com/IBM/autopilot/autopilot-daemon/HEALTH_CHECKS.md)
 42 | 
 43 | Note that if multiple node selection parameters (`host`, `job`, `nodelabel`) are provided together, Autopilot will run tests on nodes that match _any_  of the specified parameters (set union). For example, the following command will run the `pciebw` test on all nodes that either have the label `label1` OR are running the job `jobKey=job2` because both `nodelabel` and `job` parameters are provided in the input:
 44 | 
 45 | ```bash
 46 | curl "http://<route-name>/status?check=pciebw&nodelabel=label1&job=default:jobKey=job2"
 47 | ```
 48 | 
 49 | ## DCGM
 50 | 
 51 | This test runs `dcgmi diag`, and we support only `r` as [parameter](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#command-line-options).
 52 | 
 53 | The default is `1`, but can customize it by `/status?check=dcgm&r=2`.
 54 | 
 55 | ## Network Bandwidth Validation with IPERF
 56 | 
 57 | As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user.  Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user.
 58 | 
 59 | - For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the  `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`.
 60 | 
 61 | - Invocation from the exposed Autopilot API is as follows below:
 62 | 
 63 | ```bash
 64 | # Invoked via the `status` handle:
 65 | curl "http://127.0.0.1:3333/status?check=iperf&workload=ring&pclients=<NUMBER_OF_IPERF3_CLIENTS>&startport=<STARTING_IPERF3_SERVER_PORT>"
 66 | 
 67 | # Invoked via the `status` with defaults (iperf clients = 8, starting server port = 5200, workload = ring):
 68 | curl "http://127.0.0.1:3333/status?check=iperf"
 69 | 
 70 | # Invoked via the `iperf` handle directly:
 71 | curl "http://127.0.0.1:3333/iperf?workload=ring&pclients=<NUMBER_OF_IPERF3_CLIENTS>&startport=<STARTING_IPERF3_SERVER_PORT>"
 72 | 
 73 | # Invoked via the `iperf` handle directly (iperf clients = 8, starting server port = 5200, workload = ring):
 74 | curl "http://127.0.0.1:3333/iperf"
 75 | ```
 76 | 
 77 | ## Concrete Example
 78 | 
 79 | In this example, we target one node and check the pcie bandwidth and use the port-forwarding method.
 80 | In this scenario, we have a value lower than `8GB/s`, which results in an alert. This error will be exported to the OpenShift web console and on Slack, if that is enabled by admins.
 81 | 
 82 | ```bash
 83 | curl "http://127.0.0.1:3333/status?check=pciebw"
 84 | ```
 85 | 
 86 | The output of the command above, will be similar to the following (edited to save space):
 87 | 
 88 | ```bash
 89 | Checking status on all nodes
 90 | Autopilot Endpoint: 10.128.6.187
 91 | Node: hostname
 92 | url(s): http://10.128.6.187:3333/status?host=hostname&check=pciebw
 93 | Response:
 94 | Checking system status of host hostname (localhost) 
 95 | 
 96 | [[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation.
 97 | [[ PCIEBW ]] FAIL
 98 | Host  hostname
 99 | 12.3 12.3 12.3 12.3 5.3 12.3 12.3 12.3
100 | 
101 | Node Status: PCIE Failed
102 | -------------------------------------
103 | 
104 | 
105 | Autopilot Endpoint: 10.131.4.93
106 | Node: hostname2
107 | url(s): http://10.131.4.93:3333/status?host=hostname2&check=pciebw
108 | Response:
109 | Checking system status of host hostname2 (localhost) 
110 | 
111 | [[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation.
112 | [[ PCIEBW ]] SUCCESS
113 | Host  hostname2
114 | 12.1 12.0 12.3 12.3 11.9 11.5 12.1 12.1
115 | 
116 | Node Status: Ok
117 | -------------------------------------
118 | 
119 | Node Summary:
120 | 
121 | {'hostname': ['PCIE Failed'],
122 |  'hostname2': ['Ok']}
123 | 
124 | runtime: 31.845192193984985 sec
125 | ```
126 | 


--------------------------------------------------------------------------------
/alertmanager/README.md:
--------------------------------------------------------------------------------
  1 | # Alerting for autopilot tests on OpenShift clusters
  2 | 
  3 | Autopilot can issue alerts when:
  4 | 
  5 | 1) any health check reports an issue
  6 | 2) a node is labeled with `ERR` as a result of an health check
  7 | 3) any of the Autopilot pods fail.
  8 | 
  9 | This folder contains the files needed to enable the above alerts and to set up Slack notifications using Prometheus and AlertManager on OpenShift.
 10 | 
 11 | There are 3 main steps to set it up:
 12 | 
 13 | 1) Create `PrometheusRules` (alerting rules)
 14 | 2) Create a Slack webhook application
 15 | 3) Create an `AlertManager` Receiver
 16 | 
 17 | These steps are explained in more detail below.
 18 | 
 19 | ## Create alerting rules for Prometheus
 20 | 
 21 | ```console
 22 | oc project openshift-monitoring
 23 | oc create -f healthchecks-alerts.yaml
 24 | ```
 25 | 
 26 | Note the following in the example below:
 27 | 
 28 | - The `PrometheusRule` is created in the `openshift-monitoring` namespace - this is the namespace where Prometheus and Alert Manager is deployed on the OpenShift cluster.
 29 | - The `alert: autopilot` label is added to match the alert with an Alert Manager receiver that we will create in the last step. This is how Prometheus knows which Alert Manager receiver to send the alert to.
 30 | 
 31 | For example: `sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0` is the PromQL query used to count how many nodes have a GPU device with a PCIE bandwidth of less than 4.
 32 | 
 33 | ```yaml
 34 | - alert: LowPCIeBandwidth
 35 |       annotations:
 36 |         description: |
 37 |           GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has a PCIE bandwidth of {{ $value }} {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 38 |         summary: GPU with a PCIe bandwidth of 4 or less 
 39 |       expr: |
 40 |         sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0
 41 |       for: 1m
 42 |       labels:
 43 |         severity: warning
 44 |         alert: autopilot
 45 | ```
 46 | 
 47 | ## Observe OpenShift dashboard notifications
 48 | 
 49 | Once you have deployed the above `PrometheusRules`, you should start seeing alerts in the OpenShift dashboard when one of the autopilot tests fails. For example, this alert below warns about low PCIE bandwidth on a GPU device on a node:
 50 | ![PCIE Alert](images/pciealert.png)
 51 | 
 52 | ## Create a Slack incoming webhook application
 53 | 
 54 | - Create a Slack workspace using your personal Slack account
 55 | - Go to https://slack.com/apps and select your workspace from the dropdown menu in the top right of the page
 56 | - Click on `Get Essential Apps` and search the App Directory for `Incoming WebHooks`
 57 | 
 58 | You should see a page like this:
 59 | ![Slack Webhook](images/slack.png)
 60 | 
 61 | If there is no existing webhook, you can create one by following the official documentations [here](https://api.slack.com/messaging/webhooks).
 62 | 
 63 | - Click on `Add to Slack` and choose which Slack channel to post messages to from the dropdown menu or create a new channel.
 64 | 
 65 | - Click on `Add Incoming Webhooks Integration`
 66 | 
 67 | - Copy and paste the `WebhookURL`. We will use this when we configure the `AlertManager` Receiver in the next step.
 68 | It should look something like this:
 69 | 
 70 | ```bash
 71 | https://hooks.slack.com/services/<some-hash>
 72 | ```
 73 | 
 74 | ## Create an `AlertManager` receiver using Slack through the OpenShift Web UI
 75 | 
 76 | - Log into the OpenShift WebUI as an admin
 77 | - Click on Administration -> Cluster Settings -> Configuration -> Alertmanager
 78 | 
 79 | You should see this page:
 80 | ![Alert Manager](images/alertmanager.png)
 81 | 
 82 | Click on `Create Receiver`
 83 | 
 84 | - Choose a Receiver name and set the Receiver type as Slack
 85 | - Click on `Create` and fill out the following fields:
 86 | 
 87 |   - Paste the Slack Webhook URL you copied in the previous step into the `Slack API URL` field
 88 |   - Write the Slack channel name to send notifications to in the `Channel field`
 89 |   - Click on `Show advanced configuration`
 90 |     - We suggest to set the title as follows:
 91 | 
 92 |     ```console
 93 |     [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
 94 |     Autopilot Health Check Report
 95 |     ```
 96 | 
 97 |     - We suggest to set the text as follows:
 98 | 
 99 |     ```console
100 |     {{ range .Alerts -}}
101 |         *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
102 |         *Description:* {{ .Annotations.description }}
103 |         *Details:*
104 |           {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
105 |           {{ end }}
106 |         {{ end }}
107 |     ```
108 | 
109 |   - In the Routing Labels section, provide the label that we provided in the Prometheus `AlertingRule` in the first step.
110 |     - Set it to `alert=autopilot`, which is a label added in the various Prometheus Rules in `healthchecks-alerts.yaml`
111 |     This ensures that Prometheus will route the `AlertingRule` we created to this specific `AlertManager` receiver.
112 | 
113 | - Click on `Save`
114 | 
115 | This will generate a yaml file like `alertmanager.yaml` in this folder and will update the `AlertManager` pod configuration to add your new receiver. Now we will start receiving alerts from the Prometheus `AlertingRule` we created. Note that in `alertmanager.yaml`, there is a `critical` receiver to catch all the `critical` alerts. Some of the Autopilot alerts also have the `severity=critical` label (for instance, `dcgm level 3` or `ping`), but those will be captured by the `alert=autopilot` label anyways.
116 | 
117 | You can check the status of the `AlertManager` pod with this command:
118 | 
119 | ```console
120 | oc -n openshift-monitoring logs -l 'alertmanager=main'
121 | ```
122 | 
123 | That's it! Now you can get notifications in Slack every time an autopilot test fails or if any of the pods fail. If there is something else you wish to get notification for, you simply need to create a new `PrometheusRule` with a new `expr` and label, and create a new `AlertManager` Slack receiver with a matching label.
124 | 
125 | Below is an example of a Slack alert of a firing and then resolved rule.
126 | ![Slack Messages](images/slack-alert-example.png)
127 | 


--------------------------------------------------------------------------------
/alertmanager/alertmanager.yaml:
--------------------------------------------------------------------------------
 1 | ﻿global:
 2 |   resolve_timeout: 5m
 3 |   slack_api_url: >-
 4 |     <slack-webhook-api-url>
 5 | inhibit_rules:
 6 |   - equal:
 7 |       - namespace
 8 |       - alertname
 9 |     source_matchers:
10 |       - severity = critical
11 |       - alert = slack
12 |     target_matchers:
13 |       - severity =~ warning|info
14 |   - equal:
15 |       - namespace
16 |       - alertname
17 |     source_matchers:
18 |       - severity = warning
19 |     target_matchers:
20 |       - severity = info
21 |   - equal:
22 |       - namespace
23 |     source_matchers:
24 |       - alertname = InfoInhibitor
25 |     target_matchers:
26 |       - severity = info
27 | receivers:
28 |   - name: Autopilot
29 |     slack_configs:
30 |       - channel: <alerts-slack-channel>
31 |         send_resolved: true
32 |         text: |-
33 |           {{ range .Alerts -}}
34 |                   *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
35 |                   *Description:* {{ .Annotations.description }}
36 |                   *Details:*
37 |                     {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
38 |                     {{ end }}
39 |                   {{ end }}
40 |         title: >-
41 |           [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing
42 |           | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{
43 |           .CommonLabels.job }}
44 |            Autopilot Health Check Report
45 |   - name: Critical
46 |     slack_configs:
47 |       - channel: <alerts-slack-channel>
48 |         link_names: true
49 |         send_resolved: true
50 |         text: |-
51 |           {{ range .Alerts -}}
52 |                   *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
53 |                   *Description:* {{ .Annotations.description }}
54 |                   *Details:*
55 |                     {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
56 |                     {{ end }}
57 |                   {{ end }}
58 |         title: >-
59 |           [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing
60 |           | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{
61 |           .CommonLabels.job }}
62 |   - name: Default
63 |   - name: 'null'
64 |   - name: Watchdog
65 | route:
66 |   group_by:
67 |     - namespace
68 |   group_interval: 5m
69 |   group_wait: 30s
70 |   receiver: Default
71 |   repeat_interval: 12h
72 |   routes:
73 |     - matchers:
74 |         - alertname = Watchdog
75 |       receiver: Watchdog
76 |     - matchers:
77 |         - alertname = InfoInhibitor
78 |       receiver: 'null'
79 |     - receiver: Critical
80 |       matchers:
81 |         - severity = critical
82 |     - receiver: Autopilot
83 |       matchers:
84 |         - alert = autopilot
85 | 


--------------------------------------------------------------------------------
/alertmanager/alerts/healthchecks-alerts.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: monitoring.coreos.com/v1
  2 | kind: PrometheusRule
  3 | metadata:
  4 |   name: autopilot-metrics
  5 |   namespace: openshift-monitoring
  6 |   labels:
  7 |     app: autopilot
  8 | spec:
  9 |   groups:
 10 |   - name: Alerts on GPU related issues
 11 |     rules:
 12 |     - alert: AutopilotLowPCIeBandwidth
 13 |       annotations:
 14 |         description: |
 15 |           GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has a PCIE bandwidth of {{ $value }}{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 16 |         summary: GPU with a PCIe bandwidth of 4 or less 
 17 |       expr: |
 18 |         sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0
 19 |       for: 1m
 20 |       labels:
 21 |         severity: warning
 22 |         alert: autopilot
 23 |     - alert: AutopilotDCGMErrors
 24 |       annotations:
 25 |         description: |
 26 |           GPUs on node {{ $labels.node }} have DCGM failures{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 27 |         summary: GPUs have DCGM failures
 28 |       expr: |
 29 |         sum (autopilot_health_checks{health="dcgm"}==1) by (node)
 30 |       for: 1m
 31 |       labels:
 32 |         severity: warning
 33 |         alert: autopilot
 34 |     - alert: AutopilotGPUPowerSlowdownEnabled
 35 |       annotations:
 36 |         description: |
 37 |           GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has power slowdown enabled
 38 |         summary: A GPU has power slowdown enabled{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 39 |       expr: |
 40 |         sum (autopilot_health_checks{health="power-slowdown"}==1) by (node, deviceid)
 41 |       for: 1m
 42 |       labels:
 43 |         severity: warning
 44 |         alert: autopilot
 45 |     - alert: AutopilotRemappedRowsActive
 46 |       annotations:
 47 |         description: |
 48 |           GPU device {{ $labels.deviceid}} on node {{ $labels.node }} with incorrect remapped rows in memory{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 49 |         summary: A GPU device has incorrect remapped rows
 50 |       expr: |
 51 |         sum (autopilot_health_checks{health="remapped"}==1) by (node, deviceid)
 52 |       for: 1m
 53 |       labels:
 54 |         severity: warning
 55 |         alert: autopilot
 56 |     - alert: AutopilotDCGMLevel3Errors
 57 |       annotations:
 58 |         description: |
 59 |           A node reported errors after running DCGM level 3 - check health of nodes{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 60 |         summary: Node {{ $labels.node }} has GPU errors
 61 |       expr: |  
 62 |         kube_node_labels{label_autopilot_ibm_com_dcgm_level_3=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_dcgm_level_3!~""}
 63 |       for: 5m
 64 |       labels:
 65 |         severity: critical
 66 |         alert: autopilot
 67 |   - name: Alerts on network related issues
 68 |     rules:
 69 |       - alert: AutopilotPingFailures
 70 |         annotations:
 71 |           description: |
 72 |              Node {{ $labels.node }} cannot reach IP {{ $labels.deviceid }}{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 73 |           summary: Node has unreachable IPs
 74 |         expr: |
 75 |           sum (autopilot_health_checks{health="ping"} > 0) by (deviceid)
 76 |         for: 10m
 77 |         labels:
 78 |           severity: critical
 79 |           alert: autopilot
 80 |   - name: Alerts on PVC related issues
 81 |     rules:
 82 |     - alert: AutopilotPVCAlert
 83 |       annotations:
 84 |         description: |
 85 |           PVC creation by Autopilot on node {{ $labels.node }} failed{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 86 |         summary: PVC cannot be created
 87 |       expr: |
 88 |         sum (autopilot_health_checks{health="pvc"}==1) by (node)
 89 |       for: 5m
 90 |       labels:
 91 |         severity: critical
 92 |         alert: autopilot
 93 |   - name: Generic alert on periodic check failure
 94 |     rules:
 95 |     - alert: AutopilotGPUNodeHealth
 96 |       annotations:
 97 |         description: |
 98 |           Node {{ $labels.node }} reported errors after running Autopilot's periodic health checks{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}.
 99 |         summary: Node {{ $labels.node }} has errors
100 |       expr: |
101 |         kube_node_labels{label_autopilot_ibm_com_gpuhealth=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_gpuhealth!~""}
102 |       for: 1m
103 |       labels:
104 |         severity: warning
105 |         alert: autopilot
106 |   - name: Alerts on Autopilot pods not ready
107 |     rules:
108 |     - alert: AutopilotPodsNotReady
109 |       annotations:
110 |         description: Autopilot pod on node {{ $labels.node }} is not ready{{ with $console_url := "console_url" | query }}{{ if ne
111 |             (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url
112 |             ) }}{{ end }}{{ end }}.
113 |         summary: Autopilot pod on node {{ $labels.node }} is not ready
114 |       expr:  count by (namespace) (kube_pod_info and on (pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"autopilot.*"} > 0 or kube_pod_container_status_terminated_reason{reason=~"Error", namespace=~"autopilot.*"} > 0))
115 |       for: 15m
116 |       labels:
117 |         severity: critical
118 |         alert: autopilot


--------------------------------------------------------------------------------
/alertmanager/images/alertmanager.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/alertmanager.png


--------------------------------------------------------------------------------
/alertmanager/images/create-receiver.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/create-receiver.png


--------------------------------------------------------------------------------
/alertmanager/images/pciealert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/pciealert.png


--------------------------------------------------------------------------------
/alertmanager/images/slack-alert-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/slack-alert-example.png


--------------------------------------------------------------------------------
/alertmanager/images/slack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/alertmanager/images/slack.png


--------------------------------------------------------------------------------
/autopilot-daemon/Dockerfile:
--------------------------------------------------------------------------------
  1 | # FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel as cudabuild
  2 | FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel as cudabuild
  3 | 
  4 | RUN apt -y update && apt -y upgrade && apt -y clean && apt -y autoremove \
  5 |         && apt install -y --no-install-recommends build-essential git wget openssh-server && \
  6 |         apt -y clean && apt -y autoremove
  7 | 
  8 | RUN git clone --depth 1 --branch v12.4.1 https://github.com/NVIDIA/cuda-samples.git
  9 | WORKDIR cuda-samples/Samples/1_Utilities/bandwidthTest
 10 | 
 11 | RUN make SMS="80 86 90" 
 12 | 
 13 | WORKDIR /workspace
 14 | 
 15 | COPY gpu-mem/gpucheck.cu .
 16 | 
 17 | RUN nvcc -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 gpucheck.cu -o gpucheck -lcublas --linker-options -lnvidia-ml -O3
 18 | 
 19 | FROM golang:1.21 AS gobuild
 20 | 
 21 | ENV GOOS=linux
 22 | ENV GOARCH=amd64
 23 | ENV CGO_ENABLED=0
 24 | 
 25 | WORKDIR /workspace
 26 | COPY . /workspace/
 27 | 
 28 | RUN go build -o bin/autopilot pkg/cmd/main.go
 29 | 
 30 | ####################### Final Image
 31 | 
 32 | # FROM python:3.9.15-slim
 33 | FROM nvidia/cuda:12.1.1-runtime-ubuntu20.04
 34 | RUN  apt -y update && apt -y upgrade &&  DEBIAN_FRONTEND="noninteractive" TZ="America/New_York" apt install -y --no-install-recommends \ 
 35 |         build-essential iperf3 iputils-ping \
 36 |         python3 \
 37 |         pip \
 38 |         pciutils \
 39 |         wget \
 40 |         net-tools \
 41 |         software-properties-common \
 42 |         git \
 43 |         && apt -y clean && apt -y autoremove
 44 | 
 45 | RUN add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && apt -y update && apt install -y datacenter-gpu-manager
 46 | # add ca-certificates (Alpine commands, previous base image)
 47 | # RUN apk update && apk --no-cache  add ca-certificates
 48 | # RUN adduser -s /bin/bash -D -h /home/autopilot autopilot -G root 
 49 | 
 50 | # RDMA ping utils
 51 | # RUN apt -y update  && apt -y upgrade && apt -y install build-essential automake autoconf libtool libibverbs-dev librdmacm-dev libibumad-dev pciutils libpci-dev
 52 | # RUN git clone https://github.com/linux-rdma/perftest.git && \
 53 | #         cd perftest && \
 54 | #         ./autogen.sh && ./configure && \
 55 | #         make && make install
 56 | # Add capabilities for ping
 57 | RUN setcap cap_net_raw,cap_net_admin+p /bin/ping
 58 | 
 59 | RUN useradd -ms /bin/bash autopilot && usermod -g root autopilot 
 60 | 
 61 | # set working directory
 62 | WORKDIR /home/autopilot
 63 | 
 64 | COPY --from=gobuild /workspace/bin/autopilot /usr/local/bin/autopilot
 65 | 
 66 | # PCIe tests files
 67 | COPY --from=cudabuild /workspace/cuda-samples/Samples/1_Utilities/bandwidthTest/bandwidthTest /home/autopilot/gpu-bw/bandwidthTest
 68 | COPY gpu-bw/gpuLocalBandwidthTest.sh /home/autopilot/gpu-bw/gpuLocalBandwidthTest.sh
 69 | COPY gpu-bw/entrypoint.py /home/autopilot/gpu-bw/entrypoint.py
 70 | 
 71 | # DGEMM DAXPY test files
 72 | 
 73 | COPY --from=cudabuild /workspace/gpucheck /home/autopilot/gpu-mem/gpucheck
 74 | COPY gpu-mem/entrypoint.py /home/autopilot/gpu-mem/entrypoint.py
 75 | 
 76 | 
 77 | # Network tests files
 78 | # COPY network/metrics-entrypoint.py /home/autopilot/network/metrics-entrypoint.py
 79 | COPY network/ping-entrypoint.py /home/autopilot/network/ping-entrypoint.py
 80 | COPY network/iperf3_entrypoint.py /home/autopilot/network/iperf3_entrypoint.py
 81 | COPY network/iperf3_utils.py /home/autopilot/network/iperf3_utils.py
 82 | COPY network/network_workload.py /home/autopilot/network/network_workload.py
 83 | COPY network/iperf3_start_servers.py /home/autopilot/network/iperf3_start_servers.py
 84 | COPY network/iperf3_stop_servers.py /home/autopilot/network/iperf3_stop_servers.py
 85 | COPY network/iperf3_start_clients.py /home/autopilot/network/iperf3_start_clients.py
 86 | 
 87 | # Remapped Rows test files
 88 | COPY gpu-remapped/entrypoint.py /home/autopilot/gpu-remapped/entrypoint.py
 89 | COPY gpu-remapped/remapped-rows.sh /home/autopilot/gpu-remapped/remapped-rows.sh
 90 | 
 91 | COPY utils /home/autopilot/utils
 92 | 
 93 | # DGCM test files and dependencies 
 94 | COPY gpu-dcgm/entrypoint.py /home/autopilot/gpu-dcgm/entrypoint.py
 95 | # RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/datacenter-gpu-manager_3.1.8_amd64.deb && dpkg --install datacenter-gpu-manager_3.1.8_amd64.deb
 96 | 
 97 | # GPU Power cap
 98 | COPY gpu-power/power-throttle.sh /home/autopilot/gpu-power/power-throttle.sh
 99 | 
100 | # Last touches
101 | RUN pip install --upgrade pip && pip install kubernetes netifaces aiohttp[speedups]
102 | RUN apt -y update && apt install -y vim curl && apt -y clean && apt -y autoremove
103 | RUN chmod 755 /usr/local/bin/autopilot && chown -hR autopilot /home/autopilot && chmod -R g=u /home/autopilot
104 | RUN chmod 777 /tmp
105 | 
106 | 
107 | 
108 | CMD ["/usr/local/bin/autopilot"]
109 | 


--------------------------------------------------------------------------------
/autopilot-daemon/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/IBM/autopilot
 2 | 
 3 | go 1.21
 4 | 
 5 | toolchain go1.21.1
 6 | 
 7 | require (
 8 | 	github.com/prometheus/client_golang v1.15.0
 9 | 	github.com/thanhpk/randstr v1.0.6
10 | 	k8s.io/api v0.29.2
11 | 	k8s.io/apimachinery v0.29.2
12 | 	k8s.io/client-go v0.29.2
13 | 	k8s.io/klog/v2 v2.110.1
14 | 	k8s.io/kubectl v0.29.2
15 | )
16 | 
17 | require (
18 | 	github.com/beorn7/perks v1.0.1 // indirect
19 | 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
20 | 	github.com/davecgh/go-spew v1.1.1 // indirect
21 | 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
22 | 	github.com/go-logr/logr v1.3.0 // indirect
23 | 	github.com/go-openapi/jsonpointer v0.19.6 // indirect
24 | 	github.com/go-openapi/jsonreference v0.20.2 // indirect
25 | 	github.com/go-openapi/swag v0.22.3 // indirect
26 | 	github.com/gogo/protobuf v1.3.2 // indirect
27 | 	github.com/golang/protobuf v1.5.3 // indirect
28 | 	github.com/google/gnostic-models v0.6.8 // indirect
29 | 	github.com/google/go-cmp v0.6.0 // indirect
30 | 	github.com/google/gofuzz v1.2.0 // indirect
31 | 	github.com/google/uuid v1.3.0 // indirect
32 | 	github.com/josharian/intern v1.0.0 // indirect
33 | 	github.com/json-iterator/go v1.1.12 // indirect
34 | 	github.com/mailru/easyjson v0.7.7 // indirect
35 | 	github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
36 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
37 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
38 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
39 | 	github.com/prometheus/client_model v0.3.0 // indirect
40 | 	github.com/prometheus/common v0.42.0 // indirect
41 | 	github.com/prometheus/procfs v0.9.0 // indirect
42 | 	golang.org/x/net v0.23.0 // indirect
43 | 	golang.org/x/oauth2 v0.10.0 // indirect
44 | 	golang.org/x/sys v0.18.0 // indirect
45 | 	golang.org/x/term v0.18.0 // indirect
46 | 	golang.org/x/text v0.14.0 // indirect
47 | 	golang.org/x/time v0.3.0 // indirect
48 | 	google.golang.org/appengine v1.6.7 // indirect
49 | 	google.golang.org/protobuf v1.33.0 // indirect
50 | 	gopkg.in/inf.v0 v0.9.1 // indirect
51 | 	gopkg.in/yaml.v2 v2.4.0 // indirect
52 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
53 | 	k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
54 | 	k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
55 | 	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
56 | 	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
57 | 	sigs.k8s.io/yaml v1.3.0 // indirect
58 | )
59 | 


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-bw/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | 
 5 | def main():
 6 |     
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('-t', '--threshold', type=str, default='4')
 9 |     args = parser.parse_args()
10 |     output = os.popen('bash ./utils/briefings.sh')
11 |     result = output.read()
12 |     # print(result)
13 | 
14 |     if "ABORT" not in result:
15 |         print("[[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation.")
16 |         output = os.popen('./gpu-bw/gpuLocalBandwidthTest.sh -t ' + args.threshold)
17 |         result = output.read()
18 | 
19 |         if "ABORT" in result or "SKIP" in result:
20 |             print("[[ PCIEBW ]] ABORT")
21 |             print(result)
22 |             exit()
23 | 
24 |         print("SUCCESS")
25 |         print("Host ", os.getenv("NODE_NAME"))
26 |         splitres = result.split("\n")
27 |         bws = ""
28 |         for line in splitres:
29 |             if "Bandwidth =" in line:
30 |                 x = line.split("= ", 2)
31 |                 y = x[1].split(" GB/s")
32 |                 bws += y[0] + " "
33 |         print(bws.strip())
34 |     else:
35 |         print("[[ PCIEBW ]] ABORT")
36 |         print(result)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-bw/gpuLocalBandwidthTest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # MH:
 4 | # This file is supposed to be used for GPU instance in LLM cluster with PXB topology. Test the localhost only
 5 | # This version can detect more than 8 GPUs but may not correctly work on systems with more than 8 GPUs
 6 | #
 7 | # Requirement: pre-compiled bandwidthTest from cuda_samples on instances.
 8 | #
 9 | # Usage:
10 | # A. Update PROG and FN in the script. Threshold T is set to 7 by default
11 | # B. run gpuLocalBandwidthTest.sh
12 | # C. Check the last line of output: SUCCESS or FAIL
13 | #
14 | # Note: some parameters are hard coded. You may want to change them for different environment.
15 | #
16 | # Find me at minghungchen@ibm.com if any questions
17 | #
18 | # Ver. 1.3
19 | 
20 | PROG="/home/autopilot/gpu-bw/bandwidthTest"
21 | 
22 | 
23 | while getopts t:f: flag
24 | do
25 |     case "${flag}" in
26 |         t) T=${OPTARG};;
27 |     esac
28 | done
29 | echo "Threshold: $T";
30 | 
31 | RES=$(ls -d /dev/nvidia* 2>1)
32 | numre='^[0-9]+$'
33 | D=-1
34 | for d in $RES; do
35 |   d=${d#*"nvidia"*}
36 |   if [[ "$d" =~ $numre ]]; then
37 |     D=0
38 |     break
39 |   fi
40 | done
41 | if [[ $D -eq 0 ]]; then
42 |   echo -n "Detected NVIDIA GPU: "
43 |   for d in $RES; do 
44 |     d=${d#*"nvidia"*}
45 |     if [[ "$d" =~ $numre ]]; then
46 |       echo -n "$d "
47 |       D=$((D+1))
48 |     fi
49 |   done
50 |   echo "Total: $D"
51 | else
52 |   echo "No NVIDIA GPU detected. Skipping the bandwidth test."
53 |   echo "SKIP"
54 |   exit 0
55 | fi
56 | 
57 | D=$((D-1))
58 | for i in $(seq 0 1 $D) ; do
59 |   EXEC+="$($PROG --htod --memory=pinned --device=$i --csv 2>&1)"
60 |   EXEC+="\n"
61 | done
62 | errors="$(echo ${EXEC} | grep -i '802\|error')"
63 | if [[ -n $errors ]]; then
64 |   echo "CRITICAL ERROR WITH GPUs"
65 |   echo "ABORT"
66 |   echo -e $EXEC
67 | else
68 |   echo -e $EXEC
69 |   echo "SUCCESS"
70 | fi


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-dcgm/entrypoint.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import subprocess
  3 | import os
  4 | import argparse
  5 | import re
  6 | import datetime
  7 | from kubernetes import client, config
  8 | from kubernetes.client.rest import ApiException
  9 | 
 10 | config.load_incluster_config()
 11 | v1 = client.CoreV1Api()
 12 | nodename = os.getenv("NODE_NAME")
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('-r', '--run', type=str, default='1')
 16 | parser.add_argument('-l', '--label_node', action='store_true')
 17 | parser.add_argument('-v', '--verbose', action='store_true')
 18 | args = parser.parse_args()
 19 | 
 20 | def main():
 21 |     output = os.popen('bash ./utils/briefings.sh')
 22 |     result = output.read()
 23 |     print(result)
 24 |     
 25 |     if "ABORT" not in result:
 26 |         print("[[ DCGM ]] Briefings completed. Continue with dcgm evaluation.")
 27 |         command = ['dcgmi', 'diag', '-j', '-r', args.run]
 28 |         try_dcgm(command,args.run)
 29 |     else:
 30 |         print("[[ DCGM ]] ABORT")
 31 |         print(result)
 32 | 
 33 | 
 34 | # translate key-strings into lowercase and strip spaces
 35 | def unify_string_format(key: str) -> str:
 36 |     to_lower = key.strip().lower()
 37 |     res, _ = re.subn('[\/|\s]', '_', to_lower)
 38 |     return res
 39 | 
 40 | def parse_all_results(result: str):
 41 |     dcgm_dict = json.loads(result)
 42 |     tests_dict = dcgm_dict['DCGM GPU Diagnostic']['test_categories']
 43 |     success = True
 44 |     output = ""
 45 |     for category in tests_dict:
 46 |         for test in category['tests']:
 47 |             test_failing=False
 48 |             for result in test['results']:
 49 |                 if result['status'] == 'Fail':
 50 |                     success = False
 51 |                     if test_failing is False:
 52 |                         output += f'{unify_string_format(test["name"])}'
 53 |                         test_failing = True
 54 |                     output += f'{"." + str(result["gpu_id"]) if "gpu_id" in result else "NoGPUid"}'
 55 |     return success, output
 56 | 
 57 | 
 58 | # parsing the json result string based on a comma-separated list of paths (levels separated by '.')
 59 | def parse_selected_results(result: str, testpaths: str):
 60 |     '''
 61 |     follow the list of selected paths down the dcgm json tree
 62 | 
 63 |     the specification of the paths: <top_level>.<category>.<name>
 64 | 
 65 |     to walk down this example json snippet below your path should be:
 66 | 
 67 |        'DCGM GPU Diagnostic.Hardware.GPU Memory'
 68 | 
 69 |     for the search, all strings are turned to lowercase and spaces are replaced with '_'
 70 |     therefore the following path would achieve the same:
 71 | 
 72 |         'dcgm_gpu_diagnostic.HARDWare.gpu Memory'
 73 | 
 74 |     "DCGM GPU Diagnostic" : {
 75 |         "test_categories" : [ {
 76 |             ...
 77 |             "category" : "Hardware",
 78 |             "tests" : [ {
 79 |                 "name" : "GPU Memory",
 80 |                 "results" : [ {
 81 |                     "gpu_id" : "0",
 82 |                     "status" : "Fail",
 83 |          ...
 84 | 
 85 | 
 86 |     The paths need to be specified in env variable AUTOPILOT_DCGM_RESULT_PATHS as a comma-separated list
 87 |     If the variable is not set, then the regular scan is performed
 88 |     '''
 89 |     _dcgm_json_levels = [
 90 |         ("top_level","dcgm_gpu_diagnostic"),
 91 |         ("category","tests"),
 92 |         ("name","results")
 93 |     ]
 94 | 
 95 | 
 96 |     # scan the dictionary and recursively transform all keys using key_update
 97 |     def normalize_json_keys(data) -> dict:
 98 |         ndata = {}
 99 |         if not isinstance(data, dict) and not isinstance(data, list):
100 |             return data
101 |         for key,val in data.items():
102 |             key_n = unify_string_format(key)
103 | 
104 |             if isinstance(val, dict):
105 |                 val_n = normalize_json_keys(data[key])
106 |             elif isinstance(val, list):
107 |                 val_n = [ normalize_json_keys(v) for v in val ]
108 |             else:
109 |                 val_n = data[key]
110 | 
111 |             ndata[ key_n ] = val_n
112 | 
113 |         # unfortunately, the top level of dcgm dict is structured differently from the rest,
114 |         # adjusting by inserting/moving it's sub-dict into top-level and rename
115 |         if _dcgm_json_levels[0][1] in ndata:
116 |             ndata[_dcgm_json_levels[0][0]] = _dcgm_json_levels[0][1] # replace old dcgm_gpu_diagnostics with  'top_level' as a name
117 |             ndata[_dcgm_json_levels[0][1]] = ndata[_dcgm_json_levels[0][1]].pop("test_categories") # move test_categories entry to new 'top_level'
118 |         return ndata
119 | 
120 | 
121 |     # recursively dive into the json tree by following a given path
122 |     def dive_to_test(data, jpath: list[str], depth: int):
123 |         assert( 3-len(jpath) == depth )
124 |         assert( depth < 3 )
125 | 
126 |         jlevel_spec = _dcgm_json_levels[depth]
127 | 
128 |         if not isinstance(data, list):
129 |             data = [data]
130 |         for entry in data:
131 |             if jlevel_spec[0] in entry and jpath[0] == unify_string_format( entry[jlevel_spec[0]] ):
132 |                 if depth == 2:
133 |                     return entry[ jlevel_spec[1] ]
134 |                 else:
135 |                     return dive_to_test( entry[ jlevel_spec[1] ], jpath[1:], depth+1 )
136 |         return
137 | 
138 |     # browses the result section of a single test and extracts info
139 |     def parse_single_test_result(data) -> tuple[bool, str]:
140 |         if not data:
141 |             return False, "No Data"
142 |         if not isinstance(data, list):
143 |             data = [data]
144 | 
145 |         success = True
146 |         output = []
147 |         for entry in data:
148 |             if "status" in entry:
149 |                 good = (unify_string_format(entry['status']) == 'pass')
150 |                 success &= good
151 |                 if not good:
152 |                     output.append( (
153 |                         entry["gpu_id"] if "gpu_id" in entry else "NoGPU_ID",
154 |                         entry["info"] if "info" in entry else "NoInfo"
155 |                     ))
156 |             else:
157 |                 success &= False
158 |                 output.append( ("No Status") )
159 |         return success,output
160 | 
161 |     # create output from the parsed results (can be adjusted to whatever)
162 |     def build_output(output_list: tuple[str, str]) -> str:
163 |         print(output_list)
164 |         output = ""
165 |         for test,result in output_list:
166 |             if len(output):
167 |                 output += ";"
168 |             output += f'{unify_string_format(test)}:'
169 |             for result_data in result:
170 |                 for r in result_data:
171 |                     output += f'{unify_string_format(r)},'
172 |         return output
173 | 
174 |     jdata = json.load(result)
175 |     norm_d = normalize_json_keys(jdata)
176 | 
177 |     result_list = []
178 |     overall_success = True
179 |     for path in testpaths.split(','):
180 |         single_test_result = dive_to_test( norm_d, [ unify_string_format(p) for p in path.split('.') ], 0 )
181 |         test_success,output = parse_single_test_result(single_test_result)
182 |         overall_success &= test_success
183 |         if not test_success:
184 |             result_list.append( (path, output) )
185 |     return overall_success, build_output(result_list)
186 | 
187 | 
188 | 
189 | def try_dcgm(command,run_level):
190 |     result = subprocess.run(command, text=True, capture_output=True)
191 |     return_code = result.returncode  # 0 for success
192 |     if return_code != 0:
193 |         print("[[ DCGM ]] DCGM process terminated with errors. Other processes might be running on GPUs. ABORT")
194 |         command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv']
195 |         try:
196 |             proc = subprocess.run(command, check=True, text=True, capture_output=True)
197 |         except subprocess.CalledProcessError:
198 |             print("[[ DCGM ]] nvidia-smi check terminated with errors. ABORT")
199 |             exit()
200 |         if proc.stdout:
201 |             print("[[ DCGM ]] GPUs currently utilized:\n", proc.stdout)
202 | 
203 |     if result.stderr:
204 |        print(result.stderr)
205 |        print("[[ DCGM ]] exited with error: " + result.stderr + " ERR")
206 |     else:
207 |         testpaths = os.getenv("AUTOPILOT_DCGM_RESULT_PATHS")
208 |         if args.verbose:
209 |             print(result.stdout)
210 |         if testpaths == None:
211 |             success, output = parse_all_results(result.stdout)
212 |         if success:
213 |             print("[[ DCGM ]] SUCCESS")
214 |         else:
215 |             print("Host", nodename)
216 |             print("[[ DCGM ]] FAIL")
217 |         if args.label_node:
218 |             patch_node(success, output,run_level)
219 | 
220 | 
221 | def patch_node(success, output,run_level):
222 |     now = datetime.datetime.now(datetime.timezone.utc)
223 |     timestamp = now.strftime("%Y-%m-%d_%H.%M.%SUTC")
224 |     result = ""
225 |     general_health = "PASS"
226 |     try:
227 |         k8s_node = v1.read_node(nodename)
228 |     except ApiException as e:
229 |         print("Exception when calling corev1api->read_node: %s\n" % e)
230 |         exit()
231 | 
232 |     node_labels = k8s_node.metadata.labels
233 |     if os.getenv("DCGM_FATAL_ERRORS") == "":
234 |         # Only fatal errors should produce an EVICT label. Based on https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3
235 |         dcgm_fatal_errors = ['PCIe','NVLink','ECC','GPU Memory']
236 |     else:
237 |         dcgm_fatal_errors = os.getenv("DCGM_FATAL_ERRORS")
238 | 
239 |     if success and node_labels["autopilot.ibm.com/gpuhealth"] in ["PASS", "TESTING"]:
240 |         # If there is some other warning coming from other tests, i.e., ping or storage, we would overwrite this information. Let's play it safe at this point.
241 |         result = "PASS_"+timestamp
242 |     elif not success:
243 |         result = "ERR_"+timestamp
244 |         general_health = "WARN"
245 |         for error in dcgm_fatal_errors:
246 |             unified = unify_string_format(error)
247 |             if unified in output:
248 |                 general_health = "EVICT"
249 | 
250 |     label = {
251 |         "metadata": {
252 |             "labels": {
253 |                 f"autopilot.ibm.com/dcgm.level.{run_level}": result,
254 |                 "autopilot.ibm.com/gpuhealth": general_health},
255 |             "annotations": {
256 |                 f"autopilot.ibm.com/dcgm.level.{run_level}.output": str(output)
257 |             }
258 |         }
259 |     }
260 |     try:
261 |         api_response = v1.patch_node(nodename, label)
262 |     except ApiException as e:
263 |         print("Exception when calling corev1api->patch_node: %s\n" % e)
264 |         exit()
265 | 
266 | if __name__ == '__main__':
267 |     main()
268 | 


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-mem/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def main():
 4 |     output = os.popen('bash ./utils/briefings.sh')
 5 |     result = output.read()
 6 | 
 7 |     if "ABORT" not in result:
 8 |         print("[[ GPU-MEM ]] Briefings completed. Continue with memory evaluation.")
 9 |         output = os.popen('./gpu-mem/gpucheck')
10 |         result = output.read()
11 |         if "NONE" in result:
12 |             print("[[ GPU-MEM ]] Health Check successful")
13 |             exit()
14 | 
15 |     print("[[ GPU-MEM ]] Health Check unsuccessful. FAIL.")
16 |     print(result)
17 |     exit()
18 | 
19 | if __name__ == '__main__':
20 |     main()


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-mem/gpucheck.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <unistd.h>
  5 | #include <sys/time.h>
  6 | #include <cuda_runtime.h>
  7 | #include <cublas_v2.h>
  8 | #include <nvml.h>
  9 | 
 10 | #define MAX_BLOCKS 512
 11 | #define THREADS_PER_BLOCK 256
 12 | #define btoa(x) ((x)?"true":"false")
 13 | 
 14 | double cuda_dgemm(const char *, const char *, int *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
 15 | void cuda_dgemm_free();
 16 | 
 17 | #define CUDA_RC(rc) if( (rc) != cudaSuccess ) \
 18 |   {printf("Error %s at %s line %d\n", cudaGetErrorString(cudaGetLastError()), __FILE__,__LINE__); exit(1);}
 19 | 
 20 | #define CUDA_CHECK()  if( (cudaPeekAtLastError()) != cudaSuccess )        \
 21 |   {printf("Error %s at %s line %d\n", cudaGetErrorString(cudaGetLastError()), __FILE__,__LINE__-1); exit(1);}
 22 | 
 23 | double walltime(void);
 24 | 
 25 | __global__ void daxpy(const double alpha, const double * x, double * y, int npts) 
 26 | {
 27 |    for (int i = blockDim.x * blockIdx.x + threadIdx.x;  i < npts; i += blockDim.x * gridDim.x) y[i] = alpha*x[i] + y[i];
 28 | }
 29 | 
 30 | static nvmlDevice_t nvmldevice;
 31 | static unsigned int temperature, power, smMHz;
 32 | 
 33 | int main(int argc, char * argv[])
 34 | {
 35 |    int i, d, npts, iter, maxiter, mydevice, numDevices;
 36 |    double * __restrict__ x, * __restrict__ y;
 37 |    double * dev_x, * dev_y;
 38 |    double * Amat, * Bmat, * Cmat;
 39 |    int m, n, k, lda, ldb, ldc;
 40 |    double alpha, beta;
 41 |    double  BW_pinned_h2d, BW_pageable_h2d, BW_pinned_d2h, BW_pageable_d2h, BW_daxpy, TFlops;
 42 |    double time1, time2;
 43 |    cudaDeviceProp prop;
 44 |    double * metrics;
 45 |    nvmlDevice_t *device;
 46 |    unsigned int device_count;
 47 | 
 48 |    npts = 1024*1024*(1024/8);
 49 | 
 50 | 
 51 |    // initialize nvml
 52 |    if (NVML_SUCCESS != nvmlInit()) {
 53 |       fprintf(stderr, "failed to initialize NVML ... exiting\n");
 54 |    }   
 55 | 
 56 |    if (NVML_SUCCESS != nvmlDeviceGetCount(&device_count)) {
 57 |       fprintf(stderr, "nvmlDeviceGetCount failed ... exiting\n");
 58 |    }   
 59 | 
 60 |    device = (nvmlDevice_t *) malloc(device_count*sizeof(nvmlDevice_t));
 61 | 
 62 |    for (i = 0; i < device_count; i++) {
 63 |       if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &device[i])) {
 64 |          fprintf(stderr, "nvmlDeviceGetHandleByIndex failed ... exiting\n");
 65 |       }   
 66 |    }
 67 | 
 68 |    // set matrix dimensions large enough to reach close to peak Flops
 69 |    m = 8192; n = 8192; k = 8192;
 70 |    Amat = (double *) malloc(m*k*sizeof(double));
 71 |    Bmat = (double *) malloc(k*n*sizeof(double));
 72 |    Cmat = (double *) malloc(m*n*sizeof(double));
 73 | 
 74 | #pragma omp parallel for
 75 |    for (i=0; i<(m*k); i++) Amat[i] = 1.2e-2*((double) (i%100));
 76 | #pragma omp parallel for
 77 |    for (i=0; i<(k*n); i++) Bmat[i] = 1.5e-3*((double) ((i + 100)%1000));
 78 | #pragma omp parallel for
 79 |    for (i=0; i<(m*n); i++) Cmat[i] = 1.5e-3*((double) ((i + 500)%1000));
 80 | 
 81 |    CUDA_RC(cudaGetDeviceCount(&numDevices));
 82 | 
 83 | 
 84 |    metrics = (double *) malloc(numDevices*9*sizeof(double));
 85 |    y = (double *) malloc(npts*sizeof(double));
 86 |    
 87 |    bool* faulty = (bool*) malloc(numDevices*sizeof(bool));
 88 |    for (i = 0; i < numDevices; ++i)
 89 |       faulty[i] = false;
 90 | 
 91 | 
 92 |    for (d = 0; d < numDevices; d++) {
 93 |       mydevice = d; /*local_rank % numDevices;*/
 94 | 
 95 |       // assign nvmldevice to this rank's GPU
 96 |       nvmldevice = device[mydevice];
 97 | 
 98 |          CUDA_RC(cudaSetDevice(mydevice));
 99 |          CUDA_RC(cudaGetDeviceProperties(&prop, mydevice));
100 |       
101 |          // use pinned memory for x, pageable memory for y
102 |          CUDA_RC(cudaMallocHost((void **)&x, npts*sizeof(double)));
103 |          //   y = (double *) malloc(npts*sizeof(double));
104 | 
105 |          CUDA_RC(cudaMalloc((void **)&dev_x, npts*sizeof(double)));
106 |          CUDA_RC(cudaMalloc((void **)&dev_y, npts*sizeof(double)));
107 | 
108 |          #pragma omp parallel for
109 |          for (i=0; i<npts; i++) x[i] = (double) (i%10);
110 |          #pragma omp parallel for
111 |          for (i=0; i<npts; i++) y[i] = (double) (i%100);
112 | 
113 |          alpha = 3.0;
114 |          maxiter = 20;
115 | 
116 |          
117 | 
118 |          time1 = walltime();
119 |          CUDA_RC(cudaMemcpy(dev_x, x, npts*sizeof(double), cudaMemcpyHostToDevice));
120 |          CUDA_RC(cudaDeviceSynchronize());
121 |          time2 = walltime();
122 | 
123 |          BW_pinned_h2d = 8.0e-9*((double) npts)/(time2 - time1);
124 |          metrics[9*d+0] = BW_pinned_h2d;
125 |          // Check here for low values in pinned h2d
126 |          if (BW_pinned_h2d < 4)
127 |             faulty[d] = true;
128 | 
129 |          time1 = walltime();
130 |          CUDA_RC(cudaMemcpy(dev_y, y, npts*sizeof(double), cudaMemcpyHostToDevice));
131 |          CUDA_RC(cudaDeviceSynchronize());
132 |          time2 = walltime();
133 | 
134 |          BW_pageable_h2d = 8.0e-9*((double) npts)/(time2 - time1);
135 |          metrics[9*d+1] = BW_pageable_h2d;
136 |    
137 |          time1 = walltime();
138 |          CUDA_RC(cudaMemcpy(x, dev_x, npts*sizeof(double), cudaMemcpyDeviceToHost));
139 |          CUDA_RC(cudaDeviceSynchronize());
140 |          time2 = walltime();
141 | 
142 |          BW_pinned_d2h = 8.0e-9*((double) npts)/(time2 - time1);
143 |          metrics[9*d+2] = BW_pinned_d2h;
144 |          
145 | 
146 |          time1 = walltime();
147 |          CUDA_RC(cudaMemcpy(y, dev_y, npts*sizeof(double), cudaMemcpyDeviceToHost));
148 |          CUDA_RC(cudaDeviceSynchronize());
149 |          time2 = walltime();
150 | 
151 |          BW_pageable_d2h = 8.0e-9*((double) npts)/(time2 - time1);
152 |          metrics[9*d+3] = BW_pageable_d2h;
153 | 
154 |          int threadsPerBlock = THREADS_PER_BLOCK;
155 |          int numBlocks = (npts + threadsPerBlock - 1) / threadsPerBlock;
156 |          if (numBlocks > MAX_BLOCKS) numBlocks = MAX_BLOCKS;
157 | 
158 |          time1 = walltime();
159 |          for (iter=0; iter<maxiter; iter++) {
160 |             daxpy<<<numBlocks, threadsPerBlock>>>(alpha, dev_x, dev_y, npts);
161 |             CUDA_CHECK();
162 |          }
163 |          CUDA_RC(cudaDeviceSynchronize());
164 |          time2 = walltime();
165 | 
166 |          BW_daxpy = 3.0*8.0e-9*((double) npts)*((double) maxiter)/(time2 - time1);
167 |          metrics[9*d+4] = BW_daxpy;
168 |          if(BW_daxpy < 1300)
169 |             faulty[d] = true;
170 | 
171 |          //   free(y);
172 |          CUDA_RC(cudaFreeHost(x));
173 |          CUDA_RC(cudaFree(dev_x));
174 |          CUDA_RC(cudaFree(dev_y));
175 | 
176 |          beta = 0.0; lda = m; ldb = k; ldc = m;
177 |          TFlops = cuda_dgemm("N", "N", &m, &n, &k, &alpha, Amat, &lda, Bmat, &ldb, &beta, Cmat, &ldc);
178 |          cuda_dgemm_free();
179 |          metrics[9*d+5] = TFlops;
180 |          if(TFlops < 16)
181 |             faulty[d] = true;
182 | 
183 |          metrics[9*d+6] = (double) temperature;
184 |          metrics[9*d+7] = 1.0e-3*((double) power);  // convert to Watts
185 |          metrics[9*d+8] = (double) smMHz;
186 |    }
187 |    printf(" GPU H2D(p)  H2D   D2H(p)  D2H   daxpy  dgemm   temp     power     smMHz\n");
188 |    for (d = 0; d < numDevices; d++) {    
189 |       printf("%3d %6.2lf %6.2lf %6.2lf %6.2lf %7.2lf %6.2lf %6.0lf %8.0lf %8.0lf\n", 
190 |                d, metrics[9*d], metrics[9*d+1], metrics[9*d+2], metrics[9*d+3], metrics[9*d+4], metrics[9*d+5], metrics[9*d+6], metrics[9*d+7], metrics[9*d+8]);
191 |    }
192 |    printf("Summary of GPU errors:");
193 |    bool allgood = true;
194 |    for (d = 0; d < numDevices; d++) {
195 |       if (faulty[d]) {
196 |          allgood = false;
197 |          printf("GPU %d -- H2D(p): %f; daxpy: %f; dgemm: %f", d, metrics[9*d+0], metrics[9*d+4], metrics[9*d+5]);
198 |       }
199 |    }
200 |    if (allgood) {
201 |       printf(" NONE ");
202 |    }
203 |    free(y);
204 |    free(metrics);
205 |    free(faulty);
206 |    return 0;
207 | }
208 | 
209 | double walltime(void)
210 | {
211 |   double elapsed;
212 |   struct timeval tv;
213 |   gettimeofday(&tv,NULL);
214 |   elapsed = ((double) tv.tv_sec) + 1.0e-6*((double) tv.tv_usec);
215 |   return elapsed;
216 | }
217 | 
218 | 
219 | // variables for cublas dgemm wrapper
220 | static double * d_A, * d_B, * d_C;
221 | static cublasHandle_t handle;
222 | 
223 | // use the Fortran dgemm argument list
224 | double cuda_dgemm(const char * transa, const char * transb, int * m, int * n, int * k, 
225 |                   double * alpha, double * A, int * lda, double * B, int * ldb, 
226 |                   double * beta, double * C, int * ldc)
227 | {
228 |    int M, N, K, LDA, LDB, LDC;
229 |    int asize, bsize, csize;
230 |    double time1, time2, TFlops;
231 |    cublasOperation_t opA, opB;
232 |    int iter, maxiter = 400, sample_iter = 350;
233 | 
234 |    M = *m; N = *n; K = *k;
235 |    LDA = *lda; LDB = *ldb; LDC = *ldc;
236 | 
237 |    asize = M*K;
238 |    bsize = K*N;
239 |    csize = M*N;
240 | 
241 |    cublasCreate(&handle);
242 |    cudaMalloc((void **)&d_A, asize*sizeof(double));
243 |    cudaMalloc((void **)&d_B, bsize*sizeof(double));
244 |    cudaMalloc((void **)&d_C, csize*sizeof(double));
245 | 
246 |    cublasSetVector(asize, sizeof(double), A, 1, d_A, 1);
247 |    cublasSetVector(bsize, sizeof(double), B, 1, d_B, 1);
248 |    cublasSetVector(csize, sizeof(double), C, 1, d_C, 1);
249 | 
250 |    if      (transa[0] == 'n' || transa[0] == 'N') opA = CUBLAS_OP_N;
251 |    else if (transa[0] == 't' || transa[0] == 'T') opA = CUBLAS_OP_T;
252 | 
253 |    if      (transb[0] == 'n' || transb[0] == 'N') opB = CUBLAS_OP_N;
254 |    else if (transb[0] == 't' || transb[0] == 'T') opB = CUBLAS_OP_T;
255 | 
256 | 
257 |    // call one time outside the timers, then time it
258 |    cublasDgemm(handle, opA, opB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta, d_C, LDC);
259 |    cudaDeviceSynchronize();
260 | 
261 |    time1 = walltime();
262 |    for (iter = 0; iter < maxiter; iter++) {
263 |       cublasDgemm(handle, opA, opB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta, d_C, LDC);
264 |       if (iter == sample_iter) {
265 |          if (NVML_SUCCESS != nvmlDeviceGetTemperature(nvmldevice, NVML_TEMPERATURE_GPU, &temperature)) temperature = 0; 
266 |          if (NVML_SUCCESS != nvmlDeviceGetPowerUsage(nvmldevice, &power)) power = 0;
267 |          if (NVML_SUCCESS != nvmlDeviceGetClockInfo(nvmldevice, NVML_CLOCK_SM, &smMHz)) smMHz = 0;
268 |       }
269 |       cudaDeviceSynchronize();
270 |    }
271 |    time2 = walltime();
272 |    TFlops = 2.0e-12*((double) maxiter)*((double) M)*((double) N)*((double) K)/(time2 - time1);
273 | 
274 |    cudaMemcpy(C, d_C, csize*sizeof(double), cudaMemcpyDeviceToHost);
275 | 
276 |    return TFlops;
277 | }
278 | 
279 | void cuda_dgemm_free()
280 | {
281 |    cudaFree(d_A);
282 |    cudaFree(d_B);
283 |    cudaFree(d_C);
284 |    cublasDestroy(handle);
285 |    return;
286 | }
287 | 


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-power/power-throttle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | OUT="$(bash /home/autopilot/utils/briefings.sh | grep ABORT)"
 3 | echo ${OUT}
 4 | if [[ ! -z $OUT ]]; then
 5 |     echo "[[GPU POWER]] ABORT"
 6 |     exit 0
 7 | fi
 8 | echo "[[GPU POWER]] Briefings completed. Continue with power cap evaluation."
 9 | 
10 | RES=$(ls -d /dev/nvidia* 2>1)
11 | numre='^[0-9]+$'
12 | D=-1
13 | for d in $RES; do
14 |   d=${d#*"nvidia"*}
15 |   if [[ "$d" =~ $numre ]]; then
16 |     D=0
17 |     break
18 |   fi
19 | done
20 | if [[ $D -eq 0 ]]; then
21 |   echo -n "[GPU POWER] Detected NVIDIA GPU: "
22 |   for d in $RES; do
23 |     d=${d#*"nvidia"*}
24 |     if [[ "$d" =~ $numre ]]; then
25 |       echo -n "$d "
26 |       D=$((D+1))
27 |     fi
28 |   done
29 |   echo "Total: $D"
30 | else
31 |   echo "[GPU POWER] No NVIDIA GPU detected. Skipping the Power Throttle check."
32 |   echo "ABORT"
33 |   exit 0
34 | fi
35 | RESULT=""
36 | FAIL=0
37 | for i in $(seq 0 1 $((D-1))) ; do
38 |   OUT=$(nvidia-smi --format=csv -i $i --query-gpu=clocks_event_reasons.hw_slowdown)
39 |   NOTACTIVE=$(echo $OUT | grep "Not Active")
40 |   if [[ ! -z "$NOTACTIVE" ]]; then
41 |     RESULT+="0 "
42 |   else
43 |     RESULT+="1 "
44 |     FAIL=1
45 |   fi
46 | done
47 | if [[ $FAIL -ne 0 ]]; then
48 |   echo "[GPU POWER] FAIL"
49 | else
50 |   echo "[GPU POWER] SUCCESS"
51 | fi
52 | echo $RESULT


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-remapped/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def main():
 5 |     output = os.popen('bash ./utils/briefings.sh')
 6 |     result = output.read()
 7 |     print(result)
 8 | 
 9 |     if "ABORT" not in result:
10 |         print("[[ REMAPPED ROWS ]] Briefings completed. Continue with remapped rows evaluation.")
11 |         output = os.popen('./gpu-remapped/remapped-rows.sh')
12 |         result = output.read()
13 |         if "FAIL" not in result:
14 |             print("[[ REMAPPED ROWS ]] SUCCESS")
15 |         else:
16 |             print("[[ REMAPPED ROWS ]] FAIL")
17 |             print("Host ", os.getenv("NODE_NAME"))
18 |             print(result.strip())
19 |             return 0
20 |         print("Host ", os.getenv("NODE_NAME"))
21 |         print(result.strip())
22 |     else:
23 |         print("[[ REMAPPED ROWS ]] ABORT")
24 |         print(result.strip())
25 | 
26 | if __name__ == '__main__':
27 |     main()


--------------------------------------------------------------------------------
/autopilot-daemon/gpu-remapped/remapped-rows.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | RES=$(ls -d /dev/nvidia* 2>1)
 3 | numre='^[0-9]+$'
 4 | D=-1
 5 | for d in $RES; do
 6 |   d=${d#*"nvidia"*}
 7 |   if [[ "$d" =~ $numre ]]; then
 8 |     D=0
 9 |     break
10 |   fi
11 | done
12 | if [[ $D -eq 0 ]]; then
13 |   echo -n "Detected NVIDIA GPU: "
14 |   for d in $RES; do 
15 |     d=${d#*"nvidia"*}
16 |     if [[ "$d" =~ $numre ]]; then
17 |       echo -n "$d "
18 |       D=$((D+1))
19 |     fi
20 |   done
21 |   echo "Total: $D"
22 | else
23 |   echo "No NVIDIA GPU detected. Skipping the Remapped Rows check."
24 |   echo "SKIP"
25 |   exit 0
26 | fi
27 | RESULT=""
28 | FAIL=0
29 | for i in $(seq 0 1 $((D-1))) ; do
30 |   OUT=$(nvidia-smi -q -i $i| grep -A 10 "Remapped Rows")
31 |   REMAPPED=$(echo $OUT | egrep "Pending\s*:\s+Yes")
32 |   if [[ -z "$REMAPPED" ]]; then
33 |     RESULT+="0 "
34 |   else
35 |     RESULT+="1 "
36 |     FAIL=1
37 |   fi
38 | done
39 | if [[ $FAIL -eq 1 ]]; then
40 |   echo FAIL
41 | fi
42 | echo $RESULT


--------------------------------------------------------------------------------
/autopilot-daemon/network/README.md:
--------------------------------------------------------------------------------
 1 | # Network Validation Tests
 2 | 
 3 | Autopilot provides two network validation tests:
 4 | 
 5 | - Reachability: runs `ping` against all network interfaces available in all the Autopilot pods
 6 | - Bandwidth: runs `iperf3` to validate the network bandwidth available.
 7 | 
 8 | ## Iperf
 9 | 
10 | This test, in it's current form, is primarily for running `TCP` `data plane` `port-to-port` network workloads to gather key performance statistics. This performs a `Ring Traversal` (or as we call it, a `ring workload`) through all network interfaces (net1-X interfaces) at varying intensity (number of simultaneous client & servers per interface). In future versions of Autopilot, more workloads and customization to the workloads may be provided.
11 | 
12 | ### Ring workload
13 | A "Ring Workload", in our case is similar the commonly known "Ring Topology" such that the execution calls flow sequentially in a   particular _direction_ that forms a "ring" like pattern. _Most importantly, none of the the compute infrastructure is actually configured in a ring, we merely develop workloads that resemble a ring pattern._ The motivation for these workloads is to achieve full line rate throughput on a port-by-port (in our case network interfaces net1-X) basis for a single logical cluster.
14 | 
15 | Assume we have the following set of nodes `[A,B,C]`.  We can create a `ring` starting from node `A` that flows to the direction of `C`:
16 | 
17 | ```console
18 | A -> B
19 | B -> C
20 | C -> A
21 | ```
22 | 
23 | In our case, a "Ring Workload" will exhaust all starting pointings. We call these iterations, `timesteps`. In a compute infrastructure with `n` number of nodes, we can say there will be `n-1` total timesteps. Said differently, there's `n-1` possible starting points that form a ring such that no node flows to itself.  Each of the pairs of execution in a given timestep will execute in parallel.
24 | 
25 | ```console
26 | Timestep 1:
27 | ------------
28 | A -> B
29 | B -> C
30 | C -> A
31 | 
32 | Timestep 2:
33 | ------------
34 | A -> C
35 | B -> A
36 | C -> B
37 | ```
38 | 
39 | As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user.  Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user.
40 | 
41 | For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the  `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`.
42 | 
43 | For each timestep, all `pairs` are executed simultaneously. For each pair some `number of clients` are started in parallel and will run for `5 seconds` using `zero-copies` against a respective `iperf3 server`
44 | 
45 | Metrics such `minimum`, `maximum`, `mean`, `aggregate` bitrates and transfers are tracked for both the `sender` and the `receiver` for each `client -> server` execution. The results are stored both as `JSON` in the respective `pod` as well as summarized and dumped into the `pod logs`.
46 | 
47 | Invocation from the exposed Autopilot API is as follows below:
48 | 
49 | ```bash
50 |     # Invoked via the `status` handle:
51 | curl "http://127.0.0.1:3333/status?check=iperf&workload=ring&pclients=<NUMBER_OF_IPERF3_CLIENTS>&startport=<STARTING_IPERF3_SERVER_PORT>"
52 | 
53 |     # Invoked via the `iperf` handle directly:
54 | curl "http://127.0.0.1:3333/iperf?workload=ring&pclients=<NUMBER_OF_IPERF3_CLIENTS>&startport=<STARTING_IPERF3_SERVER_PORT>"
55 | ```
56 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/iperf3_entrypoint.py:
--------------------------------------------------------------------------------
  1 | from iperf3_utils import *
  2 | from network_workload import NetworkWorkload
  3 | 
  4 | parser = argparse.ArgumentParser()
  5 | 
  6 | parser.add_argument(
  7 |     "--workload",
  8 |     type=str,
  9 |     default="ring",
 10 |     help=('The type of network workload. Supported workload values: "ring"'),
 11 | )
 12 | 
 13 | parser.add_argument(
 14 |     "--pclients",
 15 |     type=str,
 16 |     default="8",
 17 |     help=(
 18 |         'The number of clients to run in parallel. Note. This is not using the iperf3 "-P" option. '
 19 |         'This spawns "N" number of iperf3 client instances in parallel to a target server. For each client, '
 20 |         'a respective port on the target server will be pinned. For instance, if there are 3 "pclients" '
 21 |         "specified, then there will be 3 instances of a particular network interface on 3 different ports."
 22 |     ),
 23 | )
 24 | 
 25 | parser.add_argument(
 26 |     "--startport",
 27 |     type=str,
 28 |     default="5200",
 29 |     help=(
 30 |         'The default port value. In the event that "pclients" is greater than 1, the default port value used '
 31 |         "to generate servers will automatically increase to accomdate the clients running in parallel."
 32 |     ),
 33 | )
 34 | 
 35 | parser.add_argument(
 36 |     "--cleanup",
 37 |     action="store_true",
 38 |     help=("When provided, this will kill ALL iperf servers on every node."),
 39 | )
 40 | 
 41 | args = vars(parser.parse_args())
 42 | 
 43 | 
 44 | async def make_server_connection(event, address, handle):
 45 |     """
 46 |     Handles connections to the target autopilot pod on a different worker-node.
 47 |     Attempts to ensure synchronization via asyncio events...
 48 | 
 49 |     Args:
 50 |         address (str): The address of the autopilot pod.
 51 |         handle (str): The endpoint handle for the connection.
 52 | 
 53 |     """
 54 |     try:
 55 |         if event != None:
 56 |             await event.wait()
 57 |         url = f"http://{address}:{AUTOPILOT_PORT}{handle}"
 58 |         total_timeout = aiohttp.ClientTimeout(total=60 * 10)
 59 |         async with aiohttp.ClientSession(timeout=total_timeout) as session:
 60 |             async with session.get(url) as resp:
 61 |                 reply = await resp.text()
 62 |     except Exception as e:
 63 |         # If we can't create servers we'll need to exit...something has gone wrong
 64 |         # with the network.
 65 |         log.error(f"Error when creating server on {address} at {handle}: {e}")
 66 |         sys.exit(1)
 67 | 
 68 | 
 69 | async def make_client_connection(event, iface, src, dst, address, handle):
 70 |     # Task waits for the event to be set before starting its work.
 71 |     try:
 72 |         if event != None:
 73 |             await event.wait()
 74 |         url = f"http://{address}:{AUTOPILOT_PORT}{handle}"
 75 |         total_timeout = aiohttp.ClientTimeout(total=60 * 10)
 76 |         async with aiohttp.ClientSession(timeout=total_timeout) as session:
 77 |             async with session.get(url) as resp:
 78 |                 reply = await resp.text()
 79 |                 reply = "".join(reply.split())
 80 |                 try:
 81 |                     json_reply = json.loads(reply)
 82 |                 except json.JSONDecodeError as e:
 83 |                     log.error(
 84 |                         f"Failed to decode JSON from response: {e}. Response: {reply}"
 85 |                     )
 86 |                     return {"src": src, "dst": dst, "iface": iface, "data": {}}
 87 | 
 88 |                 return {"src": src, "dst": dst, "iface": iface, "data": json_reply}
 89 |     except Exception as e:
 90 |         log.error(f"Error during client connection to {address} at {handle}: {e}")
 91 |         log.error(f"Failure occured with from src {src} to dst {dst} on iface {iface}")
 92 |         return {"src": src, "dst": dst, "iface": iface, "data": {}}
 93 | 
 94 | 
 95 | async def iperf_start_servers(node_map, num_servers, port_start):
 96 |     """
 97 |     Starts iperf3 servers on each node by sending requests to the corresponding endpoints
 98 |     derived in the node_map. Each server will be launched from the corresponding autopilot
 99 |     pod that the endpoint represents on the worker-node.
100 | 
101 |     Args:
102 |         node_map (dict): A dictionary mapping worker-nodes to representation data.
103 |         num_servers (str): The number of iperf3 servers to start on each node.
104 |         port_start (str) The port to start launching servers from on each node.
105 |     """
106 |     tasks = [
107 |         make_server_connection(
108 |             None,
109 |             node_map[node]["endpoint"],
110 |             f"/iperfservers?numservers={num_servers}&startport={port_start}",
111 |         )
112 |         for node in node_map
113 |     ]
114 |     await asyncio.gather(*tasks)
115 | 
116 | 
117 | async def run_workload(workload_type, nodemap, workload, num_clients, port_start):
118 |     """
119 |     Starts network tests according to the specified workload.
120 | 
121 |     Args:
122 |         workload_type (str): A workload type to run.
123 |         node_map (dict): A dictionary mapping node names to their endpoints, pods, and network interfaces.
124 |         workload (dict): A dictionary specifying the workload and steps for the network tests.
125 |         num_clients (str): The number of parallel clients to test against the server (used to also increase port val.)
126 |         port_start (str): A port associated to the server,
127 |     """
128 |     if SupportedWorkload.RING.value == workload_type:
129 |         event = asyncio.Event()
130 |         # All the nodes "should have" the same amount of interfaces...let's just get the first node and check how many there are...
131 |         # This is also assuming that the ordering of the ifaces in this list are accurate...i.e., starting with net1-0 and so forth
132 |         netifaces_count = len(nodemap[next(iter(nodemap))]["netifaces"])
133 |         results = []
134 |         for iface in range(netifaces_count):
135 |             interface_results = []
136 |             log.info(f"Running Interface net1-{iface}")
137 |             for step in workload:
138 |                 tasks = []
139 |                 for pair in workload[step]:
140 |                     for source, target in pair.items():
141 |                         task = make_client_connection(
142 |                             event,
143 |                             f"net1-{iface}",
144 |                             f"{nodemap[source]['pod']}_on_{source}",
145 |                             f"{nodemap[target]['pod']}_on_{target}",
146 |                             nodemap[source]["endpoint"],
147 |                             f"/iperfclients?dstip={nodemap[target]['netifaces'][iface]}&dstport={port_start}&numclients={num_clients}",
148 |                         )
149 |                         tasks.append(task)
150 |                 await asyncio.sleep(1)
151 |                 event.set()
152 |                 res = await asyncio.gather(*tasks)
153 |                 interface_results.append(res)
154 |             results.append(interface_results)
155 | 
156 |         grids = []
157 |         summary_avg = []
158 |         for i, el in enumerate(results):
159 |             grid = {}
160 |             total_bitrate = 0
161 |             count = 0
162 |             for l in el:
163 |                 for host in l:
164 |                     src = host["src"]
165 |                     dst = host["dst"]
166 |                     if host["data"] == {}:
167 |                         # Failure had occured resulting in a 0.0 bitrate.
168 |                         bitrate = 0.0
169 |                     else:
170 |                         bitrate = float(
171 |                             host["data"]["receiver"]["aggregate"]["bitrate"]
172 |                         )
173 |                     count = count + 1
174 |                     total_bitrate = total_bitrate + bitrate
175 |                     if src not in grid:
176 |                         grid[src] = {}
177 |                     grid[src][dst] = bitrate
178 |             avg = str(round(Decimal(total_bitrate / count), 2))
179 |             summary_avg.append(f"net1-{i} Average Bandwidth Gb/s: {avg}")
180 |             grids.append(grid)
181 | 
182 |         for i, grid in enumerate(grids):
183 |             print(f"Network Throughput net1-{i}:")
184 |             pods = sorted(grid.keys())
185 |             print(f"{'src/dst':<40}" + "".join(f"{dst:<40}" for pod in pods))
186 |             for src_pod in pods:
187 |                 row = [f"{grid[src_pod].get(dst_pod, 'N/A'):<40}" for dst_pod in pods]
188 |                 print(f"{src_pod:<40}" + "".join(row))
189 |             print()
190 | 
191 |         print("Overall Network Interface Average Bandwidth:")
192 |         for i in summary_avg:
193 |             print(i)
194 | 
195 |     else:
196 |         log.error("Unsupported Workload Attempted")
197 |         sys.exit(1)
198 | 
199 | 
200 | async def cleanup_iperf_servers(node_map):
201 |     """
202 |     Removes all started iperf servers across all nodes.
203 | 
204 |     Args:
205 |     node_map (dict): A dictionary mapping worker-nodes to representation data.
206 |     """
207 |     tasks = [
208 |         make_server_connection(
209 |             None,
210 |             node_map[node]["endpoint"],
211 |             f"/iperfstopservers",
212 |         )
213 |         for node in node_map
214 |     ]
215 |     await asyncio.gather(*tasks)
216 | 
217 | 
218 | async def main():
219 |     type_of_workload = args["workload"].upper()
220 |     num_parallel_clients = args["pclients"]
221 |     port_start = args["startport"]
222 |     cleanup_iperf = args["cleanup"]
223 | 
224 |     wl = NetworkWorkload()
225 |     autopilot_node_map = wl.gen_autopilot_node_map_json()
226 |     if type_of_workload in (workload.value for workload in SupportedWorkload):
227 |         if SupportedWorkload.RING.value == type_of_workload:
228 |             ring_workload = wl.generate_ring_topology_json(autopilot_node_map)
229 |             await iperf_start_servers(
230 |                 autopilot_node_map, num_parallel_clients, port_start
231 |             )
232 |             await run_workload(
233 |                 type_of_workload,
234 |                 autopilot_node_map,
235 |                 ring_workload,
236 |                 num_parallel_clients,
237 |                 port_start,
238 |             )
239 | 
240 |         else:
241 |             #
242 |             # TODO: Build other workloads...
243 |             #
244 |             log.error("Unsupported Workload Attempted")
245 |             sys.exit(1)
246 |     else:
247 |         log.error("Unsupported Workload Attempted")
248 |         sys.exit(1)
249 | 
250 |     if cleanup_iperf:
251 |         await cleanup_iperf_servers(autopilot_node_map)
252 | 
253 | 
254 | if __name__ == "__main__":
255 |     asyncio.run(main())
256 | 
257 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/iperf3_start_clients.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import json
  4 | from decimal import Decimal
  5 | from iperf3_utils import *
  6 | 
  7 | parser = argparse.ArgumentParser()
  8 | parser.add_argument("--dstip", type=str, default="", help="IP for iperf3 server")
  9 | parser.add_argument("--dstport", type=int, default=5200, help="Port for iperf3 server")
 10 | parser.add_argument("--numclients", type=int, default=1, help="Number of clients")
 11 | args = parser.parse_args()
 12 | 
 13 | 
 14 | async def run_iperf_client(dstip, dstport, iteration, duration_seconds):
 15 |     dstport += iteration
 16 |     command = [
 17 |         "iperf3",
 18 |         "-c",
 19 |         dstip,
 20 |         "-p",
 21 |         str(dstport),
 22 |         "-t",
 23 |         duration_seconds,
 24 |         "-i",
 25 |         "1.0",
 26 |         "-Z",
 27 |     ]
 28 | 
 29 |     default_res = {
 30 |         "interface": {"ip": dstip, "port": dstport},
 31 |         "results": {
 32 |             "sender": {
 33 |                 "transfer": {"rate": 0.0, "units": "n/a"},
 34 |                 "bitrate": {"rate": 0.0, "units": "n/a"},
 35 |             },
 36 |             "receiver": {
 37 |                 "transfer": {"rate": 0.0, "units": "n/a"},
 38 |                 "bitrate": {"rate": 0.0, "units": "n/a"},
 39 |             },
 40 |         },
 41 |     }
 42 | 
 43 |     try:
 44 |         process = await asyncio.wait_for(
 45 |             asyncio.create_subprocess_exec(
 46 |                 *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
 47 |             ),
 48 |             timeout=60,
 49 |         )
 50 |         stdout, stderr = await process.communicate()
 51 |         output_filename = f"{dstip}_{dstport}_client.log"
 52 |         with open(output_filename, "w") as f:
 53 |             f.write(stdout.decode())
 54 |     except Exception as e:
 55 |         return {"interface": {"ip": dstip, "port": dstport}, "results": default_res}
 56 | 
 57 |     # In theory this should not occur since we catch this above...but just to be safe let's ensure
 58 |     # the return code is zero...
 59 |     if process.returncode != 0:
 60 |         return {"interface": {"ip": dstip, "port": dstport}, "results": default_res}
 61 | 
 62 |     result = {
 63 |         "sender": {
 64 |             "transfer": {"rate": 0.0, "units": "n/a"},
 65 |             "bitrate": {"rate": 0.0, "units": "n/a"},
 66 |         },
 67 |         "receiver": {
 68 |             "transfer": {"rate": 0.0, "units": "n/a"},
 69 |             "bitrate": {"rate": 0.0, "units": "n/a"},
 70 |         },
 71 |     }
 72 |     iperf3_stdout = stdout.decode().strip().splitlines()
 73 |     for line in iperf3_stdout:
 74 |         line = line.lower()
 75 |         if "sender" in line:
 76 |             parts = line.split()
 77 |             result["sender"]["transfer"] = {"rate": parts[4], "units": parts[5]}
 78 |             result["sender"]["bitrate"] = {"rate": parts[6], "units": parts[7]}
 79 |         elif "receiver" in line:
 80 |             parts = line.split()
 81 |             result["receiver"]["transfer"] = {"rate": parts[4], "units": parts[5]}
 82 |             result["receiver"]["bitrate"] = {"rate": parts[6], "units": parts[7]}
 83 |     return {"interface": {"ip": dstip, "port": dstport}, "results": result}
 84 | 
 85 | 
 86 | def calculate_stats(values, num_clients):
 87 |     return {
 88 |         "aggregate": {
 89 |             "transfer": str(round(Decimal(sum(values["transfer"])), 2)),
 90 |             "bitrate": str(round(Decimal(sum(values["bitrate"])), 2)),
 91 |         },
 92 |         "mean": {
 93 |             "transfer": str(round(Decimal(sum(values["transfer"]) / num_clients), 2)),
 94 |             "bitrate": str(round(Decimal(sum(values["bitrate"]) / num_clients), 2)),
 95 |         },
 96 |         "min": {
 97 |             "transfer": str(round(Decimal(min(values["transfer"])), 2)),
 98 |             "bitrate": str(round(Decimal(min(values["bitrate"])), 2)),
 99 |         },
100 |         "max": {
101 |             "transfer": str(round(Decimal(max(values["transfer"])), 2)),
102 |             "bitrate": str(round(Decimal(max(values["bitrate"])), 2)),
103 |         },
104 |     }
105 | 
106 | 
107 | async def main():
108 |     dstip, dstport, numclients = args.dstip, args.dstport, args.numclients
109 |     duration_seconds = "5"
110 | 
111 |     tasks = [
112 |         asyncio.create_task(run_iperf_client(dstip, dstport, i, duration_seconds))
113 |         for i in range(numclients)
114 |     ]
115 |     results = await asyncio.gather(*tasks)
116 | 
117 |     sender_values = {"transfer": [], "bitrate": []}
118 |     receiver_values = {"transfer": [], "bitrate": []}
119 | 
120 |     total_results = {}
121 |     for idx, result in enumerate(results):
122 |         total_results[str(idx)] = result
123 |         sender_values["transfer"].append(
124 |             float(result["results"]["sender"]["transfer"]["rate"])
125 |         )
126 |         sender_values["bitrate"].append(
127 |             float(result["results"]["sender"]["bitrate"]["rate"])
128 |         )
129 |         receiver_values["transfer"].append(
130 |             float(result["results"]["receiver"]["transfer"]["rate"])
131 |         )
132 |         receiver_values["bitrate"].append(
133 |             float(result["results"]["receiver"]["bitrate"]["rate"])
134 |         )
135 | 
136 |     stats = {
137 |         "sender": calculate_stats(sender_values, numclients),
138 |         "receiver": calculate_stats(receiver_values, numclients),
139 |     }
140 | 
141 |     total_results["stats"] = stats
142 |     summary_file = f"{dstip}_summary.json"
143 |     with open(summary_file, "w") as f:
144 |         json.dump(total_results, f, indent=4)
145 |     print(json.dumps(stats, indent=4))
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     asyncio.run(main())
150 | 
151 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/iperf3_start_servers.py:
--------------------------------------------------------------------------------
  1 | from iperf3_utils import *
  2 | 
  3 | parser = argparse.ArgumentParser()
  4 | parser.add_argument(
  5 |     "--numservers",
  6 |     type=int,
  7 |     default=1,
  8 |     help=(
  9 |         'The number of servers (on different ports) to have running on a single IP. Note. For "numservers" values greater than 1 '
 10 |         'the "startport" value will be adjusted for each subsequently started server by a factor of 1.'
 11 |     ),
 12 | )
 13 | 
 14 | parser.add_argument(
 15 |     "--startport",
 16 |     type=int,
 17 |     default=5200,
 18 |     help=(
 19 |         'The default port value. In the event that "numservers" is greater than 1, the default port value used '
 20 |         "to generate servers will automatically increase to accomdate the clients running in parallel."
 21 |     ),
 22 | )
 23 | args = vars(parser.parse_args())
 24 | 
 25 | 
 26 | def main():
 27 |     num_server = args["numservers"]
 28 |     port = args["startport"]
 29 |     interfaces = []
 30 |     entrylist = json.loads('{}')
 31 | 
 32 |     try:
 33 |         config.load_incluster_config()
 34 |         v1 = client.CoreV1Api()
 35 |     except:
 36 |         log.error("Failed to load Kubernetes CoreV1API.")
 37 |         exit(1)
 38 |     try:
 39 |         autopilot_pods = v1.list_namespaced_pod(
 40 |                 namespace=AUTOPILOT_NAMESPACE, field_selector="metadata.name="+CURR_POD_NAME
 41 |                 )
 42 |     except ApiException as e:
 43 |         log.error(
 44 |             "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e
 45 |         )
 46 |         exit(1)
 47 | 
 48 |     pod = autopilot_pods.items[0]
 49 |     try:
 50 |         entrylist = json.loads(
 51 |                 pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"]
 52 |         )
 53 |     except KeyError:
 54 |         log.info(
 55 |             f'Key k8s.v1.cni.cncf.io/network-status not found on pod "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}"')
 56 |     if len(entrylist) > 0:
 57 |         interfaces = [
 58 |             iface
 59 |             for iface in netifaces.interfaces()
 60 |             if "net" in iface and iface not in ("lo", "eth0", "tunl0")
 61 |         ]
 62 |     else:
 63 |         interfaces = [
 64 |             iface
 65 |             for iface in netifaces.interfaces()
 66 |             if iface not in ("lo", "tunl0")
 67 |         ]
 68 | 
 69 |     
 70 |     if not interfaces:
 71 |         log.error(
 72 |             f'Secondary nics not found for "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}".'
 73 |         )
 74 |         sys.exit(1)
 75 | 
 76 |     for iface in interfaces:
 77 |         for i in range(num_server):
 78 |             try:
 79 |                 address = netifaces.ifaddresses(iface)
 80 |                 ip = address[netifaces.AF_INET][0]["addr"]
 81 |                 command = ["iperf3", "-s", "-B", ip, "-p", str(port + i), "-D"]
 82 |                 log.info(
 83 |                     f"Starting iperf3 server {ip}:{port + i} using {iface} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}..."
 84 |                 )
 85 |                 subprocess.run(command, text=True, capture_output=True, check=True)
 86 |             except subprocess.CalledProcessError as e:
 87 |                 log.error(
 88 |                     f"Server failed to start on {ip}:{port + i} using {iface} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}.\n "
 89 |                     f"Exited with error: {e.stderr}"
 90 |                 )
 91 |                 sys.exit(1)
 92 |             except KeyError:
 93 |                 log.error(
 94 |                     f"No AF_INET (IPv4) address found for interface {iface} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}."
 95 |                 )
 96 |                 sys.exit(1)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/iperf3_stop_servers.py:
--------------------------------------------------------------------------------
 1 | from iperf3_utils import *
 2 | 
 3 | 
 4 | def kill_all_iperf_servers():
 5 |     try:
 6 |         result = subprocess.run(
 7 |             ["ps", "aux"], text=True, capture_output=True, check=True
 8 |         )
 9 |     except subprocess.CalledProcessError as e:
10 |         print(f"Error occurred while listing processes: {e}")
11 |         sys.exit(1)
12 | 
13 |     processes = result.stdout.splitlines()
14 | 
15 |     for process in processes:
16 |         try:
17 |             # Don't combine the strings...this won't work if "-s" is placed in a different position...
18 |             if "iperf3" in process and "-s" in process:
19 |                 parts = process.split()
20 |                 if len(parts) > 1:
21 |                     pid = int(parts[1])
22 |                     if pid > 1:
23 |                         try:
24 |                             os.kill(pid, signal.SIGTERM)
25 |                         except PermissionError:
26 |                             log.error(
27 |                                 f"Permission denied: Could not kill process with PID {pid} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}."
28 |                             )
29 |                             sys.exit(1)
30 |                         except ProcessLookupError:
31 |                             log.error(
32 |                                 f"Process with PID {pid} does not exist in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}."
33 |                             )
34 |                             sys.exit(1)
35 |                         except Exception as e:
36 |                             log.error(
37 |                                 f"Failed to kill process with PID {pid}: {e} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}"
38 |                             )
39 |                             sys.exit(1)
40 |                 else:
41 |                     log.error(
42 |                         f"Unexpected format in process line: {process} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}"
43 |                     )
44 |                     sys.exit(1)
45 |         except ValueError:
46 |             log.error(
47 |                 f"Could not convert PID to an integer: {process} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}"
48 |             )
49 |             sys.exit(1)
50 |         except Exception as e:
51 |             log.error(
52 |                 f"An unexpected error occurred: {e} in {CURR_POD_NAME} on {CURR_WORKER_NODE_NAME}"
53 |             )
54 |             sys.exit(1)
55 |     log.info(f"All iperf servers have been removed (not deleting default iperf server)")
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     kill_all_iperf_servers()
60 | 
61 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/iperf3_utils.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from decimal import Decimal
 3 | import argparse
 4 | import asyncio
 5 | import logging
 6 | import aiohttp
 7 | import os
 8 | import json
 9 | import requests
10 | import netifaces
11 | import subprocess
12 | import sys
13 | import signal
14 | 
15 | from kubernetes import client, config
16 | from kubernetes.client.rest import ApiException
17 | 
18 | log = logging.getLogger(__name__)
19 | logging.basicConfig(
20 |     format="[NETWORK] - [IPERF] - [%(levelname)s] : %(message)s",
21 |     level=logging.INFO,
22 | )
23 | 
24 | 
25 | #
26 | # TODO: Add this to network_workload.py
27 | #
28 | class SupportedWorkload(Enum):
29 |     RING = "RING"
30 | 
31 | 
32 | CURR_POD_NAME = os.getenv("POD_NAME")
33 | CURR_WORKER_NODE_NAME = os.getenv("NODE_NAME")
34 | AUTOPILOT_NAMESPACE = os.getenv("NAMESPACE")
35 | AUTOPILOT_PORT = os.getenv("AUTOPILOT_HEALTHCHECKS_SERVICE_PORT")
36 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/network_workload.py:
--------------------------------------------------------------------------------
  1 | from iperf3_utils import *
  2 | 
  3 | 
  4 | #
  5 | # TODO: Make this an abstract class...
  6 | #
  7 | #
  8 | 
  9 | 
 10 | class NetworkWorkload:
 11 |     def __init__(self, namespace=None, workload_name="Ring Topology"):
 12 |         self.namespace = namespace or os.getenv("NAMESPACE")
 13 |         self.workload = workload_name
 14 |         self.log = logging.getLogger(__name__)
 15 |         logging.basicConfig(
 16 |             format="[NETWORK] - [WORKLOAD-GEN] - [%(levelname)s] : %(message)s",
 17 |             level=logging.INFO,
 18 |         )
 19 | 
 20 |         try:
 21 |             config.load_incluster_config()
 22 |             self.v1 = client.CoreV1Api()
 23 |         except:
 24 |             self.log.error("Failed to load Kubernetes CoreV1API.")
 25 |             exit(1)
 26 | 
 27 |     def get_all_ifaces(self):
 28 |         address_map = {}
 29 | 
 30 |         try:
 31 |             autopilot_pods = self.v1.list_namespaced_pod(
 32 |                 namespace=self.namespace, label_selector="app=autopilot"
 33 |             )
 34 |         except ApiException as e:
 35 |             self.log.error(
 36 |                 "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e
 37 |             )
 38 |             exit(1)
 39 |         entrylist = json.loads('{}')
 40 |         for pod in autopilot_pods.items:
 41 |             try:
 42 |                 entrylist = json.loads(
 43 |                     pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"]
 44 |                 )
 45 |             except KeyError:
 46 |                 log.info(
 47 |                     f'Key k8s.v1.cni.cncf.io/network-status not found on pod "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}"')
 48 |             if len(entrylist) > 0:
 49 |                 for entry in entrylist:
 50 |                     try:
 51 |                         iface = entry["interface"]
 52 |                     except KeyError:
 53 |                         self.log.info("Interface key name not found, assigning 'k8s-pod-network'.")
 54 |                         iface = "k8s-pod-network"
 55 |                     if address_map.get(iface) == None:
 56 |                         address_map[iface] = []
 57 |                     address_map.get(iface).append((pod.spec.node_name, entry["ips"]))
 58 |             else:
 59 |                 pod_ips = pod.status.pod_i_ps
 60 |                 if pod_ips != None:
 61 |                     iface = "default"
 62 |                     if address_map.get(iface) == None:
 63 |                         address_map[iface] = []
 64 |                     ips = []
 65 |                     for pod_ip in pod_ips:
 66 |                         ips.append(pod_ip.ip)
 67 |                     address_map.get(iface).append((pod.spec.node_name, ips))
 68 | 
 69 |         if len(address_map) == 0:
 70 |             self.log.error("No interfaces found. FAIL.")
 71 |         return address_map
 72 | 
 73 |     def gen_autopilot_node_map_json(self):
 74 |         #
 75 |         # TODO: This is bad because it gets all endpoints, but what happens if
 76 |         # we have a failing worker that doesn't have any pods?
 77 |         #
 78 |         # Well we skip it...this bad...why? Well, the user won't know...
 79 |         #
 80 |         # Proposal, warn the user at least that NOT ALL work nodes will be tested...
 81 |         #
 82 |         try:
 83 |             endpoints = self.v1.list_namespaced_endpoints(
 84 |                 self.namespace,
 85 |                 field_selector="metadata.name=autopilot-healthchecks",
 86 |             )
 87 |         except ApiException as e:
 88 |             self.log.error(
 89 |                 "Exception when calling Kubernetes CoreV1Api->list_namespaced_endpoints: %s\n"
 90 |                 % e
 91 |             )
 92 |             exit(1)
 93 | 
 94 |         autopilot_node_map = {}
 95 |         for endpointslice in endpoints.items:
 96 |             addresses = endpointslice.subsets[0].addresses
 97 |             for item in addresses:
 98 |                 node_name = item.node_name
 99 |                 if node_name not in autopilot_node_map:
100 |                     pod_name = item.target_ref.name
101 |                     ip_address = item.ip
102 |                     autopilot_node_map[node_name] = {
103 |                         "pod": pod_name,
104 |                         "endpoint": ip_address,
105 |                     }
106 | 
107 |         addresses = self.get_all_ifaces()
108 |         for add in addresses:
109 |             if add != "eth0":
110 |                 for entry in addresses.get(add):
111 |                     worker_node_name = entry[0]
112 |                     net_interfaces = entry[1]
113 |                     if worker_node_name in autopilot_node_map:
114 |                         autopilot_node_map[worker_node_name][
115 |                             "netifaces"
116 |                         ] = net_interfaces
117 | 
118 |                 return autopilot_node_map
119 | 
120 |     def generate_ring_topology_json(self, worker_nodes_map):
121 |         pair_links = {}
122 |         node_count = len(worker_nodes_map)
123 |         if node_count > 1:
124 |             worker_nodes = list(worker_nodes_map.keys())
125 |             for t in range(1, node_count):
126 |                 step_pairs = []
127 |                 for i in range(node_count):
128 |                     source = worker_nodes[i]
129 |                     target = worker_nodes[(i + t) % node_count]
130 |                     step_pairs.append({source: target})
131 |                 pair_links[t] = step_pairs
132 |         return pair_links
133 | 
134 |     def print_autopilot_node_map_json(self, worker_node_map):
135 |         self.log.info(f"\n{json.dumps(worker_node_map, indent=4)}")
136 | 
137 |     def print_ring_topology_json(self, ring_workload):
138 |         output = ""
139 |         for step in ring_workload:
140 |             output += f"Time Step {step}:\n"
141 |             for pair in ring_workload[step]:
142 |                 for source, target in pair.items():
143 |                     output += f"    {source} -> {target}\n"
144 |         self.log.info(f"\n{output}")
145 | 
146 |     def print_ring_workload(self):
147 |         autopilot_node_map_json = self.gen_autopilot_node_map_json()
148 |         ring_workload_pairs_json = self.generate_ring_topology_json(
149 |             autopilot_node_map_json
150 |         )
151 |         output = ""
152 |         for step in ring_workload_pairs_json:
153 |             output += f"Time Step {step}\n"
154 |             for pair in ring_workload_pairs_json[step]:
155 |                 for source, dest in pair.items():
156 |                     output += (
157 |                         f"    Pod-to-Pod: {autopilot_node_map_json[source]['pod']} "
158 |                         f"-> {autopilot_node_map_json[dest]['pod']}\n"
159 |                         f"        Endpoint-to-Endpoint: {autopilot_node_map_json[source]['endpoint']} -> "
160 |                         f"{autopilot_node_map_json[dest]['endpoint']}\n"
161 |                     )
162 |             output += f"\n"
163 |         self.log.info(f"\n{output}")
164 | 


--------------------------------------------------------------------------------
/autopilot-daemon/network/ping-entrypoint.py:
--------------------------------------------------------------------------------
  1 | from kubernetes import client, config
  2 | from kubernetes.client.rest import ApiException
  3 | import os
  4 | import json
  5 | import argparse
  6 | import asyncio
  7 | import subprocess
  8 | import time
  9 | import netifaces
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument('--job', type=str, default='None', help='Workload node discovery w/ given namespace and label. Ex: \"--job=namespace:label-key=label-value\". Default is set to None.')
 13 | parser.add_argument('--nodelabel', type=str, default='None', help='Node label to select nodes. Ex: \"label-key=label-value\". Default is set to None.')
 14 | parser.add_argument('--nodes', type=str, default='all', help='Node(s) running autopilot that will be reached out by ping. Can be a comma separated list. Default is \"all\". Servers are reached out sequentially')
 15 | args = vars(parser.parse_args())
 16 | 
 17 | job = args['job']
 18 | nodemap = {}
 19 | namespace_self = os.getenv("NAMESPACE")
 20 | nodename_self  = os.getenv("NODE_NAME")
 21 | config.load_incluster_config()
 22 | kubeapi = client.CoreV1Api()
 23 | 
 24 | async def main():
 25 |     nodelist = args['nodes'].replace(' ', '').split(',') # list of nodes
 26 |     job = args['job']
 27 |     nodelabel = args['nodelabel']
 28 |     nodemap = {}
 29 |     allnodes = False
 30 |     check_local_ifaces()
 31 |     if 'all' in nodelist and job == 'None' and nodelabel == 'None':
 32 |         allnodes = True
 33 |     else:
 34 |         nodemap = get_job_nodes(nodelist)
 35 | 
 36 |     nodes={}
 37 |     ifaces=set()
 38 |     print("[PING] Pod running ping: ", os.getenv("POD_NAME"))
 39 |     print("[PING] Starting: collecting node list")
 40 |     try:
 41 |         retries = 0
 42 |         daemonset_size = expectedPods()
 43 |         autopilot_pods = kubeapi.list_namespaced_pod(namespace=namespace_self, label_selector="app=autopilot")
 44 |         while len(autopilot_pods.items) < daemonset_size or retries > 100:
 45 |             print("[PING] Waiting for all Autopilot pods to run")
 46 |             time.sleep(5)
 47 |             autopilot_pods = kubeapi.list_namespaced_pod(namespace=namespace_self, label_selector="app=autopilot")
 48 |             retries +=1
 49 |         if retries > 100 and len(autopilot_pods.items) < daemonset_size:
 50 |             print("[PING] Reached max retries of 100. ABORT")
 51 |             exit()
 52 | 
 53 |     except ApiException as e:
 54 |         print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e)
 55 |         exit()
 56 | 
 57 |     # run through all pods and create a map of all interfaces
 58 |     print("Creating a list of interfaces and IPs")
 59 |     entrylist = json.loads('{}')
 60 |     for pod in autopilot_pods.items:
 61 |         if pod.spec.node_name != nodename_self and (allnodes or (pod.spec.node_name in nodemap.keys())):
 62 |             try:
 63 |                 entrylist = json.loads(pod.metadata.annotations['k8s.v1.cni.cncf.io/network-status'])
 64 |             except KeyError:
 65 |                 print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod.metadata.name, "- node", pod.spec.node_name)
 66 |             if len(entrylist) > 0 :
 67 |                 node={}
 68 |                 nodes[pod.spec.node_name] = node
 69 |                 for entry in entrylist:
 70 |                     try:
 71 |                         iface=entry['interface']
 72 |                     except KeyError:
 73 |                         print("Interface key name not found, assigning 'k8s-pod-network'.")
 74 |                         iface = "k8s-pod-network"
 75 |                     ifaces = ifaces | {iface}
 76 |                     node[iface] = {
 77 |                         'ips': entry['ips'],
 78 |                         'pod': pod.metadata.name
 79 |                     }
 80 |             else:
 81 |                 node={}
 82 |                 nodes[pod.spec.node_name] = node
 83 |                 pod_ips = pod.status.pod_i_ps
 84 |                 if pod_ips != None:
 85 |                     iface = "default"
 86 |                     ifaces = ifaces | {iface}
 87 |                     iplist = []
 88 |                     for pod_ip in pod_ips:
 89 |                         iplist.append(pod_ip.ip)
 90 |                     node[iface] = {
 91 |                         'ips': iplist,
 92 |                         'pod': pod.metadata.name
 93 |                     }
 94 | 
 95 | 
 96 | 
 97 |     if len(nodes.keys()) == 0:
 98 |         print("[PING] No nodes found. ABORT")
 99 |         exit(0)
100 |     # run ping tests to each pod on each interface
101 |     print("[PING] Running ping tests for every interface")
102 |     conn_dict = dict()
103 |     clients = []
104 |     for nodename in nodes.keys():
105 |         conn_dict[nodename] = {}
106 |         for iface in ifaces:
107 |             try:
108 |                 ips = nodes[nodename][iface]['ips']
109 |             except KeyError:
110 |                 print("Interface", iface, "not found, skipping.")
111 |                 continue
112 |             for index, ip in enumerate(ips):
113 |                 command = ['ping',ip,'-t','45','-c','10']
114 |                 indexed_iface = iface+("-"+str(index) if len(ips)>1 else "")
115 |                 clients.append((subprocess.Popen(command, start_new_session=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE), nodename, ip, indexed_iface))
116 |     for c in clients:
117 |         try:
118 |             c[0].wait(50)
119 |         except:
120 |             print("Timeout while waiting for", c[2], "on node", c[1])
121 |             continue
122 |     fail = False
123 |     for c in clients:
124 |         stdout, stderr = c[0].communicate()
125 |         if stderr:
126 |             print("[PING] output parse exited with error: " + stderr)
127 |             fail = True
128 |         else:
129 |             if "Unreachable" in stdout or "100% packet loss" in stdout:
130 |                 print("Node", c[1], c[2], c[3], "1")
131 |                 fail = True
132 |             else:
133 |                 print("Node", c[1], c[2], c[3], "0")
134 |     if fail:
135 |         print("[PING] At least one node unreachable. FAIL")
136 |     else:
137 |         print("[PING] all nodes reachable. success")
138 |             
139 | def check_local_ifaces():
140 |     podname = os.getenv("POD_NAME")
141 |     pod_list = kubeapi.list_namespaced_pod(namespace=namespace_self, field_selector="metadata.name="+podname)
142 |     ips = []
143 |     iface_count = 0
144 |     pod_self = pod_list.items[0]
145 |     entrylist = json.loads('{}')
146 |     ip_addresses = [netifaces.ifaddresses(iface)[netifaces.AF_INET][0]['addr'] for iface in netifaces.interfaces() if netifaces.AF_INET in netifaces.ifaddresses(iface)]
147 |     try:
148 |         entrylist = json.loads(pod_self.metadata.annotations['k8s.v1.cni.cncf.io/network-status'])
149 |     except KeyError:
150 |         print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod_self.metadata.name, "-  node", pod_self.spec.node_name)
151 |     if len(entrylist) > 0:
152 |         for entry in entrylist:
153 |             try:
154 |                 iface=entry['interface']
155 |             except KeyError:
156 |                 continue
157 |             for ip in entry['ips']:
158 |                 if ip not in ip_addresses:
159 |                     print("[PING] IFACES count inconsistent. Pod annotation reports", entry['ips'], ", not found in the pod among", ip_addresses, "ABORT")
160 |                     exit()
161 |             ips.append(entry['ips'])
162 |             iface_count += len(entry['ips'])
163 |     else:
164 |         pod_ips = pod_self.status.pod_i_ps
165 |         if pod_ips != None:
166 |             for pod_ip in pod_ips:
167 |                 if pod_ip.ip not in ip_addresses:
168 |                     print("[PING] IFACES count inconsistent. Pod annotation reports", pod_ip.ip, ", not found in the pod among", ip_addresses, "ABORT")
169 |                     exit()
170 |                 ips.append(pod_ip.ip)
171 |         iface_count += len(pod_ips)
172 | 
173 | 
174 | 
175 | def get_job_nodes(nodelist):
176 |     v1 = client.CoreV1Api()
177 |     # get nodes from job is specified
178 |     nodemap = {}
179 |     node_name_self = os.getenv("NODE_NAME")
180 |     job = args['job']
181 |     if job != 'None':
182 |         job = args['job'].split(':') 
183 |         job_ns = job[0] # ex: "default"
184 |         job_label = job[1] # ex: "job-name=my-job" or "app=my-app"]
185 |         try:
186 |             job_pods = v1.list_namespaced_pod(namespace=job_ns, label_selector=job_label)
187 |         except ApiException as e:
188 |             print("[PING] Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e)
189 | 
190 |         print('[PING] Workload:', ': '.join(job))
191 |         for pod in job_pods.items:
192 |             if pod.spec.node_name != node_name_self:
193 |                 nodemap[pod.spec.node_name] = True
194 | 
195 |     nodelabel = args['nodelabel']
196 |     if nodelabel != 'None':
197 |         try:
198 |             labeled_nodes = v1.list_node(label_selector=nodelabel)
199 |         except ApiException as e:
200 |             print("Exception when calling CoreV1Api->list_node: %s\n" % e)
201 |             exit()
202 |         if len(labeled_nodes.items) == 0:
203 |             print ("No node is labeled with", nodelabel, " - ABORT.")
204 |             exit()
205 |         for labeled_node in labeled_nodes.items:
206 |             if labeled_node.metadata.name != node_name_self:
207 |                 nodemap[labeled_node.metadata.name] = True
208 |     # get nodes from input list, if any
209 |     if 'all' not in nodelist:
210 |         for i in nodelist:
211 |             if i != node_name_self:
212 |                 nodemap[i] = True
213 |     return nodemap
214 | 
215 | 
216 | def expectedPods():
217 |     v1 = client.AppsV1Api()
218 |     try:
219 |         autopilot = v1.list_namespaced_daemon_set(namespace=namespace_self, label_selector="app=autopilot")
220 |     except ApiException as e:
221 |         print("[PING] Exception when calling fetching Autopilot by corev1api->list_namespaced_daemon_set", e)
222 |         return 0
223 |     return autopilot.items[0].status.desired_number_scheduled
224 | 
225 | if __name__ == '__main__':
226 |     asyncio.run(main())


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/cmd/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"net/http"
  8 | 	"os"
  9 | 	"time"
 10 | 
 11 | 	"github.com/IBM/autopilot/pkg/handler"
 12 | 	"github.com/IBM/autopilot/pkg/healthcheck"
 13 | 	"github.com/IBM/autopilot/pkg/utils"
 14 | 	"github.com/prometheus/client_golang/prometheus"
 15 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
 16 | 	"k8s.io/klog/v2"
 17 | )
 18 | 
 19 | func main() {
 20 | 	port := flag.String("port", "3333", "Port for the webhook to listen to. Defaulted to 3333")
 21 | 	bwThreshold := flag.Int("bw", 4, "Sets bandwidth threshold for the init container")
 22 | 	logFile := flag.String("logfile", "", "File where to save all the events")
 23 | 	v := flag.String("loglevel", "2", "Log level")
 24 | 	repeat := flag.Int("w", 24, "Run all tests periodically on each node. Time set in hours. Defaults to 24h")
 25 | 	invasive := flag.Int("invasive-check-timer", 4, "Run invasive checks (e.g., dcgmi level 3) on each node when GPUs are free. Time set in hours. Defaults to 4h. Set to 0 to avoid invasive checks")
 26 | 
 27 | 	flag.Parse()
 28 | 
 29 | 	klog.InitFlags(nil)
 30 | 	flag.Set("alsologtostderr", "true")
 31 | 	if *logFile != "" {
 32 | 		flag.Set("log_file", *logFile)
 33 | 	}
 34 | 	flag.Set("v", *v)
 35 | 	flag.Set("logtostderr", "false")
 36 | 	klog.OsExit = func(exitCode int) {
 37 | 		fmt.Printf("os.Exit(%d)\n", exitCode)
 38 | 	}
 39 | 
 40 | 	utils.UserConfig = utils.InitConfig{
 41 | 		BWThreshold: *bwThreshold,
 42 | 	}
 43 | 
 44 | 	reg := prometheus.NewRegistry()
 45 | 	utils.InitMetrics(reg)
 46 | 
 47 | 	utils.InitHardwareMetrics()
 48 | 
 49 | 	// Init the node status map
 50 | 	healthcheck.InitNodeStatusMap()
 51 | 
 52 | 	pMux := http.NewServeMux()
 53 | 	promHandler := promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
 54 | 	pMux.Handle("/metrics", promHandler)
 55 | 
 56 | 	go func() {
 57 | 		klog.Info("Serving metrics on :8081")
 58 | 		err := http.ListenAndServe(":8081", pMux)
 59 | 		if err != nil {
 60 | 			klog.Error(err.Error())
 61 | 			os.Exit(1)
 62 | 		}
 63 | 	}()
 64 | 
 65 | 	readinessMux := http.NewServeMux()
 66 | 	readinessMux.Handle("/readinessprobe", handler.ReadinessProbeHandler())
 67 | 
 68 | 	go func() {
 69 | 		klog.Info("Serving Readiness Probe on :8080")
 70 | 		err := http.ListenAndServe(":8080", readinessMux)
 71 | 		if err != nil {
 72 | 			klog.Error(err.Error())
 73 | 			os.Exit(1)
 74 | 		}
 75 | 	}()
 76 | 
 77 | 	hcMux := http.NewServeMux()
 78 | 
 79 | 	hcMux.Handle("/dcgm", handler.DCGMHandler())
 80 | 	hcMux.Handle("/gpumem", handler.GpuMemHandler())
 81 | 	hcMux.Handle("/gpupower", handler.GpuPowerHandler())
 82 | 	hcMux.Handle("/iperf", handler.IperfHandler())
 83 | 	hcMux.Handle("/iperfservers", handler.StartIperfServersHandler())
 84 | 	hcMux.Handle("/iperfstopservers", handler.StopAllIperfServersHandler())
 85 | 	hcMux.Handle("/iperfclients", handler.StartIperfClientsHandler())
 86 | 	hcMux.Handle("/invasive", handler.InvasiveCheckHandler())
 87 | 	hcMux.Handle("/pciebw", handler.PCIeBWHandler())
 88 | 	hcMux.Handle("/ping", handler.PingHandler())
 89 | 	hcMux.Handle("/pvc", handler.PVCHandler())
 90 | 	hcMux.Handle("/remapped", handler.RemappedRowsHandler())
 91 | 	hcMux.Handle("/status", handler.SystemStatusHandler())
 92 | 
 93 | 	s := &http.Server{
 94 | 		Addr:         ":" + *port,
 95 | 		Handler:      hcMux,
 96 | 		ReadTimeout:  30 * time.Minute,
 97 | 		WriteTimeout: 30 * time.Minute,
 98 | 		IdleTimeout:  30 * time.Minute,
 99 | 	}
100 | 
101 | 	go func() {
102 | 		klog.Info("Serving Health Checks on port :", *port)
103 | 		err := s.ListenAndServe()
104 | 		if errors.Is(err, http.ErrServerClosed) {
105 | 			klog.Info("Server Closed")
106 | 		} else if errors.Is(err, http.ErrAbortHandler) {
107 | 			klog.Info("Server Aborted")
108 | 		} else if errors.Is(err, http.ErrContentLength) {
109 | 			klog.Info("Response size too large")
110 | 		} else if errors.Is(err, http.ErrBodyReadAfterClose) {
111 | 			klog.Info("Read after close")
112 | 		} else if errors.Is(err, http.ErrHandlerTimeout) {
113 | 			klog.Info("Handler timed out")
114 | 		}
115 | 		if err != nil {
116 | 			klog.Info("EXITING")
117 | 			klog.Error(err.Error())
118 | 			os.Exit(1)
119 | 		}
120 | 	}()
121 | 
122 | 	// Create a Watcher over nodes. Needed to export metrics from data created by external jobs (i.e., dcgm Jobs)
123 | 	go utils.WatchNode()
124 | 
125 | 	// Run the health checks at startup, then start the timer
126 | 	healthcheck.PeriodicCheck()
127 | 
128 | 	periodicChecksTicker := time.NewTicker(time.Duration(*repeat) * time.Hour)
129 | 	defer periodicChecksTicker.Stop()
130 | 	invasiveChecksTicker := time.NewTicker(time.Duration(*invasive) * time.Hour)
131 | 	defer invasiveChecksTicker.Stop()
132 | 	for {
133 | 		select {
134 | 		case <-periodicChecksTicker.C:
135 | 			healthcheck.PeriodicCheck()
136 | 		case <-invasiveChecksTicker.C:
137 | 			if *invasive > 0 {
138 | 				healthcheck.InvasiveCheck()
139 | 			}
140 | 		}
141 | 	}
142 | 
143 | 	// cert := "/etc/admission-webhook/tls/tls.crt"
144 | 	// key := "/etc/admission-webhook/tls/tls.key"
145 | 
146 | 	// err := http.ListenAndServeTLS(":"+*port, cert, key, mux)
147 | 	// if errors.Is(err, http.ErrServerClosed) {
148 | 	// 	klog.Error("Server closed")
149 | 	// } else if err != nil {
150 | 	// 	klog.Error("error starting server: %s\n", err)
151 | 	// 	os.Exit(1)
152 | 	// }
153 | }
154 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/handler/handler.go:
--------------------------------------------------------------------------------
  1 | package handler
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"net/http"
  6 | 	"strconv"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/IBM/autopilot/pkg/healthcheck"
 10 | 	"github.com/IBM/autopilot/pkg/utils"
 11 | 	"k8s.io/klog/v2"
 12 | )
 13 | 
 14 | func SystemStatusHandler() http.Handler {
 15 | 	fn := func(w http.ResponseWriter, r *http.Request) {
 16 | 		nodelabel := r.URL.Query().Get("nodelabel")
 17 | 		if nodelabel == "" {
 18 | 			nodelabel = "None"
 19 | 		}
 20 | 		hosts := r.URL.Query().Get("host")
 21 | 		if hosts == "" {
 22 | 			hosts = "all"
 23 | 		}
 24 | 		checks := r.URL.Query().Get("check")
 25 | 		if checks == "" {
 26 | 			checks = "all"
 27 | 		}
 28 | 		batch := r.URL.Query().Get("batch")
 29 | 		if batch == "" {
 30 | 			batch = "0"
 31 | 		}
 32 | 		jobName := r.URL.Query().Get("job")
 33 | 		if jobName == "" {
 34 | 			jobName = "None"
 35 | 		}
 36 | 		dcgmR := r.URL.Query().Get("r")
 37 | 		if dcgmR == "" {
 38 | 			dcgmR = "1"
 39 | 		}
 40 | 		if strings.Contains(checks, string(healthcheck.Iperf)) {
 41 | 			klog.Info("Running iperf3 on hosts ", hosts, " or job ", jobName)
 42 | 			w.Write([]byte("Running iperf3 on hosts " + hosts + " or job " + jobName + "\n\n"))
 43 | 			checks = strings.Trim(checks, "iperf")
 44 | 			workload := r.URL.Query().Get("workload")
 45 | 			if workload == "" {
 46 | 				workload = "ring"
 47 | 			}
 48 | 			pclients := r.URL.Query().Get("pclients")
 49 | 			if pclients == "" {
 50 | 				pclients = "8"
 51 | 			}
 52 | 			startport := r.URL.Query().Get("startport")
 53 | 			if startport == "" {
 54 | 				startport = "5200"
 55 | 			}
 56 | 			cleanup := ""
 57 | 			if r.URL.Query().Has("cleanup") {
 58 | 				cleanup = "--cleanup"
 59 | 			}
 60 | 			out, err := healthcheck.RunIperf(workload, pclients, startport, cleanup)
 61 | 			if err != nil {
 62 | 				klog.Error(err.Error())
 63 | 			}
 64 | 			if out != nil {
 65 | 				w.Write(*out)
 66 | 			}
 67 | 		}
 68 | 		if checks != "" {
 69 | 			if hosts == utils.NodeName {
 70 | 				utils.HealthcheckLock.Lock()
 71 | 				defer utils.HealthcheckLock.Unlock()
 72 | 				out, err := healthcheck.RunHealthLocalNode(checks, dcgmR, jobName, nodelabel, r)
 73 | 				if err != nil {
 74 | 					klog.Error(err.Error())
 75 | 				}
 76 | 				w.Write(*out)
 77 | 				hasFailures := healthcheck.GetNodeStatus()
 78 | 				klog.Info("Errors after running local, on demand health checks: ", hasFailures)
 79 | 				if hasFailures {
 80 | 					utils.PatchNode(utils.GPUHealthWarnLabel, utils.NodeName, false)
 81 | 				} else {
 82 | 					utils.PatchNode(utils.GPUHealthPassLabel, utils.NodeName, false)
 83 | 				}
 84 | 
 85 | 			} else {
 86 | 				klog.Info("Asking to run on remote node(s) ", hosts, " or with node label ", nodelabel)
 87 | 				w.Write([]byte("Asking to run on remote node(s) " + hosts + " or with node label " + nodelabel + "\n\n"))
 88 | 				out, err := healthcheck.RunHealthRemoteNodes(hosts, checks, batch, jobName, dcgmR, nodelabel)
 89 | 				if err != nil {
 90 | 					klog.Error(err.Error())
 91 | 				}
 92 | 				w.Write(*out)
 93 | 			}
 94 | 		}
 95 | 
 96 | 	}
 97 | 	return http.HandlerFunc(fn)
 98 | }
 99 | 
100 | func PCIeBWHandler() http.Handler {
101 | 	fn := func(w http.ResponseWriter, r *http.Request) {
102 | 		w.Write([]byte("Requesting pcie test with bw: " + strconv.Itoa(utils.UserConfig.BWThreshold) + "\n"))
103 | 		out, err := healthcheck.RunPCIeBW()
104 | 		if err != nil {
105 | 			klog.Error(err.Error())
106 | 		}
107 | 		if out != nil {
108 | 			w.Write(*out)
109 | 		}
110 | 
111 | 	}
112 | 	return http.HandlerFunc(fn)
113 | }
114 | 
115 | func RemappedRowsHandler() http.Handler {
116 | 	fn := func(w http.ResponseWriter, r *http.Request) {
117 | 		w.Write([]byte("Requesting Remapped Rows check on all GPUs\n"))
118 | 		out, err := healthcheck.RunRemappedRows()
119 | 		if err != nil {
120 | 			klog.Error(err.Error())
121 | 		}
122 | 		if out != nil {
123 | 			w.Write(*out)
124 | 		}
125 | 
126 | 	}
127 | 	return http.HandlerFunc(fn)
128 | }
129 | 
130 | func PingHandler() http.Handler {
131 | 	fn := func(w http.ResponseWriter, r *http.Request) {
132 | 		w.Write([]byte("Ping test"))
133 | 		hosts := r.URL.Query().Get("host")
134 | 		if hosts == "" {
135 | 			hosts = "all"
136 | 		}
137 | 		jobName := r.URL.Query().Get("job")
138 | 		if jobName == "" {
139 | 			jobName = "None"
140 | 		}
141 | 		nodelabel := r.URL.Query().Get("nodelabel")
142 | 		if nodelabel == "" {
143 | 			nodelabel = "None"
144 | 		}
145 | 		out, err := healthcheck.RunPing(hosts, jobName, nodelabel)
146 | 		if err != nil {
147 | 			klog.Error(err.Error())
148 | 		}
149 | 		if out != nil {
150 | 			w.Write(*out)
151 | 		}
152 | 	}
153 | 	return http.HandlerFunc(fn)
154 | }
155 | 
156 | func InvasiveCheckHandler() http.Handler {
157 | 	fn := func(w http.ResponseWriter, r *http.Request) {
158 | 		w.Write([]byte("Launching invasive health checks. Results will be added to 'autopilot.ibm.com/gpuhealth' and 'autopilot.ibm.com/dcgm.level.3' node labels\n"))
159 | 		healthcheck.InvasiveCheck()
160 | 	}
161 | 	return http.HandlerFunc(fn)
162 | }
163 | 
164 | func IperfHandler() http.Handler {
165 | 	fn := func(w http.ResponseWriter, r *http.Request) {
166 | 
167 | 		workload := r.URL.Query().Get("workload")
168 | 		if workload == "" {
169 | 			workload = "ring"
170 | 		}
171 | 		pclients := r.URL.Query().Get("pclients")
172 | 		if pclients == "" {
173 | 			pclients = "8"
174 | 		}
175 | 		startport := r.URL.Query().Get("startport")
176 | 		if startport == "" {
177 | 			startport = "5200"
178 | 		}
179 | 		cleanup := ""
180 | 		if r.URL.Query().Has("cleanup") {
181 | 			cleanup = "--cleanup"
182 | 		}
183 | 		out, err := healthcheck.RunIperf(workload, pclients, startport, cleanup)
184 | 		if err != nil {
185 | 			klog.Error(err.Error())
186 | 		}
187 | 		if out != nil {
188 | 			w.Write(*out)
189 | 		}
190 | 	}
191 | 	return http.HandlerFunc(fn)
192 | }
193 | 
194 | func StartIperfServersHandler() http.Handler {
195 | 	fn := func(w http.ResponseWriter, r *http.Request) {
196 | 		numservers := r.URL.Query().Get("numservers")
197 | 		if numservers == "" {
198 | 			numservers = "8"
199 | 		}
200 | 		startport := r.URL.Query().Get("startport")
201 | 		if startport == "" {
202 | 			startport = "5200"
203 | 		}
204 | 		out, err := healthcheck.StartIperfServers(numservers, startport)
205 | 
206 | 		if err != nil {
207 | 			klog.Error(err.Error())
208 | 		}
209 | 		if out != nil {
210 | 			w.Write(*out)
211 | 		}
212 | 	}
213 | 	return http.HandlerFunc(fn)
214 | }
215 | 
216 | func StopAllIperfServersHandler() http.Handler {
217 | 	fn := func(w http.ResponseWriter, r *http.Request) {
218 | 		out, err := healthcheck.StopAllIperfServers()
219 | 		if err != nil {
220 | 			klog.Error(err.Error())
221 | 		}
222 | 		if out != nil {
223 | 			w.Write(*out)
224 | 		}
225 | 	}
226 | 	return http.HandlerFunc(fn)
227 | }
228 | 
229 | func StartIperfClientsHandler() http.Handler {
230 | 	fn := func(w http.ResponseWriter, r *http.Request) {
231 | 		dstip := r.URL.Query().Get("dstip")
232 | 		dstport := r.URL.Query().Get("dstport")
233 | 		numclients := r.URL.Query().Get("numclients")
234 | 		out, err := healthcheck.StartIperfClients(dstip, dstport, numclients)
235 | 		if err != nil {
236 | 			klog.Error(err.Error())
237 | 		}
238 | 		if out != nil {
239 | 			w.Write(*out)
240 | 		}
241 | 	}
242 | 	return http.HandlerFunc(fn)
243 | }
244 | 
245 | func DCGMHandler() http.Handler {
246 | 	fn := func(w http.ResponseWriter, r *http.Request) {
247 | 		w.Write([]byte("DCGM test"))
248 | 		dcgmR := r.URL.Query().Get("r")
249 | 		if dcgmR == "" {
250 | 			dcgmR = "1"
251 | 		}
252 | 		out, err := healthcheck.RunDCGM(dcgmR)
253 | 		if err != nil {
254 | 			klog.Error(err.Error())
255 | 		}
256 | 		if out != nil {
257 | 			w.Write(*out)
258 | 		}
259 | 	}
260 | 	return http.HandlerFunc(fn)
261 | }
262 | 
263 | func GpuPowerHandler() http.Handler {
264 | 	fn := func(w http.ResponseWriter, r *http.Request) {
265 | 		w.Write([]byte("GPU Power Measurement test"))
266 | 		out, err := healthcheck.RunGPUPower()
267 | 		if err != nil {
268 | 			klog.Error(err.Error())
269 | 		}
270 | 		if out != nil {
271 | 			w.Write(*out)
272 | 		}
273 | 	}
274 | 	return http.HandlerFunc(fn)
275 | }
276 | 
277 | func GpuMemHandler() http.Handler {
278 | 	fn := func(w http.ResponseWriter, r *http.Request) {
279 | 		w.Write([]byte("GPU Memory DGEMM+DAXPY test"))
280 | 		out, err := healthcheck.RunGPUPower()
281 | 		if err != nil {
282 | 			klog.Error(err.Error())
283 | 		}
284 | 		if out != nil {
285 | 			w.Write(*out)
286 | 		}
287 | 	}
288 | 	return http.HandlerFunc(fn)
289 | }
290 | 
291 | func PVCHandler() http.Handler {
292 | 	fn := func(w http.ResponseWriter, r *http.Request) {
293 | 		w.Write([]byte("PVC create-delete test\n"))
294 | 		out, err := healthcheck.RunCreateDeletePVC()
295 | 		if err != nil {
296 | 			klog.Error(err.Error())
297 | 		}
298 | 		if out != nil {
299 | 			w.Write(*out)
300 | 		}
301 | 	}
302 | 	return http.HandlerFunc(fn)
303 | }
304 | 
305 | func ReadinessProbeHandler() http.Handler {
306 | 	fn := func(w http.ResponseWriter, r *http.Request) {
307 | 		data := HealthResult{"readinessProbe", "ready"}
308 | 		w.Header().Set("Content-Type", "application/json")
309 | 		w.WriteHeader(http.StatusCreated)
310 | 		json.NewEncoder(w).Encode(data)
311 | 	}
312 | 	return http.HandlerFunc(fn)
313 | }
314 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/handler/messagestruct.go:
--------------------------------------------------------------------------------
1 | package handler
2 | 
3 | type HealthResult struct {
4 | 	Name string
5 | 	Body string
6 | }
7 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/healthcheck/functions.go:
--------------------------------------------------------------------------------
  1 | package healthcheck
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"os"
  6 | 	"time"
  7 | 
  8 | 	"github.com/IBM/autopilot/pkg/utils"
  9 | 	corev1 "k8s.io/api/core/v1"
 10 | 	"k8s.io/apimachinery/pkg/api/resource"
 11 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 12 | 	"k8s.io/klog/v2"
 13 | )
 14 | 
 15 | func ListPVC() (string, error) {
 16 | 	pvc, err := utils.GetClientsetInstance().Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Get(context.Background(), utils.PodName, metav1.GetOptions{})
 17 | 	if err != nil {
 18 | 		klog.Error("Error in creating the lister", err.Error())
 19 | 		return "ABORT", err
 20 | 	}
 21 | 	switch pvc.Status.Phase {
 22 | 	case "Bound":
 23 | 		{
 24 | 			klog.Info("[PVC Create-Delete] PVC Bound: SUCCESS")
 25 | 			klog.Info("Observation: ", utils.NodeName, " 0")
 26 | 			utils.HchecksGauge.WithLabelValues("pvc", utils.NodeName, utils.CPUModel, utils.GPUModel, "").Set(0)
 27 | 		}
 28 | 	case "Pending":
 29 | 		{
 30 | 			waitonpvc := time.NewTicker(time.Minute)
 31 | 			defer waitonpvc.Stop()
 32 | 			<-waitonpvc.C
 33 | 			pvc, err := utils.GetClientsetInstance().Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Get(context.Background(), utils.PodName, metav1.GetOptions{})
 34 | 			if err != nil {
 35 | 				klog.Error("[PVC Create-Delete] Error in creating the lister: ", err.Error())
 36 | 				return "[PVC Create-Delete] PVC not found. ABORT ", err
 37 | 			}
 38 | 			phase := pvc.Status.Phase
 39 | 			if pvc.Status.Phase == "Pending" {
 40 | 				klog.Info("[PVC Create-Delete] Timer is up with PVC Pending. Force delete. FAIL")
 41 | 				klog.Info("Observation: ", utils.NodeName, " 1")
 42 | 				utils.HchecksGauge.WithLabelValues("pvc", utils.NodeName, utils.CPUModel, utils.GPUModel, "").Set(1)
 43 | 				err := deletePVC(utils.PodName)
 44 | 				if err != nil {
 45 | 					return "[PVC Create-Delete] Error in deleting the PVC. ABORT ", err
 46 | 				}
 47 | 				HealthCheckStatus[PVC] = true
 48 | 				return "[PVC Create-Delete] FAIL", nil
 49 | 			}
 50 | 			if phase == "Bound" {
 51 | 				klog.Info("[PVC Create-Delete] PVC Bound: SUCCESS")
 52 | 				klog.Info("Observation: ", utils.NodeName, " 0")
 53 | 				utils.HchecksGauge.WithLabelValues("pvc", utils.NodeName, utils.CPUModel, utils.GPUModel, "").Set(0)
 54 | 			}
 55 | 		}
 56 | 	}
 57 | 	err = deletePVC(utils.PodName)
 58 | 	if err != nil {
 59 | 		return "Error in deleting the PVC. ABORT ", err
 60 | 	}
 61 | 	return "[PVC Create-Delete] PVC SUCCESS", nil
 62 | }
 63 | 
 64 | func deletePVC(pvc string) error {
 65 | 	cset := utils.GetClientsetInstance()
 66 | 	err := cset.Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Delete(context.TODO(), pvc, metav1.DeleteOptions{})
 67 | 	if err != nil {
 68 | 		klog.Info("[PVC Delete] Failed. ABORT. ", err.Error())
 69 | 	}
 70 | 	return err
 71 | }
 72 | 
 73 | func createPVC() error {
 74 | 	cset := utils.GetClientsetInstance()
 75 | 	storageclass := os.Getenv("PVC_TEST_STORAGE_CLASS")
 76 | 	pvcTemplate := corev1.PersistentVolumeClaim{
 77 | 		ObjectMeta: metav1.ObjectMeta{
 78 | 			Name: utils.PodName,
 79 | 		},
 80 | 		Spec: corev1.PersistentVolumeClaimSpec{
 81 | 			StorageClassName: &storageclass,
 82 | 			AccessModes: []corev1.PersistentVolumeAccessMode{
 83 | 				corev1.ReadWriteMany,
 84 | 			},
 85 | 			Resources: corev1.VolumeResourceRequirements{
 86 | 				Requests: corev1.ResourceList{
 87 | 					"storage": resource.MustParse("100Mi"),
 88 | 				},
 89 | 			},
 90 | 		},
 91 | 	}
 92 | 	// Check if any previous instance exists, cleanup if so
 93 | 	pvc, _ := utils.GetClientsetInstance().Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Get(context.Background(), utils.PodName, metav1.GetOptions{})
 94 | 
 95 | 	if pvc.Name != "" {
 96 | 		klog.Info("[PVC Create] Found pre-existing instance. Cleanup ", pvc.Name)
 97 | 		deletePVC(utils.PodName)
 98 | 		waitDelete := time.NewTimer(30 * time.Second)
 99 | 		<-waitDelete.C
100 | 	}
101 | 
102 | 	_, err := cset.Cset.CoreV1().PersistentVolumeClaims(utils.Namespace).Create(context.TODO(), &pvcTemplate, metav1.CreateOptions{})
103 | 
104 | 	if err != nil {
105 | 		klog.Info("[PVC Create] Failed. ABORT. ", err.Error())
106 | 	}
107 | 	return err
108 | }
109 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/healthcheck/global.go:
--------------------------------------------------------------------------------
 1 | package healthcheck
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"strings"
 6 | 
 7 | 	"k8s.io/klog/v2"
 8 | )
 9 | 
10 | type HealthCheck string
11 | 
12 | // Holding each test current status to facilitate node labeling
13 | var HealthCheckStatus map[HealthCheck]bool
14 | var defaultPeriodicChecks string = "pciebw,remapped,dcgm,ping,gpupower"
15 | 
16 | const (
17 | 	Undefined HealthCheck = ""
18 | 	DCGM      HealthCheck = "dcgm"
19 | 	GPUMem    HealthCheck = "gpumem"
20 | 	GPUPower  HealthCheck = "gpupower"
21 | 	Iperf     HealthCheck = "iperf"
22 | 	PCIeBW    HealthCheck = "pciebw"
23 | 	Ping      HealthCheck = "ping"
24 | 	PVC       HealthCheck = "pvc"
25 | 	RowRemap  HealthCheck = "remapped"
26 | )
27 | 
28 | func GetPeriodicChecks() string {
29 | 	checks, exists := os.LookupEnv("PERIODIC_CHECKS")
30 | 	if !exists {
31 | 		klog.Info("Run all periodic health checks\n")
32 | 		return defaultPeriodicChecks
33 | 	}
34 | 	return checks
35 | }
36 | 
37 | func InitNodeStatusMap() {
38 | 	HealthCheckStatus = make(map[HealthCheck]bool)
39 | 	checklist := GetPeriodicChecks()
40 | 	for _, v := range strings.Split(checklist, ",") {
41 | 		klog.Info("Init entry map ", v)
42 | 		HealthCheckStatus[HealthCheck(v)] = false
43 | 	}
44 | }
45 | 
46 | func GetNodeStatus() bool {
47 | 	hasFailures := false
48 | 	for v := range HealthCheckStatus {
49 | 		hasFailures = hasFailures || HealthCheckStatus[v]
50 | 	}
51 | 	return hasFailures
52 | }
53 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/utils/functions.go:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"os"
  6 | 	"strconv"
  7 | 
  8 | 	"context"
  9 | 
 10 | 	"github.com/thanhpk/randstr"
 11 | 	batchv1 "k8s.io/api/batch/v1"
 12 | 	corev1 "k8s.io/api/core/v1"
 13 | 	"k8s.io/apimachinery/pkg/api/resource"
 14 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 15 | 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 16 | 	"k8s.io/apimachinery/pkg/fields"
 17 | 	"k8s.io/apimachinery/pkg/types"
 18 | 	"k8s.io/client-go/kubernetes"
 19 | 	"k8s.io/client-go/rest"
 20 | 	"k8s.io/klog/v2"
 21 | 	resourcehelper "k8s.io/kubectl/pkg/util/resource"
 22 | )
 23 | 
 24 | func GetClientsetInstance() *K8sClientset {
 25 | 	csetLock.Lock()
 26 | 	if k8sClientset == nil {
 27 | 		if k8sClientset == nil {
 28 | 			k8sClientset = &K8sClientset{}
 29 | 			config, err := rest.InClusterConfig()
 30 | 			if err != nil {
 31 | 				panic(err.Error())
 32 | 			}
 33 | 			k8sClientset.Cset, err = kubernetes.NewForConfig(config)
 34 | 			if err != nil {
 35 | 				panic(err.Error())
 36 | 			}
 37 | 		}
 38 | 
 39 | 	}
 40 | 	csetLock.Unlock()
 41 | 	return k8sClientset
 42 | }
 43 | 
 44 | func GetNode(nodename string) (*corev1.Node, error) {
 45 | 	cset := GetClientsetInstance()
 46 | 	fieldselector, err := fields.ParseSelector("metadata.name=" + nodename)
 47 | 	if err != nil {
 48 | 		klog.Info("Error in creating the field selector ", err.Error())
 49 | 		return nil, err
 50 | 	}
 51 | 	instance, err := cset.Cset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{FieldSelector: fieldselector.String()})
 52 | 	if err != nil {
 53 | 		klog.Info("Error in creating the watcher ", err.Error())
 54 | 		return nil, err
 55 | 	}
 56 | 	return &instance.Items[0], nil
 57 | }
 58 | 
 59 | // Returns true if GPUs are not currently requested by any workload
 60 | func GPUsAvailability() bool {
 61 | 	node, _ := GetNode(NodeName)
 62 | 	nodelabels := node.Labels
 63 | 	if _, found := nodelabels["nvidia.com/gpu.present"]; !found {
 64 | 		klog.Info("At least one GPU busy on node ", NodeName, ". Cannot run invasive health checks.")
 65 | 		return false
 66 | 	}
 67 | 	// Once cleared, list pods using gpus and abort the check if gpus are in use
 68 | 	fieldselector, err := fields.ParseSelector("spec.nodeName=" + NodeName + ",status.phase!=" + string(corev1.PodSucceeded))
 69 | 	if err != nil {
 70 | 		klog.Info("Error in creating the field selector ", err.Error())
 71 | 		return false
 72 | 	}
 73 | 	cset := GetClientsetInstance()
 74 | 	pods, err := cset.Cset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
 75 | 		FieldSelector: fieldselector.String(),
 76 | 	})
 77 | 	if err != nil {
 78 | 		klog.Info("Cannot list pods:", err.Error())
 79 | 		return false
 80 | 	}
 81 | 	for _, pod := range pods.Items {
 82 | 		podReqs, podLimits := resourcehelper.PodRequestsAndLimits(&pod)
 83 | 		gpuReq := podReqs["nvidia.com/gpu"]
 84 | 		gpuLim := podLimits["nvidia.com/gpu"]
 85 | 		if gpuReq.Value() > 0 || gpuLim.Value() > 0 {
 86 | 			klog.Info("Pod ", pod.Name, " with requests ", gpuReq.Value(), " and limits ", gpuLim.Value(), ". Cannot run invasive health checks.")
 87 | 			return false
 88 | 		}
 89 | 	}
 90 | 	klog.Info("GPUs are free. Will run invasive health checks.")
 91 | 	return true
 92 | }
 93 | 
 94 | func CreateJob(healthcheck string) error {
 95 | 	var args []string
 96 | 	var cmd []string
 97 | 	switch healthcheck {
 98 | 	case "dcgm":
 99 | 		cmd = []string{"python3"}
100 | 		args = []string{"gpu-dcgm/entrypoint.py", "-r", "3", "-l", "-v"}
101 | 	}
102 | 	cset := GetClientsetInstance()
103 | 
104 | 	fieldselector, err := fields.ParseSelector("metadata.name=" + PodName)
105 | 	if err != nil {
106 | 		klog.Info("Error in creating the field selector", err.Error())
107 | 		return err
108 | 	}
109 | 	pods, err := cset.Cset.CoreV1().Pods("autopilot").List(context.TODO(), metav1.ListOptions{
110 | 		FieldSelector: fieldselector.String(),
111 | 	})
112 | 	if err != nil {
113 | 		klog.Info("Cannot get pod:", err.Error())
114 | 		return err
115 | 	}
116 | 	autopilotPod := pods.Items[0]
117 | 	// setting TTL to 30 sec, but looking for used defined value
118 | 	ttlsec := int32(30)
119 | 	if os.Getenv("INVASIVE_JOB_TTLSEC") != "" {
120 | 		val, _ := strconv.Atoi(os.Getenv("INVASIVE_JOB_TTLSEC"))
121 | 		ttlsec = int32(val)
122 | 	}
123 | 
124 | 	backofflimits := int32(0)
125 | 	job := &batchv1.Job{
126 | 		ObjectMeta: metav1.ObjectMeta{
127 | 			Name:      healthcheck + "-" + randstr.Hex(6),
128 | 			Namespace: autopilotPod.Namespace,
129 | 		},
130 | 		Spec: batchv1.JobSpec{
131 | 			TTLSecondsAfterFinished: &ttlsec,
132 | 			BackoffLimit:            &backofflimits,
133 | 			Template: corev1.PodTemplateSpec{
134 | 				Spec: corev1.PodSpec{
135 | 					RestartPolicy:      "Never",
136 | 					ServiceAccountName: "autopilot",
137 | 					NodeName:           NodeName,
138 | 					InitContainers: []corev1.Container{
139 | 						{
140 | 							Name:            "init",
141 | 							Image:           autopilotPod.Spec.InitContainers[0].DeepCopy().Image,
142 | 							ImagePullPolicy: "IfNotPresent",
143 | 							Command:         autopilotPod.Spec.InitContainers[0].DeepCopy().Command,
144 | 							Args:            autopilotPod.Spec.InitContainers[0].DeepCopy().Args,
145 | 						},
146 | 					},
147 | 					Containers: []corev1.Container{
148 | 						{
149 | 							Name:            "main",
150 | 							Image:           autopilotPod.Spec.Containers[0].DeepCopy().Image,
151 | 							ImagePullPolicy: "IfNotPresent",
152 | 							Command:         cmd,
153 | 							Args:            args,
154 | 							Resources: corev1.ResourceRequirements{
155 | 								Limits: corev1.ResourceList{
156 | 									"nvidia.com/gpu": resource.MustParse("8"),
157 | 								},
158 | 								Requests: corev1.ResourceList{
159 | 									"nvidia.com/gpu": resource.MustParse("8"),
160 | 								},
161 | 							},
162 | 							Env: []corev1.EnvVar{
163 | 								{
164 | 									Name:  "NODE_NAME",
165 | 									Value: NodeName,
166 | 								},
167 | 							},
168 | 						},
169 | 					},
170 | 				},
171 | 			},
172 | 		},
173 | 	}
174 | 	klog.Info("Try create Job")
175 | 	_, err = cset.Cset.BatchV1().Jobs(Namespace).Create(context.TODO(), job,
176 | 		metav1.CreateOptions{})
177 | 	if err != nil {
178 | 		klog.Info("Couldn't create Job ", err.Error())
179 | 		return err
180 | 	}
181 | 	klog.Info("Created")
182 | 	return nil
183 | }
184 | 
185 | func PatchNode(label string, nodename string, force bool) error {
186 | 	cset := GetClientsetInstance()
187 | 
188 | 	// Should not patch the gpuhealth label if it's currently in TESTING or EVICT
189 | 	node, err := cset.Cset.CoreV1().Nodes().Get(context.TODO(), nodename, v1.GetOptions{})
190 | 	if err != nil {
191 | 		klog.Info("[Node Patch] Failed read node ", err.Error())
192 | 		return err
193 | 	}
194 | 	labels := node.GetLabels()
195 | 	if current, found := labels["autopilot.ibm.com/gpuhealth"]; found {
196 | 		klog.Info("Node ", nodename, " label found ", current)
197 | 		if current == "TESTING" || current == "EVICT" {
198 | 			if !force {
199 | 				klog.Info("Cannot patch node's label, value found: ", current)
200 | 				return errors.New("Node status " + current)
201 | 			} else {
202 | 				klog.Info("Force patch for completed testing")
203 | 			}
204 | 		}
205 | 	} else {
206 | 		klog.Info("No label found, will go ahead patching the node")
207 | 	}
208 | 	_, err = cset.Cset.CoreV1().Nodes().Patch(context.TODO(), nodename, types.StrategicMergePatchType, []byte(label), v1.PatchOptions{})
209 | 	if err != nil {
210 | 		klog.Info("[Node Patch] Failed. ", err.Error())
211 | 		return err
212 | 	}
213 | 	klog.Info("Node patched with label ", label)
214 | 	return nil
215 | }
216 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/utils/global.go:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"sync"
 6 | 
 7 | 	"k8s.io/client-go/kubernetes"
 8 | )
 9 | 
10 | type InitConfig struct {
11 | 	BWThreshold int
12 | }
13 | 
14 | var UserConfig InitConfig
15 | 
16 | type K8sClientset struct {
17 | 	Cset *kubernetes.Clientset
18 | }
19 | 
20 | var k8sClientset *K8sClientset
21 | var csetLock sync.Mutex
22 | 
23 | var HealthcheckLock sync.Mutex
24 | 
25 | var CPUModel string
26 | var GPUModel string
27 | 
28 | var NodeName string = os.Getenv("NODE_NAME")
29 | var Namespace string = os.Getenv("NAMESPACE")
30 | var PodName string = os.Getenv("POD_NAME")
31 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/utils/listwatch.go:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"strings"
 6 | 
 7 | 	corev1 "k8s.io/api/core/v1"
 8 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 9 | 	"k8s.io/apimachinery/pkg/fields"
10 | 	"k8s.io/apimachinery/pkg/watch"
11 | 	"k8s.io/client-go/tools/cache"
12 | 	toolswatch "k8s.io/client-go/tools/watch"
13 | 	"k8s.io/klog/v2"
14 | )
15 | 
16 | func WatchNode() {
17 | 	watchFunc := func(options metav1.ListOptions) (watch.Interface, error) {
18 | 		timeout := int64(60)
19 | 		fieldselector, err := fields.ParseSelector("metadata.name=" + NodeName)
20 | 		if err != nil {
21 | 			klog.Info("Error in creating the field selector", err.Error())
22 | 			return nil, err
23 | 		}
24 | 		instance, err := GetClientsetInstance().Cset.CoreV1().Nodes().Watch(context.Background(), metav1.ListOptions{TimeoutSeconds: &timeout, FieldSelector: fieldselector.String()})
25 | 		if err != nil {
26 | 			klog.Info("Error in creating the watcher", err.Error())
27 | 			return nil, err
28 | 		}
29 | 		return instance, err
30 | 	}
31 | 
32 | 	watcher, _ := toolswatch.NewRetryWatcher("1", &cache.ListWatch{WatchFunc: watchFunc})
33 | 
34 | 	for event := range watcher.ResultChan() {
35 | 		item := event.Object.(*corev1.Node)
36 | 
37 | 		switch event.Type {
38 | 		case watch.Modified:
39 | 			{
40 | 				key := "autopilot.ibm.com/dcgm.level.3"
41 | 				labels := item.GetLabels()
42 | 				if val, found := labels[key]; found {
43 | 					var res float64
44 | 					res = 0
45 | 					if strings.Contains(val, "EVICT") {
46 | 						res = 1
47 | 						klog.Info("[DCGM level 3] Update observation: ", NodeName, " Fatal error found")
48 | 					}
49 | 					HchecksGauge.WithLabelValues("dcgm", NodeName, CPUModel, GPUModel, "").Set(res)
50 | 				}
51 | 			}
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/utils/nodelabels.go:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | // All GPU tests pass
 4 | var GPUHealthPassLabel string = `
 5 | 	{
 6 | 		"metadata": {
 7 | 			"labels": {
 8 | 				"autopilot.ibm.com/gpuhealth": "PASS"
 9 | 			}
10 | 		}
11 | 	}
12 | `
13 | 
14 | // At least one GPU test fails. No info about the severity of the failure
15 | var GPUHealthWarnLabel string = `
16 | 	{
17 | 		"metadata": {
18 | 			"labels": {
19 | 				"autopilot.ibm.com/gpuhealth": "WARN"
20 | 			}
21 | 		}
22 | 	}
23 | `
24 | 
25 | var GPUHealthEmptyLabel string = `
26 | 	{
27 | 		"metadata": {
28 | 			"labels": {
29 | 				"autopilot.ibm.com/gpuhealth": ""
30 | 				}
31 | 		}
32 | 	}
33 | `
34 | 
35 | var GPUHealthTestingLabel string = `
36 | 	{
37 | 		"metadata": {
38 | 			"labels": {
39 | 				"autopilot.ibm.com/gpuhealth": "TESTING"
40 | 				}
41 | 		}
42 | 	}
43 | `
44 | 
45 | // Some health check failed. Can be any health check
46 | var NodeHealthWarnLabel string = `
47 | 	{
48 | 		"metadata": {
49 | 			"labels": {
50 | 				"autopilot.ibm.com/nodehealth": "WARN"
51 | 			}
52 | 		}
53 | 	}
54 | `
55 | 
56 | var NodeHealthEmptyLabel string = `
57 | 	{
58 | 		"metadata": {
59 | 			"labels": {
60 | 				"autopilot.ibm.com/nodehealth": ""
61 | 			}
62 | 		}
63 | 	}
64 | `
65 | 


--------------------------------------------------------------------------------
/autopilot-daemon/pkg/utils/prometheus.go:
--------------------------------------------------------------------------------
 1 | package utils
 2 | 
 3 | import (
 4 | 	"os/exec"
 5 | 	"strings"
 6 | 
 7 | 	"github.com/prometheus/client_golang/prometheus"
 8 | 	"k8s.io/klog/v2"
 9 | )
10 | 
11 | var (
12 | 	Requests = prometheus.NewCounter(
13 | 		prometheus.CounterOpts{
14 | 			Namespace: "autopilot",
15 | 			Name:      "health_checks_req_total",
16 | 			Help:      "Number of invocations to Autopilot",
17 | 		},
18 | 	)
19 | 
20 | 	HchecksGauge = prometheus.NewGaugeVec(
21 | 		prometheus.GaugeOpts{
22 | 			Namespace: "autopilot",
23 | 			Name:      "health_checks",
24 | 			Help:      "Summary of the health checks measurements on compute nodes. Gauge Vector version",
25 | 		},
26 | 		[]string{"health", "node", "cpumodel", "gpumodel", "deviceid"},
27 | 	)
28 | )
29 | 
30 | func InitMetrics(reg prometheus.Registerer) {
31 | 	// Register custom metrics with the global prometheus registry
32 | 	reg.MustRegister(HchecksGauge)
33 | }
34 | 
35 | func InitHardwareMetrics() {
36 | 	// Define CPUModel global variable
37 | 	cpu := "N/A"
38 | 
39 | 	cmd := "cat /proc/cpuinfo | egrep '^model name' | uniq | awk '{print substr($0, index($0,$4))}'|  sed 's/(//; s/)//'"
40 | 	out, err := exec.Command("bash", "-c", cmd).CombinedOutput()
41 | 	if err != nil {
42 | 		klog.Info("Error retrieving cpu model info", err.Error())
43 | 	} else {
44 | 		cpu = strings.TrimSpace(string(out[:]))
45 | 	}
46 | 	klog.Info("CPU_MODEL: ", cpu)
47 | 	CPUModel = cpu
48 | 
49 | 	// Define GPUModel global variable
50 | 	gpu := "N/A"
51 | 
52 | 	cmd2 := exec.Command("nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader")
53 | 	out, err = cmd2.CombinedOutput()
54 | 	if err != nil {
55 | 		klog.Info("Error retrieving gpu model info", err.Error())
56 | 	} else {
57 | 		tmp := strings.TrimSpace(string(out[:]))
58 | 		gpu = strings.Split(tmp, "\n")[0]
59 | 	}
60 | 	klog.Info("GPU_MODEL: ", gpu)
61 | 	GPUModel = gpu
62 | }
63 | 


--------------------------------------------------------------------------------
/autopilot-daemon/utils/briefings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exists=`which nvidia-smi`
 3 | if [[ -z $exists ]]
 4 | then
 5 | 	echo !! nvidia-smi not present. ABORT.
 6 | 	killall5 
 7 | fi
 8 | 
 9 | CMD="$(nvidia-smi)"
10 | errors="$(echo ${CMD} | grep -i err)"
11 | if [[ -n $errors ]]
12 | then
13 | 	echo !! nvidia-smi failed to start. ABORT.
14 | 	killall5
15 | fi
16 | 
17 | CMD="$(nvidia-smi --query-gpu=mig.mode.current --format=csv)"
18 | mig="$(echo ${CMD} | grep Enabled)"
19 | if [[ -n $mig ]]
20 | then
21 | 	echo !! MIG enabled. ABORT.
22 | 	exit 
23 | fi
24 | 
25 | CMD="$(dcgmi --version)"
26 | errors="$(echo ${CMD} | grep -i 'fail|error')"
27 | if [[ -n $errors ]]
28 | then
29 | 	echo !! dcgmi failed to start. ABORT.
30 | 	exit 
31 | fi


--------------------------------------------------------------------------------
/autopilot-daemon/utils/runHealthchecks.py:
--------------------------------------------------------------------------------
  1 | ##################################################################################
  2 | # Python program that uses the Python Client Library for Kubernetes to
  3 | # run autopilot health checks on all nodes or a specific node(s).
  4 | # Healchecks include PCIEBW, and GPU REMAPPED ROWS.
  5 | # Image: us.icr.io/cil15-shared-registry/gracek/run-healthchecks:3.0.1
  6 | ##################################################################################
  7 | import argparse
  8 | import os
  9 | import time
 10 | import asyncio
 11 | import aiohttp
 12 | from itertools import islice
 13 | import pprint
 14 | from kubernetes import client, config
 15 | from kubernetes.client.rest import ApiException
 16 | from multiprocessing import Pool
 17 | 
 18 | # load in cluster kubernetes config for access to cluster
 19 | config.load_incluster_config()
 20 | v1 = client.CoreV1Api()
 21 | 
 22 | # get arguments for service, namespace, node(s), and check (test type)
 23 | parser = argparse.ArgumentParser()
 24 | parser.add_argument('--service', type=str, default='autopilot-healthchecks', help='Autopilot healthchecks service name. Default is \"autopilot-healthchecks\".')
 25 | 
 26 | parser.add_argument('--namespace', type=str, default='autopilot', help='Namespace where autopilot DaemonSet is deployed. Default is \"autopilot\".')
 27 | 
 28 | parser.add_argument('--nodes', type=str, default='all', help='Node(s) that will run a healthcheck. Can be a comma separated list. Default is \"all\" unless --wkload is provided, then set to None. Specific nodes can be provided in addition to --wkload.')
 29 | 
 30 | parser.add_argument('--check', type=str, default='all', help='The specific test(s) that will run: \"all\", \"pciebw\", \"dcgm\", \"remapped\", \"ping\", \"gpumem\", \"pvc\" or \"gpupower\". Default is \"all\". Can be a comma separated list.')
 31 | 
 32 | parser.add_argument('--batchSize', default='0', type=str, help='Number of nodes to check in parallel. Default is set to the number of the worker nodes.')
 33 | 
 34 | parser.add_argument('--wkload', type=str, default='None', help='Workload node discovery w/ given namespace and label. Ex: \"--wkload=namespace:label-key=label-value\". Default is set to None.')
 35 | 
 36 | parser.add_argument('--dcgmR', type=str, default='1', help='Run a diagnostic in dcgmi. Run a diagnostic. (Note: higher numbered tests include all beneath.)\n\t1 - Quick (System Validation ~ seconds)\n\t2 - Medium (Extended System Validation ~ 2 minutes)\n\t3 - Long (System HW Diagnostics ~ 15 minutes)\n\t4 - Extended (Longer-running System HW Diagnostics)')
 37 | 
 38 | parser.add_argument('--nodelabel', type=str, default='None', help='Node label to select nodes. Ex: \"label-key=label-value\". Default is set to None.')
 39 | 
 40 | args = vars(parser.parse_args())
 41 | service = args['service']
 42 | namespace = args['namespace']
 43 | node = args['nodes'].replace(' ', '').split(',') # list of nodes
 44 | checks = args['check'].replace(' ', '').split(',') # list of checks
 45 | batch_size = int(args['batchSize'])
 46 | nodelabel = args['nodelabel']
 47 | wkload = args['wkload']
 48 | if wkload != 'None':
 49 |     wkload = args['wkload'].split(':') 
 50 |     if '' in wkload:
 51 |         print("Invalid job definition, must be namespace:label=value. Got",wkload)
 52 |         exit()
 53 | 
 54 | if ((wkload != "None") or (nodelabel != "None")) and (args['nodes'] == 'all'):
 55 |     node = []
 56 | 
 57 | # debug: runtime
 58 | start_time = time.time()
 59 | 
 60 | def find_labeled_nodes():
 61 |     try:
 62 |         labeled_nodes = v1.list_node(label_selector=nodelabel)
 63 |     except ApiException as e:
 64 |         print("Exception when calling CoreV1Api->list_node: %s\n" % e)
 65 |         exit()
 66 |     if len(labeled_nodes.items) == 0:
 67 |         print ("No node is labeled with", nodelabel, " - ABORT.")
 68 |         exit()
 69 |     for labeled_node in labeled_nodes.items:
 70 |         node_name = labeled_node.metadata.name
 71 |         if node_name not in node:
 72 |             node.append(node_name)
 73 | 
 74 | # find workload addresses
 75 | def find_wkload():
 76 |     node_len = len(node)
 77 |     copy = False
 78 |     wkload_ns = wkload[0] # ex: "default"
 79 |     wkload_label = wkload[1] # ex: "job-name=my-job" or "app=my-app"
 80 |     try:
 81 |         wkload_pods = v1.list_namespaced_pod(namespace=wkload_ns, label_selector=wkload_label)
 82 |     except ApiException as e:
 83 |         print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e)
 84 |         exit()
 85 |     print('Workload:', ': '.join(wkload))
 86 |     if len(wkload_pods.items) == 0: 
 87 |         print("No workload labeled with", wkload_label, "- ABORT.")
 88 |         exit()
 89 |     for pod in wkload_pods.items:
 90 |         node_name = pod.spec.node_name
 91 |         if node_name not in node:
 92 |             node.append(node_name)
 93 |         else:
 94 |             copy = True
 95 |     if (len(node) == node_len) and not copy:
 96 |         print('Error: Issue with --wkload parameter.\nMake sure your workload is spelled correctly and exists in the cluster. ABORT')
 97 |         exit()
 98 | 
 99 | 
100 | # get addresses in desired endpointslice (autopilot-healthchecks) based on which node(s) the user chooses
101 | def get_addresses():
102 |     global server_address
103 |     server_address = ''
104 |     try:
105 |         endpoints = v1.list_namespaced_endpoints(namespace=namespace)
106 |     except ApiException as e:
107 |         print("Exception when calling CoreV1Api->list_namespaced_endpoints: %s\n" % e)
108 |         exit()
109 |     for endpointslice in endpoints.items:
110 |         if endpointslice.metadata.name == service:
111 |             # print("EndpointSlice: " + str(endpointslice.metadata.name)) 
112 |             addresses = endpointslice.subsets[0].addresses
113 |             if node[0] == 'all':
114 |                 # server_address = [addresses[0], addresses[len(addresses)-1]]
115 |                 return addresses
116 |             else:
117 |                 address_list = []
118 |                 for address in addresses:
119 |                     if address.node_name in node:
120 |                         address_list.append(address)
121 |                     else:
122 |                         server_address = address
123 |                 if len(address_list) > 0:
124 |                     return address_list
125 |                 # if server_address == '': # when all nodes are being tested / there's only one node
126 |                 #     print('Iperf test cannot be completed')
127 | 
128 | # create url for test
129 | def create_url(address, daemon_node):
130 |     urls = []
131 |     for check in checks:
132 |         if check == 'all':
133 |             urls.append('http://' + str(address.ip) + ':3333/status?host=' + daemon_node)
134 |             return urls
135 |     extra_params = ""
136 |     if "ping" in args['check']:
137 |         if args['wkload'] != 'None':
138 |             extra_params += "&job=" + args['wkload']
139 |         if nodelabel != 'None':
140 |             extra_params += "&nodelabel=" + nodelabel
141 |         if args['nodes'] != 'all' :
142 |             extra_params += "&pingnodes=" + args['nodes']
143 |     if "dcgm" in args['check']:
144 |         extra_params += "&r=" + args['dcgmR']
145 |     urls.append('http://' + str(address.ip) + ':3333/status?host=' + daemon_node + '&check=' + args['check'] + extra_params)
146 |     return urls
147 | 
148 | # check and print status of each node
149 | def get_node_status(responses):
150 |     node_status_list = []
151 |     for response in responses:
152 |         response_list = response.split('\n')
153 |         for line in response_list:
154 |             if (('FAIL' in line) or ('ABORT' in line)):
155 |                 if ('PCIE' in line):
156 |                     node_status_list.append('PCIE Failed')
157 |                 elif('REMAPPED ROWS' in line):
158 |                     node_status_list.append('REMAPPED ROWS Failed')
159 |                 elif('DCGM' in line):
160 |                     node_status_list.append('DCGM Failed')
161 |                 elif('GPU POWER' in line):
162 |                     node_status_list.append('GPU POWER Failed')
163 |                 elif('PING' in line):
164 |                     node_status_list.append('PING Failed')
165 |                 elif('GPU-MEM' in line):
166 |                     node_status_list.append("GPU MEM Test Failed")
167 |                 elif('PVC' in line):
168 |                     node_status_list.append("PVC Create-Delete Test Failed")
169 |                 elif('Disconnected' in line):
170 |                     node_status_list.append('Connection to Server Failed')
171 | 
172 |     if len(node_status_list) < 1:
173 |         node_status_list.append('OK')
174 |     return node_status_list
175 | 
176 | async def makeconnection(address):
177 |     daemon_node = str(address.node_name)
178 |     pid = os.getpid()
179 |     url = create_url(address, daemon_node)
180 |     output = '\nAutopilot Endpoint: {ip}\nNode: {daemon_node}\nurl(s): {url}'.format(ip=address.ip, daemon_node=daemon_node, url='\n        '.join(url))
181 |     print(f"Initiated connection to {url}.")
182 |     total_timeout=aiohttp.ClientTimeout(total=60*60*24)
183 |     try:
184 |         async with aiohttp.ClientSession(timeout=total_timeout) as session:
185 |             async with session.get(url[0]) as resp:
186 |                 reply = await resp.text()
187 |     except aiohttp.client_exceptions.ServerDisconnectedError:
188 |         print("Server Disconnected")
189 |         reply = "Server Disconnected. ABORT"
190 | 
191 |     response=[reply]
192 |     node_status_list = get_node_status(response)
193 |     output += '\nResponse:\n{response}\nNode Status: {status}\n-------------------------------------\n'.format(response='~~\n'.join(response), status=', '.join(node_status_list))
194 |     # output += "\n-------------------------------------\n" # separator
195 |     return output, pid, daemon_node, node_status_list
196 | 
197 | 
198 | async def main(addresses):
199 |     res = await asyncio.gather(*(makeconnection(addr) for addr in addresses))
200 |     return res
201 | 
202 | def batch_of_nodes(nodelist, batch_size):
203 |     it = iter(nodelist)
204 |     while True:
205 |         batch = list(islice(it, batch_size))
206 |         if not batch:
207 |             break
208 |         yield batch
209 | 
210 | # start program
211 | if __name__ == "__main__":
212 |     # initializing some variables
213 |     if wkload != 'None':
214 |         find_wkload()
215 |     if nodelabel != 'None':
216 |         find_labeled_nodes()
217 |     addresses = get_addresses()
218 |     total_nodes = len(addresses)
219 |     node_status = {} # updates after each node is tested
220 |     pids_tups = [] # debug: process list
221 |     pids_dict = {} # debug: process list
222 | 
223 |     if batch_size == 0 or batch_size > total_nodes:
224 |         batch_size = total_nodes
225 |     asyncres = []
226 | 
227 |     for b in batch_of_nodes(addresses, batch_size):
228 |         asyncres.extend(asyncio.run(main(b)))
229 | 
230 |     for result, pid, daemon_node, node_status_list in asyncres:
231 |         pids_tups.append((pid, daemon_node))
232 |         node_status[daemon_node] = node_status_list
233 |         print(result)
234 |     
235 |     print("Node Summary:\n")
236 |     pprint.pprint(node_status)
237 |     
238 |     # debug: print each process with the nodes they ran
239 |     # for p, n in pids_tups:
240 |     #     pids_dict.setdefault(p, []).append(n)
241 |     # print("\n~~~DEBUGGING BELOW~~~\nProcesses (randomly ordered) and the nodes they ran (process:[nodes]):")
242 |     # pprint.pprint(pids_dict, width=1)
243 | 
244 |     # print runtime
245 |     print('\nruntime:', str(time.time() - start_time), 'sec')
246 | 


--------------------------------------------------------------------------------
/figures/autopilot-daemon-pod.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-daemon-pod.pdf


--------------------------------------------------------------------------------
/figures/autopilot-daemon-pod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-daemon-pod.png


--------------------------------------------------------------------------------
/figures/autopilot-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-logo.png


--------------------------------------------------------------------------------
/figures/autopilot-main-loop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/autopilot-main-loop.pdf


--------------------------------------------------------------------------------
/figures/autopilot-main-loop.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="646px" height="271px" viewBox="-0.5 -0.5 646 271" style="background-color: rgb(255, 255, 255);"><defs/><g><path d="M 395 10 L 455 10 L 455 53.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 455 58.88 L 451.5 51.88 L 455 53.63 L 458.5 51.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 255 10 L 210.5 10 L 210.5 53.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 210.5 58.88 L 207 51.88 L 210.5 53.63 L 214 51.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="255" y="0" width="140" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 10px; margin-left: 325px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">Autopilot Main Loop</div></div></div></foreignObject><text x="325" y="14" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Autopilot Main Loop</text></switch></g><path d="M 210.5 120 L 210.5 145 L 75 145 L 75 163.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 75 168.88 L 71.5 161.88 L 75 163.63 L 78.5 161.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 210.5 120 L 210.5 145 L 345.5 145 L 345.5 163.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 345.5 168.88 L 342 161.88 L 345.5 163.63 L 349 161.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="150.5" y="60" width="120" height="60" rx="9" ry="9" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 90px; margin-left: 152px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Run Health Checks</div></div></div></foreignObject><text x="211" y="94" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Run Health Checks</text></switch></g><path d="M 395 90 L 345.5 90 L 345.5 163.63" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 345.5 168.88 L 342 161.88 L 345.5 163.63 L 349 161.88 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><rect x="395" y="60" width="120" height="60" rx="9" ry="9" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 90px; margin-left: 396px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Check GPUs Availability</div></div></div></foreignObject><text x="455" y="94" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Check GPUs Availa...</text></switch></g><rect x="0" y="170" width="150" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 180px; margin-left: 75px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">Periodic / On Demand</div></div></div></foreignObject><text x="75" y="184" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Periodic / On Demand</text></switch></g><rect x="310.5" y="170" width="70" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 180px; margin-left: 346px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">Invasive</div></div></div></foreignObject><text x="346" y="184" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Invasive</text></switch></g><rect x="165.5" y="230" width="90" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 240px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">Label nodes</div></div></div></foreignObject><text x="211" y="244" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Label nodes</text></switch></g><path d="M 345.5 210 L 345.5 190 L 345.5 210 L 345.5 190" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 349.5 190 L 341.5 190" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="345.5" cy="240" rx="60" ry="30" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 240px; margin-left: 287px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Via Job</div></div></div></foreignObject><text x="346" y="244" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Via Job</text></switch></g><path d="M 285.5 240 L 265.5 240 L 275.5 240 L 255.5 240" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 255.5 236 L 255.5 244" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 570 150 L 570 90 L 515 90" fill="none" stroke="#000000" stroke-miterlimit="10" stroke-dasharray="3 3" pointer-events="stroke"/><path d="M 515 86 L 515 94" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 495 170 L 387.9 170 L 387.92 212.42" fill="none" stroke="#000000" stroke-miterlimit="10" stroke-dasharray="3 3" pointer-events="stroke"/><path d="M 387.93 217.67 L 384.42 210.67 L 387.92 212.42 L 391.42 210.67 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 168px; margin-left: 436px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">allows</div></div></div></foreignObject><text x="436" y="172" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">allows</text></switch></g><rect x="495" y="150" width="150" height="40" fill="none" stroke="#000000" stroke-dasharray="3 3" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 170px; margin-left: 570px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">Look at node<br style="font-size: 14px" />resource request/limit</div></div></div></foreignObject><text x="570" y="174" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Look at node...</text></switch></g><path d="M 77.45 210 L 77.45 190" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 81.45 190 L 73.45 190" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="77.45" cy="240" rx="60" ry="30" fill="#ffffff" stroke="#000000" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 240px; margin-left: 18px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Via Autopilot Pod</div></div></div></foreignObject><text x="77" y="244" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Via Autopilot Pod</text></switch></g><path d="M 137.45 240 L 157.5 240 L 145.5 240 L 165.5 240" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 165.5 244 L 165.5 236" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------
/figures/big-picture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/big-picture.pdf


--------------------------------------------------------------------------------
/figures/big-picture.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="643px" height="397px" viewBox="-0.5 -0.5 643 397" style="background-color: rgb(255, 255, 255);"><defs/><g><rect x="183" y="80" width="290" height="50" fill="#000000" stroke="#000000" stroke-dasharray="3 3" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><rect x="183" y="80" width="290" height="50" fill="#d5e8d4" stroke="#82b366" stroke-dasharray="3 3" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 288px; height: 1px; padding-top: 105px; margin-left: 184px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">On-demand Evaluation<br /><div><span>   using node/Job label through Autopilot</span></div></div></div></div></foreignObject><text x="328" y="109" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">On-demand Evaluation...</text></switch></g><rect x="98" y="140" width="460" height="80" rx="12" ry="12" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 458px; height: 1px; padding-top: 180px; margin-left: 99px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Workload lifetime</div></div></div></foreignObject><text x="328" y="184" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Workload lifetime</text></switch></g><path d="M 169.75 20 L 211 41.5 L 169.75 63 L 169.75 52.5 C 88.15 52.5 22 118.65 22 200.25 C 22 281.85 88.15 348 169.75 348 L 300 348 L 300 370 L 169.75 370 C 76 370 0 294 0 200.25 C 0 106.5 76 30.5 169.75 30.5 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><path d="M 169.75 20 L 211 41.5 L 169.75 63 L 169.75 52.5 C 88.15 52.5 22 118.65 22 200.25 C 22 281.85 88.15 348 169.75 348 L 300 348 L 300 370 L 169.75 370 C 76 370 0 294 0 200.25 C 0 106.5 76 30.5 169.75 30.5 Z" fill="#f5f5f5" stroke="#b3b3b3" stroke-miterlimit="10" pointer-events="all"/><path d="M 509.75 30 L 554 51.5 L 509.75 73 L 509.75 62.5 C 428.15 62.5 362 128.65 362 210.25 C 362 291.85 428.15 358 509.75 358 L 640 358 L 640 380 L 509.75 380 C 416 380 340 304 340 210.25 C 340 116.5 416 40.5 509.75 40.5 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" transform="translate(2,3)rotate(180,490,205)" pointer-events="all" opacity="0.25"/><path d="M 509.75 30 L 554 51.5 L 509.75 73 L 509.75 62.5 C 428.15 62.5 362 128.65 362 210.25 C 362 291.85 428.15 358 509.75 358 L 640 358 L 640 380 L 509.75 380 C 416 380 340 304 340 210.25 C 340 116.5 416 40.5 509.75 40.5 Z" fill="#f5f5f5" stroke="#b3b3b3" stroke-miterlimit="10" transform="rotate(180,490,205)" pointer-events="all"/><rect x="218.75" y="0" width="101.25" height="60" fill="#000000" stroke="#000000" stroke-dasharray="1 2" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><rect x="218.75" y="0" width="101.25" height="60" fill="#e1d5e7" stroke="#9673a6" stroke-dasharray="1 2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 99px; height: 1px; padding-top: 30px; margin-left: 220px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Continuous<br />system evaluation through Autopilot</div></div></div></foreignObject><text x="269" y="34" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Continuous...</text></switch></g><path d="M 241.87 286.5 Q 165.1 286.5 165.01 221.37" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 165.01 216.12 L 168.52 223.11 L 165.01 221.37 L 161.52 223.12 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><path d="M 403.12 286.5 Q 482.5 286.5 482.39 221.37" fill="none" stroke="#000000" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 482.38 216.12 L 485.89 223.11 L 482.39 221.37 L 478.89 223.12 Z" fill="#000000" stroke="#000000" stroke-miterlimit="10" pointer-events="all"/><ellipse cx="482.38" cy="185" rx="47.125" ry="30" fill="#ffe6cc" stroke="#d79b00" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 92px; height: 1px; padding-top: 185px; margin-left: 436px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Migration decision<br /></div></div></div></foreignObject><text x="482" y="189" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Migration dec...</text></switch></g><ellipse cx="165.01" cy="185" rx="47.125" ry="30" fill="#ffe6cc" stroke="#d79b00" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 92px; height: 1px; padding-top: 185px; margin-left: 119px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Placement decision</div></div></div></foreignObject><text x="165" y="189" fill="#000000" font-family="Helvetica" font-size="14px" text-anchor="middle">Placement dec...</text></switch></g><rect x="313.75" y="333" width="101.25" height="60" fill="#000000" stroke="#000000" stroke-dasharray="1 2" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><rect x="313.75" y="333" width="101.25" height="60" fill="#e1d5e7" stroke="#9673a6" stroke-dasharray="1 2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 99px; height: 1px; padding-top: 363px; margin-left: 315px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Continuous<br />system evaluation through Autopilot</div></div></div></foreignObject><text x="364" y="367" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Continuous...</text></switch></g><path d="M 318 310 L 223 310 Q 213 310 216.16 300.51 L 229.84 259.49 Q 233 250 243 250 L 433 250 Q 443 250 439.84 259.49 L 426.16 300.51 Q 423 310 413 310 Z" fill="none" stroke="none" pointer-events="all"/><path d="M 316.46 309.65 L 221.37 311.81 L 218.3 307.3 L 216.83 302.96 L 230.21 257.76 L 232.31 255.89 L 237.66 249.5 L 434.8 248.11 L 440.31 252.57 L 441.75 254.58 L 441.26 257.86 L 426.95 299.62 L 426.03 304.42 L 416.77 308.89 L 318.06 309.24" fill="#ffffff" stroke="none" pointer-events="all"/><path d="M 318 310 M 318 310 C 281.75 311.25 246.54 311.03 223 310 M 318 310 C 281.06 309.82 244.64 310.88 223 310 M 223 310 C 215.23 309.98 215.9 307.09 216.16 300.51 M 223 310 C 218.18 308.41 214.47 304.59 216.16 300.51 M 216.16 300.51 C 218.4 292.83 221.6 281.38 229.84 259.49 M 216.16 300.51 C 220.53 287.35 225.02 273.44 229.84 259.49 M 229.84 259.49 C 233.02 252.57 235.57 251.65 243 250 M 229.84 259.49 C 232.03 254.72 235.64 250.82 243 250 M 243 250 C 293.04 248.53 339.06 249.36 433 250 M 243 250 C 317.25 250.02 390.34 248.32 433 250 M 433 250 C 438.48 249.96 440.91 252.15 439.84 259.49 M 433 250 C 438.74 251.02 441.6 253.64 439.84 259.49 M 439.84 259.49 C 436.31 270.25 432.57 278.34 426.16 300.51 M 439.84 259.49 C 437.39 269.46 433.94 279.35 426.16 300.51 M 426.16 300.51 C 423.14 308.19 417.86 309.64 413 310 M 426.16 300.51 C 424.72 304.63 421.45 311.35 413 310 M 413 310 C 387.74 309.77 364.3 309.47 318 310 M 413 310 C 391.51 309.76 367.74 310.05 318 310" fill="none" stroke="#000000" stroke-linejoin="round" stroke-linecap="round" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 228px; height: 1px; padding-top: 280px; margin-left: 214px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><span>Scheduler or opinionated tool (e.g., CodeFlare)</span></div></div></div></foreignObject><text x="328" y="284" fill="#000000" font-family="Helvetica" font-size="12px" text-anchor="middle">Scheduler or opinionated tool (e.g., C...</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------
/figures/invasive-check-flow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/invasive-check-flow.pdf


--------------------------------------------------------------------------------
/figures/periodic-check-flow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/autopilot/a475e783455abaeecb7281cbfdfb9bf5353f6398/figures/periodic-check-flow.pdf


--------------------------------------------------------------------------------
/helm-charts/autopilot/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: autopilot
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: v2.1.3
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: latest
25 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/README.md:
--------------------------------------------------------------------------------
 1 | # Helm Chart Customization
 2 | 
 3 | ## Latest tag
 4 | 
 5 | At every PR merge, we automatically build the `latest` tag that can be pulled by using `quay.io/autopilot/autopilot:latest`.
 6 | 
 7 | This tag contains the latest changes and it must be considered as a dev image. For stable releases, always refer to the published ones.
 8 | 
 9 | ## Customize Helm chart
10 | 
11 | Autopilot is set to run on NVidia GPU nodes. It is possible to run it on heterogeneous nodes (i.e., CPU only and GPU only), GPU only nodes or CPU only nodes.
12 | 
13 | ```yaml
14 | onlyOnGPUNodes: true
15 | ```
16 | 
17 | Running on GPU nodes only, will:
18 | 
19 | 1) add the `nvidia.com/gpu.present: 'true'` label and
20 | 2) enable the init container, which checks on the nvidia device plug-in to be setup
21 | 
22 | Alternatively, `onlyOnGPUNodes` can be set to false and Autopilot will run on all worker nodes, regardless of the accelerators.
23 | Notice that, in this heterogeneous case, the GPU health checks will error out in the non-GPU nodes.
24 | 
25 | - Autopilot runs tests periodically. The default is set to every hour and 4 hours for regular and deep diagnostics respectively, but these can be customized be changing the following
26 | 
27 | ```yaml
28 | repeat: <hours> # periodic health checks timer (default 1h)
29 | invasive: <hours> # deeper diagnostic timer (default 4h, 0 to disable)
30 | ```
31 | 
32 | - The list of GPU errors considered fatal as a result of a dcgmi run, can be customized through the `DCGM_FATAL_ERRORS` environment variable. This is used to label nodes with extra WARN/EVICT labels. The list defaults to [PCIe,NVLink,ECC,GPU Memory] and refers to https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3
33 | 
34 | ```yaml
35 |   - name: "DCGM_FATAL_ERRORS"
36 |     value: ""
37 | ```
38 | 
39 | - Invasive jobs (e.g., dcgm level 3), are executed as separate job. The job deletes itself by default after 30s. This parameter can be customized by the env variable below
40 | 
41 | ```yaml
42 |   - name: "INVASIVE_JOB_TTLSEC"
43 |     value: ""
44 | ```
45 | 
46 | - PCIe bandwidth critical value is defaulted to 4GB/s. It is recommended to set a threshold that is 25% or lower of the expected peak PCIe bandwidth capability, which maps to maximum peak from 16 lanes to 4 lanes. For example, for a PCIe Gen4x16, reported peak bandwidth is 63GB/s. A degradation at 25% is 15.75GB/s, which corresponds to PCIe Gen4x4. The measured bandwidth is expected to be at least 80% of the expected peak PCIe generation bandwidth.
47 | 
48 | ```yaml
49 | PCIeBW: <val>
50 | ```
51 | 
52 | - If secondary nics are available by, for instance, Multus or Multi-Nic-Operator, those can be enabled in autopilot by setting the following
53 | 
54 | ```yaml
55 | annotations:
56 |   k8s.v1.cni.cncf.io/networks: <network-config-name>
57 | ```
58 | 
59 | - The list of periodic health checks can be customized through an environment variable. In the example below, we select all health checks and specify the storage class for the `pvc` test
60 | 
61 | If running on CPU nodes only, `pciebw,remapped,dcgm and gpupower` can be removed
62 | 
63 | ```yaml
64 | env:
65 |   - name: "PERIODIC_CHECKS"
66 |     value: "pciebw,remapped,dcgm,ping,gpupower,pvc"
67 |   - name: "PVC_TEST_STORAGE_CLASS"
68 |     value: "example-storage-class"
69 | ```
70 | 
71 | All these values can be saved in a `config.yaml` file.
72 | 
73 | ## Install
74 | 
75 | If you have your own configuration file, it can be passed to the `helm` install command with the `-f` parameter. If you want to install the default values, just omit the parameter.
76 | 
77 | ```bash
78 | helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace <-f your-config.yml>
79 | ```
80 | 
81 | For more customization, please refer to `values.yaml`.
82 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Autopilot DaemonSet deployed.


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "mutating-webhook.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "mutating-webhook.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "mutating-webhook.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "mutating-webhook.labels" -}}
37 | helm.sh/chart: {{ include "mutating-webhook.chart" . }}
38 | {{ include "mutating-webhook.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "mutating-webhook.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "mutating-webhook.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "mutating-webhook.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "mutating-webhook.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 
64 | {{/*
65 | Create the name of the namespace to use
66 | */}}
67 | {{- define "mutating-webhook.namespaceName" -}}
68 | {{- if .Values.namespace.create }}
69 | {{- default (include "mutating-webhook.fullname" .) .Values.namespace.name }}
70 | {{- end }}
71 | {{- end }}
72 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/autopilot.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: DaemonSet
  3 | metadata:
  4 |   labels:
  5 |     app: autopilot
  6 |   name: {{ printf "%s" .Chart.Name }}
  7 | spec:
  8 |   selector:
  9 |     matchLabels:
 10 |       app: autopilot
 11 |   template:
 12 |     metadata:
 13 |       annotations:
 14 |       {{- toYaml .Values.annotations | nindent 8 }}
 15 |       labels:
 16 |         app: autopilot
 17 |     spec:
 18 |       {{- if .Values.affinity }}
 19 |       affinity:
 20 |       {{- toYaml .Values.affinity | nindent 8 }}
 21 |       {{- end}}
 22 |       nodeSelector:
 23 |       {{- if .Values.nodeSelector }}
 24 |       {{- toYaml .Values.nodeSelector | nindent 8 }}
 25 |       {{- end}}
 26 |       {{- if .Values.onlyOnGPUNodes }}
 27 |         nvidia.com/gpu.present: 'true'
 28 |       {{- end}}
 29 |       serviceAccountName: autopilot
 30 |       {{- if .Values.pullSecrets.create }}
 31 |       imagePullSecrets:
 32 |       - name: {{ .Values.pullSecrets.name }}
 33 |       {{- end}}
 34 |       {{- if .Values.onlyOnGPUNodes }}
 35 |       initContainers:
 36 |         - args:
 37 |           - until [ -f /usr/bin/nvidia-smi ]; do echo waiting for nvidia device plug-in to be setup; sleep 5 && exit -1; done
 38 |           command:
 39 |           - sh
 40 |           - -c
 41 |           image: {{ .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }}
 42 |           imagePullPolicy: Always
 43 |           name: device-plugin-validation
 44 |           securityContext:
 45 |             runAsNonRoot: true
 46 |             runAsUser: 1000910000
 47 |       {{- end}}
 48 |       containers:
 49 |         - image: {{ .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }}
 50 |           command:
 51 |            - sh
 52 |            - -c
 53 |            - |
 54 |              /usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }} --invasive-check-timer {{ .Values.invasive }}
 55 |           imagePullPolicy: {{ .Values.image.pullPolicy }} 
 56 |           name: autopilot
 57 |           securityContext:
 58 |             runAsNonRoot: true
 59 |             runAsUser: 1000910000
 60 |             capabilities:
 61 |                 add:
 62 |                 - NET_RAW
 63 |                 - NET_ADMIN
 64 |           env:
 65 |           {{- range .Values.env }}
 66 |             - name: {{ .name }}
 67 |               value: {{ .value | quote}}
 68 |           {{- end }} 
 69 |             - name: "NODE_NAME"
 70 |               valueFrom:
 71 |                 fieldRef:
 72 |                   fieldPath: spec.nodeName
 73 |             - name: "NAMESPACE"
 74 |               valueFrom:
 75 |                 fieldRef:
 76 |                   fieldPath: metadata.namespace
 77 |             - name: "POD_NAME"
 78 |               valueFrom:
 79 |                 fieldRef:
 80 |                   fieldPath: metadata.name
 81 |           ports:
 82 |             - containerPort: {{ .Values.service.port }}
 83 |               name: healthcheck
 84 |             - containerPort: 8081
 85 |               name: http
 86 |             - containerPort: 8080
 87 |               name: readinessprobe
 88 |           readinessProbe:
 89 |             httpGet:
 90 |               path: /readinessprobe
 91 |               port: 8080
 92 |             initialDelaySeconds: 15
 93 |             periodSeconds: 120
 94 |             timeoutSeconds: 10
 95 |           livenessProbe:
 96 |             initialDelaySeconds: 15
 97 |             periodSeconds: 120
 98 |             timeoutSeconds: 15
 99 |           {{- if .Values.onlyOnGPUNodes }}
100 |             exec:
101 |               command:
102 |                 - nvidia-smi
103 |           {{- else }}
104 |             httpGet:
105 |               path: /readinessprobe
106 |               port: 8080
107 |           {{- end}}
108 |           resources:
109 |             {{- toYaml .Values.resources | nindent 12 }}
110 |           volumeMounts:
111 |             {{- if .Values.additionalVolumeMounts }}
112 |             {{- toYaml .Values.additionalVolumeMounts | nindent 12 }}
113 |             {{- end }}
114 |       volumes:
115 |         {{- if .Values.additionalVolumeClaimTemplates }}
116 |         {{- toYaml .Values.additionalVolumeClaimTemplates | nindent 8 }}
117 |         {{- end}}
118 |           
119 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/metrics_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: autopilot
 6 |   name: autopilot-metrics-service
 7 | spec:
 8 |   ports:
 9 |   - name: http
10 |     port: 8081
11 |     protocol: TCP
12 |     targetPort: http
13 |   selector:
14 |     app: autopilot
15 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/pullsecret.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.pullSecrets.create -}}
2 | apiVersion: v1
3 | data:
4 |   .dockerconfigjson: {{ .Values.pullSecrets.imagePullSecretData }}
5 | kind: Secret
6 | metadata:
7 |   name: {{ .Values.pullSecrets.name }}
8 | type: kubernetes.io/dockerconfigjson
9 | {{- end}}


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: autopilot
 6 |   name: autopilot-healthchecks
 7 |   annotations: 
 8 |     {{- toYaml .Values.serviceAnnotations | nindent 4 }}
 9 | spec:
10 |   ports:
11 |     - port: {{ .Values.service.port }}
12 |       protocol: TCP
13 |       name: healthcheck
14 |   selector:
15 |     app: autopilot
16 | ---
17 | apiVersion: v1
18 | kind: Service
19 | metadata:
20 |   labels:
21 |     app: autopilot
22 |   name: autopilot-readinessprobe
23 | spec:
24 |   ports:
25 |     - port: 8080
26 |       protocol: TCP 
27 |       name: readinessprobe
28 |   selector:
29 |     app: autopilot


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{ if .Capabilities.APIVersions.Has "security.openshift.io/v1" -}}
 2 | kind: SecurityContextConstraints
 3 | apiVersion: security.openshift.io/v1
 4 | metadata:
 5 |   name: scc-autopilot
 6 | allowPrivilegedContainer: true
 7 | runAsUser:
 8 |   type: RunAsAny
 9 | seLinuxContext:
10 |   type: RunAsAny
11 | fsGroup:
12 |   type: RunAsAny
13 | supplementalGroups:
14 |   type: RunAsAny
15 | users:
16 | - system:serviceaccount:{{ .Release.Namespace }}:autopilot
17 | allowedCapabilities:
18 | - 'NET_RAW'
19 | - 'NET_ADMIN'
20 | volumes:
21 | - configMap
22 | - csi
23 | - downwardAPI
24 | - emptyDir
25 | - ephemeral
26 | - hostPath
27 | - persistentVolumeClaim
28 | - projected
29 | - secret
30 | {{ end -}}
31 | ---
32 | apiVersion: v1
33 | kind: ServiceAccount
34 | metadata:
35 |   name: autopilot
36 | ---
37 | apiVersion: rbac.authorization.k8s.io/v1
38 | kind: ClusterRole
39 | metadata:
40 |   name: autopilot
41 | rules:
42 | - apiGroups: [""]
43 |   resources: ["endpoints"]
44 |   verbs: ["get", "list"]
45 | - apiGroups: [""]
46 |   resources: ["pods"]
47 |   verbs: ["get", "list"]
48 | - apiGroups: ["batch"]
49 |   resources: ["jobs"]
50 |   verbs: ["get", "list", "create"]
51 | - apiGroups: [""]
52 |   resources: ["nodes"]
53 |   verbs: ["list", "get", "patch", "watch"]
54 | - apiGroups: ["apps"]
55 |   resources: ["daemonsets"]
56 |   verbs: ["list", "get"]
57 | - apiGroups: [""]
58 |   resources: ["persistentvolumeclaims"]
59 |   verbs: ["list", "get", "create", "delete"]
60 | ---
61 | apiVersion: rbac.authorization.k8s.io/v1
62 | kind: ClusterRoleBinding
63 | metadata:
64 |   name: autopilot 
65 | subjects:
66 | - kind: ServiceAccount
67 |   namespace: {{ .Release.Namespace }}
68 |   name: autopilot 
69 | roleRef: 
70 |   kind: ClusterRole
71 |   name: autopilot 
72 |   apiGroup: rbac.authorization.k8s.io
73 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/templates/servicemonitor.yaml:
--------------------------------------------------------------------------------
 1 | # Prometheus Monitor Service (Metrics)
 2 | {{ if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" -}}
 3 | apiVersion: monitoring.coreos.com/v1
 4 | kind: ServiceMonitor
 5 | metadata:
 6 |   labels:
 7 |     app: autopilot
 8 |     app.kubernetes.io/name: servicemonitor
 9 |     app.kubernetes.io/component: metrics
10 |     release: prometheus
11 |   name: autopilot-metrics-monitor
12 | spec:
13 |   endpoints:
14 |     - path: /metrics
15 |       port: http
16 |       scheme: http
17 |   selector:
18 |     matchLabels:
19 |       app: autopilot
20 | {{ end -}}
21 | 


--------------------------------------------------------------------------------
/helm-charts/autopilot/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for the Autopilot DaemonSet.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | image:
 5 |   repository: quay.io/autopilot/autopilot
 6 |   pullPolicy: Always
 7 | 
 8 | # Bandwidth threshold below which PCIe links are considered defective (Gb/s)
 9 | # It is recommended to set a threshold that is 25% or lower of the expected peak PCIe bandwidth capability, which maps to maximum peak from 16 lanes to 4 lanes. For example, for a PCIe Gen4x16, reported peak bandwidth is 63GB/s. A degradation at 25% is 15.75GB/s, which corresponds to PCIe Gen4x4. The measured bandwidth is expected to be at least 80% of the expected peak PCIe generation bandwidth.
10 | PCIeBW: 4
11 | 
12 | # Timer for periodic checks, in hours
13 | repeat: 1
14 | 
15 | # Timer for periodic invasive checks, in hours (e.g., dcgmi diag -r 3). Set to 0 to disable (for non nvidia gpu systems)
16 | invasive: 4
17 | 
18 | # Image pull secret if the image is in a private repository
19 | pullSecrets:
20 |   create: false
21 |   name: autopilot-pull-secret
22 |   imagePullSecretData: 
23 |   
24 | env:
25 | # List of periodic checks to be executed every `repeat` hours.
26 | # If not running on GPU nodes, pciebw,remapped,dcgm and gpupower can be removed
27 |   - name: "PERIODIC_CHECKS"
28 |     value: "pciebw,remapped,dcgm,ping,gpupower"
29 | # Storage class name to test
30 |   - name: "PVC_TEST_STORAGE_CLASS"
31 |     value: ""
32 | # List of GPU errors considered fatal, as a result of a dcgmi run. This is used to label nodes with extra WARN/EVICT labels. The list defaults to [PCIe,NVLink,ECC,GPU Memory] and refers to https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#id3
33 |   - name: "DCGM_FATAL_ERRORS"
34 |     value: ""
35 | # Invasive jobs (e.g., dcgm level 3), are executed as separate job. The job deletes itself by default after 30s. This parameter can be customized by the env variable below
36 |   - name: "INVASIVE_JOB_TTLSEC"
37 |     value: ""
38 | 
39 | service:
40 |   port: 3333
41 | 
42 | annotations:
43 |   # k8s.v1.cni.cncf.io/networks: multi-nic-network
44 | 
45 | nodeSelector:
46 |   # nvidia.com/gpu.present: 'true'
47 |   # nvidia.com/mig.config: 'all-disabled'
48 | 
49 | affinity:
50 | 
51 | # Running on GPU nodes only, will:
52 | # 1) add the `nvidia.com/gpu.present: 'true'` label and 
53 | # 2) enable the init container, which checks on the nvidia device plug-in to be setup
54 | onlyOnGPUNodes: true
55 | 
56 | resources: 
57 |   # We advice to not set cpu and memory limits. DCGM requires several GB of memory to run and it may OOMKill the pod
58 |   limits:
59 |     nvidia.com/gpu: 0
60 |   requests:
61 |     nvidia.com/gpu: 0
62 | 
63 | # klog configuration
64 | loglevel: 2
65 | # logfile: "/home/autopilot/data/report.log"
66 | 
67 | # optional remote storage. A PVC and secret must exist
68 | additionalVolumeClaimTemplates:
69 |   # - name: logdir
70 |   #   persistentVolumeClaim:
71 |   #     claimName: my-pvc
72 |   # - name: autopilot-tls-secret
73 |   #   secret:
74 |   #     secretName: autopilot-webhook
75 | additionalVolumeMounts:
76 |   # - name: autopilot-tls-secret
77 |   #   mountPath: "/etc/autopilot-tls-secret/tls"
78 |   #   readOnly: true
79 |   # - mountPath: /data
80 |   #   name: logdir
81 | 


--------------------------------------------------------------------------------