├── .codespell_exclude_lines.txt
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── docs-build-pr.yaml
    │   ├── docs-build.yaml
    │   ├── docs-preview-pr.yaml
    │   └── docs-remove-stale-reviews.yaml
├── .gitignore
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── assets
    ├── NVIDIA_Horizontal_Logo_RGBBlack.png
    ├── NVLogo_H_B&W.png
    ├── NVLogo_H_B_W.png
    ├── favicon.ico
    ├── nvidia-logo-white.png
    └── nvidia_logo.png
├── container-toolkit
    ├── arch-overview.md
    ├── assets
    │   ├── nvidia-containerd-arch.png
    │   ├── nvidia-crio-lxc-arch.png
    │   ├── nvidia-docker-arch-new.png
    │   ├── nvidia-docker-arch.png
    │   └── runtime-architecture.png
    ├── cdi-support.md
    ├── docker-specialized.md
    ├── index.md
    ├── install-guide.md
    ├── output
    │   └── nvidia-smi.txt
    ├── release-notes.md
    ├── sample-workload.md
    ├── supported-platforms.md
    ├── troubleshooting.md
    ├── versions.json
    └── versions1.json
├── contents.rst
├── css
    └── custom.css
├── deps
    └── repo-deps.packman.xml
├── docker
    └── Dockerfile
├── driver-containers
    ├── graphics
    │   ├── driver-container-demo.gif
    │   └── nvidia-driver-container-image.png
    ├── redirected.rst
    ├── versions.json
    └── versions1.json
├── edge
    ├── anthos-guide.rst
    ├── graphics
    │   └── anthos
    │   │   └── virt
    │   │       ├── image01.png
    │   │       ├── image02.png
    │   │       ├── image03.png
    │   │       ├── image04.png
    │   │       ├── image05.png
    │   │       ├── image06.png
    │   │       ├── image07.png
    │   │       ├── image08.png
    │   │       ├── image09.png
    │   │       ├── image10.png
    │   │       ├── image11.png
    │   │       ├── image12.png
    │   │       ├── image13.png
    │   │       ├── image14.png
    │   │       ├── image15.png
    │   │       ├── image16.png
    │   │       └── image17.png
    ├── index.rst
    ├── nvidia-gpu-with-device-edge.rst
    ├── versions.json
    └── versions1.json
├── gpu-operator
    ├── amazon-eks.rst
    ├── cdi.rst
    ├── custom-driver-params.rst
    ├── dra-cds.rst
    ├── dra-gpus.rst
    ├── dra-intro-install.rst
    ├── getting-started.rst
    ├── google-gke.rst
    ├── gpu-driver-configuration.rst
    ├── gpu-driver-upgrades.rst
    ├── gpu-operator-kubevirt.rst
    ├── gpu-operator-mig.rst
    ├── gpu-operator-rdma.rst
    ├── gpu-sharing.rst
    ├── graphics
    │   ├── gpu-op-confidential-containers.svg
    │   ├── gpu-operator-demo.gif
    │   ├── nvd-basics.svg
    │   ├── nvidia-gpu-operator-image.jpg
    │   └── upgrade-controller-state-machine.png
    ├── index.rst
    ├── install-gpu-operator-air-gapped.rst
    ├── install-gpu-operator-gov-ready.rst
    ├── install-gpu-operator-nvaie.rst
    ├── install-gpu-operator-outdated-kernels.rst
    ├── install-gpu-operator-proxy.rst
    ├── install-gpu-operator-service-mesh.rst
    ├── install-gpu-operator-vgpu.rst
    ├── life-cycle-policy.rst
    ├── manifests
    │   ├── input
    │   │   ├── amazon-eks-cluster-config.yaml
    │   │   ├── custom-mig-config.yaml
    │   │   ├── google-gke-gpu-operator-quota.yaml
    │   │   ├── gpu-direct-rdma-demo-pod-1.yaml
    │   │   ├── gpu-direct-rdma-demo-pod-2.yaml
    │   │   ├── mig-cm-values.yaml
    │   │   ├── nvd-all.yaml
    │   │   ├── nvd-demo-gold.yaml
    │   │   ├── nvd-driver-multiple.yaml
    │   │   ├── nvd-precompiled-all.yaml
    │   │   ├── nvd-precompiled-some.yaml
    │   │   ├── tf-notebook.yaml
    │   │   ├── time-slicing-config-all.yaml
    │   │   ├── time-slicing-config-fine.yaml
    │   │   ├── time-slicing-config-sample.yaml
    │   │   └── time-slicing-verification.yaml
    │   └── output
    │   │   ├── cdi-get-pods-restart.txt
    │   │   ├── common-cuda-vectoradd-logs.txt
    │   │   ├── mig-get-pods.txt
    │   │   ├── mig-mixed-node-labels.json
    │   │   ├── mig-mixed-nvidia-smi.txt
    │   │   ├── mig-node-labels.json
    │   │   ├── mig-nvidia-smi.txt
    │   │   ├── precomp-driver-conventional-running.txt
    │   │   ├── precomp-driver-running.txt
    │   │   ├── precomp-driver-terminating.txt
    │   │   ├── time-slicing-get-events.txt
    │   │   ├── time-slicing-get-pods.txt
    │   │   └── time-slicing-logs-pods.txt
    ├── microsoft-aks.rst
    ├── overview.rst
    ├── platform-support.rst
    ├── precompiled-drivers.rst
    ├── release-notes.rst
    ├── security.rst
    ├── troubleshooting.rst
    ├── uninstall.rst
    ├── upgrade.rst
    ├── versions.json
    └── versions1.json
├── gpu-telemetry
    ├── about-telemetry.rst
    ├── dcgm-exporter.rst
    ├── graphics
    │   ├── dcgm-e2e
    │   │   ├── 001-dcgm-e2e-prom-screenshot.png
    │   │   ├── 002-dcgm-e2e-grafana-screenshot.png
    │   │   ├── 003-dcgm-e2e-grafana-home-screenshot.png
    │   │   ├── 004-dcgm-e2e-grafana-manage-screenshot.png
    │   │   ├── 005-dcgm-e2e-grafana-import-screenshot.png
    │   │   ├── 006-dcgm-e2e-grafana-import-screenshot.png
    │   │   ├── 007-dcgm-e2e-grafana-import-screenshot.png
    │   │   ├── 008-dcgm-e2e-grafana-dashboard-screenshot.png
    │   │   ├── 009-dcgm-e2e-deepstream-screenshot.png
    │   │   ├── 010-dcgm-e2e-deepstream-screenshot.png
    │   │   └── 011-dcgm-e2e-prom-dashboard-metrics-screenshot.png
    │   ├── dcgm-exporter-bare-metal.png
    │   ├── dcgm-exporter-containers.png
    │   ├── dcgm-exporter_embedded.png
    │   └── dcgm_and_dcgm-exporter.png
    ├── index.rst
    ├── integrating-telemetry-kubernetes.rst
    ├── kube-prometheus.rst
    ├── versions.json
    └── versions1.json
├── kubernetes
    ├── index.rst
    ├── versions.json
    └── versions1.json
├── make.bat
├── mig
    ├── mig-examples.rst
    ├── mig-k8s.rst
    └── mig.rst
├── openshift
    ├── appendix-ocp.rst
    ├── clean-up.rst
    ├── download
    │   └── 0003-cluster-wide-machineconfigs.yaml.template
    ├── enable-gpu-monitoring-dashboard.rst
    ├── get-entitlement.rst
    ├── gpu-operator-with-precompiled-drivers.rst
    ├── graphics
    │   ├── Mig-profile-A100.png
    │   ├── cluster-policy-image-version.png
    │   ├── cluster-policy-repository.png
    │   ├── cluster-policy-state-ready.png
    │   ├── cluster-policy-suceed.png
    │   ├── cluster_entitlement_1.png
    │   ├── cluster_entitlement_2.png
    │   ├── cluster_entitlement_3.png
    │   ├── cluster_entitlement_4.png
    │   ├── cluster_entitlement_5.png
    │   ├── cluster_entitlement_6.png
    │   ├── cluster_entitlement_attachsub.png
    │   ├── cluster_policy1.png
    │   ├── cluster_policy2.png
    │   ├── cluster_policy_1.png
    │   ├── cluster_policy_3.png
    │   ├── cluster_policy_4.png
    │   ├── cluster_policy_configure_vgpu.png
    │   ├── cluster_policy_enable_sandbox_workloads.png
    │   ├── cluster_policy_suceed.png
    │   ├── cluster_policy_vGPU_confg.png
    │   ├── cluster_policy_vgpu_1.png
    │   ├── cluster_policy_vgpu_2.png
    │   ├── create_cluster_policy.png
    │   ├── create_config_map1.png
    │   ├── create_project_1.png
    │   ├── create_project_2.png
    │   ├── createclusterpolicy2.png
    │   ├── createclusterpolicy3.png
    │   ├── created_pull-secret.png
    │   ├── disconnected_cluster.png
    │   ├── driver_toolkit_alert.png
    │   ├── enable-gpu-direct-rdma.png
    │   ├── entitlement_hypervisor.png
    │   ├── gpu-operator-certified-cli-install.png
    │   ├── gpu_dashboards.png
    │   ├── locate-cluster-acm.png
    │   ├── mig-mixed-profile-A100.png
    │   ├── mig_strategy.png
    │   ├── navigate_to_cluster_policy.png
    │   ├── nvaie2.3_cluster_policy.png
    │   ├── ocp_main_console_alerts.png
    │   ├── pci_passthrough.png
    │   ├── precompiled_driver_config_repository.png
    │   ├── precompiled_driver_config_version_and_image.png
    │   ├── pull-secret.png
    │   ├── secrets.png
    │   ├── secrets_2.png
    │   └── vmx_secure_boot.png
    ├── index.rst
    ├── install-gpu-ocp.rst
    ├── install-gpu-operator-gov-ready-openshift.rst
    ├── install-nfd.rst
    ├── introduction.rst
    ├── mig-ocp.rst
    ├── mirror-gpu-ocp-disconnected.rst
    ├── nvaie-with-ocp.rst
    ├── openshift-virtualization.rst
    ├── prerequisites.rst
    ├── steps-overview.rst
    ├── time-slicing-gpus-in-openshift.rst
    ├── troubleshooting-gpu-ocp.rst
    ├── versions.json
    └── versions1.json
├── partner-validated
    ├── PARTNER-VALIDATED-TEMPLATE.rst
    ├── index.rst
    ├── k0rdent.rst
    ├── mirantis-mke.rst
    ├── versions.json
    └── versions1.json
├── playground
    ├── dind.rst
    └── x-arch.rst
├── repo
├── repo.bat
├── repo.toml
├── review
    ├── index.rst
    ├── versions.json
    └── versions1.json
├── scripts
    └── create_archive.sh
├── secure-services-istio-keycloak
    ├── configure.md
    ├── images
    │   ├── keycloak-1.png
    │   ├── keycloak-10.png
    │   ├── keycloak-11.png
    │   ├── keycloak-12.png
    │   ├── keycloak-13.png
    │   ├── keycloak-14.png
    │   ├── keycloak-15.png
    │   ├── keycloak-16.png
    │   ├── keycloak-2.png
    │   ├── keycloak-3.png
    │   ├── keycloak-4.png
    │   ├── keycloak-5.png
    │   ├── keycloak-6.png
    │   ├── keycloak-7.png
    │   ├── keycloak-8.png
    │   ├── keycloak-9.png
    │   └── reference-arch-01.png
    ├── implementation.md
    ├── index.md
    ├── manifests
    │   ├── authorizationPolicy.yaml
    │   ├── istio-sample-manifest.yaml
    │   └── requestAuthentication.yaml
    ├── platform-support.md
    ├── versions.json
    └── versions1.json
├── templates
    ├── breadcrumbs.html
    └── last-updated.html
├── tools
    ├── packman
    │   ├── bootstrap
    │   │   ├── configure.bat
    │   │   ├── download_file_from_url.ps1
    │   │   ├── fetch_file_from_packman_bootstrap.cmd
    │   │   ├── generate_temp_file_name.ps1
    │   │   ├── generate_temp_folder.ps1
    │   │   └── install_package.py
    │   ├── config.packman.xml
    │   ├── packman
    │   ├── packman.cmd
    │   ├── packmanconf.py
    │   ├── python.bat
    │   └── python.sh
    └── repoman
    │   └── repoman.py
└── work
    └── dcgm-offline.inv


/.codespell_exclude_lines.txt:
--------------------------------------------------------------------------------
 1 | # Include whole lines that have codespell-recognized typos.
 2 | # This is better than accepting a typo for ask someplace random.
 3 | # End the file with a blank line.
 4 | Approaches for Working with Azure AKS
 5 | You can approach running workloads in Azure AKS with NVIDIA GPUs in at least two ways.
 6 | Default AKS configuration without the GPU Operator
 7 | By default, you can run Azure AKS images on GPU-enabled virtual machines with NVIDIA GPUs,
 8 | AKS images include a preinstalled NVIDIA GPU Driver and preinstalled NVIDIA Container Toolkit.
 9 | `Use GPUs for compute-intensive workloads on Azure Kubernetes Services <https://learn.microsoft.com/en-us/azure/aks/gpu-cluster>`__
10 | The images that are available in AKS always include a preinstalled NVIDIA GPU driver
11 | After you start your Azure AKS cluster, you are ready to install the NVIDIA GPU Operator.
12 |    GPU Operator with Azure AKS <microsoft-aks.rst>
13 | * Added support for running the Operator with Microsoft Azure Kubernetes Service (AKS).
14 |   You must use an AKS image with a preinstalled NVIDIA GPU driver and a preinstalled
15 | Create AKS Cluster with a Node Pool to Skip GPU Driver installation
16 | command-line argument to the ``az aks nodepool add`` command.
17 |    $ az aks nodepool add --resource-group <rg-name> --name gpunodes --cluster-name <cluster-name> \
18 | `Skip GPU driver installation (preview) <https://learn.microsoft.com/en-us/azure/aks/gpu-cluster?source=recommendations&tabs=add-ubuntu-gpu-node-pool#skip-gpu-driver-installation-preview>`__
19 | After you start your Azure AKS cluster with an image that includes a preinstalled NVIDIA GPU Driver
20 |    Azure AKS <microsoft-aks.rst>
21 | .. |prod-name-short| replace:: MKE
22 | Mirantis Kubernetes Engine (MKE) gives you the power to build, run, and scale cloud-native
23 |    * - MKE 3.6.2+ and 3.5.7+
24 | * A running MKE cluster with at least one control plane node and two worker nodes.
25 | * A seed node to connect to the MKE instance, with Helm 3.x installed on the seed node.
26 | * The kubeconfig file for the MKE cluster on the seed node.
27 |   You can get the file from the MKE web interface by downloading a client bundle.
28 |   Alternatively, if the MKE cluster is a managed cluster of a Mirantis Container Cloud (MCC) instance,
29 |   In this case, the MKE web interface can be accessed from the MCC web interface.
30 | * You have an MKE administrator user name and password, and you have the MKE host URL.
31 | Perform the following steps to prepare the MKE cluster:
32 | #. MKE does not apply a label to worker nodes.
33 |       $ export MKE_USERNAME=<mke-username> \
34 |           MKE_PASSWORD=<mke-password> \
35 |           MKE_HOST=<mke-fqdn-or-ip-address>
36 | #. Get an API key from MKE so that you can make API calls later:
37 |           '{"username":"'$MKE_USERNAME'","password":"'$MKE_PASSWORD'"}' \
38 |           https://$MKE_HOST/auth/login | jq --raw-output .auth_token)
39 | #. Download the MKE configuration file:
40 |       $ curl --silent --insecure -X GET "https://$MKE_HOST/api/ucp/config-toml" \
41 | #. Upload the edited MKE configuration file:
42 |           https://$MKE_HOST/api/ucp/config-toml
43 | The MKE cluster is ready for you to install the GPU Operator with Helm.
44 | Refer to the MKE product documentation for information about working with MKE.
45 | * https://docs.mirantis.com/mke/3.6/overview.html
46 |          $ cat <<EOF > nvidia-container-microshift.te
47 |          $ checkmodule -m -M -o nvidia-container-microshift.mod nvidia-container-microshift.te
48 |       2023/06/22 14:25:38 Retreiving plugins.
49 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | 
4 |   - package-ecosystem: "github-actions"
5 |     directory: "/"
6 |     schedule:
7 |       interval: "weekly"
8 | 


--------------------------------------------------------------------------------
/.github/workflows/docs-build-pr.yaml:
--------------------------------------------------------------------------------
 1 | name: docs-build-pr
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main, release-* ]
 6 |     types: [ opened, synchronize ]
 7 | 
 8 | env:
 9 |   GH_TOKEN: ${{ github.token }}
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   build-docs:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v5
21 |       - name: Set up Docker Buildx
22 |         uses: docker/setup-buildx-action@v3
23 |       - name: Build image
24 |         uses: docker/build-push-action@v6
25 |         with:
26 |           context: .
27 |           file: docker/Dockerfile
28 |           load: true
29 |           tags: pr-image:${{ github.sha }}
30 |       - name: Build docs
31 |         run: |
32 |           docker run -v $(pwd):/work -w /work pr-image:${{ github.sha }} ./repo docs
33 |       - name: Delete unnecessary files
34 |         run: |
35 |           sudo find _build -name .doctrees -prune -exec rm -rf {} \;
36 |           sudo find _build -name .buildinfo -exec rm {} \;
37 |       - name: Copy review page
38 |         run: |
39 |           sudo mv _build/docs/review/latest/* _build/docs
40 |           sudo rm -rf _build/docs/review _build/docs/tmp _build/docs/sphinx_warnings.txt
41 |       - name: Upload HTML
42 |         uses: actions/upload-artifact@v4
43 |         with:
44 |           name: html-build-artifact
45 |           path: _build/docs
46 |           if-no-files-found: error
47 |           retention-days: 1
48 |       - name: Store PR information
49 |         run: |
50 |           mkdir ./pr
51 |           echo ${{ github.event.number }}              > ./pr/pr.txt
52 |           echo ${{ github.event.pull_request.merged }} > ./pr/merged.txt
53 |           echo ${{ github.event.action }}              > ./pr/action.txt
54 |       - name: Upload PR information
55 |         uses: actions/upload-artifact@v4
56 |         with:
57 |           name: pr
58 |           path: pr/
59 | 


--------------------------------------------------------------------------------
/.github/workflows/docs-preview-pr.yaml:
--------------------------------------------------------------------------------
 1 | name: docs-preview-pr
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: [ docs-build-pr ]
 6 |     types: [ completed ]
 7 |     branches-ignore: [ main ]
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11 |   cancel-in-progress: true
12 | 
13 | env:
14 |   WF_ID: ${{ github.event.workflow_run.id }}
15 | 
16 | jobs:
17 |   preview:
18 |     uses: nvidia-merlin/.github/.github/workflows/docs-preview-pr-common.yaml@main


--------------------------------------------------------------------------------
/.github/workflows/docs-remove-stale-reviews.yaml:
--------------------------------------------------------------------------------
 1 | name: docs-remove-stale-reviews
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # 42 minutes after 0:00 UTC on Sundays
 6 |     - cron: "42 0 * * 0"
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   remove:
11 |     uses: nvidia-merlin/.github/.github/workflows/docs-remove-stale-reviews-common.yaml@main
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | _*
3 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | variables:
  2 |   CONTAINER_TEST_IMAGE: "${CI_REGISTRY_IMAGE}:${CI_COMMIT_REF_SLUG}"
  3 |   CONTAINER_RELEASE_IMAGE: "${CI_REGISTRY_IMAGE}:0.5.1"
  4 |   BUILDER_IMAGE: ghcr.io/nvidia/cloud-native-docs:0.5.1
  5 |   PUBLISHER_IMAGE: "${CI_REGISTRY_PUBLISHER}/publisher:3.1.0"
  6 | 
  7 | stages:
  8 | - .pre
  9 | - build_image
 10 | - build_docs
 11 | - publish_docs
 12 | 
 13 | .build_image:
 14 |   image: docker:23.0.6
 15 |   stage: .pre
 16 |   services:
 17 |     - docker:23.0.6-dind
 18 |   variables:
 19 |     GIT_STRATEGY: clone
 20 |   script:
 21 |     - apk add git
 22 |     - git fetch origin "${CI_DEFAULT_BRANCH}"
 23 |     - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
 24 |     - if ! docker manifest inspect "${BUILDER_IMAGE}" 2>&1 > /dev/null ; then export NEEDS_IMAGE=true ; fi
 25 |     - FILES=$(git diff --name-only "${CI_COMMIT_SHA}" "origin/${CI_DEFAULT_BRANCH}" | tr '\n' ' ')
 26 |     - if echo "${FILES}" | grep -q "deps/\|Dockerfile\|repo.toml" ; then export NEEDS_IMAGE=true ; fi
 27 |     - >
 28 |       if [[ "${NEEDS_IMAGE}" ]]; then
 29 |         docker build -t "${CONTAINER_TEST_IMAGE}" . -f docker/Dockerfile
 30 |         docker push "${CONTAINER_TEST_IMAGE}"
 31 |         echo "BUILDER_IMAGE=${CONTAINER_TEST_IMAGE}" >> build.env
 32 |       else
 33 |         echo "BUILDER_IMAGE=${BUILDER_IMAGE}" >> build.env
 34 |       fi
 35 |     - >
 36 |       if [ "${NEEDS_IMAGE}" ] && [ "${CI_COMMIT_BRANCH}" == "${CI_DEFAULT_BRANCH}" ] && [ "${CI_PIPELINE_SOURCE}" == "push" ]; then
 37 |         docker tag "${CONTAINER_TEST_IMAGE}" "${CONTAINER_RELEASE_IMAGE}"
 38 |         docker push "${CONTAINER_RELEASE_IMAGE}"
 39 |       fi
 40 |   artifacts:
 41 |     reports:
 42 |       dotenv: build.env
 43 | 
 44 | .build_image_rules:
 45 |   rules:
 46 |     - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH) || $CI_PIPELINE_SOURCE == "merge_request_event"
 47 | 
 48 | build_image_ext:
 49 |   extends: .build_image
 50 |   rules:
 51 |     - if: $INTERNAL != null
 52 |       when: never
 53 |     - !reference [.build_image_rules, rules]
 54 | 
 55 | build_image_int:
 56 |   extends: .build_image
 57 |   tags:
 58 |     - os/linux
 59 |     - type/docker
 60 |   rules:
 61 |     - if: $INTERNAL == null
 62 |       when: never
 63 |     - !reference [.build_image_rules, rules]
 64 | 
 65 | .build:
 66 |   stage: build_docs
 67 |   image: "${BUILDER_IMAGE}"
 68 |   script:
 69 |     - ./repo docs
 70 |     - echo "BUILDER_IMAGE=${BUILDER_IMAGE}" >> build.env
 71 |   artifacts:
 72 |     name: ${CI_PROJECT_NAME}-${CI_COMMIT_SHORT_SHA}
 73 |     paths:
 74 |       - _build
 75 |     expire_in: 4w
 76 |     reports:
 77 |       dotenv: build.env
 78 | 
 79 | .build_rules:
 80 |   rules:
 81 |     - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH) || $CI_PIPELINE_SOURCE == "merge_request_event"
 82 |     - if: $CI_COMMIT_REF_NAME =~ /-v[0-9]/
 83 | 
 84 | build_ext:
 85 |   extends: .build
 86 |   variables:
 87 |     APIURL: "${CI_API_V4_URL}/projects/${CI_MERGE_REQUEST_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}/discussions"
 88 |   after_script:
 89 |     - PROJPART=$(echo "${CI_PROJECT_PATH#$CI_PROJECT_ROOT_NAMESPACE}")
 90 |     - BASEURL=$(echo "https://${CI_PROJECT_ROOT_NAMESPACE}.${CI_PAGES_DOMAIN}/-${PROJPART}")
 91 |     - REVURL=$(echo "${BASEURL}/-/jobs/${CI_JOB_ID}/artifacts/_build/docs/review/latest/index.html")
 92 |     - MSG=$(echo "{\"body\":\"<p><b>Review HTML</b></p><p><a href=\"${REVURL}\">${REVURL}</a></p>\"}")
 93 |     - echo "${REVURL}"
 94 |     - echo "${MSG}"
 95 |     - 'curl -X POST -H "Authorization: Bearer ${MR_COMMENT}" "${APIURL}" -H "Content-Type: application/json" --data-raw "${MSG}"'
 96 |   rules:
 97 |     - if: $INTERNAL != null
 98 |       when: never
 99 |     - !reference [.build_rules, rules]
100 | 
101 | build_int:
102 |   extends: .build
103 |   tags:
104 |     - os/linux
105 |     - type/docker
106 |   rules:
107 |     - if: $INTERNAL == null
108 |       when: never
109 |     - !reference [.build_rules, rules]
110 | 
111 | pages:
112 |   image: "${CONTAINER_RELEASE_IMAGE}"
113 |   stage: publish_docs
114 |   script:
115 |     - rm -rf public
116 |     - cp -r _build/docs/ public
117 |   artifacts:
118 |     paths:
119 |       - public
120 |     expire_in: 1 week
121 |   dependencies:
122 |     - build_ext
123 |   rules:
124 |     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push" && $INTERNAL != 'true'
125 | 
126 | 
127 | publish_docs:
128 |   image:
129 |     name: "${PUBLISHER_IMAGE}"
130 |     entrypoint: [ "" ]
131 |   stage: publish_docs
132 |   tags:
133 |     - os/linux
134 |     - type/docker
135 |   variables:
136 |     HTML_PATH: "_build/docs"
137 |     FORCE_LATEST: "true"
138 |   script:
139 |   - echo "Pushing docs live to https://docs.nvidia.com/datacenter/cloud-native"
140 |   - |+
141 |     if [[ "${CI_COMMIT_REF_NAME}" =~ (.+)-v([0-9]+\.[0-9]+(\.[a-zA-Z0-9]+)?) ]]; then
142 |       export DOCSET="${BASH_REMATCH[1]}"
143 |       export VERSION="${BASH_REMATCH[2]}"
144 |     fi
145 |   - |+
146 |     if [ -z "${DOCSET}" ] || [ -z "${VERSION}" ]; then
147 |       echo "Failed to determine the docset or version."
148 |       exit 1
149 |     fi
150 |   - |+
151 |     if [[ "${CI_COMMIT_MESSAGE}" =~ $'/not-latest\n' ]]; then
152 |       export FORCE_LATEST=false
153 |     fi
154 |   - echo "Publishing docs for ${DOCSET} and version ${VERSION}"
155 |   - pushd "${HTML_PATH}/${DOCSET}/latest"
156 |   - deploy_s3.sh --archive "${DOCSET}" "${VERSION}"
157 |   - |+
158 |     if [ "true" == "${FORCE_LATEST}" ]; then
159 |       deploy_s3.sh --latest "${DOCSET}"
160 |     fi
161 |   - deploy_s3.sh --flush "${DOCSET}"
162 |   dependencies:
163 |     - build_int
164 |   rules:
165 |     - if: $CI_COMMIT_TAG =~ /-v[0-9]/ && ($CI_PIPELINE_SOURCE == "push" || $CI_PIPELINE_SOURCE == "web" ) && $INTERNAL
166 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |     - id: mixed-line-ending
 6 |     - id: trailing-whitespace
 7 |     - id: check-yaml
 8 |   - repo: https://github.com/codespell-project/codespell
 9 |     rev: v2.2.2
10 |     hooks:
11 |     - id: codespell
12 |       args: [ "-x", ".codespell_exclude_lines.txt"]
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to the Docs
  2 | 
  3 | Thanks for contributing to the documentation repository! The documentation is licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0). Before
  4 | patches are accepted and merged, we require that these relatively simple guidelines be followed:
  5 | * Adhere to the documentation style guidelines
  6 | * Sign your work
  7 | 
  8 | Also, read an [overview](https://developers.google.com/tech-writing/overview) on Technical Writing from Google on authoring good technical content!
  9 | 
 10 | ## Documentation style guidelines
 11 | 
 12 | This documentation is authored using [reStructuredText](http://docutils.sourceforge.net/rst.html) as a markup language and uses the
 13 | [Sphinx](https://www.sphinx-doc.org/en/master/) documentation generator.
 14 | 
 15 | ### Filenames
 16 | 
 17 | Use only lowercase alphanumeric characters and hyphens `-` where required. Filenames are suffixed with the `.rst` extension.
 18 | 
 19 | ### Headings
 20 | 
 21 | Use title case for headings.
 22 | Refer to https://titlecase.com/ for more information.
 23 | 
 24 | The headings follow this convention:
 25 | 
 26 | 1. `H1` or document title based on `#` with overline
 27 | 1. `H2` based on `*` with overline
 28 | 1. `H3` based on `=`
 29 | 1. `H4` based on `-`
 30 | 1. `H5` based on `^`
 31 | 1. `H6` based on `"`
 32 | 
 33 | If you need more levels, then consider creating a new document. A document has only one `H1`.
 34 | 
 35 | ### Guideline for Kubernetes Object Types in Body Text
 36 | 
 37 | Prefer lowercase plain text such as namespace, pod, daemon set, container, service, and so on.
 38 | This guideline applies to multi-word types like custom resource definition.
 39 | 
 40 | Use the camel case name only if you follow the name with object, resource, and so on.
 41 | For example, "Delete the ``Pod`` object..."
 42 | However, that example is not compelling and is just as clear when written as "Delete the pod..."
 43 | 
 44 | ### Console Outputs
 45 | 
 46 | #### Directives
 47 | 
 48 | For console outputs in this document, use `code-block:: console` directive. This results in a red prompt, which makes it easy to distinguish between the prompt
 49 | and the command.
 50 | 
 51 | #### Commands
 52 | 
 53 | Separate each command into its own `code-block`. Since this repository uses the Sphinx `copy-button` to allow for easy copy/pasting of commands
 54 | by users, it makes sense to separate each command for readability and usage.
 55 | 
 56 | If you need to aggregate multiple commands, then use the separator, 2-space indentation and `&&` on each line as shown in the example below:
 57 | ```console
 58 | $ command1 \
 59 |     && command2 \
 60 |     && command3
 61 | ```
 62 | 
 63 | #### Outputs
 64 | 
 65 | Separate outputs and commands into their own `code-block` sequence. Since the repository is configured to copy everything (including items after the prompt lines by
 66 | setting `copybutton_only_copy_prompt_lines` to false), it is desirable to only copy commands.
 67 | 
 68 | ## Sign your work
 69 | 
 70 | The sign-off is a simple line at the end of the explanation for the patch. Your
 71 | signature certifies that you wrote the patch or otherwise have the right to pass
 72 | it on as an open-source patch. The rules are pretty simple: if you can certify
 73 | the below (from [developercertificate.org](http://developercertificate.org/)):
 74 | 
 75 | ```
 76 | Developer Certificate of Origin
 77 | Version 1.1
 78 | 
 79 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 80 | 1 Letterman Drive
 81 | Suite D4700
 82 | San Francisco, CA, 94129
 83 | 
 84 | Everyone is permitted to copy and distribute verbatim copies of this
 85 | license document, but changing it is not allowed.
 86 | 
 87 | Developer's Certificate of Origin 1.1
 88 | 
 89 | By making a contribution to this project, I certify that:
 90 | 
 91 | (a) The contribution was created in whole or in part by me and I
 92 |     have the right to submit it under the open source license
 93 |     indicated in the file; or
 94 | 
 95 | (b) The contribution is based upon previous work that, to the best
 96 |     of my knowledge, is covered under an appropriate open source
 97 |     license and I have the right under that license to submit that
 98 |     work with modifications, whether created in whole or in part
 99 |     by me, under the same open source license (unless I am
100 |     permitted to submit under a different license), as indicated
101 |     in the file; or
102 | 
103 | (c) The contribution was provided directly to me by some other
104 |     person who certified (a), (b) or (c) and I have not modified
105 |     it.
106 | 
107 | (d) I understand and agree that this project and the contribution
108 |     are public and that a record of the contribution (including all
109 |     personal information I submit with it, including my sign-off) is
110 |     maintained indefinitely and may be redistributed consistent with
111 |     this project or the open source license(s) involved.
112 | ```
113 | 
114 | Then you just add a line to every git commit message:
115 | 
116 |     Signed-off-by: Joe Smith <joe.smith@email.com>
117 | 
118 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
119 | 
120 | If you set your `user.name` and `user.email` git configs, you can sign your
121 | commit automatically with `git commit -s`.
122 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/assets/NVIDIA_Horizontal_Logo_RGBBlack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/NVIDIA_Horizontal_Logo_RGBBlack.png


--------------------------------------------------------------------------------
/assets/NVLogo_H_B&W.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/NVLogo_H_B&W.png


--------------------------------------------------------------------------------
/assets/NVLogo_H_B_W.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/NVLogo_H_B_W.png


--------------------------------------------------------------------------------
/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/favicon.ico


--------------------------------------------------------------------------------
/assets/nvidia-logo-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/nvidia-logo-white.png


--------------------------------------------------------------------------------
/assets/nvidia_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/nvidia_logo.png


--------------------------------------------------------------------------------
/container-toolkit/assets/nvidia-containerd-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-containerd-arch.png


--------------------------------------------------------------------------------
/container-toolkit/assets/nvidia-crio-lxc-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-crio-lxc-arch.png


--------------------------------------------------------------------------------
/container-toolkit/assets/nvidia-docker-arch-new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-docker-arch-new.png


--------------------------------------------------------------------------------
/container-toolkit/assets/nvidia-docker-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-docker-arch.png


--------------------------------------------------------------------------------
/container-toolkit/assets/runtime-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/runtime-architecture.png


--------------------------------------------------------------------------------
/container-toolkit/index.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | ```{toctree}
 4 | :caption: NVIDIA Container Toolkit
 5 | :hidden: true
 6 | :titlesonly: true
 7 | 
 8 | self
 9 | Installing the Toolkit <install-guide.md>
10 | sample-workload
11 | supported-platforms.md
12 | troubleshooting.md
13 | release-notes.md
14 | ```
15 | 
16 | ```{toctree}
17 | :caption: Advanced Configuration
18 | :hidden: true
19 | :titlesonly: true
20 | 
21 | arch-overview.md
22 | Container Device Interface <cdi-support.md>
23 | docker-specialized.md
24 | ```
25 | 
26 | The NVIDIA Container Toolkit is a collection of libraries and utilities enabling users to build and run GPU-accelerated containers. It currently includes:
27 | 
28 | * The NVIDIA Container Runtime (`nvidia-container-runtime`)
29 | * The NVIDIA Container Toolkit CLI (`nvidia-ctk`)
30 | * The NVIDIA CDI Hooks (`nvidia-cdi-hook`)
31 | * The NVIDIA Container Runtime Hook (`nvidia-container-runtime-hook`)
32 | * The NVIDIA Container CLI (`nvidia-container-cli`)
33 | * The NVIDIA Container Library (`libnvidia-container1`)
34 | 
35 | ## License
36 | 
37 | The NVIDIA Container Toolkit (and all included components) is licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and
38 | contributions are accepted with a Developer Certificate of Origin (DCO). Refer to the [contributing](https://github.com/NVIDIA/nvidia-container-toolkit/blob/master/CONTRIBUTING.md) document for
39 | more information.
40 | 


--------------------------------------------------------------------------------
/container-toolkit/output/nvidia-smi.txt:
--------------------------------------------------------------------------------
 1 | +-----------------------------------------------------------------------------+
 2 | | NVIDIA-SMI 535.86.10    Driver Version: 535.86.10    CUDA Version: 12.2     |
 3 | |-------------------------------+----------------------+----------------------+
 4 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 5 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 6 | |                               |                      |               MIG M. |
 7 | |===============================+======================+======================|
 8 | |   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
 9 | | N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
10 | |                               |                      |                  N/A |
11 | +-------------------------------+----------------------+----------------------+
12 | 
13 | +-----------------------------------------------------------------------------+
14 | | Processes:                                                                  |
15 | |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
16 | |        ID   ID                                                   Usage      |
17 | |=============================================================================|
18 | |  No running processes found                                                 |
19 | +-----------------------------------------------------------------------------+
20 | 


--------------------------------------------------------------------------------
/container-toolkit/sample-workload.md:
--------------------------------------------------------------------------------
 1 | # Running a Sample Workload
 2 | 
 3 | ## Running a Sample Workload with Docker
 4 | 
 5 | After you install and configure the toolkit and install an NVIDIA GPU Driver,
 6 | you can verify your installation by running a sample workload.
 7 | 
 8 | - Run a sample CUDA container:
 9 | 
10 |    ```console
11 |    sudo docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
12 |    ```
13 | 
14 |    Your output should resemble the following output:
15 | 
16 |    ```{literalinclude} ./output/nvidia-smi.txt
17 |    ---
18 |    language: output
19 |    ---
20 |    ```
21 | 
22 | ## Running a Sample Workload with Podman
23 | 
24 | After you install and configure the toolkit (including [generating a CDI specification](cdi-support.md)) and install an NVIDIA GPU Driver,
25 | you can verify your installation by running a sample workload.
26 | 
27 | - Run a sample CUDA container:
28 | 
29 |    ```console
30 |    podman run --rm --security-opt=label=disable \
31 |       --device=nvidia.com/gpu=all \
32 |       ubuntu nvidia-smi
33 |    ```
34 | 
35 |    Your output should resemble the following output:
36 | 
37 |    ```{literalinclude} ./output/nvidia-smi.txt
38 |    ---
39 |    language: output
40 |    ---
41 |    ```
42 | 
43 | ## Running Sample Workloads with containerd or CRI-O
44 | 
45 | These runtimes are more common with Kubernetes than desktop computing.
46 | Refer to {doc}`gpuop:index` in the NVIDIA GPU Operator documentation for more information.


--------------------------------------------------------------------------------
/container-toolkit/supported-platforms.md:
--------------------------------------------------------------------------------
 1 | % Date: August 10 2020
 2 | 
 3 | % Author: pramarao
 4 | 
 5 | (supported-platforms)=
 6 | 
 7 | # Platform support
 8 | 
 9 | Recent NVIDIA Container Toolkit releases are tested and expected to work on these Linux distributions:
10 | 
11 | | OS Name / Version        | amd64 / x86_64 | ppc64le | arm64 / aarch64 {sup}`1` |
12 | | ------------------------ | -------------- | ------- | ------------------------ |
13 | | Amazon Linux 2023        | X              |         | X {sup}`2`               |
14 | | Amazon Linux 2           | X              |         | X                        |
15 | | Open Suse/SLES 15.x      | X              |         |                          |
16 | | Debian Linux 11          | X              |         |                          |
17 | | CentOS 8                 | X              | X       | X                        |
18 | | RHEL 8.x                 | X              | X       | X                        |
19 | | RHEL 9.x                 | X              | X       | X                        |
20 | | RHEL 10.x                | X              | X       | X                        |
21 | | Ubuntu 20.04             | X              | X       | X                        |
22 | | Ubuntu 22.04             | X              | X       | X                        |
23 | | Ubuntu 24.04             | X              |         | X                        |
24 | 
25 | 
26 | ## Report issues
27 | 
28 | Our qualification-testing procedures are constantly evolving and we might miss
29 | certain problems. [Report](https://github.com/NVIDIA/nvidia-container-toolkit/issues) issues in
30 | particular as they occur on a platform listed above.
31 | 
32 | 
33 | ## Other Linux distributions
34 | 
35 | Releases may work on more platforms than indicated in the table above (such as on distribution versions older and newer than listed).
36 | Give things a try and we invite you to [report](https://github.com/NVIDIA/nvidia-container-toolkit/issues) any issue observed even if your Linux distribution is not listed.
37 | 
38 | ----
39 | 
40 | 1. The `arm64` / `aarch64` architecture includes support for Tegra-based systems.
41 | 2. For Amazon Linux 2023 on Arm64, a `g5g.2xlarge` Amazon EC2 instance was used for validation.
42 |    The `g5g.xlarge` instance caused failures due to the limited system memory.
43 | 


--------------------------------------------------------------------------------
/container-toolkit/versions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "latest": "1.18.1",
 3 |     "versions":
 4 |     [
 5 |         {
 6 |             "version": "1.18.1"
 7 |         },
 8 |         {
 9 |             "version": "1.18.0"
10 |         },
11 |         {
12 |             "version": "1.17.8"
13 |         },
14 |         {
15 |             "version": "1.17.7"
16 |         },
17 |         {
18 |             "version": "1.17.6"
19 |         },
20 |         {
21 |             "version": "1.17.5"
22 |         },
23 |         {
24 |             "version": "1.17.4"
25 |         },
26 |         {
27 |             "version": "1.17.3"
28 |         },
29 |         {
30 |             "version": "1.17.2"
31 |         },
32 |         {
33 |             "version": "1.17.1"
34 |         },
35 |         {
36 |             "version": "1.17.0"
37 |         },
38 |         {
39 |             "version": "1.16.2"
40 |         },
41 |         {
42 |             "version": "1.16.1"
43 |         },
44 |         {
45 |             "version": "1.16.0"
46 |         }
47 |     ]
48 | }
49 | 


--------------------------------------------------------------------------------
/container-toolkit/versions1.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "preferred": "true",
 4 |     "url": "../1.18.1",
 5 |     "version": "1.18.1"
 6 |   },
 7 |   {
 8 |     "url": "../1.18.0",
 9 |     "version": "1.18.0"
10 |   },
11 |   {
12 |     "url": "../1.17.8",
13 |     "version": "1.17.8"
14 |   },
15 |   {
16 |     "url": "../1.17.7",
17 |     "version": "1.17.7"
18 |   },
19 |   {
20 |     "url": "../1.17.6",
21 |     "version": "1.17.6"
22 |   },
23 |   {
24 |     "url": "../1.17.5",
25 |     "version": "1.17.5"
26 |   },
27 |   {
28 |     "url": "../1.17.4",
29 |     "version": "1.17.4"
30 |   },
31 |   {
32 |     "url": "../1.17.3",
33 |     "version": "1.17.3"
34 |   },
35 |   {
36 |     "url": "../1.17.2",
37 |     "version": "1.17.2"
38 |   },
39 |   {
40 |     "url": "../1.17.1",
41 |     "version": "1.17.1"
42 |   },
43 |   {
44 |     "url": "../1.17.0",
45 |     "version": "1.17.0"
46 |   },
47 |   {
48 |     "url": "../1.16.2",
49 |     "version": "1.16.2"
50 |   },
51 |   {
52 |     "url": "../1.16.1",
53 |     "version": "1.16.1"
54 |   },
55 |   {
56 |     "url": "../1.16.0",
57 |     "version": "1.16.0"
58 |   }
59 | ]


--------------------------------------------------------------------------------
/contents.rst:
--------------------------------------------------------------------------------
 1 | .. NVIDIA Cloud Native Technologies documentation master file, created by
 2 |    sphinx-quickstart on Mon Jul 27 23:51:30 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | NVIDIA Cloud Native Technologies
 7 | ================================
 8 | This documentation repository contains the product documentation for the
 9 | :ref:`NVIDIA Container Toolkit <container-toolkit/overview>`, the :ref:`NVIDIA GPU Operator <gpu-operator/overview>`, and
10 | using NVIDIA GPUs with Kubernetes.
11 | 
12 | .. toctree::
13 |    :hidden:
14 | 
15 | ..   Documentation home <self>
16 | 
17 | .. toctree::
18 |    :maxdepth: 2
19 |    :caption: NVIDIA Container Toolkit:
20 | 
21 |    container-toolkit/overview.rst
22 |    container-toolkit/concepts.rst
23 |    container-toolkit/arch-overview.rst
24 |    container-toolkit/install-guide.rst
25 |    container-toolkit/troubleshooting.rst
26 |    container-toolkit/user-guide.rst
27 |    container-toolkit/release-notes.rst
28 |    container-toolkit/archive.rst
29 | 
30 | .. toctree::
31 |    :maxdepth: 2
32 |    :caption: NVIDIA GPU Operator:
33 | 
34 |    gpu-operator/overview.rst
35 |    gpu-operator/getting-started.rst
36 |    gpu-operator/platform-support.rst
37 |    gpu-operator/release-notes.rst
38 |    gpu-operator/gpu-driver-upgrades.rst
39 |    gpu-operator/install-gpu-operator-vgpu.rst
40 |    gpu-operator/install-gpu-operator-nvaie.rst
41 |    GPU Operator on OpenShift <gpu-operator/openshift/contents.rst>
42 |    gpu-operator/gpu-operator-mig.rst
43 |    gpu-operator/gpu-sharing.rst
44 |    gpu-operator/gpu-operator-rdma.rst
45 |    gpu-operator/gpu-operator-kubevirt.rst
46 |    gpu-operator/appendix.rst
47 |    gpu-operator/archive.rst
48 | 
49 | .. toctree::
50 |    :maxdepth: 2
51 |    :caption: Kubernetes with GPUs:
52 | 
53 |    kubernetes/install-k8s.rst
54 |    kubernetes/mig-k8s.rst
55 |    kubernetes/anthos-guide.rst
56 | 
57 | .. toctree::
58 |    :titlesonly:
59 |    :caption: NVIDIA GPUs and Red Hat Device Edge
60 | 
61 |    edge/nvidia-gpu-with-device-edge.rst
62 | 
63 | .. toctree::
64 |    :maxdepth: 2
65 |    :caption: GPU Telemetry:
66 | 
67 |    gpu-telemetry/dcgm-exporter.rst
68 | 
69 | .. toctree::
70 |    :maxdepth: 2
71 |    :caption: Multi-Instance GPU:
72 | 
73 |    mig/mig.rst
74 |    mig/mig-k8s.rst
75 | 
76 | .. toctree::
77 |    :maxdepth: 2
78 |    :caption: Driver Containers:
79 | 
80 |    driver-containers/overview.rst
81 | 
82 | .. toctree::
83 |    :maxdepth: 2
84 |    :caption: Playground:
85 | 
86 |    playground/dind.rst
87 |    playground/x-arch.rst
88 | 
89 | .. Indices and tables
90 | .. ==================
91 | ..
92 | .. * :ref:`genindex`
93 | 


--------------------------------------------------------------------------------
/css/custom.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  */
 5 | html[data-theme=light] .highlight .go {
 6 |     font-style:unset
 7 | }
 8 | 
 9 | .bd-page-width {
10 |     max-width: 176rem;
11 |   }
12 |   
13 |   .bd-main {
14 |     flex: 1 1 auto;
15 |   }
16 |   
17 |   .bd-main .bd-content .bd-article-container {
18 |     max-width: 100%;
19 |   }
20 |   
21 |   .bd-sidebar-secondary {
22 |     /* flex: 0 0 auto; */
23 |     flex-basis: 15%;
24 |     min-width: var(--pst-sidebar-secondary);
25 |   }
26 |   
27 | html[data-theme=light] .bd-toc-nav .nav-link-expand {
28 |     display: none !important;
29 | }
30 | 
31 | .bd-sidebar-primary li.has-children>details>summary .toctree-toggle {
32 |     display: none !important;
33 | }
34 | 


--------------------------------------------------------------------------------
/deps/repo-deps.packman.xml:
--------------------------------------------------------------------------------
 1 | <project toolsVersion="5.0">
 2 |   <!-- required: the core library for repo suite of tools -->
 3 |   <dependency name="repo_man" linkPath="../_repo/deps/repo_man">
 4 |     <package name="repo_man" version="1.46.2" />
 5 |   </dependency>
 6 |   <!-- optional: tool for building and publishing great looking documentation -->
 7 |   <dependency name="repo_docs" linkPath="../_repo/deps/repo_docs">
 8 |     <package name="repo_docs" version="0.39.2" />
 9 |   </dependency>
10 | </project>
11 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Imported from https://hub.docker.com/r/sphinxdoc/sphinx/dockerfile
 2 | # maintainer="Sphinx Team <https://www.sphinx-doc.org/>"
 3 | # $ docker build --pull \
 4 | #   --tag ${REGISTRY}/sphinxdoc
 5 | #   --file Dockerfile .
 6 | FROM python:3.10-slim
 7 | 
 8 | WORKDIR /docs
 9 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
10 |  && apt-get install --no-install-recommends -y \
11 |       curl \
12 |  && apt-get autoremove \
13 |  && apt-get clean \
14 |  && rm -rf /var/lib/apt/lists/*
15 | 
16 | ENV PM_PACKAGES_ROOT=/var/tmp/packman
17 | 
18 | RUN --mount=type=bind,source=.,destination=/x,rw /x/repo docs -p review || true
19 | 
20 | RUN --mount=type=bind,source=.,destination=/x,rw /x/tools/packman/python.sh -m pip install --no-cache-dir --no-deps -U \
21 |    -t /tmp/extension \
22 |    sphinx-copybutton \
23 |    nvidia-sphinx-theme \
24 |    pydata-sphinx-theme \
25 |    linuxdoc
26 | 
27 | RUN (cd /tmp/extension; tar cf - . ) | (cd /var/tmp/packman/chk/sphinx/4.5.0.2-py3.7-linux-x86_64/; tar xf -)
28 | RUN rm -rf /tmp/extension
29 | 
30 | RUN --mount=type=bind,target=/work echo 'alias build-docs="./repo docs"' >> ~/.bashrc
31 | 


--------------------------------------------------------------------------------
/driver-containers/graphics/driver-container-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/driver-containers/graphics/driver-container-demo.gif


--------------------------------------------------------------------------------
/driver-containers/graphics/nvidia-driver-container-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/driver-containers/graphics/nvidia-driver-container-image.png


--------------------------------------------------------------------------------
/driver-containers/redirected.rst:
--------------------------------------------------------------------------------
1 | Documentation for the driver containers is obsolete.
2 | 
3 | Refer to :external+gpuop:doc:`index`.


--------------------------------------------------------------------------------
/driver-containers/versions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "versions":
3 |     [
4 |         {
5 |             "version": "1.0.0"
6 |         }
7 |     ]
8 | }


--------------------------------------------------------------------------------
/driver-containers/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image01.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image02.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image03.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image04.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image05.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image06.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image07.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image08.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image09.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image10.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image11.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image12.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image13.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image14.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image15.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image16.png


--------------------------------------------------------------------------------
/edge/graphics/anthos/virt/image17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image17.png


--------------------------------------------------------------------------------
/edge/index.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings (h1/h2/h3/h4/h5) are # * = -
18 | 
19 | ###########################################
20 | NVIDIA Cloud Native Reference Architectures
21 | ###########################################
22 | 
23 | .. toctree::
24 |    :titlesonly:
25 | 
26 |    nvidia-gpu-with-device-edge
27 |    anthos-guide


--------------------------------------------------------------------------------
/edge/versions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "versions":
3 |     [
4 |         {
5 |             "version": "1.0.0"
6 |         }
7 |     ]
8 | }
9 | 


--------------------------------------------------------------------------------
/edge/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/gpu-operator/cdi.rst:
--------------------------------------------------------------------------------
  1 | .. license-header
  2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |   SPDX-License-Identifier: Apache-2.0
  4 | 
  5 |   Licensed under the Apache License, Version 2.0 (the "License");
  6 |   you may not use this file except in compliance with the License.
  7 |   You may obtain a copy of the License at
  8 | 
  9 |   http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |   Unless required by applicable law or agreed to in writing, software
 12 |   distributed under the License is distributed on an "AS IS" BASIS,
 13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   See the License for the specific language governing permissions and
 15 |   limitations under the License.
 16 | 
 17 | .. headings # #, * *, =, -, ^, "
 18 | 
 19 | ############################################################
 20 | Container Device Interface (CDI) Support in the GPU Operator
 21 | ############################################################
 22 | 
 23 | ************************************
 24 | About the Container Device Interface
 25 | ************************************
 26 | 
 27 | The `Container Device Interface (CDI) <https://github.com/cncf-tags/container-device-interface/blob/main/SPEC.md>`_
 28 | is an open specification for container runtimes that abstracts what access to a device, such as an NVIDIA GPU, means,
 29 | and standardizes access across container runtimes. Popular container runtimes can read and process the specification to
 30 | ensure that a device is available in a container. CDI simplifies adding support for devices such as NVIDIA GPUs because
 31 | the specification is applicable to all container runtimes that support CDI.
 32 | 
 33 | Starting with GPU Operator v25.10.0, CDI is used by default for enabling GPU support in containers running on Kubernetes.
 34 | Specifically, CDI support in container runtimes, e.g. containerd and cri-o, is used to inject GPU(s) into workload
 35 | containers. This differs from prior GPU Operator releases where CDI was used via a CDI-enabled ``nvidia`` runtime class.
 36 | 
 37 | Use of CDI is transparent to cluster administrators and application developers.
 38 | The benefits of CDI are largely to reduce development and support for runtime-specific
 39 | plugins.
 40 | 
 41 | ********************************
 42 | Enabling CDI During Installation
 43 | ********************************
 44 | 
 45 | CDI is enabled by default during installation in GPU Operator v25.10.0 and later.
 46 | Follow the instructions for installing the Operator with Helm on the :doc:`getting-started` page.
 47 | 
 48 | CDI is also enabled by default during a Helm upgrade to GPU Operator v25.10.0 and later.
 49 | 
 50 | *******************************
 51 | Enabling CDI After Installation
 52 | *******************************
 53 | 
 54 | CDI is enabled by default in GPU Operator v25.10.0 and later.
 55 | Use the following procedure to enable CDI if you disabled CDI during installation.
 56 | 
 57 | .. rubric:: Procedure
 58 | 
 59 | #. Enable CDI by modifying the cluster policy:
 60 | 
 61 |    .. code-block:: console
 62 | 
 63 |      $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \
 64 |          -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value":true}]'
 65 | 
 66 |    *Example Output*
 67 | 
 68 |    .. code-block:: output
 69 | 
 70 |     clusterpolicy.nvidia.com/cluster-policy patched
 71 | 
 72 | #. (Optional) Confirm that the container toolkit and device plugin pods restart:
 73 | 
 74 |    .. code-block:: console
 75 | 
 76 |      $ kubectl get pods -n gpu-operator
 77 | 
 78 |    *Example Output*
 79 | 
 80 |    .. literalinclude:: ./manifests/output/cdi-get-pods-restart.txt
 81 |       :language: output
 82 |       :emphasize-lines: 6,9
 83 | 
 84 | 
 85 | *************
 86 | Disabling CDI
 87 | *************
 88 | 
 89 | While CDI is the default and recommended mechanism for injecting GPU support into containers, you can
 90 | disable CDI and use the legacy NVIDIA Container Toolkit stack instead with the following procedure:
 91 | 
 92 | #. If your nodes use the CRI-O container runtime, then temporarily disable the
 93 |    GPU Operator validator:
 94 | 
 95 |    .. code-block:: console
 96 | 
 97 |       $ kubectl label nodes \
 98 |           nvidia.com/gpu.deploy.operator-validator=false \
 99 |           -l nvidia.com/gpu.present=true \
100 |           --overwrite
101 | 
102 |    .. tip::
103 | 
104 |       You can run ``kubectl get nodes -o wide`` and view the ``CONTAINER-RUNTIME``
105 |       column to determine if your nodes use CRI-O.
106 | 
107 | #. Disable CDI by modifying the cluster policy:
108 | 
109 |    .. code-block:: console
110 | 
111 |       $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \
112 |           -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value":false}]'
113 | 
114 |    *Example Output*
115 | 
116 |    .. code-block:: output
117 | 
118 |       clusterpolicy.nvidia.com/cluster-policy patched
119 | 
120 | #. If you temporarily disabled the GPU Operator validator, re-enable the validator:
121 | 
122 |    .. code-block:: console
123 | 
124 |       $ kubectl label nodes \
125 |           nvidia.com/gpu.deploy.operator-validator=true \
126 |           nvidia.com/gpu.present=true \
127 |           --overwrite
128 | 


--------------------------------------------------------------------------------
/gpu-operator/custom-driver-params.rst:
--------------------------------------------------------------------------------
  1 | .. Date: Mar 11 2022
  2 | .. Author: cdesiniotis
  3 | 
  4 | .. _custom-driver-params:
  5 | 
  6 | Customizing NVIDIA GPU Driver Parameters during Installation
  7 | ************************************************************
  8 | 
  9 | The NVIDIA Driver kernel modules accept a number of parameters which can be used to customize the behavior of the driver.
 10 | By default, the GPU Operator loads the kernel modules with default values.
 11 | On a machine with the driver already installed, you can list the parameter names and values with the ``cat /proc/driver/nvidia/params`` command.
 12 | You can pass custom parameters to the kernel modules that get loaded as part of the
 13 | NVIDIA Driver installation (``nvidia``, ``nvidia-modeset``, ``nvidia-uvm``, and ``nvidia-peermem``).
 14 | 
 15 | Configure Custom Driver Parameters
 16 | -----------------------------------
 17 | 
 18 | To pass custom parameters, execute the following steps.
 19 | 
 20 | #. Create a configuration file named ``<module>.conf``, where ``<module>`` is the name of the kernel module the parameters are for.
 21 |    The file should contain parameters as key-value pairs -- one parameter per line.
 22 | 
 23 |    The following example shows the GPU firmware logging parameter being passed to the ``nvidia`` module.
 24 | 
 25 |    .. code-block:: console
 26 | 
 27 |       $ cat nvidia.conf
 28 |       NVreg_EnableGpuFirmwareLogs=2
 29 | 
 30 | #. Create a ``ConfigMap`` for the configuration file.
 31 |    If multiple modules are being configured, pass multiple files when creating the ``ConfigMap``.
 32 | 
 33 |    .. code-block:: console
 34 | 
 35 |       $ kubectl create configmap kernel-module-params -n gpu-operator --from-file=nvidia.conf=./nvidia.conf
 36 | 
 37 | #. Install the GPU Operator and set ``driver.kernelModuleConfig.name`` to the name of the ``ConfigMap``
 38 |    containing the kernel module parameters.
 39 | 
 40 |    .. code-block:: console
 41 | 
 42 |       $ helm install --wait --generate-name \
 43 |          -n gpu-operator --create-namespace \
 44 |          nvidia/gpu-operator \
 45 |          --version=${version} \
 46 |          --set driver.kernelModuleConfig.name="kernel-module-params"
 47 | 
 48 | -----------------------------------
 49 | Example using ``nvidia-uvm`` module
 50 | -----------------------------------
 51 | 
 52 | This example shows the Heterogeneous Memory Management (HMM) being disabled in the ``nvidia-uvm`` module.
 53 | Refer to `Simplifying GPU Application Development with Heterogeneous Memory Management <https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/>`_ for more information about HMM.
 54 | 
 55 | #. Create a configuration file named ``nvidia-uvm.conf``:
 56 | 
 57 |    .. code-block:: console
 58 | 
 59 |       $ cat nvidia-uvm.conf
 60 |       uvm_disable_hmm=1
 61 | 
 62 | 
 63 | #. Create a ``ConfigMap`` for the configuration file.
 64 |    If multiple modules are being configured, pass multiple files when creating the ``ConfigMap``.
 65 | 
 66 |    .. code-block:: console
 67 | 
 68 |       $ kubectl create configmap kernel-module-params -n gpu-operator --from-file=nvidia-uvm.conf=./nvidia-uvm.conf
 69 | 
 70 | #. Install the GPU Operator and set ``driver.kernelModuleConfig.name`` to the name of the ``ConfigMap``
 71 |    containing the kernel module parameters.
 72 | 
 73 |    .. code-block:: console
 74 | 
 75 |       $ helm install --wait --generate-name \
 76 |          -n gpu-operator --create-namespace \
 77 |          nvidia/gpu-operator \
 78 |          --version=${version} \
 79 |          --set driver.kernelModuleConfig.name="kernel-module-params"
 80 | 
 81 | #. Verify the parameter has been correctly applied, go to ``/sys/module/nvidia_uvm/parameters/`` on the node:
 82 | 
 83 |    .. code-block:: console
 84 | 
 85 |       $ ls /sys/module/nvidia_uvm/parameters/
 86 | 
 87 |    *Example Output*
 88 | 
 89 |    .. code-block:: output
 90 | 
 91 |       ...           
 92 |       uvm_disable_hmm                               uvm_perf_access_counter_migration_enable  uvm_perf_prefetch_min_faults
 93 |       uvm_downgrade_force_membar_sys                uvm_perf_access_counter_threshold         uvm_perf_prefetch_threshold
 94 |       ...
 95 | 
 96 |    Then check the value of the parameter: 
 97 | 
 98 |    .. code-block:: console
 99 | 
100 |       $ cat /sys/module/nvidia_uvm/parameters/uvm_disable_hmm
101 | 
102 |    *Example Output*
103 | 
104 |    .. code-block:: output
105 | 
106 |       Y


--------------------------------------------------------------------------------
/gpu-operator/dra-gpus.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | ##########################
 6 | NVIDIA DRA Driver for GPUs
 7 | ##########################
 8 | 
 9 | .. _dra_docs_gpus:
10 | 
11 | **************
12 | GPU allocation
13 | **************
14 | 
15 | Compared to `traditional GPU allocation <https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins>`_ using coarse-grained count-based requests, the GPU allocation side of this driver enables fine-grained control and powerful features long desired by the community, such as:
16 | 
17 | #. Controlled sharing of individual GPUs between multiple pods and/or containers.
18 | #. GPU selection via complex constraints expressed via `CEL <https://kubernetes.io/docs/reference/using-api/cel/>`_.
19 | #. Dynamic partitioning.
20 | 
21 | To learn more about this part of the driver and about what we are planning to build in the future, have a look at `these release notes <https://github.com/NVIDIA/k8s-dra-driver-gpu/releases/tag/v25.3.0-rc.3>`_.
22 | 
23 | While the GPU allocation features of this driver can be tried out, they are not yet officially supported.
24 | Hence, the GPU kubelet plugin is currently disabled by default in the Helm chart installation.
25 | 
26 | For documentation on how to use and test the current set of GPU allocation features, please head over to the `demo section <https://github.com/NVIDIA/k8s-dra-driver-gpu?tab=readme-ov-file#a-kind-demo>`_ of the driver's README and to its `quickstart directory <https://github.com/NVIDIA/k8s-dra-driver-gpu/tree/main/demo/specs/quickstart>`_.
27 | 
28 | .. note::
29 |   This part of the NVIDIA DRA Driver for GPUs is in **Technology Preview**.
30 |   It is not yet supported in production environments and not yet functionally complete.
31 |   Generally spoken, Technology Preview features provide early access to upcoming product features, enabling users to test functionality and provide feedback during the development process.
32 |   Technology Preview releases may not have full documentation, and testing is limited.
33 | 


--------------------------------------------------------------------------------
/gpu-operator/graphics/gpu-operator-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-operator/graphics/gpu-operator-demo.gif


--------------------------------------------------------------------------------
/gpu-operator/graphics/nvidia-gpu-operator-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-operator/graphics/nvidia-gpu-operator-image.jpg


--------------------------------------------------------------------------------
/gpu-operator/graphics/upgrade-controller-state-machine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-operator/graphics/upgrade-controller-state-machine.png


--------------------------------------------------------------------------------
/gpu-operator/index.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings # #, * *, =, -, ^, "
18 | 
19 | .. toctree::
20 |    :caption: NVIDIA GPU Operator
21 |    :titlesonly:
22 |    :hidden:
23 | 
24 |    About the Operator <overview.rst>
25 |    Install <getting-started.rst>
26 |    Upgrade <upgrade.rst>
27 |    Uninstall <uninstall.rst>
28 |    Platform Support <platform-support.rst>
29 |    Release Notes <release-notes.rst>
30 |    Troubleshooting <troubleshooting.rst>
31 |    gpu-driver-upgrades.rst
32 |    install-gpu-operator-vgpu.rst
33 |    install-gpu-operator-nvaie.rst
34 |    Security Considerations <security.rst>
35 | 
36 | 
37 | 
38 | .. toctree::
39 |    :caption: Advanced Operator Configuration
40 |    :titlesonly:
41 |    :hidden:
42 | 
43 |    Multi-Instance GPU <gpu-operator-mig.rst>
44 |    Time-Slicing GPUs <gpu-sharing.rst>
45 |    gpu-operator-rdma.rst
46 |    Outdated Kernels <install-gpu-operator-outdated-kernels.rst>
47 |    Custom GPU Driver Parameters <custom-driver-params.rst>
48 |    precompiled-drivers.rst
49 |    GPU Driver CRD <gpu-driver-configuration.rst>
50 |    Container Device Interface (CDI) Support <cdi.rst>
51 | 
52 | .. toctree::
53 |    :caption:  Sandboxed Workloads
54 |    :titlesonly:
55 |    :hidden:
56 | 
57 |    KubeVirt <gpu-operator-kubevirt.rst>
58 |    
59 | .. toctree::
60 |    :caption: Specialized Networks
61 |    :titlesonly:
62 |    :hidden:
63 | 
64 |    HTTP Proxy <install-gpu-operator-proxy.rst>
65 |    Air-Gapped Network <install-gpu-operator-air-gapped.rst>
66 |    Service Mesh <install-gpu-operator-service-mesh.rst>
67 | 
68 | .. toctree::
69 |    :caption: CSP configurations
70 |    :titlesonly:
71 |    :hidden:
72 | 
73 |    Amazon EKS <amazon-eks.rst>
74 |    Azure AKS <microsoft-aks.rst>
75 |    Google GKE <google-gke.rst>
76 | 
77 | .. toctree::
78 |    :caption: NVIDIA DRA Driver for GPUs
79 |    :titlesonly:
80 |    :hidden:
81 | 
82 |    Introduction & Installation <dra-intro-install.rst>
83 |    GPUs <dra-gpus.rst>
84 |    ComputeDomains <dra-cds.rst>
85 | 
86 | .. include:: overview.rst
87 | 


--------------------------------------------------------------------------------
/gpu-operator/install-gpu-operator-outdated-kernels.rst:
--------------------------------------------------------------------------------
 1 | .. Date: Aug 2 2021
 2 | .. Author: cdesiniotis
 3 | 
 4 | .. _install-gpu-operator-outdated-kernels:
 5 | 
 6 | Considerations when Installing with Outdated Kernels in Cluster
 7 | ***************************************************************
 8 | 
 9 | The ``driver`` container deployed as part of the GPU Operator requires certain packages to be available as part of the driver installation.
10 | On GPU nodes where the running kernel is not the latest, the ``driver`` container may fail to find the right version of these packages
11 | (e.g. kernel-headers, kernel-devel) that correspond to the running kernel version. In the ``driver`` container logs, you will most likely
12 | see the following error message: ``Could not resolve Linux kernel version``.
13 | 
14 | In general, upgrading your system to the latest kernel should fix this issue. But if this is not an option, the following is a
15 | workaround to successfully deploy the GPU Operator when GPU nodes in your cluster may not be running the latest kernel.
16 | 
17 | Add Archived Package Repositories
18 | =================================
19 | 
20 | The workaround is to find the package archive containing packages for your outdated kernel and to add this repository to the package
21 | manager running inside the ``driver`` container. To achieve this, we can simply mount a repository list file into the ``driver`` container using a ``ConfigMap``.
22 | The ``ConfigMap`` containing the repository list file needs to be created in the ``gpu-operator`` namespace.
23 | 
24 | Let us demonstrate this workaround via an example. The system used in this example is running CentOS 7 with an outdated kernel:
25 | 
26 | .. code-block:: console
27 | 
28 |     $ uname -r
29 |     3.10.0-1062.12.1.el7.x86_64
30 | 
31 | The official archive for older CentOS packages is https://vault.centos.org/. Typically, most archived CentOS repositories
32 | are found in ``/etc/yum.repos.d/CentOS-Vault.repo`` but they are disabled by default. If the appropriate archive repository
33 | was enabled, then the ``driver`` container would resolve the kernel version and be able to install the correct versions
34 | of the prerequisite packages.
35 | 
36 | We can simply drop in a replacement of ``/etc/yum.repos.d/CentOS-Vault.repo`` to ensure the appropriate CentOS archive is enabled.
37 | For the kernel running in this example, the ``CentOS-7.7.1908`` archive contains the kernel-headers version we are looking for.
38 | Here is our example drop-in replacement file:
39 | 
40 | .. code-block::
41 | 
42 |    [C7.7.1908-base]
43 |    name=CentOS-7.7.1908 - Base
44 |    baseurl=http://vault.centos.org/7.7.1908/os/$basearch/
45 |    gpgcheck=1
46 |    gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7
47 |    enabled=1
48 | 
49 |    [C7.7.1908-updates]
50 |    name=CentOS-7.7.1908 - Updates
51 |    baseurl=http://vault.centos.org/7.7.1908/updates/$basearch/
52 |    gpgcheck=1
53 |    gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7
54 |    enabled=1
55 | 
56 | Once the repo list file is created, we can create a ``ConfigMap`` for it:
57 | 
58 | .. code-block:: console
59 | 
60 |    $ kubectl create configmap repo-config -n gpu-operator --from-file=<path-to-repo-list-file>
61 | 
62 | Once the ``ConfigMap`` is created using the above command, update ``values.yaml`` with this information, to let the GPU Operator mount the repo configuration
63 | within the ``driver`` container to pull required packages.
64 | 
65 | For Ubuntu:
66 | 
67 | .. code-block:: yaml
68 | 
69 |    driver:
70 |       repoConfig:
71 |          configMapName: repo-config
72 |          destinationDir: /etc/apt/sources.list.d
73 | 
74 | For RHEL/Centos/RHCOS:
75 | 
76 | .. code-block:: yaml
77 | 
78 |    driver:
79 |       repoConfig:
80 |          configMapName: repo-config
81 |          destinationDir: /etc/yum.repos.d
82 | 
83 | Deploy GPU Operator with updated ``values.yaml``:
84 | 
85 | .. code-block:: console
86 | 
87 |    $ helm install --wait --generate-name \
88 |         -n gpu-operator --create-namespace \
89 |         nvidia/gpu-operator \
90 |         --version=${version} \
91 |         -f values.yaml
92 | 
93 | 
94 | Check the status of the pods to ensure all the containers are running:
95 | 
96 | .. code-block:: console
97 | 
98 |    $ kubectl get pods -n gpu-operator
99 | 


--------------------------------------------------------------------------------
/gpu-operator/install-gpu-operator-proxy.rst:
--------------------------------------------------------------------------------
  1 | .. license-header
  2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |   SPDX-License-Identifier: Apache-2.0
  4 | 
  5 |   Licensed under the Apache License, Version 2.0 (the "License");
  6 |   you may not use this file except in compliance with the License.
  7 |   You may obtain a copy of the License at
  8 | 
  9 |   http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |   Unless required by applicable law or agreed to in writing, software
 12 |   distributed under the License is distributed on an "AS IS" BASIS,
 13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   See the License for the specific language governing permissions and
 15 |   limitations under the License.
 16 | 
 17 | .. headings # #, * *, =, -, ^, "
 18 | 
 19 | .. Date: Sep 16 2021
 20 | .. Author: cdesiniotis
 21 | 
 22 | .. _install-gpu-operator-proxy:
 23 | 
 24 | Install GPU Operator in Proxy Environments
 25 | ******************************************
 26 | 
 27 | Introduction
 28 | ============
 29 | 
 30 | This page describes how to successfully deploy the GPU Operator in clusters behind an HTTP proxy.
 31 | By default, the GPU Operator requires internet access for the following reasons:
 32 | 
 33 |     1) Container images need to be pulled during GPU Operator installation.
 34 |     2) The ``driver`` container needs to download several OS packages prior to driver installation.
 35 | 
 36 |        .. tip::
 37 |           Using :doc:`precompiled-drivers` removes the need for the ``driver`` containers to
 38 |           download operating system packages.
 39 | 
 40 | To address these requirements, all Kubernetes nodes as well as the ``driver`` container need proper configuration
 41 | in order to direct traffic through the proxy.
 42 | 
 43 | This document demonstrates how to configure the GPU Operator so that the ``driver`` container can successfully
 44 | download packages behind a HTTP proxy. Since configuring Kubernetes/container runtime components to use
 45 | a proxy is not specific to the GPU Operator, we do not include those instructions here.
 46 | 
 47 | The instructions for Openshift are different, so skip the section titled :ref:`proxy_config_openshift` if you are not running Openshift.
 48 | 
 49 | Prerequisites
 50 | =============
 51 | 
 52 | * Kubernetes cluster is configured with HTTP proxy settings (container runtime should be enabled with HTTP proxy)
 53 | 
 54 | .. _proxy_config_openshift:
 55 | 
 56 | HTTP Proxy Configuration for Openshift
 57 | ======================================
 58 | 
 59 | For Openshift, it is recommended to use the cluster-wide Proxy object to provide proxy information for the cluster.
 60 | Follow the procedure described in `Configuring the cluster-wide proxy <https://docs.openshift.com/container-platform/latest/networking/enable-cluster-wide-proxy.html>`_
 61 | from Red Hat Openshift public documentation. The GPU Operator will automatically inject proxy related ENV into the ``driver`` container
 62 | based on information present in the cluster-wide Proxy object.
 63 | 
 64 | HTTP Proxy Configuration
 65 | ========================
 66 | 
 67 | First, get the ``values.yaml`` file used for GPU Operator configuration:
 68 | 
 69 | .. code-block:: console
 70 | 
 71 |   $ curl -sO https://raw.githubusercontent.com/NVIDIA/gpu-operator/${version}/deployments/gpu-operator/values.yaml
 72 | 
 73 | Specify ``driver.env`` in ``values.yaml`` with appropriate HTTP_PROXY, HTTPS_PROXY, and NO_PROXY environment variables
 74 | (in both uppercase and lowercase).
 75 | 
 76 | .. code-block:: yaml
 77 | 
 78 |    driver:
 79 |       env:
 80 |       - name: HTTPS_PROXY
 81 |         value: http://<example.proxy.com:port>
 82 |       - name: HTTP_PROXY
 83 |         value: http://<example.proxy.com:port>
 84 |       - name: NO_PROXY
 85 |         value: <example.com>
 86 |       - name: https_proxy
 87 |         value: http://<example.proxy.com:port>
 88 |       - name: http_proxy
 89 |         value: http://<example.proxy.com:port>
 90 |       - name: no_proxy
 91 |         value: <example.com>
 92 | 
 93 | .. note::
 94 | 
 95 |    * Proxy related ENV are automatically injected by GPU Operator into the ``driver`` container to indicate proxy information used when downloading necessary packages.
 96 |    * If HTTPS Proxy server is setup then change the values of HTTPS_PROXY and https_proxy to use ``https`` instead.
 97 | 
 98 | Deploy GPU Operator
 99 | ===================
100 | 
101 | Download and deploy GPU Operator Helm Chart with the updated ``values.yaml``.
102 | 
103 | Fetch the chart from the NGC repository:
104 | 
105 | .. code-block:: console
106 | 
107 |     $ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-${version}.tgz
108 | 
109 | Install the GPU Operator with updated ``values.yaml``:
110 | 
111 | .. code-block:: console
112 | 
113 |     $ helm install --wait gpu-operator \
114 |          -n gpu-operator --create-namespace \
115 |          gpu-operator-${version}.tgz \
116 |          -f values.yaml
117 | 
118 | Check the status of the pods to ensure all the containers are running:
119 | 
120 | .. code-block:: console
121 | 
122 |    $ kubectl get pods -n gpu-operator
123 | 


--------------------------------------------------------------------------------
/gpu-operator/install-gpu-operator-service-mesh.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings # #, * *, =, -, ^, "
18 | 
19 | ######################################
20 | Install GPU Operator with Service Mesh
21 | ######################################
22 | 
23 | 
24 | *****************************************
25 | Special Considerations for Service Meshes
26 | *****************************************
27 | 
28 | You can use NVIDIA GPU Operator in a cluster that uses a service mesh provided by Istio CNI or Linkerd CNI.
29 | 
30 | The typical consideration for using the Operator with a service mesh is that the ``k8s-driver-manager`` init container
31 | for the ``driver`` container needs network access to the Kubernetes API server of the cluster.
32 | 
33 | The data plane---implemented by Istio CNI or Linkerd CNI as proxies running as sidecar containers---must be running for any pod networking to work.
34 | The proxy sidecar containers start only after the init phase of the pod, so init containers are not able to communicate with the API server.
35 | 
36 | To address the connectivity challenge, NVIDIA recommends disabling injection for the GPU Operator namespace.
37 | Refer to the following documentation for more information:
38 | 
39 | - `Controlling the injection policy <https://istio.io/latest/docs/setup/additional-setup/sidecar-injection/#controlling-the-injection-policy>`_
40 |   in the Istio documentation.
41 | - `Overriding injection <https://linkerd.io/2.14/features/proxy-injection/#overriding-injection>`_
42 |   in the Linkerd documentation.
43 | 
44 | 
45 | ****************************************
46 | Label the Namespace to Disable Injection
47 | ****************************************
48 | 
49 | - Label the Operator namespace to prevent automatic injection:
50 | 
51 |   .. code-block:: console
52 | 
53 |      $ kubectl label namespace gpu-operator istio-injection=disabled
54 | 
55 |   Or, for Linkerd:
56 | 
57 |   .. code-block:: console
58 | 
59 |      $ kubectl label namespace gpu-operator linkerd.io/inject=disabled
60 | 
61 | 
62 | If the GPU Operator is not already installed, refer to
63 | :doc:`getting-started`
64 | for information about custom options and common installation scenarios.
65 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/amazon-eks-cluster-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: eksctl.io/v1alpha5
 2 | kind: ClusterConfig
 3 | metadata:
 4 |   name: demo-cluster
 5 |   region: us-west-2
 6 |   version: "1.25"
 7 | nodeGroups:
 8 |   - name: demo-gpu-workers
 9 |     instanceType: g4dn.xlarge
10 |     ami: ami-0770ab88ec35aa875
11 |     amiFamily: Ubuntu2004
12 |     minSize: 1
13 |     desiredCapacity: 3
14 |     maxSize: 3
15 |     volumeSize: 100
16 |     overrideBootstrapCommand: |
17 |       #!/bin/bash
18 |       source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh
19 |       /etc/eks/bootstrap.sh ${CLUSTER_NAME} --container-runtime containerd --kubelet-extra-args "--node-labels=${NODE_LABELS}"
20 |     ssh:
21 |       allow: true
22 |       publicKeyPath: ~/.ssh/id_rsa.pub


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/custom-mig-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: custom-mig-config
 5 | data:
 6 |   config.yaml: |
 7 |     version: v1
 8 |     mig-configs:
 9 |       all-disabled:
10 |         - devices: all
11 |           mig-enabled: false
12 |       
13 |       five-1g-one-2g:
14 |         - devices: all 
15 |           mig-enabled: true
16 |           mig-devices:
17 |             "1g.10gb": 5
18 |             "2g.20gb": 1
19 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/google-gke-gpu-operator-quota.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ResourceQuota
 3 | metadata:
 4 |   name: gpu-operator-quota
 5 | spec:
 6 |   hard:
 7 |     pods: 100
 8 |   scopeSelector:
 9 |     matchExpressions:
10 |     - operator: In
11 |       scopeName: PriorityClass
12 |       values:
13 |         - system-node-critical
14 |         - system-cluster-critical


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/gpu-direct-rdma-demo-pod-1.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: demo-pod-1
 5 |   annotations:
 6 |     k8s.v1.cni.cncf.io/networks: demo-macvlannetwork
 7 |     # If a network with static IPAM is used replace network annotation with the below.
 8 |     # k8s.v1.cni.cncf.io/networks: '[
 9 |     #   { "name": "rdma-net",
10 |     #     "ips": ["192.168.111.101/24"],
11 |     #     "gateway": ["192.168.111.1"]
12 |     #   }
13 |     # ]'
14 | spec:
15 |   nodeSelector:
16 |     # Note: Replace hostname or remove selector altogether
17 |     kubernetes.io/hostname: nvnode1
18 |   restartPolicy: OnFailure
19 |   containers:
20 |   - image: mellanox/cuda-perftest
21 |     name: rdma-gpu-test-ctr
22 |     securityContext:
23 |       capabilities:
24 |         add: [ "IPC_LOCK" ]
25 |     resources:
26 |       limits:
27 |         nvidia.com/gpu: 1
28 |         rdma/rdma_shared_device_a: 1
29 |       requests:
30 |         nvidia.com/gpu: 1
31 |         rdma/rdma_shared_device_a: 1


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/gpu-direct-rdma-demo-pod-2.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: demo-pod-2
 5 |   annotations:
 6 |     k8s.v1.cni.cncf.io/networks: demo-macvlannetwork
 7 |     # If a network with static IPAM is used replace network annotation with the below.
 8 |     # k8s.v1.cni.cncf.io/networks: '[
 9 |     #   { "name": "rdma-net",
10 |     #     "ips": ["192.168.111.101/24"],
11 |     #     "gateway": ["192.168.111.1"]
12 |     #   }
13 |     # ]'
14 | spec:
15 |   nodeSelector:
16 |     # Note: Replace hostname or remove selector altogether
17 |     kubernetes.io/hostname: nvnode2
18 |   restartPolicy: OnFailure
19 |   containers:
20 |   - image: mellanox/cuda-perftest
21 |     name: rdma-gpu-test-ctr
22 |     securityContext:
23 |       capabilities:
24 |         add: [ "IPC_LOCK" ]
25 |     resources:
26 |       limits:
27 |         nvidia.com/gpu: 1
28 |         rdma/rdma_shared_device_a: 1
29 |       requests:
30 |         nvidia.com/gpu: 1
31 |         rdma/rdma_shared_device_a: 1


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/mig-cm-values.yaml:
--------------------------------------------------------------------------------
 1 | migManager:
 2 |   config:
 3 |     name: custom-mig-config
 4 |     create: true
 5 |     data:
 6 |       config.yaml: |-
 7 |         version: v1
 8 |         mig-configs:
 9 |           all-disabled:
10 |             - devices: all
11 |               mig-enabled: false
12 |           custom-mig:
13 |             - devices: [0]
14 |               mig-enabled: true
15 |               mig-devices:
16 |                 "1g.10gb": 2
17 |                 "2g.20gb": 2
18 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/nvd-all.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: nvidia.com/v1alpha1
 2 | kind: NVIDIADriver
 3 | metadata:
 4 |   name: demo-all
 5 | spec:
 6 |   driverType: gpu
 7 |   image: driver
 8 |   imagePullPolicy: IfNotPresent
 9 |   imagePullSecrets: []
10 |   manager: {}
11 |   rdma:
12 |     enabled: false
13 |     useHostMofed: false
14 |   gds:
15 |     enabled: false
16 |   repository: nvcr.io/nvidia
17 |   startupProbe:
18 |     failureThreshold: 120
19 |     initialDelaySeconds: 60
20 |     periodSeconds: 10
21 |     timeoutSeconds: 60
22 |   usePrecompiled: false
23 |   version: 535.104.12
24 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/nvd-demo-gold.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: nvidia.com/v1alpha1
 2 | kind: NVIDIADriver
 3 | metadata:
 4 |   name: demo-gold
 5 | spec:
 6 |   driverType: gpu
 7 |   gdrcopy:
 8 |     enabled: false
 9 |     repository: nvcr.io/nvidia/cloud-native
10 |     image: gdrdrv
11 |     version: v2.4.1
12 |     imagePullPolicy: IfNotPresent
13 |     imagePullSecrets: []
14 |     env: []
15 |     args: []
16 |   image: driver
17 |   imagePullPolicy: IfNotPresent
18 |   imagePullSecrets: []
19 |   kernelModuleType: auto
20 |   manager: {}
21 |   nodeSelector:
22 |     driver.config: "gold"
23 |   rdma:
24 |     enabled: false
25 |     useHostMofed: false
26 |   gds:
27 |     enabled: false
28 |   repository: nvcr.io/nvidia
29 |   startupProbe:
30 |     failureThreshold: 120
31 |     initialDelaySeconds: 60
32 |     periodSeconds: 10
33 |     timeoutSeconds: 60
34 |   usePrecompiled: false
35 |   version: 535.104.12
36 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/nvd-driver-multiple.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: nvidia.com/v1alpha1
 2 | kind: NVIDIADriver
 3 | metadata:
 4 |   name: demo-gold
 5 | spec:
 6 |   driverType: gpu
 7 |   env: []
 8 |   image: driver
 9 |   imagePullPolicy: IfNotPresent
10 |   imagePullSecrets: []
11 |   manager: {}
12 |   nodeSelector:
13 |     driver.config: "gold"
14 |   repository: nvcr.io/nvidia
15 |   version: "535.104.12"
16 | ---
17 | apiVersion: nvidia.com/v1alpha1
18 | kind: NVIDIADriver
19 | metadata:
20 |   name: demo-silver
21 | spec:
22 |   driverType: gpu
23 |   env: []
24 |   image: driver
25 |   imagePullPolicy: IfNotPresent
26 |   imagePullSecrets: []
27 |   manager: {}
28 |   nodeSelector:
29 |     driver.config: "silver"
30 |   repository: nvcr.io/nvidia
31 |   version: "470.141.10"
32 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/nvd-precompiled-all.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: nvidia.com/v1alpha1
 2 | kind: NVIDIADriver
 3 | metadata:
 4 |   name: demo-precomp-all
 5 | spec:
 6 |   driverType: gpu
 7 |   env: []
 8 |   image: driver
 9 |   imagePullPolicy: IfNotPresent
10 |   imagePullSecrets: []
11 |   manager: {}
12 |   nodeSelector: {}
13 |   repository: nvcr.io/nvidia
14 |   resources: {}
15 |   usePrecompiled: true
16 |   version: "535"
17 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/nvd-precompiled-some.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: nvidia.com/v1alpha1
 2 | kind: NVIDIADriver
 3 | metadata:
 4 |   name: demo-precomp
 5 | spec:
 6 |   driverType: gpu
 7 |   env: []
 8 |   image: driver
 9 |   imagePullPolicy: IfNotPresent
10 |   imagePullSecrets: []
11 |   manager: {}
12 |   nodeSelector:
13 |     driver.precompiled: "true"
14 |     driver.version: "535"
15 |   repository: nvcr.io/nvidia
16 |   resources: {}
17 |   usePrecompiled: true
18 |   version: "535"
19 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/tf-notebook.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: tf-notebook
 6 |   labels:
 7 |     app: tf-notebook
 8 | spec:
 9 |   type: NodePort
10 |   ports:
11 |   - port: 80
12 |     name: http
13 |     targetPort: 8888
14 |     nodePort: 30001
15 |   selector:
16 |     app: tf-notebook
17 | ---
18 | apiVersion: v1
19 | kind: Pod
20 | metadata:
21 |   name: tf-notebook
22 |   labels:
23 |     app: tf-notebook
24 | spec:
25 |   securityContext:
26 |     fsGroup: 0
27 |   containers:
28 |   - name: tf-notebook
29 |     image: tensorflow/tensorflow:latest-gpu-jupyter
30 |     resources:
31 |       limits:
32 |         nvidia.com/gpu: 1
33 |     ports:
34 |     - containerPort: 8888
35 |       name: notebook


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/time-slicing-config-all.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: time-slicing-config-all
 5 | data:
 6 |   any: |-
 7 |     version: v1
 8 |     flags:
 9 |       migStrategy: none
10 |     sharing:
11 |       timeSlicing:
12 |         resources:
13 |         - name: nvidia.com/gpu
14 |           replicas: 4
15 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/time-slicing-config-fine.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: time-slicing-config-fine
 5 | data:
 6 |   a100-40gb: |-
 7 |     version: v1
 8 |     flags:
 9 |       migStrategy: mixed
10 |     sharing:
11 |       timeSlicing:
12 |         resources:
13 |         - name: nvidia.com/gpu
14 |           replicas: 8
15 |         - name: nvidia.com/mig-1g.5gb
16 |           replicas: 2
17 |         - name: nvidia.com/mig-2g.10gb
18 |           replicas: 2
19 |         - name: nvidia.com/mig-3g.20gb
20 |           replicas: 3
21 |         - name: nvidia.com/mig-7g.40gb
22 |           replicas: 7
23 |   tesla-t4: |-
24 |     version: v1
25 |     flags:
26 |       migStrategy: none
27 |     sharing:
28 |       timeSlicing:
29 |         resources:
30 |         - name: nvidia.com/gpu
31 |           replicas: 4
32 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/time-slicing-config-sample.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: time-slicing-config
 5 | data:
 6 |   any: |-
 7 |     version: v1
 8 |     flags:
 9 |       migStrategy: none
10 |     sharing:
11 |       timeSlicing:
12 |         renameByDefault: false
13 |         failRequestsGreaterThanOne: false
14 |         resources:
15 |           - name: nvidia.com/gpu
16 |             replicas: 4
17 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/input/time-slicing-verification.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: time-slicing-verification
 5 |   labels:
 6 |     app: time-slicing-verification
 7 | spec:
 8 |   replicas: 5
 9 |   selector:
10 |     matchLabels:
11 |       app: time-slicing-verification
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: time-slicing-verification
16 |     spec:
17 |       tolerations:
18 |         - key: nvidia.com/gpu
19 |           operator: Exists
20 |           effect: NoSchedule
21 |       hostPID: true
22 |       containers:
23 |         - name: cuda-sample-vector-add
24 |           image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04"
25 |           command: ["/bin/bash", "-c", "--"]
26 |           args:
27 |             - while true; do /cuda-samples/vectorAdd; done
28 |           resources:
29 |            limits:
30 |              nvidia.com/gpu: 1
31 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/cdi-get-pods-restart.txt:
--------------------------------------------------------------------------------
 1 | NAME                                                          READY   STATUS      RESTARTS   AGE
 2 | gpu-feature-discovery-qnw2q                                   1/1     Running     0          47h
 3 | gpu-operator-6d59774ff-hznmr                                  1/1     Running     0          2d
 4 | gpu-operator-node-feature-discovery-master-6d6649d597-7l8bj   1/1     Running     0          2d
 5 | gpu-operator-node-feature-discovery-worker-v86vj              1/1     Running     0          2d
 6 | nvidia-container-toolkit-daemonset-2768s                      1/1     Running     0          2m11s
 7 | nvidia-cuda-validator-ls4vc                                   0/1     Completed   0          47h
 8 | nvidia-dcgm-exporter-fxp9h                                    1/1     Running     0          47h
 9 | nvidia-device-plugin-daemonset-dvp4v                          1/1     Running     0          2m26s
10 | nvidia-device-plugin-validator-kvxbs                          0/1     Completed   0          47h
11 | nvidia-driver-daemonset-m86r7                                 1/1     Running     0          2d
12 | nvidia-operator-validator-xg98r                               1/1     Running     0          47h
13 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/common-cuda-vectoradd-logs.txt:
--------------------------------------------------------------------------------
1 | [Vector addition of 50000 elements]
2 | Copy input data from the host memory to the CUDA device
3 | CUDA kernel launch with 196 blocks of 256 threads
4 | Copy output data from the CUDA device to the host memory
5 | Test PASSED
6 | Done
7 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/mig-get-pods.txt:
--------------------------------------------------------------------------------
 1 | NAME                                                          READY   STATUS      RESTARTS   AGE
 2 | gpu-feature-discovery-qmwb2                                   1/1     Running     0          14m
 3 | gpu-operator-7bbf8bb6b7-xz664                                 1/1     Running     0          14m
 4 | gpu-operator-node-feature-discovery-gc-79d6d968bb-sg4t6       1/1     Running     0          14m
 5 | gpu-operator-node-feature-discovery-master-6d9f8d497c-7cwrp   1/1     Running     0          14m
 6 | gpu-operator-node-feature-discovery-worker-x5z62              1/1     Running     0          14m
 7 | nvidia-container-toolkit-daemonset-pkcpr                      1/1     Running     0          14m
 8 | nvidia-cuda-validator-wt6bc                                   0/1     Completed   0          12m
 9 | nvidia-dcgm-exporter-zsskv                                    1/1     Running     0          14m
10 | nvidia-device-plugin-daemonset-924x6                          1/1     Running     0          14m
11 | nvidia-driver-daemonset-klj5s                                 1/1     Running     0          14m
12 | nvidia-mig-manager-8d6wz                                      1/1     Running     0          12m
13 | nvidia-operator-validator-fnsmk                               1/1     Running     0          14m
14 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/mig-mixed-nvidia-smi.txt:
--------------------------------------------------------------------------------
1 | GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-b4895dbf-9350-2524-a89b-98161ddd9fe4)
2 |   MIG 3g.40gb     Device  0: (UUID: MIG-7089d0f3-293f-58c9-8f8c-5ea666eedbde)
3 |   MIG 2g.20gb     Device  1: (UUID: MIG-56c30729-347f-5dd6-8da0-c3cc59e969e0)
4 |   MIG 1g.10gb     Device  2: (UUID: MIG-9d14fb21-4ae1-546f-a636-011582899c39)
5 |   MIG 1g.10gb     Device  3: (UUID: MIG-0f709664-740c-52b0-ae79-3e4c9ede6d3b)
6 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/mig-nvidia-smi.txt:
--------------------------------------------------------------------------------
1 | GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-b4895dbf-9350-2524-a89b-98161ddd9fe4)
2 |   MIG 1g.10gb     Device  0: (UUID: MIG-3f6f389f-b0cc-5e5c-8e32-eaa8fd067902)
3 |   MIG 1g.10gb     Device  1: (UUID: MIG-35f93699-4b53-5a19-8289-80b8418eec60)
4 |   MIG 1g.10gb     Device  2: (UUID: MIG-9d14fb21-4ae1-546f-a636-011582899c39)
5 |   MIG 1g.10gb     Device  3: (UUID: MIG-0f709664-740c-52b0-ae79-3e4c9ede6d3b)
6 |   MIG 1g.10gb     Device  4: (UUID: MIG-5d23f73a-d378-50ac-a6f5-3bf5184773bb)
7 |   MIG 1g.10gb     Device  5: (UUID: MIG-6cea15c7-8a56-578c-b965-0e73cb6dfc10)
8 |   MIG 1g.10gb     Device  6: (UUID: MIG-981c86e9-3607-57d7-9426-295347e4b925)
9 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/precomp-driver-conventional-running.txt:
--------------------------------------------------------------------------------
1 | NAME                            READY   STATUS    RESTARTS   AGE
2 | nvidia-driver-daemonset-qwprp   1/1     Running   0          10m
3 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/precomp-driver-running.txt:
--------------------------------------------------------------------------------
1 | NAME                                                          READY   STATUS    RESTARTS   AGE
2 | nvidia-driver-daemonset-5.15.0-69-generic-ubuntu22.04-thbts   1/1     Running   0          44s
3 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/precomp-driver-terminating.txt:
--------------------------------------------------------------------------------
 1 | NAME                                                              READY   STATUS        RESTARTS   AGE
 2 | pod/gpu-feature-discovery-pzzr8                                   2/2     Running       0          19m
 3 | pod/gpu-operator-859cb64846-57hfn                                 1/1     Running       0          47m
 4 | pod/gpu-operator-node-feature-discovery-master-6d6649d597-7l8bj   1/1     Running       0          10d
 5 | pod/gpu-operator-node-feature-discovery-worker-v86vj              1/1     Running       0          10d
 6 | pod/nvidia-container-toolkit-daemonset-6ltbv                      1/1     Running       0          19m
 7 | pod/nvidia-cuda-validator-62w6r                                   0/1     Completed     0          17m
 8 | pod/nvidia-dcgm-exporter-fh5wz                                    1/1     Running       0          19m
 9 | pod/nvidia-device-plugin-daemonset-rwslh                          2/2     Running       0          19m
10 | pod/nvidia-device-plugin-validator-gq4ww                          0/1     Completed     0          17m
11 | pod/nvidia-driver-daemonset-xqrxk                                 1/1     Terminating   0          20m
12 | pod/nvidia-operator-validator-78mzv                               1/1     Running       0          19m
13 | 
14 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/time-slicing-get-events.txt:
--------------------------------------------------------------------------------
 1 | LAST SEEN   TYPE      REASON             OBJECT                                     MESSAGE
 2 | 33s         Normal    Created            pod/nvidia-device-plugin-daemonset-cffds   Created container toolkit-validation
 3 | 33s         Normal    Started            pod/nvidia-device-plugin-daemonset-cffds   Started container toolkit-validation
 4 | 33s         Normal    Started            pod/gpu-feature-discovery-rvlg9            Started container toolkit-validation
 5 | 33s         Normal    Created            pod/gpu-feature-discovery-rvlg9            Created container toolkit-validation
 6 | 33s         Normal    Pulled             pod/gpu-feature-discovery-rvlg9            Container image "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1" already present on machine
 7 | 33s         Normal    Pulled             pod/nvidia-device-plugin-daemonset-cffds   Container image "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1" already present on machine
 8 | 32s         Normal    Created            pod/nvidia-device-plugin-daemonset-cffds   Created container config-manager-init
 9 | 32s         Normal    Pulled             pod/nvidia-device-plugin-daemonset-cffds   Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine
10 | 32s         Normal    Pulled             pod/gpu-feature-discovery-rvlg9            Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine
11 | 32s         Normal    Created            pod/gpu-feature-discovery-rvlg9            Created container config-manager-init
12 | 32s         Normal    Started            pod/gpu-feature-discovery-rvlg9            Started container config-manager-init
13 | 32s         Normal    Started            pod/nvidia-device-plugin-daemonset-cffds   Started container config-manager-init
14 | 31s         Normal    Created            pod/gpu-feature-discovery-rvlg9            Created container config-manager
15 | 31s         Normal    Started            pod/gpu-feature-discovery-rvlg9            Started container gpu-feature-discovery
16 | 31s         Normal    Created            pod/gpu-feature-discovery-rvlg9            Created container gpu-feature-discovery
17 | 31s         Normal    Pulled             pod/gpu-feature-discovery-rvlg9            Container image "nvcr.io/nvidia/gpu-feature-discovery:v0.7.0-ubi8" already present on machine
18 | 31s         Normal    Started            pod/nvidia-device-plugin-daemonset-cffds   Started container config-manager
19 | 31s         Normal    Created            pod/nvidia-device-plugin-daemonset-cffds   Created container config-manager
20 | 31s         Normal    Pulled             pod/nvidia-device-plugin-daemonset-cffds   Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine
21 | 31s         Normal    Started            pod/nvidia-device-plugin-daemonset-cffds   Started container nvidia-device-plugin
22 | 31s         Normal    Created            pod/nvidia-device-plugin-daemonset-cffds   Created container nvidia-device-plugin
23 | 31s         Normal    Pulled             pod/nvidia-device-plugin-daemonset-cffds   Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine
24 | 31s         Normal    Pulled             pod/gpu-feature-discovery-rvlg9            Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine
25 | 31s         Normal    Started            pod/gpu-feature-discovery-rvlg9            Started container config-manager
26 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/time-slicing-get-pods.txt:
--------------------------------------------------------------------------------
1 | NAME                                         READY   STATUS    RESTARTS   AGE
2 | time-slicing-verification-7cdc7f87c5-lkd9d   1/1     Running   0          23s
3 | time-slicing-verification-7cdc7f87c5-rrzq7   1/1     Running   0          23s
4 | time-slicing-verification-7cdc7f87c5-s8qwk   1/1     Running   0          23s
5 | time-slicing-verification-7cdc7f87c5-xhmb7   1/1     Running   0          23s
6 | time-slicing-verification-7cdc7f87c5-zsncp   1/1     Running   0          23s
7 | 


--------------------------------------------------------------------------------
/gpu-operator/manifests/output/time-slicing-logs-pods.txt:
--------------------------------------------------------------------------------
 1 | Found 5 pods, using pod/time-slicing-verification-7cdc7f87c5-s8qwk
 2 | [Vector addition of 50000 elements]
 3 | Copy input data from the host memory to the CUDA device
 4 | CUDA kernel launch with 196 blocks of 256 threads
 5 | Copy output data from the CUDA device to the host memory
 6 | Test PASSED
 7 | Done
 8 | [Vector addition of 50000 elements]
 9 | Copy input data from the host memory to the CUDA device
10 | CUDA kernel launch with 196 blocks of 256 threads
11 | Copy output data from the CUDA device to the host memory
12 | ...
13 | 


--------------------------------------------------------------------------------
/gpu-operator/overview.rst:
--------------------------------------------------------------------------------
  1 | .. license-header
  2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |   SPDX-License-Identifier: Apache-2.0
  4 | 
  5 |   Licensed under the Apache License, Version 2.0 (the "License");
  6 |   you may not use this file except in compliance with the License.
  7 |   You may obtain a copy of the License at
  8 | 
  9 |   http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 |   Unless required by applicable law or agreed to in writing, software
 12 |   distributed under the License is distributed on an "AS IS" BASIS,
 13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   See the License for the specific language governing permissions and
 15 |   limitations under the License.
 16 | 
 17 | .. headings # #, * *, =, -, ^, "
 18 | 
 19 | 
 20 | *****************************
 21 | About the NVIDIA GPU Operator
 22 | *****************************
 23 | 
 24 | .. image:: graphics/nvidia-gpu-operator-image.jpg
 25 |    :width: 600
 26 | 
 27 | Kubernetes provides access to special hardware resources such as NVIDIA GPUs, NICs, Infiniband adapters and other devices
 28 | through the `device plugin framework <https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/>`_.
 29 | However, configuring and managing nodes with these hardware resources requires
 30 | configuration of multiple software components such as drivers, container runtimes or other libraries which are difficult
 31 | and prone to errors. The NVIDIA GPU Operator uses the `operator framework <https://coreos.com/blog/introducing-operator-framework>`_
 32 | within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. These components include the NVIDIA drivers (to enable CUDA),
 33 | Kubernetes device plugin for GPUs, the `NVIDIA Container Toolkit <https://github.com/NVIDIA/nvidia-container-toolkit>`_,
 34 | automatic node labeling using `GFD <https://github.com/NVIDIA/gpu-feature-discovery>`_, `DCGM <https://developer.nvidia.com/dcgm>`_ based monitoring and others.
 35 | 
 36 | 
 37 | .. card:: Red Hat OpenShift Container Platform
 38 | 
 39 |    For information about installing, managing, and upgrading the Operator,
 40 |    refer to :external+ocp:doc:`index`.
 41 | 
 42 |    Information about supported versions is available in :ref:`Supported Operating Systems and Kubernetes Platforms`.
 43 | 
 44 | 
 45 | About This Documentation
 46 | ========================
 47 | 
 48 | Browse through the following documents for getting started, platform support and release notes.
 49 | 
 50 | Getting Started
 51 | ---------------
 52 | 
 53 | The :ref:`operator-install-guide` guide includes information on installing the GPU Operator in a Kubernetes cluster.
 54 | 
 55 | Release Notes
 56 | ---------------
 57 | 
 58 | Refer to :ref:`operator-release-notes` for information about releases.
 59 | 
 60 | Platform Support
 61 | ------------------
 62 | 
 63 | The :ref:`operator-platform-support` describes the supported platform configurations.
 64 | 
 65 | Licenses and Contributing
 66 | =========================
 67 | 
 68 | .. _pstai: https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/
 69 | .. |pstai| replace:: Product-Specific Terms for NVIDIA AI Products
 70 | 
 71 | The NVIDIA GPU Operator source code is licensed under `Apache 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`__ and
 72 | contributions are accepted with a DCO. Refer to the `contributing <https://github.com/NVIDIA/gpu-operator/blob/master/CONTRIBUTING.md>`_ document for
 73 | more information on how to contribute and the release artifacts.
 74 | 
 75 | The base images used by the software might include software that is licensed under open-source licenses such as GPL.
 76 | The source code for these components is archived on the CUDA opensource `index <https://developer.download.nvidia.com/compute/cuda/opensource/>`_.
 77 | 
 78 | The following table identifieis the licenses for the Operator and software components.
 79 | By installing and using the GPU Operator, you accept the terms and conditions of these licenses.
 80 | 
 81 | .. list-table::
 82 |    :header-rows: 1
 83 |    :widths: 30 10 60
 84 | 
 85 |    * - Component
 86 |      - Artifact Type
 87 |      - Artifact Licenses
 88 | 
 89 |    * - NVIDIA GPU Operator
 90 |      - Helm Chart
 91 |      - `Apache 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`__
 92 | 
 93 |    * - NVIDIA GPU Operator
 94 |      - Image
 95 |      - |pstai|_
 96 | 
 97 |    * - NVIDIA GPU Feature Discovery
 98 |      - Image
 99 |      - |pstai|_
100 | 
101 |    * - NVIDIA GPU Driver
102 |      - Image
103 |      - `License for Customer Use of NVIDIA Software <http://www.nvidia.com/content/DriverDownload-March2009/licence.php?lang=us>`__
104 | 
105 |        |pstai|_
106 | 
107 |    * - NVIDIA Container Toolkit
108 |      - Image
109 |      - |pstai|_
110 | 
111 |    * - NVIDIA Kubernetes Device Plugin
112 |      - Image
113 |      - |pstai|_
114 | 
115 |    * - NVIDIA MIG Manager for Kubernetes
116 |      - Image
117 |      - |pstai|_
118 | 
119 |    * - Validator for NVIDIA GPU Operator
120 |      - Image
121 |      - |pstai|_
122 | 
123 |    * - NVIDIA DCGM
124 |      - Image
125 |      - |pstai|_
126 | 
127 |    * - NVIDIA DCGM Exporter
128 |      - Image
129 |      - |pstai|_
130 | 
131 |    * - NVIDIA Driver Manager for Kubernetes
132 |      - Image
133 |      - |pstai|_
134 | 
135 |    * - NVIDIA KubeVirt GPU Device Plugin
136 |      - Image
137 |      - |pstai|_
138 | 
139 |    * - NVIDIA vGPU Device Manager
140 |      - Image
141 |      - |pstai|_
142 | 
143 |    * - NVIDIA GDS Driver
144 |      - Image
145 |      - `License for Customer Use of NVIDIA Software <http://www.nvidia.com/content/DriverDownload-March2009/licence.php?lang=us>`__
146 | 
147 |        |pstai|_
148 | 
149 |    * - NVIDIA Confidential Computing
150 |        Manager for Kubernetes
151 |      - Image
152 |      - |pstai|_
153 | 
154 |    * - NVIDIA Kata Manager for Kubernetes
155 |      - Image
156 |      - |pstai|_
157 | 
158 |    * - NVIDIA GDRCopy Driver
159 |      - Image
160 |      - |pstai|_


--------------------------------------------------------------------------------
/gpu-operator/security.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | *****************************
  3 | Security Considerations
  4 | *****************************
  5 | 
  6 | 
  7 | Pod Security Context of the Operator and Operands
  8 | =================================================
  9 | 
 10 | Several of the NVIDIA GPU Operator operands, such as the driver containers and container toolkit,
 11 | require the following elevated privileges:
 12 | 
 13 | - ``privileged: true``
 14 | - ``hostPID: true``
 15 | - ``hostIPC: true``
 16 | 
 17 | The elevated privileges are required for the following reasons:
 18 | 
 19 | - Access to the host file system and hardware devices, such as NVIDIA GPUs.
 20 | - Restart system services such as containerd.
 21 | - Loading and unloading kernel modules.
 22 | 
 23 | Only the Kubernetes cluster administrator needs to access or manage the Operator namespace.
 24 | As a best practice, establish proper security policies and prevent any other users from accessing the Operator namespace.
 25 | 
 26 | 
 27 | CVEs
 28 | =================================================
 29 | 
 30 | The following is a list of known CVEs in the GPU Operator or its operands.
 31 | To view any published security bulletins for NVIDIA products published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/.
 32 | 
 33 | .. list-table:: CVEs
 34 |    :widths: 20 45 35
 35 |    :header-rows: 1
 36 | 
 37 |    * - CVE ID
 38 |      - Affected Components
 39 |      - Fixed Version
 40 | 
 41 |    * - `NVIDIA CVE-2025-23359 <https://nvidia.custhelp.com/app/answers/detail/a_id/5616>`_
 42 |      - NVIDIA Container Toolkit, all versions up to and including 1.17.3
 43 | 
 44 |        NVIDIA GPU Operator, all versions up to and including 24.9.1
 45 |      - NVIDIA Container Toolkit 1.17.4
 46 | 
 47 |        NVIDIA GPU Operator 24.9.2
 48 | 
 49 |    * - `NVIDIA CVE-2024-0135 <https://nvidia.custhelp.com/app/answers/detail/a_id/5599>`_ 
 50 |      - NVIDIA Container Toolkit, all versions up to and including 1.17.2
 51 | 
 52 |        NVIDIA GPU Operator, all versions up to and including 24.9.0
 53 |      - NVIDIA Container Toolkit 1.17.3
 54 | 
 55 |        NVIDIA GPU Operator 24.9.1
 56 | 
 57 |    * - `NVIDIA CVE-2024-0136 <https://nvidia.custhelp.com/app/answers/detail/a_id/5599>`_ 
 58 |      - NVIDIA Container Toolkit, all versions up to and including 1.17.2
 59 | 
 60 |        NVIDIA GPU Operator, all versions up to and including 24.9.0
 61 |      - NVIDIA Container Toolkit 1.17.3
 62 | 
 63 |        NVIDIA GPU Operator 24.9.1
 64 | 
 65 |    * - `NVIDIA CVE-2024-0137 <https://nvidia.custhelp.com/app/answers/detail/a_id/5599>`_
 66 |      - NVIDIA Container Toolkit, all versions up to and including 1.17.2
 67 | 
 68 |        NVIDIA GPU Operator, all versions up to and including 24.9.0
 69 |      - NVIDIA Container Toolkit 1.17.3
 70 | 
 71 |        NVIDIA GPU Operator 24.9.1
 72 | 
 73 |    * - `NVIDIA CVE-2024-0134 <https://nvidia.custhelp.com/app/answers/detail/a_id/5585>`_
 74 |      - NVIDIA Container Toolkit, all versions up to and including 1.16.2
 75 | 
 76 |        NVIDIA GPU Operator, all versions up to and including 24.6.2
 77 |      - NVIDIA Container Toolkit 1.17.0
 78 | 
 79 |        NVIDIA GPU Operator 24.9.0
 80 | 
 81 |    * - `NVIDIA CVE-2024-0132 <https://nvidia.custhelp.com/app/answers/detail/a_id/5582>`_
 82 |      - NVIDIA Container Toolkit, all versions up to and including 1.16.1
 83 | 
 84 |        NVIDIA GPU Operator, all versions up to and including 24.6.1
 85 |      - NVIDIA Container Toolkit 1.16.2
 86 | 
 87 |        NVIDIA GPU Operator 24.6.2
 88 |    * - `NVIDIA CVE-2024-0133 <https://nvidia.custhelp.com/app/answers/detail/a_id/5582>`_
 89 |      - NVIDIA Container Toolkit, all versions up to and including 1.16.1
 90 | 
 91 |        NVIDIA GPU Operator, all versions up to and including 24.6.1
 92 |      - NVIDIA Container Toolkit 1.16.2
 93 | 
 94 |        NVIDIA GPU Operator 24.6.2
 95 | 
 96 | Report a Vulnerability
 97 | -----------------------------
 98 | 
 99 | For details on reporting a suspected vulnerability, refer to the  `GPU Operator Security policies <https://github.com/NVIDIA/gpu-operator/blob/main/SECURITY.md/>`_ page.
100 | 


--------------------------------------------------------------------------------
/gpu-operator/uninstall.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings # #, * *, =, -, ^, "
18 | 
19 | #############################
20 | Uninstalling the GPU Operator
21 | #############################
22 | 
23 | Perform the following steps to uninstall the Operator.
24 | 
25 | #. Optional: List and delete NVIDIA driver custom resources.
26 | 
27 |    .. code-block:: console
28 | 
29 |       $ kubectl get nvidiadrivers
30 | 
31 |    *Example Output*
32 | 
33 |    .. code-block:: output
34 | 
35 |       NAME          STATUS   AGE
36 |       demo-gold     ready    2023-10-16T17:57:12Z
37 |       demo-silver   ready    2023-10-16T17:57:12Z
38 | 
39 |    .. code-block:: console
40 | 
41 |       $ kubectl delete nvidiadriver demo-gold
42 |       $ kubectl delete nvidiadriver demo-silver
43 | 
44 |    .. code-block:: console
45 | 
46 |       $ kubectl delete crd nvidiadrivers.nvidia.com
47 | 
48 | #. Delete the Operator:
49 | 
50 |    .. code-block:: console
51 | 
52 |       $ helm delete -n gpu-operator $(helm list -n gpu-operator | grep gpu-operator | awk '{print $1}')
53 | 
54 | #. Optional: List the pods in the Operator namespace to confirm the pods are deleted or in the process of deleting:
55 | 
56 |    .. code-block:: console
57 | 
58 |       $ kubectl get pods -n gpu-operator
59 | 
60 |    *Example Output*
61 | 
62 |    .. code-block:: output
63 | 
64 |       No resources found.
65 | 
66 | By default, Helm does not `support deleting existing CRDs <https://helm.sh/docs/chart_best_practices/custom_resource_definitions/#some-caveats-and-explanations>`__
67 | when you delete the chart.
68 | As a result, the ``clusterpolicy`` CRD and ``nvidiadrivers`` CRD will still remain, by default.
69 | 
70 | .. code-block:: console
71 | 
72 |    $ kubectl get crd clusterpolicies.nvidia.com
73 | 
74 | To overcome this, the Operator uses a `post-delete hook <https://helm.sh/docs/topics/charts_hooks/#the-available-hooks>`__
75 | to perform the CRD cleanup.
76 | The ``operator.cleanupCRD`` chart parameter is added to enable this hook.
77 | This parameter is disabled by default.
78 | You can enable the hook by specifying ``--set operator.cleanupCRD=true`` during install or upgrade to perform automatic CRD cleanup on chart deletion.
79 | 
80 | Alternatively, you can delete the custom resource definition:
81 | 
82 | .. code-block:: console
83 | 
84 |    $ kubectl delete crd clusterpolicies.nvidia.com
85 | 
86 | .. note::
87 | 
88 |    * After uninstalling the Operator, the NVIDIA driver modules might still be loaded.
89 |      Either reboot the node or unload them using the following command:
90 | 
91 |      .. code-block:: console
92 | 
93 |         $ sudo rmmod nvidia_modeset nvidia_uvm nvidia
94 | 
95 |    * Helm hooks used with the GPU Operator use the Operator image itself.
96 |      If the Operator image cannot be pulled successfully (either due to network error or an invalid NGC registry secret in case of NVAIE), hooks will fail.
97 |      In this case, delete the chart and specify the ``--no-hooks`` argument to avoid hanging on hook failures.
98 | 


--------------------------------------------------------------------------------
/gpu-operator/versions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "latest": "25.10",
 3 |     "versions":
 4 |     [
 5 |         {
 6 |             "version": "25.10"
 7 |         },
 8 |         {
 9 |             "version": "25.3"
10 |         },
11 |         {
12 |             "version": "24.9"
13 |         }
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------
/gpu-operator/versions1.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "preferred": "true",
 4 |     "url": "../25.10",
 5 |     "version": "25.10"
 6 |   }, 
 7 |   {
 8 |     "url": "../25.3",
 9 |     "version": "25.3"
10 |   }, 
11 |   {
12 |     "url": "../24.9",
13 |     "version": "24.9"
14 |   }
15 | ]
16 | 


--------------------------------------------------------------------------------
/gpu-telemetry/about-telemetry.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings (h1/h2/h3/h4/h5) are # * = -
18 | 
19 | ###################
20 | About GPU Telemetry
21 | ###################
22 | 
23 | Monitoring stacks usually consist of a collector, a time-series database to store metrics and a visualization layer.
24 | A popular open-source stack is `Prometheus <https://prometheus.io/>`_ used along with `Grafana <https://grafana.com/>`_ as
25 | the visualization tool to create rich dashboards. Prometheus also includes an `Alertmanager <https://github.com/prometheus/alertmanager>`_,
26 | to create and manage alerts. Prometheus is deployed along with `kube-state-metrics <https://github.com/kubernetes/kube-state-metrics>`_ and
27 | `node_exporter <https://github.com/prometheus/node_exporter>`_ to expose cluster-level metrics for Kubernetes API objects and node-level
28 | metrics such as CPU utilization.
29 | 
30 | An architecture of Prometheus is shown in the figure below:
31 | 
32 | .. image:: https://boxboat.com/2019/08/08/monitoring-kubernetes-with-prometheus/prometheus-architecture.png
33 |    :width: 800
34 | 
35 | 
36 | To gather GPU telemetry in Kubernetes, its recommended to use DCGM Exporter.  DCGM Exporter, based on `DCGM <https://developer.nvidia.com/dcgm>`_ exposes
37 | GPU metrics for Prometheus and can be visualized using Grafana.  DCGM Exporter is architected to take advantage of
38 | ``KubeletPodResources`` `API <https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/>`_ and exposes GPU metrics in a format that can be
39 | scraped by Prometheus. A ``ServiceMonitor`` is also included to expose endpoints.
40 | 


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/001-dcgm-e2e-prom-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/001-dcgm-e2e-prom-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/002-dcgm-e2e-grafana-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/002-dcgm-e2e-grafana-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/003-dcgm-e2e-grafana-home-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/003-dcgm-e2e-grafana-home-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/004-dcgm-e2e-grafana-manage-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/004-dcgm-e2e-grafana-manage-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/005-dcgm-e2e-grafana-import-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/005-dcgm-e2e-grafana-import-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/006-dcgm-e2e-grafana-import-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/006-dcgm-e2e-grafana-import-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/007-dcgm-e2e-grafana-import-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/007-dcgm-e2e-grafana-import-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/008-dcgm-e2e-grafana-dashboard-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/008-dcgm-e2e-grafana-dashboard-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/009-dcgm-e2e-deepstream-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/009-dcgm-e2e-deepstream-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/010-dcgm-e2e-deepstream-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/010-dcgm-e2e-deepstream-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-e2e/011-dcgm-e2e-prom-dashboard-metrics-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/011-dcgm-e2e-prom-dashboard-metrics-screenshot.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-exporter-bare-metal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-exporter-bare-metal.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-exporter-containers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-exporter-containers.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm-exporter_embedded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-exporter_embedded.png


--------------------------------------------------------------------------------
/gpu-telemetry/graphics/dcgm_and_dcgm-exporter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm_and_dcgm-exporter.png


--------------------------------------------------------------------------------
/gpu-telemetry/index.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings (h1/h2/h3/h4/h5) are # * = -
18 | 
19 | .. toctree::
20 |    :caption: DCGM Exporter
21 |    :titlesonly:
22 |    :hidden:
23 | 
24 |    about-telemetry
25 |    dcgm-exporter
26 |    integrating-telemetry-kubernetes
27 |    kube-prometheus
28 | 
29 | 
30 | .. include:: about-telemetry.rst
31 |    :start-line: 18
32 | 


--------------------------------------------------------------------------------
/gpu-telemetry/versions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "versions":
3 |     [
4 |         {
5 |             "version": "1.0.0"
6 |         }
7 |     ]
8 | }


--------------------------------------------------------------------------------
/gpu-telemetry/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/kubernetes/versions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "versions":
3 |     [
4 |         {
5 |             "version": "1.0.0"
6 |         }
7 |     ]
8 | }


--------------------------------------------------------------------------------
/kubernetes/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/mig/mig.rst:
--------------------------------------------------------------------------------
 1 | .. Date: April 26 2021
 2 | .. Author: pramarao
 3 | 
 4 | .. headings (h1/h2/h3/h4/h5) are # * - =
 5 | 
 6 | .. _mig-landing:
 7 | 
 8 | ####################
 9 | Multi-Instance GPU
10 | ####################
11 | 
12 | *************
13 | Introduction
14 | *************
15 | 
16 | The new Multi-Instance GPU (MIG) feature allows GPUs based on the NVIDIA Ampere architecture 
17 | (such as NVIDIA A100) to be securely partitioned into up to seven separate GPU Instances for 
18 | CUDA applications, providing multiple users with separate GPU resources for optimal GPU 
19 | utilization. This feature is particularly beneficial for workloads that do not fully saturate 
20 | the GPU’s compute capacity and therefore users may want to run different workloads in parallel 
21 | to maximize utilization.
22 | 
23 | Refer to the `MIG User Guide <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html>`_ 
24 | for more details on the technical concepts, setting up and using MIG on NVIDIA Ampere GPUs. 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/openshift/appendix-ocp.rst:
--------------------------------------------------------------------------------
 1 | .. Date: November 17 2021
 2 | .. Author: kquinn
 3 | 
 4 | .. _ocp-appendix:
 5 | 
 6 | **********
 7 | Appendix
 8 | **********
 9 | 
10 | .. _cluster-entitlement:
11 | 
12 | Entitled NVIDIA Driver Builds No Longer Supported
13 | =================================================
14 | 
15 | Introduction
16 | -------------
17 | 
18 | .. important::
19 | 
20 |    **Entitled NVIDIA driver builds are deprecated and not supported starting with Red Hat OpenShift 4.10.**
21 | 
22 |    The Driver Toolkit (DTK) enables entitlement-free deployments of the GPU Operator. In the past, entitled builds were used pre-DTK and for some OpenShift versions where Driver Toolkit images were broken.
23 | 
24 |    If you encounter the :ref:`"broken driver toolkit detected" <broken-dtk>` warning on OpenShift 4.10 or later, you should :ref:`troubleshoot <broken-dtk-troubleshooting>` to find the root cause instead of falling back to entitled driver builds.
25 | 
26 |    If the broken DTK warning is encountered on an older version of OpenShift, refer to the documentation for an older version of the NVIDIA GPU Operator to enable entitled builds. Keep in mind that older versions of OpenShift might no longer be supported.
27 | 
28 | .. _broken-dtk-troubleshooting:
29 | 
30 | Troubleshooting Broken Driver Toolkit Errors
31 | --------------------------------------------
32 | 
33 | The most likely reason for the broken DTK message is Node Feature Discovery (NFD) not working correctly. NFD might be disabled, failing, or not updating the kernel version label for other reasons. Another cause might be a missing or incomplete DTK image stream, for example, because of broken mirroring.
34 | 
35 | Follow these steps for initial troubleshooting of Node Feature Discovery:
36 | 
37 | #. **Check Node Feature Discovery (NFD) status:**
38 | 
39 |    .. code-block:: console
40 | 
41 |       $ oc get pods -n openshift-nfd
42 | 
43 |    Ensure NFD pods are running and healthy. If NFD is not deployed or is failing, this can cause DTK issues.
44 | 
45 | #. **Verify kernel version labels are present and correct:**
46 | 
47 |    .. code-block:: console
48 | 
49 |       $ oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{":\t"}{.metadata.labels.feature\.node\.kubernetes\.io/kernel-version\.full}{"\n"}{end}'
50 | 
51 |    Ensure nodes have proper kernel version labels that match the current OpenShift version of the cluster.
52 | 
53 | #. **Check Driver Toolkit image stream:**
54 | 
55 |    .. code-block:: console
56 | 
57 |       $ oc get -n openshift is/driver-toolkit
58 | 
59 |    Verify the driver-toolkit image stream exists and has the correct tags that correspond to the current OpenShift version.
60 | 
61 | For additional troubleshooting resources:
62 | 
63 | * `Node Feature Discovery documentation <https://kubernetes-sigs.github.io/node-feature-discovery/>`_.
64 | * `Red Hat Node Feature Discovery Operator documentation <https://docs.openshift.com/container-platform/latest/hardware_enablement/psap-node-feature-discovery-operator.html>`_
65 | * `OpenShift Driver Toolkit documentation <https://docs.redhat.com/en/documentation/openshift_container_platform/latest/html/specialized_hardware_and_driver_enablement/driver-toolkit>`_
66 | * `OpenShift Driver Toolkit GitHub repository <https://github.com/openshift/driver-toolkit/>`_
67 | * `OpenShift troubleshooting guide <https://docs.openshift.com/container-platform/latest/support/troubleshooting/>`_
68 | 


--------------------------------------------------------------------------------
/openshift/clean-up.rst:
--------------------------------------------------------------------------------
 1 | .. Date: September 01 2021
 2 | .. Author: kquinn
 3 | 
 4 | .. _clean-up:
 5 | 
 6 | *****************************************
 7 | Cleanup
 8 | *****************************************
 9 | This section describes how to clean up (remove) the GPU Operator if it is no longer needed.
10 | 
11 | #. Delete the NVIDIA GPU Operator from the cluster following the guidance outlined in `Deleting Operators from a cluster <https://docs.openshift.com/container-platform/latest/operators/admin/olm-deleting-operators-from-cluster.html>`_.
12 | 
13 | #. Delete the cluster policy by using the OpenShift Container Platform CLI.
14 | 
15 |    .. code-block:: console
16 | 
17 |       $ oc delete crd clusterpolicies.nvidia.com
18 | 
19 |    .. code-block:: console
20 | 
21 |       customresourcedefinition.apiextensions.k8s.io "clusterpolicies.nvidia.com" deleted
22 | 


--------------------------------------------------------------------------------
/openshift/download/0003-cluster-wide-machineconfigs.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: machineconfiguration.openshift.io/v1
 2 | kind: MachineConfig
 3 | metadata:
 4 |   labels:
 5 |     machineconfiguration.openshift.io/role: worker
 6 |   name: 50-rhsm-conf
 7 | spec:
 8 |   config:
 9 |     ignition:
10 |       version: 2.2.0
11 |     storage:
12 |       files:
13 |       - contents:
14 |           source: data:text/plain;charset=utf-8;base64,IyBSZWQgSGF0IFN1YnNjcmlwdGlvbiBNYW5hZ2VyIENvbmZpZ3VyYXRpb24gRmlsZToKCiMgVW5pZmllZCBFbnRpdGxlbWVudCBQbGF0Zm9ybSBDb25maWd1cmF0aW9uCltzZXJ2ZXJdCiMgU2VydmVyIGhvc3RuYW1lOgpob3N0bmFtZSA9IHN1YnNjcmlwdGlvbi5yaHNtLnJlZGhhdC5jb20KCiMgU2VydmVyIHByZWZpeDoKcHJlZml4ID0gL3N1YnNjcmlwdGlvbgoKIyBTZXJ2ZXIgcG9ydDoKcG9ydCA9IDQ0MwoKIyBTZXQgdG8gMSB0byBkaXNhYmxlIGNlcnRpZmljYXRlIHZhbGlkYXRpb246Cmluc2VjdXJlID0gMAoKIyBTZXQgdGhlIGRlcHRoIG9mIGNlcnRzIHdoaWNoIHNob3VsZCBiZSBjaGVja2VkCiMgd2hlbiB2YWxpZGF0aW5nIGEgY2VydGlmaWNhdGUKc3NsX3ZlcmlmeV9kZXB0aCA9IDMKCiMgYW4gaHR0cCBwcm94eSBzZXJ2ZXIgdG8gdXNlCnByb3h5X2hvc3RuYW1lID0KCiMgVGhlIHNjaGVtZSB0byB1c2UgZm9yIHRoZSBwcm94eSB3aGVuIHVwZGF0aW5nIHJlcG8gZGVmaW5pdGlvbnMsIGlmIG5lZWRlZAojIGUuZy4gaHR0cCBvciBodHRwcwpwcm94eV9zY2hlbWUgPSBodHRwCgojIHBvcnQgZm9yIGh0dHAgcHJveHkgc2VydmVyCnByb3h5X3BvcnQgPQoKIyB1c2VyIG5hbWUgZm9yIGF1dGhlbnRpY2F0aW5nIHRvIGFuIGh0dHAgcHJveHksIGlmIG5lZWRlZApwcm94eV91c2VyID0KCiMgcGFzc3dvcmQgZm9yIGJhc2ljIGh0dHAgcHJveHkgYXV0aCwgaWYgbmVlZGVkCnByb3h5X3Bhc3N3b3JkID0KCiMgaG9zdC9kb21haW4gc3VmZml4IGJsYWNrbGlzdCBmb3IgcHJveHksIGlmIG5lZWRlZApub19wcm94eSA9CgpbcmhzbV0KIyBDb250ZW50IGJhc2UgVVJMOgpiYXNldXJsID0gaHR0cHM6Ly9jZG4ucmVkaGF0LmNvbQoKIyBSZXBvc2l0b3J5IG1ldGFkYXRhIEdQRyBrZXkgVVJMOgpyZXBvbWRfZ3BnX3VybCA9CgojIFNlcnZlciBDQSBjZXJ0aWZpY2F0ZSBsb2NhdGlvbjoKY2FfY2VydF9kaXIgPSAvZXRjL3Joc20vY2EvCgojIERlZmF1bHQgQ0EgY2VydCB0byB1c2Ugd2hlbiBnZW5lcmF0aW5nIHl1bSByZXBvIGNvbmZpZ3M6CnJlcG9fY2FfY2VydCA9ICUoY2FfY2VydF9kaXIpc3JlZGhhdC11ZXAucGVtCgojIFdoZXJlIHRoZSBjZXJ0aWZpY2F0ZXMgc2hvdWxkIGJlIHN0b3JlZApwcm9kdWN0Q2VydERpciA9IC9ldGMvcGtpL3Byb2R1Y3QKZW50aXRsZW1lbnRDZXJ0RGlyID0gL2V0Yy9wa2kvZW50aXRsZW1lbnQKY29uc3VtZXJDZXJ0RGlyID0gL2V0Yy9wa2kvY29uc3VtZXIKCiMgTWFuYWdlIGdlbmVyYXRpb24gb2YgeXVtIHJlcG9zaXRvcmllcyBmb3Igc3Vic2NyaWJlZCBjb250ZW50OgptYW5hZ2VfcmVwb3MgPSAxCgojIFJlZnJlc2ggcmVwbyBmaWxlcyB3aXRoIHNlcnZlciBvdmVycmlkZXMgb24gZXZlcnkgeXVtIGNvbW1hbmQKZnVsbF9yZWZyZXNoX29uX3l1bSA9IDAKCiMgSWYgc2V0IHRvIHplcm8sIHRoZSBjbGllbnQgd2lsbCBub3QgcmVwb3J0IHRoZSBwYWNrYWdlIHByb2ZpbGUgdG8KIyB0aGUgc3Vic2NyaXB0aW9uIG1hbmFnZW1lbnQgc2VydmljZS4KcmVwb3J0X3BhY2thZ2VfcHJvZmlsZSA9IDEKCiMgVGhlIGRpcmVjdG9yeSB0byBzZWFyY2ggZm9yIHN1YnNjcmlwdGlvbiBtYW5hZ2VyIHBsdWdpbnMKcGx1Z2luRGlyID0gL3Vzci9zaGFyZS9yaHNtLXBsdWdpbnMKCiMgVGhlIGRpcmVjdG9yeSB0byBzZWFyY2ggZm9yIHBsdWdpbiBjb25maWd1cmF0aW9uIGZpbGVzCnBsdWdpbkNvbmZEaXIgPSAvZXRjL3Joc20vcGx1Z2luY29uZi5kCgojIE1hbmFnZSBhdXRvbWF0aWMgZW5hYmxpbmcgb2YgeXVtL2RuZiBwbHVnaW5zIChwcm9kdWN0LWlkLCBzdWJzY3JpcHRpb24tbWFuYWdlcikKYXV0b19lbmFibGVfeXVtX3BsdWdpbnMgPSAxCgojIFJ1biB0aGUgcGFja2FnZSBwcm9maWxlIG9uIGVhY2ggeXVtL2RuZiB0cmFuc2FjdGlvbgpwYWNrYWdlX3Byb2ZpbGVfb25fdHJhbnMgPSAwCgojIElub3RpZnkgaXMgdXNlZCBmb3IgbW9uaXRvcmluZyBjaGFuZ2VzIGluIGRpcmVjdG9yaWVzIHdpdGggY2VydGlmaWNhdGVzLgojIEN1cnJlbnRseSBvbmx5IHRoZSAvZXRjL3BraS9jb25zdW1lciBkaXJlY3RvcnkgaXMgbW9uaXRvcmVkIGJ5IHRoZQojIHJoc20uc2VydmljZS4gV2hlbiB0aGlzIGRpcmVjdG9yeSBpcyBtb3VudGVkIHVzaW5nIGEgbmV0d29yayBmaWxlIHN5c3RlbQojIHdpdGhvdXQgaW5vdGlmeSBub3RpZmljYXRpb24gc3VwcG9ydCAoZS5nLiBORlMpLCB0aGVuIGRpc2FibGluZyBpbm90aWZ5CiMgaXMgc3Ryb25nbHkgcmVjb21tZW5kZWQuIFdoZW4gaW5vdGlmeSBpcyBkaXNhYmxlZCwgcGVyaW9kaWNhbCBkaXJlY3RvcnkKIyBwb2xsaW5nIGlzIHVzZWQgaW5zdGVhZC4KaW5vdGlmeSA9IDEKCltyaHNtY2VydGRdCiMgSW50ZXJ2YWwgdG8gcnVuIGNlcnQgY2hlY2sgKGluIG1pbnV0ZXMpOgpjZXJ0Q2hlY2tJbnRlcnZhbCA9IDI0MAojIEludGVydmFsIHRvIHJ1biBhdXRvLWF0dGFjaCAoaW4gbWludXRlcyk6CmF1dG9BdHRhY2hJbnRlcnZhbCA9IDE0NDAKIyBJZiBzZXQgdG8gemVybywgdGhlIGNoZWNrcyBkb25lIGJ5IHRoZSByaHNtY2VydGQgZGFlbW9uIHdpbGwgbm90IGJlIHNwbGF5ZWQgKHJhbmRvbWx5IG9mZnNldCkKc3BsYXkgPSAxCiMgSWYgc2V0IHRvIDEsIHJoc21jZXJ0ZCB3aWxsIG5vdCBleGVjdXRlLgpkaXNhYmxlID0gMAoKW2xvZ2dpbmddCmRlZmF1bHRfbG9nX2xldmVsID0gSU5GTwojIHN1YnNjcmlwdGlvbl9tYW5hZ2VyID0gREVCVUcKIyBzdWJzY3JpcHRpb25fbWFuYWdlci5tYW5hZ2VyY2xpID0gREVCVUcKIyByaHNtID0gREVCVUcKIyByaHNtLmNvbm5lY3Rpb24gPSBERUJVRwojIHJoc20tYXBwID0gREVCVUcKIyByaHNtLWFwcC5yaHNtZCA9IERFQlVHCg==
15 |         filesystem: root
16 |         mode: 0644
17 |         path: /etc/rhsm/rhsm.conf
18 | ---
19 | apiVersion: machineconfiguration.openshift.io/v1
20 | kind: MachineConfig
21 | metadata:
22 |   labels:
23 |     machineconfiguration.openshift.io/role: worker
24 |   name: 50-entitlement-pem
25 | spec:
26 |   config:
27 |     ignition:
28 |       version: 2.2.0
29 |     storage:
30 |       files:
31 |       - contents:
32 |           source: data:text/plain;charset=utf-8;base64,BASE64_ENCODED_PEM_FILE
33 |         filesystem: root
34 |         mode: 0644
35 |         path: /etc/pki/entitlement/entitlement.pem
36 | --- 
37 | apiVersion: machineconfiguration.openshift.io/v1
38 | kind: MachineConfig
39 | metadata:
40 |   labels:
41 |     machineconfiguration.openshift.io/role: worker
42 |   name: 50-entitlement-key-pem
43 | spec:
44 |   config:
45 |     ignition:
46 |       version: 2.2.0
47 |     storage:
48 |       files:
49 |       - contents:
50 |           source: data:text/plain;charset=utf-8;base64,BASE64_ENCODED_PEM_FILE
51 |         filesystem: root
52 |         mode: 0644
53 |         path: /etc/pki/entitlement/entitlement-key.pem
54 | 


--------------------------------------------------------------------------------
/openshift/enable-gpu-monitoring-dashboard.rst:
--------------------------------------------------------------------------------
  1 | .. Date: August 27 2023
  2 | .. Author: empovit
  3 | 
  4 | .. _enable-gpu-monitoring-dashboard:
  5 | 
  6 | #####################################
  7 | Enabling the GPU Monitoring Dashboard
  8 | #####################################
  9 | 
 10 | The GPU Operator exposes GPU telemetry for Prometheus by using the NVIDIA DCGM Exporter.
 11 | These metrics can be visualized using a monitoring dashboard based on Grafana.
 12 | 
 13 | Perform the following procedure to add the dashboard to the **Observe** section of the OpenShift Container Platform web console.
 14 | 
 15 | 
 16 | *************
 17 | Prerequisites
 18 | *************
 19 | 
 20 | * Your cluster uses OpenShift Container Platform 4.10 or higher.
 21 | * You have access to the cluster as a user with the ``cluster-admin`` cluster role.
 22 | 
 23 | 
 24 | **********************************************
 25 | Configuring the NVIDIA DCGM Exporter Dashboard
 26 | **********************************************
 27 | 
 28 | #. Download the latest NVIDIA DCGM Exporter Dashboard from the DCGM Exporter repository on GitHub:
 29 | 
 30 |    .. code-block:: console
 31 | 
 32 |       $ curl -LfO https://github.com/NVIDIA/dcgm-exporter/raw/main/grafana/dcgm-exporter-dashboard.json
 33 | 
 34 | #. Create a config map from the downloaded file in the ``openshift-config-managed`` namespace:
 35 | 
 36 |    .. code-block:: console
 37 | 
 38 |       $ oc create configmap nvidia-dcgm-exporter-dashboard -n openshift-config-managed --from-file=dcgm-exporter-dashboard.json
 39 | 
 40 | #. Label the config map to expose the dashboard in the **Administrator** perspective of the web console:
 41 | 
 42 |    .. code-block:: console
 43 | 
 44 |       $ oc label configmap nvidia-dcgm-exporter-dashboard -n openshift-config-managed "console.openshift.io/dashboard=true"
 45 | 
 46 | #. Optional: Label the config map to expose the dashboard in the **Developer** perspecitive of the web console:
 47 | 
 48 |    .. code-block:: console
 49 | 
 50 |       $ oc label configmap nvidia-dcgm-exporter-dashboard -n openshift-config-managed "console.openshift.io/odc-dashboard=true"
 51 | 
 52 | #. View the created resource and verify the labels:
 53 | 
 54 |    .. code-block:: console
 55 | 
 56 |       $ oc -n openshift-config-managed get cm nvidia-dcgm-exporter-dashboard --show-labels
 57 | 
 58 | 
 59 | ###################
 60 | Viewing GPU Metrics
 61 | ###################
 62 | 
 63 | - In the OpenShift Container Platform web console from the side menu, switch to the **Administrator** perspective, then navigate to
 64 |   **Observe** > **Dashboards** and select **NVIDIA DCGM Exporter Dashboard** from the **Dashboard** list.
 65 | 
 66 |   If the dashboard was added to the **Developer** perspective, in the OpenShift Container Platform web console from the side menu, switch to
 67 |   the **Developer** perspective, navigate to **Observe** > **Dashboard** and select **NVIDIA DCGM Exporter Dashboard** from the **Dashboard** list.
 68 | 
 69 |   The **NVIDIA DCGM Exporter Dashboard** displays the GPU-related graphs.
 70 | 
 71 |   .. image:: graphics/gpu_dashboards.png
 72 | 
 73 | The provided Grafana dashboard includes a default set of DCGM metrics.
 74 | You can create and deploy a custom dashboard definition in Grafana 6.x format.
 75 | 
 76 | 
 77 | ***********************************
 78 | Default NVIDIA DCGM Exporter Graphs
 79 | ***********************************
 80 | 
 81 | The following table provides a brief description of the graphs on the default dashboard.
 82 | 
 83 | +--------------------------+------------------------------------------------------------+
 84 | | Graph                    | Description                                                |
 85 | +==========================+============================================================+
 86 | | GPU Temperature          | GPU temperature in Celsius.                                |
 87 | +--------------------------+------------------------------------------------------------+
 88 | | GPU Avg. Temp            | Average GPU temperature in Celsius.                        |
 89 | +--------------------------+------------------------------------------------------------+
 90 | | GPU Power Usage          | Power usage in watts for each GPU.                         |
 91 | +--------------------------+------------------------------------------------------------+
 92 | | GPU Power Total          | Total power usage in watts.                                |
 93 | +--------------------------+------------------------------------------------------------+
 94 | | GPU SM Clocks            | SM clock frequency in hertz.                               |
 95 | +--------------------------+------------------------------------------------------------+
 96 | | GPU Utilization          | GPU utilization, percent.                                  |
 97 | +--------------------------+------------------------------------------------------------+
 98 | | GPU Framebuffer Mem Used | Frame buffer memory used in MB.                            |
 99 | +--------------------------+------------------------------------------------------------+
100 | | Tensor Core Utilization  | Ratio of cycles the tensor (HMMA) pipe is active, percent. |
101 | +--------------------------+------------------------------------------------------------+
102 | 


--------------------------------------------------------------------------------
/openshift/get-entitlement.rst:
--------------------------------------------------------------------------------
 1 | .. Date: Sept 07 2021
 2 | .. Author: kquinn
 3 | 
 4 | .. _get-entitlement:
 5 | 
 6 | ####################################################
 7 | Entitled Driver Builds No Longer Supported
 8 | ####################################################
 9 | 
10 | .. important::
11 | 
12 |    **Entitled NVIDIA driver builds are deprecated and not supported.**
13 | 
14 |    If you encounter issues with the NVIDIA GPU driver build that might require entitlement, refer to the Driver Toolkit (DTK) troubleshooting section: :ref:`broken-dtk-troubleshooting`.
15 | 


--------------------------------------------------------------------------------
/openshift/graphics/Mig-profile-A100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/Mig-profile-A100.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster-policy-image-version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-image-version.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster-policy-repository.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-repository.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster-policy-state-ready.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-state-ready.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster-policy-suceed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-suceed.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_1.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_2.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_3.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_4.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_5.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_6.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_entitlement_attachsub.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_attachsub.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy1.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy2.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_1.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_3.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_4.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_configure_vgpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_configure_vgpu.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_enable_sandbox_workloads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_enable_sandbox_workloads.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_suceed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_suceed.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_vGPU_confg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_vGPU_confg.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_vgpu_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_vgpu_1.png


--------------------------------------------------------------------------------
/openshift/graphics/cluster_policy_vgpu_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_vgpu_2.png


--------------------------------------------------------------------------------
/openshift/graphics/create_cluster_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_cluster_policy.png


--------------------------------------------------------------------------------
/openshift/graphics/create_config_map1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_config_map1.png


--------------------------------------------------------------------------------
/openshift/graphics/create_project_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_project_1.png


--------------------------------------------------------------------------------
/openshift/graphics/create_project_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_project_2.png


--------------------------------------------------------------------------------
/openshift/graphics/createclusterpolicy2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/createclusterpolicy2.png


--------------------------------------------------------------------------------
/openshift/graphics/createclusterpolicy3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/createclusterpolicy3.png


--------------------------------------------------------------------------------
/openshift/graphics/created_pull-secret.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/created_pull-secret.png


--------------------------------------------------------------------------------
/openshift/graphics/disconnected_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/disconnected_cluster.png


--------------------------------------------------------------------------------
/openshift/graphics/driver_toolkit_alert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/driver_toolkit_alert.png


--------------------------------------------------------------------------------
/openshift/graphics/enable-gpu-direct-rdma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/enable-gpu-direct-rdma.png


--------------------------------------------------------------------------------
/openshift/graphics/entitlement_hypervisor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/entitlement_hypervisor.png


--------------------------------------------------------------------------------
/openshift/graphics/gpu-operator-certified-cli-install.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/gpu-operator-certified-cli-install.png


--------------------------------------------------------------------------------
/openshift/graphics/gpu_dashboards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/gpu_dashboards.png


--------------------------------------------------------------------------------
/openshift/graphics/locate-cluster-acm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/locate-cluster-acm.png


--------------------------------------------------------------------------------
/openshift/graphics/mig-mixed-profile-A100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/mig-mixed-profile-A100.png


--------------------------------------------------------------------------------
/openshift/graphics/mig_strategy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/mig_strategy.png


--------------------------------------------------------------------------------
/openshift/graphics/navigate_to_cluster_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/navigate_to_cluster_policy.png


--------------------------------------------------------------------------------
/openshift/graphics/nvaie2.3_cluster_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/nvaie2.3_cluster_policy.png


--------------------------------------------------------------------------------
/openshift/graphics/ocp_main_console_alerts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/ocp_main_console_alerts.png


--------------------------------------------------------------------------------
/openshift/graphics/pci_passthrough.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/pci_passthrough.png


--------------------------------------------------------------------------------
/openshift/graphics/precompiled_driver_config_repository.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/precompiled_driver_config_repository.png


--------------------------------------------------------------------------------
/openshift/graphics/precompiled_driver_config_version_and_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/precompiled_driver_config_version_and_image.png


--------------------------------------------------------------------------------
/openshift/graphics/pull-secret.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/pull-secret.png


--------------------------------------------------------------------------------
/openshift/graphics/secrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/secrets.png


--------------------------------------------------------------------------------
/openshift/graphics/secrets_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/secrets_2.png


--------------------------------------------------------------------------------
/openshift/graphics/vmx_secure_boot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/vmx_secure_boot.png


--------------------------------------------------------------------------------
/openshift/index.rst:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | NVIDIA GPU Operator on Red Hat OpenShift Container Platform
 3 | ###########################################################
 4 | 
 5 | .. toctree::
 6 |    :titlesonly:
 7 | 
 8 |    Introduction <introduction.rst>
 9 |    Prerequisites <prerequisites.rst>
10 |    Installation and Upgrade Overview <steps-overview.rst>
11 |    NFD Operator Installation <install-nfd.rst>
12 |    GPU Operator Installation <install-gpu-ocp.rst>
13 |    nvaie-with-ocp.rst
14 |    mig-ocp.rst
15 |    clean-up.rst
16 |    mirror-gpu-ocp-disconnected.rst
17 |    enable-gpu-monitoring-dashboard.rst
18 |    time-slicing-gpus-in-openshift.rst
19 |    openshift-virtualization.rst
20 |    gpu-operator-with-precompiled-drivers.rst
21 |    troubleshooting-gpu-ocp.rst
22 |    appendix-ocp.rst
23 | 


--------------------------------------------------------------------------------
/openshift/install-nfd.rst:
--------------------------------------------------------------------------------
 1 | .. Date: Nov 15 2021
 2 | .. Author: kquinn
 3 | 
 4 | .. _install-nfd:
 5 | 
 6 | ###########################################################
 7 | Installing the Node Feature Discovery Operator on OpenShift
 8 | ###########################################################
 9 | 
10 | *********
11 | Procedure
12 | *********
13 | 
14 | The Node Feature Discovery (NFD) Operator is a prerequisite for the **NVIDIA GPU Operator**. Install the NFD Operator using the Red Hat OperatorHub catalog in the OpenShift Container Platform web console.
15 | 
16 | #. Follow the Red Hat documentation guidance in the `Node Feature Discovery Operator guide <https://docs.openshift.com/container-platform/latest/hardware_enablement/psap-node-feature-discovery-operator.html>`_ to install the Node Feature Discovery Operator.
17 | 
18 | #. Verify the Node Feature Discovery Operator is running:
19 | 
20 |    .. code-block:: console
21 | 
22 |       $ oc get pods -n openshift-nfd
23 | 
24 |    .. code-block:: console
25 | 
26 |       NAME                                      READY   STATUS    RESTARTS   AGE
27 |       nfd-controller-manager-7f86ccfb58-nqgxm   2/2     Running   0          11m
28 | 
29 | #. When the Node Feature Discovery is installed, create an instance of Node Feature Discovery using the **NodeFeatureDiscovery** tab:
30 | 
31 |    #. Click **Operators** > **Installed Operators** from the side menu.
32 | 
33 |    #. Find the **Node Feature Discovery** entry.
34 | 
35 |    #. Click **NodeFeatureDiscovery** under the **Provided APIs** field.
36 | 
37 |    #. Click **Create NodeFeatureDiscovery**.
38 | 
39 |    #. In the following screen, click **Create**. This starts the Node Feature Discovery Operator that proceeds to label the nodes in the cluster that have GPUs.
40 | 
41 |    .. note:: The values prepopulated by the OperatorHub are valid for the GPU Operator.
42 | 
43 | *************************************************************************
44 | Verify that the Node Feature Discovery Operator is functioning correctly
45 | *************************************************************************
46 | 
47 | The Node Feature Discovery Operator uses vendor PCI IDs to identify hardware in a node. NVIDIA uses the PCI ID ``10de``. Use the OpenShift Container Platform web console or the CLI to verify that the Node Feature Discovery Operator is functioning correctly.
48 | 
49 | 
50 | #. In the OpenShift Container Platform web console, click **Compute** > **Nodes** from the side menu.
51 | 
52 | #. Select a worker node that contains a GPU.
53 | 
54 | #. Click the **Details** tab.
55 | 
56 | #. Under **Node Labels**, verify that the following label is present:
57 | 
58 |    .. code-block:: console
59 | 
60 |       feature.node.kubernetes.io/pci-10de.present=true
61 | 
62 |    .. note:: ``0x10de`` is the PCI vendor ID assigned to NVIDIA.
63 | 
64 | #. Verify that the GPU device (``pci-10de``) is discovered on the GPU node:
65 | 
66 |    .. code-block:: console
67 | 
68 |       $ oc describe node | egrep 'Roles|pci' | grep -v master
69 | 
70 |    .. code-block:: console
71 | 
72 |       Roles:              worker
73 |                           feature.node.kubernetes.io/pci-10de.present=true
74 |                           feature.node.kubernetes.io/pci-1d0f.present=true
75 |       Roles:              worker
76 |                           feature.node.kubernetes.io/pci-1013.present=true
77 |                           feature.node.kubernetes.io/pci-8086.present=true
78 |       Roles:              worker
79 |                           feature.node.kubernetes.io/pci-1013.present=true
80 |                           feature.node.kubernetes.io/pci-8086.present=true
81 |       Roles:              worker
82 |                           feature.node.kubernetes.io/pci-1013.present=true
83 |                           feature.node.kubernetes.io/pci-8086.present=true
84 | 


--------------------------------------------------------------------------------
/openshift/introduction.rst:
--------------------------------------------------------------------------------
 1 | .. Date: Oct 24 2022
 2 | .. Author: kquinn
 3 | 
 4 | .. _essug: https://docs.nvidia.com/enterprise-support-and-services-user-guide/about-this-user-guide/index.html
 5 | .. |essug| replace:: *NVIDIA Enterprise Support and Services User Guide*
 6 | 
 7 | .. _openshift-introduction:
 8 | 
 9 | ************************************************
10 | Introduction to NVIDIA GPU Operator on OpenShift
11 | ************************************************
12 | 
13 | Kubernetes is an open-source platform for automating the deployment, scaling, and managing of containerized applications.
14 | 
15 | Red Hat OpenShift Container Platform is a security-centric and enterprise-grade hardened Kubernetes platform for deploying and managing Kubernetes clusters at scale, developed and supported by Red Hat.
16 | Red Hat OpenShift Container Platform includes enhancements to Kubernetes so users can easily configure and use GPU resources for accelerating workloads like deep learning.
17 | 
18 | The NVIDIA GPU Operator uses the operator framework within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. These components include the NVIDIA drivers (to enable CUDA),
19 | Kubernetes device plugin for GPUs, the `NVIDIA Container Toolkit <https://github.com/NVIDIA/nvidia-container-toolkit>`_,
20 | automatic node labeling using `GFD <https://github.com/NVIDIA/gpu-feature-discovery>`_, `DCGM <https://developer.nvidia.com/dcgm>`_-based monitoring, and others.
21 | 
22 | For guidance on the specific NVIDIA support entitlement needs,
23 | refer |essug|_ if you have an NVIDIA AI Enterprise entitlement.
24 | Otherwise, refer to the `Obtaining Support from NVIDIA <https://access.redhat.com/solutions/5174941>`_
25 | Red Hat Knowledgebase article.
26 | 


--------------------------------------------------------------------------------
/openshift/prerequisites.rst:
--------------------------------------------------------------------------------
 1 | .. Date: November 26 2021
 2 | .. Author: kquinn
 3 | 
 4 | *******************************************
 5 | Prerequisites for GPU Operator on OpenShift
 6 | *******************************************
 7 | 
 8 | Before following the steps in this guide, ensure that your environment has:
 9 | 
10 | * A working OpenShift cluster up and running with a GPU worker node. Refer to the `OpenShift Container Platform installation overview <https://docs.redhat.com/en/documentation/openshift_container_platform/latest/html/installation_overview/ocp-installation-overview>`_ for installation guidance.
11 |   Refer to :external+gpuop:ref:`Container Platforms <container-platforms>` for the support matrix of the GPU Operator releases and the supported container platforms for more information.
12 | * Access to the OpenShift cluster as a ``cluster-admin`` to perform the necessary steps.
13 | * OpenShift CLI (``oc``) installed.
14 | 


--------------------------------------------------------------------------------
/openshift/versions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "latest": "25.10",
 3 |     "versions":
 4 |     [
 5 |         {
 6 |             "version": "25.10"
 7 |         },
 8 |         {
 9 |             "version": "25.3"
10 |         },
11 |         {
12 |             "version": "24.9"
13 |         }
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------
/openshift/versions1.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "preferred": "true",
 4 |     "url": "../25.10",
 5 |     "version": "25.10"
 6 |   }, 
 7 |   {
 8 |     "url": "../25.3",
 9 |     "version": "25.3"
10 |   }, 
11 |   {
12 |     "url": "../24.9",
13 |     "version": "24.9"
14 |   }
15 | ]
16 | 


--------------------------------------------------------------------------------
/partner-validated/PARTNER-VALIDATED-TEMPLATE.rst:
--------------------------------------------------------------------------------
  1 | .. headings # #, * *, =, -, ^, "
  2 | 
  3 | .. |prod-name-long| replace:: Your Product Name v1.0
  4 | .. |prod-name-short| replace:: YPN
  5 | 
  6 | #########################################
  7 | NVIDIA GPU Operator with |prod-name-long|
  8 | #########################################
  9 | 
 10 | 
 11 | *********************************************
 12 | About the GPU Operator with |prod-name-short|
 13 | *********************************************
 14 | 
 15 | Use this section of the documentation to describe the benefits that customers
 16 | can experience by using the NVIDIA GPU Operator and the product together.
 17 | 
 18 | Providing a summary of the competitive advantages that your product provides
 19 | is appropriate.
 20 | 
 21 | Providing a URL to your product documentation so readers can learn more about
 22 | your product is also appropriate.
 23 | 
 24 | 
 25 | ******************************
 26 | Validated Configuration Matrix
 27 | ******************************
 28 | 
 29 | Identify the hardware baseline that was used to self-validate your product with
 30 | the Operator.
 31 | 
 32 | .. rubric:: Example
 33 | 
 34 | .. list-table::
 35 |    :header-rows: 1
 36 | 
 37 |    * - <product-name-short>
 38 |      - | NVIDIA
 39 |        | GPU Operator
 40 |      - | Operating
 41 |        | System
 42 |      - | Container
 43 |        | Runtime
 44 |      - Kubernetes
 45 |      - Helm
 46 |      - NVIDIA GPU
 47 |      - Hardware Model
 48 | 
 49 |   * - |prod-name-long|
 50 |     - v23.3.1
 51 |     - | Ubuntu 22.04
 52 |       | Ubuntu 20.04
 53 |     - containerd v1.6
 54 |     - 1.25, 1.26
 55 |     - v3
 56 |     - | NVIDIA HGX H100
 57 |       | NVIDIA H100
 58 |       | NVIDIA A100
 59 |     - | Dell PowerEdge R740
 60 |       | 2 $\times$ Intel Xeon Silver 2.2 GHz
 61 |       | 64GB RAM, 1TB NVMe
 62 | 
 63 | Include at least the following pieces of information:
 64 | 
 65 | * **Product name.**
 66 |   Specify your product name and version.
 67 | 
 68 | * **GPU Operator version.**
 69 |   Specify the version of the NVIDIA GPU Operator that you self-validated.
 70 | 
 71 | * **Operating system.**
 72 |   Specify the operating system name and version that you self-validated.
 73 | 
 74 | * **Container runtime.**
 75 |   Specify the container runtime name and version.
 76 |   Refer to the
 77 |   `Supported Container Runtimes <https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/platform-support.html#supported-container-runtimes>`_
 78 |   section of the platform support page.
 79 | 
 80 | * **Kubernetes version.**
 81 |   Specify the Kubernetes version, such as ``1.25``, that your product uses.
 82 | 
 83 | * **Helm version.**
 84 |   Specify the version of Helm that you used with your product to self-validate.
 85 |   If Helm is not used to install the NVIDIA GPU Operator, identify the product
 86 |   and version that you used for installation.
 87 | 
 88 | * **NVIDIA GPU model.**
 89 |   Use the same product model name that is provided in the
 90 |   `Supported NVIDIA GPUs and Systems <https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/platform-support.html#supported-nvidia-gpus-and-systems>`_
 91 |   section of the platform support page.
 92 | 
 93 | * **Hardware model.**
 94 |   Including a summary of the CPU model, number of CPUs, memory, and other
 95 |   popular specifications is appropriate.
 96 | 
 97 | 
 98 | *************
 99 | Prerequisites
100 | *************
101 | 
102 | Specify the conditions that the customer must meet before beginning to install
103 | the NVIDIA GPU Operator.
104 | 
105 | References to product documentation are appropriate.
106 | 
107 | A few commands with brief example output that customers can run to verify their
108 | readiness is appropriate.
109 | 
110 | A bulleted list is an effective presentation for simple and brief prerequisites
111 | information, but is not required.
112 | 
113 | If the prerequisites are not simple and require running several commands to
114 | verify readiness to begin, organize the commands or requirements into stages
115 | and create a level 3 heading for each of the stages.
116 | 
117 | 
118 | *********
119 | Procedure
120 | *********
121 | 
122 | You can keep the heading as Procedure, or you can replace with text similar to
123 | Configuring |prod-name-short| with the GPU Operator.
124 | 
125 | If the procedure is in the range of 7 to 10 steps, then present them after
126 | the heading.
127 | 
128 | If the procedure is more sophisticated, organize the steps into stages and
129 | create a level 3 heading for each of the stages.
130 | 
131 | 
132 | ****************************************************
133 | Verifying |prod-name-short| with the GPU Operator
134 | ****************************************************
135 | 
136 | Optionally, include commands that the customer can run to verify that the
137 | installation is successful and that workloads can use the NVIDIA GPUs.
138 | 
139 | 
140 | ***************
141 | Getting Support
142 | ***************
143 | 
144 | Indicate how end users can receive support from you regarding your product.
145 | 
146 | * URL for product documentation.
147 | * Information to help an end user to open a support request with you.
148 | 
149 | 
150 | *******************
151 | Related Information
152 | *******************
153 | 
154 | Provide URLs to product documentation, support forums, and so on.


--------------------------------------------------------------------------------
/partner-validated/k0rdent.rst:
--------------------------------------------------------------------------------
  1 | .. headings # #, * *, =, -, ^, "
  2 | 
  3 | .. |prod-name-long| replace:: Mirantis k0rdent
  4 | .. |prod-name-short| replace:: k0rdent
  5 | 
  6 | #############################################
  7 | |prod-name-long| with the NVIDIA GPU Operator
  8 | #############################################
  9 | 
 10 | 
 11 | *********************************************
 12 | About |prod-name-short| with the GPU Operator
 13 | *********************************************
 14 | 
 15 | |prod-name-short| is as a "super control plane" designed to ensure the consistent provisioning and lifecycle
 16 | management of Kubernetes clusters and the services that make them useful. The goal of the k0rdent project is
 17 | to provide platform engineers with the means to deliver a distributed container management environment (DCME) 
 18 | and enable them to compose unique internal developer platforms (IDP) to support a diverse range of complex 
 19 | modern application workloads.
 20 | 
 21 | The NVIDIA GPU Operator uses the operator framework within Kubernetes to automate
 22 | both the deployment and management of all NVIDIA software components needed to provision NVIDIA GPUs.
 23 | These components include the NVIDIA GPU drivers to enable CUDA, Kubernetes device plugin for GPUs,
 24 | the NVIDIA Container Toolkit, automatic node labeling using GFD, DCGM based monitoring and others.
 25 | 
 26 | 
 27 | ******************************
 28 | Validated Configuration Matrix
 29 | ******************************
 30 | 
 31 | |prod-name-long| has self-validated with the following components and versions:
 32 | 
 33 | .. list-table::
 34 |    :header-rows: 1
 35 | 
 36 |    * - Version
 37 |      - | NVIDIA
 38 |        | GPU
 39 |        | Operator
 40 |      - | Operating
 41 |        | System
 42 |      - | Container
 43 |        | Runtime
 44 |      - Kubernetes
 45 |      - Helm
 46 |      - NVIDIA GPU
 47 |      - Hardware Model
 48 | 
 49 |    * - k0rdent 0.2.0 / k0s v1.31.5+k0s
 50 |      - v24.9.2
 51 |      - | Ubuntu 22.04
 52 |      - containerd v1.7.24  with the NVIDIA Container Toolkit v1.17.4
 53 |      - 1.31.5
 54 |      - Helm v3
 55 |      - | 2x NVIDIA RTX 4000 SFF Ada 20GB GDDR6 (ECC)
 56 |      - | Supermicro SuperServer 6028U-E1CNR4T+
 57 | 
 58 |        | 1000W Supermicro PWS-1K02A-1R
 59 | 
 60 |        | 2x Intel Xeon E5-2630v4, 10C/20T 2.2/3.1 GHz LGA 2011-3 25MB 85W
 61 | 
 62 |        | 32GB DDR4-2666 RDIMM, M393A4K40BB2-CTD6Q
 63 | 
 64 |        | NVMe 960GB PM983 NVMe M.2, MZ1LB960HAJQ-00007
 65 | 
 66 |        | 2 x NVIDIA RTX 4000 SFF Ada 20GB GDDR6 (ECC), 70W, PCIe 4.0x16, 4x
 67 | 
 68 |        | 4x Mini DisplayPort 1.4a
 69 | 
 70 | 
 71 | *************
 72 | Prerequisites
 73 | *************
 74 | 
 75 | * A running |prod-name-short| managed cluster with at least one control plane node and two worker nodes.
 76 |   The recommended configuration is at least three control plane nodes and at least two worker nodes.
 77 | 
 78 | * At least one worker node with an NVIDIA GPU physically installed.
 79 |   The GPU Operator can locate the GPU and label the node accordingly.
 80 | 
 81 | * The kubeconfig file for the |prod-name-short| managed cluster on the seed node.
 82 |   You can get the file from the |prod-name-short| control plane.
 83 | 
 84 | * You have access to the |prod-name-short| cluster.
 85 | 
 86 | 
 87 | *********
 88 | Procedure
 89 | *********
 90 | 
 91 | Perform the following steps to prepare the |prod-name-short| cluster:
 92 | 
 93 | #. Install template to k0rdent
 94 | 
 95 |    .. code-block:: console
 96 | 
 97 |       $ helm install gpu-operator oci://ghcr.io/k0rdent/catalog/charts/gpu-operator-service-template \
 98 |           --version 24.9.2 -n kcm-system
 99 | 
100 | #. Verify service template:
101 | 
102 |    .. code-block:: console
103 | 
104 |       $ kubectl get servicetemplates -A
105 | 
106 |    *Example Output*
107 | 
108 |    .. code-block:: output
109 | 
110 |       NAMESPACE    NAME                          VALID
111 |       kcm-system   gpu-operator-24-9-2           true
112 | 
113 | #. Deploy service template to child cluster:
114 | 
115 |    .. code-block:: console
116 | 
117 |       apiVersion: k0rdent.mirantis.com/v1alpha1
118 |       kind: MultiClusterService
119 |       metadata:
120 |         name: gpu-operator
121 |       spec:
122 |         clusterSelector:
123 |           matchLabels:
124 |             group: demo
125 |       serviceSpec:
126 |         services:
127 |         - template: gpu-operator-24-9-2
128 |           name: gpu-operator
129 |           namespace: gpu-operator
130 |           values: |
131 |             operator:
132 |               defaultRuntime: containerd
133 |             toolkit:
134 |               env:
135 |                 - name: CONTAINERD_CONFIG
136 |                 value: /etc/k0s/containerd.d/nvidia.toml
137 |                 - name: CONTAINERD_SOCKET
138 |                 value: /run/k0s/containerd.sock
139 |                 - name: CONTAINERD_RUNTIME_CLASS
140 |                 value: nvidia
141 | 
142 | 
143 | The |prod-name-short| managed clusters will now have the NVIDIA GPU operator
144 | 
145 | *************************************************
146 | Verifying |prod-name-short| with the GPU Operator
147 | *************************************************
148 | 
149 | Refer to :external+gpuop:ref:`running sample gpu applications` to verify the installation.
150 | 
151 | ***************
152 | Getting Support
153 | ***************
154 | 
155 | Refer to the k0RDENT product documentation for information about working with k0RDENT.
156 | 
157 | *******************
158 | Related information
159 | *******************
160 | 
161 | * https://docs.k0rdent.io/v0.2.0/
162 | 


--------------------------------------------------------------------------------
/partner-validated/versions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "versions":
3 |     [
4 |         {
5 |             "version": "1.0.0"
6 |         }
7 |     ]
8 | }


--------------------------------------------------------------------------------
/partner-validated/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/playground/dind.rst:
--------------------------------------------------------------------------------
 1 | .. Date: November 13 2020
 2 | .. Author: pramarao
 3 | 
 4 | .. _dind:
 5 | 
 6 | ##################
 7 | Docker-in-Docker
 8 | ##################
 9 | 
10 | You can also run GPU containers with Docker-in-Docker (dind). Just mount in the Docker socket to the container and then 
11 | specify the CUDA container that you want to run: 
12 | 
13 | .. code-block:: console
14 | 
15 |    $ sudo docker run -v /var/run/docker.sock:/var/run/docker.sock \
16 |        docker run --rm --gpus all \
17 |        nvidia/cuda:11.0-base \
18 |        nvidia-smi
19 | 
20 | With the resulting output:
21 | 
22 | .. code-block:: console
23 | 
24 |    +-----------------------------------------------------------------------------+
25 |    | NVIDIA-SMI 455.45.01    Driver Version: 455.45.01    CUDA Version: 11.1     |
26 |    |-------------------------------+----------------------+----------------------+
27 |    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
28 |    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
29 |    |                               |                      |               MIG M. |
30 |    |===============================+======================+======================|
31 |    |   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
32 |    | N/A   31C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
33 |    |                               |                      |                  N/A |
34 |    +-------------------------------+----------------------+----------------------+
35 | 
36 |    +-----------------------------------------------------------------------------+
37 |    | Processes:                                                                  |
38 |    |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
39 |    |        ID   ID                                                   Usage      |
40 |    |=============================================================================|
41 |    |  No running processes found                                                 |
42 |    +-----------------------------------------------------------------------------+
43 | 
44 | Or launch an interactive session within an interactive session, Inception style! 
45 | 
46 | .. code-block:: console
47 | 
48 |    $ sudo docker run -ti -v /var/run/docker.sock:/var/run/docker.sock docker
49 | 
50 | .. code-block:: console
51 | 
52 |    / # docker run -it --gpus all nvidia/cuda:11.1-base
53 |    Unable to find image 'nvidia/cuda:11.1-base' locally
54 |    11.1-base: Pulling from nvidia/cuda
55 |    6a5697faee43: Pull complete
56 |    ba13d3bc422b: Pull complete
57 |    a254829d9e55: Pull complete
58 |    f853e5702a31: Pull complete
59 |    29cfce72a460: Pull complete
60 |    4bb689f629d3: Pull complete
61 |    Digest: sha256:6007208f8a1f626c0175260ebd46b1cbde10aab67e6d810fa593357b8199bfbe
62 |    Status: Downloaded newer image for nvidia/cuda:11.1-base
63 |    root@f29740c58731:/# nvidia-smi
64 | 
65 |    +-----------------------------------------------------------------------------+
66 |    | NVIDIA-SMI 455.45.01    Driver Version: 455.45.01    CUDA Version: 11.1     |
67 |    |-------------------------------+----------------------+----------------------+
68 |    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
69 |    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
70 |    |                               |                      |               MIG M. |
71 |    |===============================+======================+======================|
72 |    |   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
73 |    | N/A   31C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
74 |    |                               |                      |                  N/A |
75 |    +-------------------------------+----------------------+----------------------+
76 | 
77 |    +-----------------------------------------------------------------------------+
78 |    | Processes:                                                                  |
79 |    |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
80 |    |        ID   ID                                                   Usage      |
81 |    |=============================================================================|
82 |    |  No running processes found                                                 |
83 |    +-----------------------------------------------------------------------------+
84 | 
85 | What other cool stuff can you do? Send us details via GitHub `issues <https://github.com/NVIDIA/nvidia-container-toolkit/issues>`_! 
86 | 
87 | 


--------------------------------------------------------------------------------
/playground/x-arch.rst:
--------------------------------------------------------------------------------
 1 | .. Date: November 13 2020
 2 | .. Author: pramarao
 3 | 
 4 | .. _x-arch:
 5 | 
 6 | ##########################################
 7 | Running Cross-Architecture Containers
 8 | ##########################################
 9 | 
10 | For many reasons, it is desirable to build and run containers for one CPU architecture (e.g. ``x86_64``) 
11 | on another CPU architecture (e.g. ``Arm64``).
12 | 
13 | ************************
14 | Emulation Environment
15 | ************************
16 | 
17 | One solution would be to use an emulation environment using the `QEMU <https://www.qemu.org/>`_ emulator and Docker. 
18 | Using **QEMU**, `binfmt_misc <https://en.wikipedia.org/wiki/Binfmt_misc>`_ and the registration scripts via the 
19 | `multiarch/qemu-user-static <https://github.com/multiarch/qemu-user-static>`_ project, we can run containers built for 
20 | either *Arm64* or *POWER* architectures on *x86_64* servers or workstations. 
21 | 
22 | Installing QEMU
23 | -----------------
24 | 
25 | Install the *qemu*, *binfmt-support*, and *qemu-user-static* packages. The *binfmt-support* contains scripts to register binary 
26 | formats with the kernel using the *binfmt_misc* module; and the *qemu-user-static* package registers binary formats that emulators can handle. 
27 | 
28 | .. code-block:: console
29 | 
30 |    $ sudo apt-get install -y qemu \
31 |       && binfmt-support \
32 |       && qemu-user-static
33 | 
34 | Run the ``multiarch/qemu-user-static`` container to register:
35 | 
36 | .. code-block:: console
37 | 
38 |    $ sudo docker run --rm --privileged \
39 |       multiarch/qemu-user-static \
40 |       --reset \
41 |       -p yes
42 | 
43 | Now, verify that the *binfmt* entries were registered on the system:
44 | 
45 | .. code-block:: console
46 | 
47 |    $ update-binfmts --display
48 | 
49 | .. code-block:: console
50 | 
51 |    ...
52 |    qemu-aarch64 (enabled):
53 |      package = qemu-user-static
54 |         type = magic
55 |       offset = 0
56 |        magic = \x7f\x45\x4c\x46\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7\x00
57 |         mask = \xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff
58 |     interpreter = /usr/bin/qemu-aarch64-static
59 |       detector =
60 |    ...
61 | 
62 | Running Containers
63 | --------------------
64 | 
65 | The community maintains a number of Docker containers on DockerHub under `arm64v8 <https://hub.docker.com/r/arm64v8>`_. 
66 | Without an emulator, if you try running an ``arm64`` Alpine container on ``x86_64``, you will observe a format error from Docker.
67 | 
68 | This can be seen in the example below:
69 | 
70 | .. code-block:: console
71 | 
72 |    $ uname -m
73 |    x86_64
74 | 
75 | .. code-block:: console
76 | 
77 |    $ sudo docker run --rm arm64v8/alpine uname -m
78 | 
79 | .. code-block:: console
80 | 
81 |    standard_init_linux.go:211: exec user process caused "exec format error"
82 | 
83 | After installing the QEMU emulator and registering: 
84 | 
85 | .. code-block:: console
86 | 
87 |    $ sudo docker run --rm arm64v8/alpine uname -m
88 | 
89 | .. code-block:: console
90 | 
91 |    aarch64
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/repo:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | SCRIPT_DIR=$(dirname ${BASH_SOURCE})
6 | cd "$SCRIPT_DIR"
7 | 
8 | exec "tools/packman/python.sh" tools/repoman/repoman.py $@
9 | 


--------------------------------------------------------------------------------
/repo.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | call "%~dp0tools\packman\python.bat" %~dp0tools\repoman\repoman.py %*
 4 | if %errorlevel% neq 0 ( goto Error )
 5 | 
 6 | :Success
 7 | exit /b 0
 8 | 
 9 | :Error
10 | exit /b %errorlevel%
11 | 


--------------------------------------------------------------------------------
/review/index.rst:
--------------------------------------------------------------------------------
 1 | .. license-header
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |   http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |   Unless required by applicable law or agreed to in writing, software
12 |   distributed under the License is distributed on an "AS IS" BASIS,
13 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   See the License for the specific language governing permissions and
15 |   limitations under the License.
16 | 
17 | .. headings # #, * *, =, -, ^, "
18 | 
19 | ################
20 | Technical Review
21 | ################
22 | 
23 | Refer to the following URLs for the review HTML:
24 | 
25 | * `NVIDIA Container Toolkit <./container-toolkit/latest/index.html>`__
26 | * `NVIDIA Driver Containers <./driver-containers/latest/index.html>`__
27 | * `NVIDIA GPU Operator <./gpu-operator/latest/index.html>`__
28 | * `NVIDIA GPU Operator on Red Hat OpenShift Container Platform <./openshift/latest/index.html>`__
29 | * `NVIDIA GPUs and Edge Computing <./edge/latest/index.html>`__
30 | * `Partner-Validated Configurations <./partner-validated/latest/index.html>`__
31 | 


--------------------------------------------------------------------------------
/review/versions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "versions":
3 |     [
4 |         {
5 |             "version": "0.1.0"
6 |         }
7 |     ]
8 | }


--------------------------------------------------------------------------------
/review/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/scripts/create_archive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # WARNING: assumes you are running this script from the top-level directory (e.g. scripts/create_archive.sh)
 4 | # Example:
 5 | # PROJECT=gpu-operator VERSION=1.9.0 ./scripts/create_archive.sh
 6 | 
 7 | set -e
 8 | 
 9 | PROJECT=${PROJECT:?"Missing PROJECT to archive"}
10 | VERSION=${VERSION:?"Missing VERSION to archive"}
11 | ARCHIVE="${PROJECT}/archive/${VERSION}"
12 | 
13 | # Create archive directory and copy over all current files/directories, excluding the archive directory itself
14 | rm -rf $ARCHIVE
15 | mkdir -p $ARCHIVE
16 | rsync -aq "${PROJECT}/" $ARCHIVE --exclude "archive/"
17 | 
18 | # Find all labels in the project documentation and extract the label name.
19 | # Labels are in the format: ".. _label-name:"
20 | labels=$(grep -ohr --include \*.rst ".. _[^:]*" ${ARCHIVE} | cut -c 5- | xargs -n1 | sort | xargs)
21 | 
22 | # For each label, append a version suffix and update any references to the label.
23 | for label in $labels; do
24 | 	echo "Updating all references to label: $label"
25 | 	find $ARCHIVE -name "*.rst" -exec sed -i '' "s/.. _${label}/&-${VERSION}/g" {} \;
26 | 	find $ARCHIVE -name "*.rst" -exec sed -i '' "s/:ref:\`.*${label}/&-${VERSION}/g" {} \;
27 | done
28 | 
29 | 


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/configure.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |   SPDX-License-Identifier: Apache-2.0
  4 | -->
  5 | 
  6 | # Configure RBAC
  7 | 
  8 | ````{only} not publish_bsp
  9 | ```{contents}
 10 | :depth: 2
 11 | :backlinks: none
 12 | :local: true
 13 | ```
 14 | ````
 15 | 
 16 | ## Inject Istio
 17 | 
 18 | 1. Label the namespace to enable Istio injection.
 19 | 
 20 |    ```console
 21 |    kubectl label namespace <namespace> istio-injection=enabled --overwrite
 22 |    ```
 23 | 
 24 |    Replace the `<namespace>` with your target namespace.
 25 | 
 26 | 2. Delete the existing pods to recreate them with Istio sidecar containers.
 27 | 
 28 |    ```console
 29 |    kubectl delete pod $(kubectl get pods -n <namespace> | awk '{print $1}') -n <namespace>
 30 |    ````
 31 | 
 32 | ## Deploy Manifests
 33 | 
 34 | 1. The following sample manifest deploys a gateway and ingress virtual service.
 35 | 
 36 |     - Update the target namespace for the virtual service resource.
 37 |     - The sample manifest applies to NVIDIA NIM for LLMs. For other NVIDIA microservices, update the `match` and `route` for the microservice endpoints.
 38 |         - For information about the microservice endpoints, refer to the following documents:
 39 |           - [NIM Inference API Inference](https://docs.nvidia.com/nim/large-language-models/latest/api-reference.html)
 40 |           - [NIM Embedding API Reference](https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/reference.html)
 41 |           - [NIM ReRanking API Reference](https://docs.nvidia.com/nim/nemo-retriever/text-reranking/latest/reference.html)
 42 | 
 43 |    ```{literalinclude} ./manifests/istio-sample-manifest.yaml
 44 |    :language: yaml
 45 |    ```
 46 | 
 47 | 2. Apply the manifest.
 48 | 
 49 |    ```console
 50 |    kubectl apply -f istio-sample-manifest.yaml
 51 |    ````
 52 | 
 53 | 3. Determine the Istio ingress gateway node port.
 54 | 
 55 |    ```console
 56 |    kubectl get svc -n istio-system | grep ingress
 57 |    ```
 58 | 
 59 |    *Example Output*
 60 | 
 61 |    ```output
 62 |    istio-ingressgateway   LoadBalancer   10.102.8.149     10.28.234.101   15021:32658/TCP,80:30611/TCP,443:31874/TCP,31400:30160/TCP,15443:32430/TCP   22h
 63 |    ```
 64 | 
 65 | 4. List the worker IP addresses.
 66 | 
 67 |    ```console
 68 |    for node in `kubectl get nodes | awk '{print $1}' | grep -v NAME`; do echo $node ' ' | tr -d '\n'; kubectl describe node $node | grep -i 'internalIP:' | awk '{print $2}'; done
 69 |    ```
 70 | 
 71 |    *Example Output*
 72 | 
 73 |    ```console
 74 |    nim-test-cluster-03-worker-nbhk9-56b4b888dd-8lpqd  10.120.199.16
 75 |    nim-test-cluster-03-worker-nbhk9-56b4b888dd-hnrxr  10.120.199.23
 76 |    ```
 77 | 
 78 | 5. The following manifest creates request authentication resources.
 79 | 
 80 |     - Update the target namespace.
 81 |     - Modify the issuer in the manifest with one of the preceding IP addresses and preceeding ingress Istio gateway node ports, mapped to port 80.
 82 | 
 83 |     ```{literalinclude} ./manifests/requestAuthentication.yaml
 84 |     :language: yaml
 85 |     ```
 86 | 
 87 | 6. Apply the manifest.
 88 | 
 89 |    ```console
 90 |    kubectl apply -f requestAuthentication.yaml
 91 |    ```
 92 | 
 93 | 7. The following manifest creates an authorization policy resource.
 94 | 
 95 |     - Update the target namespace.
 96 |     - Update the rules that apply to the target microservices.
 97 | 
 98 |    ```{literalinclude} ./manifests/authorizationPolicy.yaml
 99 |    :language: yaml
100 |    ```
101 | 
102 | 8. Apply the manifest.
103 | 
104 |    ```console
105 |    kubectl apply -f authorizationPolicy.yaml
106 |    ```
107 | 
108 | 9. Create a token for Keycloak authentication.
109 |    Update the node IP address and ingress gateway node port.
110 | 
111 |    ```console
112 |    TOKEN=`curl -X POST -d "client_id=nvidia-nim" -d "username=nim" -d "password=nvidia123" -d "grant_type=password" "http://10.217.19.114:30611/realms/nvidia-nim-llm/protocol/openid-connect/token"| jq .access_token| tr -d '"' `
113 |    ```
114 | 
115 | 10. Verify access to the microservice from Keycloak through the Istio gateway.
116 | 
117 |     ```console
118 |     curl -v -X POST http://10.217.19.114:30611/v1/completions -H "Authorization: Bearer $TOKEN" -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ "model": "llama-2-13b-chat","prompt": "What is Kubernetes?","max_tokens": 16,"temperature": 1, "n": 1, "stream": false, "stop": "string", "frequency_penalty": 0.0 }'
119 |     ```
120 | 
121 |     Update the node IP address and ingress gateway port.
122 |     Update the model name if it is not `llama-2-13b-chat`.
123 | 
124 | 11. Generate some more data so it can be visualized in the next step on the Kiali dashboard.
125 | 
126 |     ```console
127 |     for i in $(seq 1 100); do curl -X POST http://10.217.19.114:30611/v1/chat/completions -H 'accept: application/json' -H "Authorization: Bearer $TOKEN" -H 'Content-Type: application/json' -d '{"model": "llama-2-13b-chat","messages": [{"role": "system","content": "You are a helpful assistant."},{"role": "user", "content": "Hello!"}]}'  -s -o /dev/null; done
128 |     ```
129 | 
130 | 12. Access the Istio Dashboard, specifying your client system IP address.
131 | 
132 |     ```console
133 |     istioctl dashboard kiali --address <system-ip>
134 |     ```
135 | 
136 | Access in browser with `system-ip` and port `20001`.
137 | 
138 | ## Conclusion
139 | 
140 | This architecture offers a robust solution for deploying NVIDIA NeMo MicroServices in a secure, scalable, and efficient manner. Integrating advanced service mesh capabilities with OIDC authentication sets a new standard for building sophisticated AI-driven applications.


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-1.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-10.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-11.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-12.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-13.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-14.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-15.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-16.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-2.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-3.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-4.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-5.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-6.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-7.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-8.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/keycloak-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-9.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/images/reference-arch-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/reference-arch-01.png


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/manifests/authorizationPolicy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: security.istio.io/v1beta1
 2 | kind: AuthorizationPolicy
 3 | metadata:
 4 |   name: nim-auth-policy
 5 |   namespace: <namespace>
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       app.kubernetes.io/name: inferencing
10 |   rules:
11 |   - from:
12 |     - source:
13 |         requestPrincipals: ["*"]
14 |     to:
15 |     - operation:
16 |         methods: ["POST"]
17 |         paths: ["/v1/completions*"]
18 |     when:
19 |     - key: request.auth.claims[realm_access][roles]
20 |       values: ["completions"]
21 |   - from:
22 |     - source:
23 |         requestPrincipals: ["*"]
24 |     to:
25 |     - operation:
26 |         methods: ["POST"]
27 |         paths: ["/v1/chat/completions*"]
28 |     when:
29 |     - key: request.auth.claims[realm_access][roles]
30 |       values: ["chat"]


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/manifests/istio-sample-manifest.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: networking.istio.io/v1alpha3
 3 | kind: Gateway
 4 | metadata:
 5 |   name: rag-gateway
 6 |   namespace: istio-system
 7 | spec:
 8 |   selector:
 9 |     istio: ingressgateway
10 |   servers:
11 |     - port:
12 |         number: 80
13 |         name: http2
14 |         protocol: HTTP
15 |       hosts:
16 |         - "*"
17 | 
18 | ---
19 | apiVersion: networking.istio.io/v1alpha3
20 | kind: VirtualService
21 | metadata:
22 |   name: sample-vs
23 |   namespace: <namespace>
24 | spec:
25 |   hosts:
26 |     - "*"
27 |   gateways:
28 |     - istio-system/rag-gateway
29 |   http:
30 |     - match:
31 |         - uri:
32 |             prefix: /admin
33 |         - uri:
34 |             prefix: /resources
35 |         - uri:
36 |             prefix: /welcome
37 |         - uri:
38 |             prefix: /realms
39 |       route:
40 |         - destination:
41 |             host: keycloak.default.svc.cluster.local
42 |             port:
43 |               number: 8080
44 |     - match:
45 |         - uri:
46 |             prefix: /v1/completions
47 |         - uri:
48 |             prefix: /v1/chat/completions
49 |       route:
50 |         - destination:
51 |             host: inferencing
52 |             port:
53 |               number: 8080
54 | 


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/manifests/requestAuthentication.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: security.istio.io/v1beta1
 3 | kind: RequestAuthentication
 4 | metadata:
 5 |   name: nim-request-authentication
 6 |   namespace: <namespace>
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |      app.kubernetes.io/name: inferencing
11 |   jwtRules:
12 |   - issuer: "http://10.176.21.249:30669/realms/nvidia-nim"
13 |     jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs"
14 |     forwardOriginalToken: true
15 |     fromHeaders:
16 |       - name: Authorization
17 |         prefix: "Bearer"
18 |   - issuer: "http://10.176.21.249/realms/nvidia-nim"
19 |     jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs"
20 |     forwardOriginalToken: true
21 |     fromHeaders:
22 |       - name: Authorization
23 |         prefix: "Bearer"
24 | ---
25 | apiVersion: security.istio.io/v1beta1
26 | kind: RequestAuthentication
27 | metadata:
28 |   name: nim-request-authentication-gw
29 |   namespace: istio-system
30 | spec:
31 |   selector:
32 |     matchLabels:
33 |      istio: ingressgateway
34 |   jwtRules:
35 |   - issuer: "http://10.176.21.249:30669/realms/nvidia-nim"
36 |     jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs"
37 |     forwardOriginalToken: true
38 |     fromHeaders:
39 |       - name: Authorization
40 |         prefix: "Bearer"
41 |   - issuer: "http://10.176.21.249/realms/nvidia-nim"
42 |     jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs"
43 |     forwardOriginalToken: true
44 |     fromHeaders:
45 |       - name: Authorization
46 |         prefix: "Bearer"
47 | 


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/platform-support.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |   SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |   SPDX-License-Identifier: Apache-2.0
 4 | -->
 5 | 
 6 | # Platform Support
 7 | 
 8 | ````{only} not publish_bsp
 9 | ```{contents}
10 | :depth: 2
11 | :backlinks: none
12 | :local: true
13 | ```
14 | ````
15 | 
16 | ## Operating Systems and Kubernetes Platforms
17 | 
18 | ```{list-table}
19 | :header-rows: 1
20 | :stub-columns: 1
21 | 
22 | * - Operating System
23 |   - Kubernetes
24 |   - Red Hat OpenShift
25 |   - VMware vSphere with Tanzu
26 | 
27 | * - Ubuntu 22.04
28 |   - 1.29---1.31
29 |   -
30 |   - 8.0 Update 2
31 | 
32 | * - Red Hat Core OS
33 |   -
34 |   - 4.16
35 |   - 
36 | ```
37 | 
38 | ## Container Runtimes
39 | 
40 | ```{list-table}
41 | :header-rows: 1
42 | 
43 | * - Operating System
44 |   - containerd
45 |   - CRI-O
46 | 
47 | * - Ubuntu 22.04
48 |   - 1.6, 1.7
49 |   - 1.30
50 | 
51 | * - Red Hat Core OS
52 |   - None
53 |   - Yes [{sup}`1`](cri-o-ocp)
54 | ```
55 | 
56 | (cri-o-ocp)=
57 | {sup}`1` The CRI-O version supported by OpenShift Container Platform is supported.
58 | 
59 | ## Command-Line Tools
60 | 
61 | ```{list-table}
62 | :header-rows: 1
63 | :widths: 30 70
64 | 
65 | * - Tool
66 |   - Installation Documentation
67 | 
68 | * - kubectl (match cluster version)
69 |   - Refer to
70 |     [Install Tools](https://kubernetes.io/docs/tasks/tools/)
71 |     in the Kubernetes documentation for more information.
72 | 
73 | * - Helm v3 and higher
74 |   - Refer to
75 |     [Install Helm](https://helm.sh/docs/intro/install/)
76 |     in the Helm documentation for more information.
77 | ```
78 | 
79 | ## Installed Componenets
80 | 
81 | ```{list-table}
82 | :header-rows: 1
83 | :widths: 30 70
84 | 
85 | * - Component
86 |   - Verified Version
87 | 
88 | * - Istio
89 |   - 1.23.2 
90 |     Refer to [Istion Releases](https://github.com/istio/istio/tree/release-1.23)
91 |     for more information.
92 | 
93 | * - Keycloak
94 |   - 26.0.0 
95 |     Refer to [Keycloak Releases](https://github.com/keycloak/keycloak/tree/release/26.0)
96 |     for more information.
97 | ```


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/versions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "latest": "0.1.0",
 3 |     "versions":
 4 |     [
 5 |         {
 6 |             "version": "0.1.0"
 7 |         }
 8 |     ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/secure-services-istio-keycloak/versions1.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "preferred": "true",
4 |     "url": "../1.0.0",
5 |     "version": "1.0.0"
6 |   }
7 | ]


--------------------------------------------------------------------------------
/templates/breadcrumbs.html:
--------------------------------------------------------------------------------
 1 | {% extends '!components/breadcrumbs.html' %}
 2 | 
 3 | {% set docs_home = "https://docs.nvidia.com" %}
 4 | {% set home = docs_home + "/datacenter/cloud-native" %}
 5 | 
 6 | {%- block breadcrumbs %}
 7 | <nav aria-label="{{ _('Breadcrumb') }}" class="d-print-none">
 8 |   <ul class="bd-breadcrumbs">
 9 |     <li class="breadcrumb-item breadcrumb-home">
10 |       <a href="{{ docs_home }}">NVIDIA Docs Hub</a>
11 |     </li>
12 |     <li class="breadcrumb-item">
13 |       <a href="{{ home }}">Cloud Native Technologies</a>
14 |     </li>
15 |     <li class="breadcrumb-item">
16 |       <a href="{{ pathto(root_doc) }}">{{ project }}</a>
17 |     </li>
18 |     {%- for doc in parents %}
19 |     {% if doc.link %}
20 |     <li class="breadcrumb-item"><a href="{{ doc.link|e }}" class="nav-link">{{ doc.title }}</a></li>
21 |     {% else %}
22 |     <li class="breadcrumb-item">{{ doc.title }}</li>
23 |     {% endif %}
24 |     {%- endfor %}
25 |     <li class="breadcrumb-item active" aria-current="page">{{ title }}</li>
26 |   </ul>
27 | </nav>
28 | {%- endblock %}
29 | 


--------------------------------------------------------------------------------
/templates/last-updated.html:
--------------------------------------------------------------------------------
1 | {# Suppress the default last-updated template. #}
2 | 


--------------------------------------------------------------------------------
/tools/packman/bootstrap/download_file_from_url.ps1:
--------------------------------------------------------------------------------
 1 | <#
 2 | Copyright 2019 NVIDIA CORPORATION
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | #>
16 | 
17 | param(
18 | [Parameter(Mandatory=$true)][string]$source=$null,
19 | [string]$output="out.exe"
20 | )
21 | $filename = $output
22 | 
23 | $triesLeft = 4
24 | $delay = 2
25 | do
26 | {
27 |     $triesLeft -= 1
28 | 
29 |     try
30 |     {
31 |         Write-Host "Downloading from bootstrap.packman.nvidia.com ..."
32 |         $wc = New-Object net.webclient
33 |         $wc.Downloadfile($source, $fileName)
34 |         exit 0
35 |     }
36 |     catch
37 |     {
38 |         Write-Host "Error downloading $source!"
39 |         Write-Host $_.Exception|format-list -force
40 |         if ($triesLeft)
41 |         {
42 |             Write-Host "Retrying in $delay seconds ..."
43 |             Start-Sleep -seconds $delay
44 |         }
45 |         $delay = $delay * $delay
46 |     }
47 | } while ($triesLeft -gt 0)
48 | # We only get here if the retries have been exhausted, remove any left-overs:
49 | if (Test-Path $fileName)
50 | {
51 |     Remove-Item $fileName
52 | }
53 | exit 1


--------------------------------------------------------------------------------
/tools/packman/bootstrap/fetch_file_from_packman_bootstrap.cmd:
--------------------------------------------------------------------------------
 1 | :: Copyright 2019 NVIDIA CORPORATION
 2 | ::
 3 | :: Licensed under the Apache License, Version 2.0 (the "License");
 4 | :: you may not use this file except in compliance with the License.
 5 | :: You may obtain a copy of the License at
 6 | ::
 7 | ::    http://www.apache.org/licenses/LICENSE-2.0
 8 | ::
 9 | :: Unless required by applicable law or agreed to in writing, software
10 | :: distributed under the License is distributed on an "AS IS" BASIS,
11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | :: See the License for the specific language governing permissions and
13 | :: limitations under the License.
14 | 
15 | :: You need to specify <package-name> <target-path> as input to this command
16 | @setlocal
17 | @set PACKAGE_NAME=%1
18 | @set TARGET_PATH=%2
19 | 
20 | @echo Fetching %PACKAGE_NAME% ...
21 | 
22 | @powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0download_file_from_url.ps1" ^
23 |     -source "http://bootstrap.packman.nvidia.com/%PACKAGE_NAME%" -output %TARGET_PATH%
24 | :: A bug in powershell prevents the errorlevel code from being set when using the -File execution option
25 | :: We must therefore do our own failure analysis, basically make sure the file exists:
26 | @if not exist %TARGET_PATH% goto ERROR_DOWNLOAD_FAILED
27 | 
28 | @endlocal
29 | @exit /b 0
30 | 
31 | :ERROR_DOWNLOAD_FAILED
32 | @echo Failed to download file from S3
33 | @echo Most likely because endpoint cannot be reached or file %PACKAGE_NAME% doesn't exist
34 | @endlocal
35 | @exit /b 1


--------------------------------------------------------------------------------
/tools/packman/config.packman.xml:
--------------------------------------------------------------------------------
1 | <config remotes="cloudfront">
2 |     <remote2 name="cloudfront">
3 |         <transport actions="download" protocol="https" packageLocation="d4i3qtqj3r0z5.cloudfront.net/${name}@${version}" />
4 |         <transport actions="upload" protocol="s3" packageLocation="packages-for-cloudfront" />
5 |         <transport actions="list" protocol="https" packageLocation="omnipackages.nvidia.com/api/v1/list/cloudfront" />
6 |     </remote2>
7 | </config>


--------------------------------------------------------------------------------
/tools/packman/packman:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2019-2023 NVIDIA CORPORATION
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | 
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | SAVED_SETTINGS="$-"
 17 | set -eu
 18 | 
 19 | if echo ${PM_VERBOSITY-} | grep -i "debug" > /dev/null ; then
 20 | 	set -x
 21 | else
 22 | 	PM_CURL_SILENT="-s -S"
 23 | 	PM_WGET_QUIET="--quiet"
 24 | fi
 25 | PM_PACKMAN_VERSION=7.5
 26 | 
 27 | # This is necessary for newer macOS
 28 | if [ `uname` == 'Darwin' ]; then
 29 | 	export LC_ALL=en_US.UTF-8
 30 | 	export LANG=en_US.UTF-8
 31 | fi
 32 | 
 33 | # We cannot rely on realpath, it isn't installed on macOS and some Linux distros
 34 | get_abs_filename() {
 35 |   echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")"
 36 | }
 37 | 
 38 | # Specify where packman command exists
 39 | export PM_INSTALL_PATH="$(get_abs_filename "$(dirname "${BASH_SOURCE}")")"
 40 | 
 41 | # The packages root may already be configured by the user
 42 | if [ -z "${PM_PACKAGES_ROOT:-}" ]; then
 43 | 	# Set variable temporarily in this process so that the following execution will work
 44 | 	if [ `uname` == 'Darwin' ]; then
 45 | 		export PM_PACKAGES_ROOT="${HOME}/Library/Application Support/packman-cache"
 46 | 	else
 47 | 		if [ -z "${XDG_CACHE_HOME:-}" ]; then
 48 | 			export PM_PACKAGES_ROOT="${HOME}/.cache/packman"
 49 | 		else
 50 | 			export PM_PACKAGES_ROOT="${XDG_CACHE_HOME}/packman"
 51 | 		fi
 52 | 	fi
 53 | fi
 54 | 
 55 | # Ensure the packages root path exists:
 56 | if [ ! -d "$PM_PACKAGES_ROOT" ]; then
 57 | 	echo "Creating packman packages cache at $PM_PACKAGES_ROOT"
 58 | 	mkdir -p -m a+rwx "$PM_PACKAGES_ROOT"
 59 | fi
 60 | 
 61 | fetch_file_from_s3()
 62 | {
 63 | 	SOURCE=$1
 64 | 	SOURCE_URL=http://bootstrap.packman.nvidia.com/$SOURCE
 65 | 	TARGET=$2
 66 | 	echo "Fetching $SOURCE from bootstrap.packman.nvidia.com ..."
 67 | 	if command -v wget >/dev/null 2>&1; then
 68 | 		wget $PM_WGET_QUIET -O$TARGET $SOURCE_URL
 69 | 	else
 70 | 		curl -o $TARGET $SOURCE_URL $PM_CURL_SILENT
 71 | 	fi
 72 | }
 73 | 
 74 | generate_temp_file_name()
 75 | {
 76 | 	if [ `uname` == "Darwin" ]; then
 77 | 		local tmpfile=`mktemp -t packman`
 78 | 	else
 79 | 		local tmpfile=`mktemp -t packman.XXXXXXXX`
 80 | 	fi
 81 | 	echo "$tmpfile"
 82 | }
 83 | 
 84 | install_python()
 85 | {
 86 | 	PLATFORM=`uname`
 87 | 	PROCESSOR=`uname -m`
 88 | 	PYTHON_VERSION=3.10.5-1
 89 | 
 90 | 	if [ $PLATFORM == 'Darwin' ]; then
 91 | 		PYTHON_PACKAGE=$PYTHON_VERSION-macos-x86_64
 92 | 	elif [ $PLATFORM == 'Linux' ] && [ $PROCESSOR == 'x86_64' ]; then
 93 | 		PYTHON_PACKAGE=$PYTHON_VERSION-linux-x86_64
 94 | 	elif [ $PLATFORM == 'Linux' ] && [ $PROCESSOR == 'aarch64' ]; then
 95 | 		PYTHON_PACKAGE=$PYTHON_VERSION-linux-aarch64
 96 | 	else
 97 | 		echo "Operating system not supported"
 98 | 		exit 1
 99 | 	fi
100 | 
101 | 	PYTHON_INSTALL_FOLDER="$PM_PACKAGES_ROOT/python/$PYTHON_PACKAGE"
102 | 	if [ ! -d "$PYTHON_INSTALL_FOLDER" ]; then
103 | 		mkdir -p "$PYTHON_INSTALL_FOLDER"
104 | 	fi
105 | 
106 | 	export PM_PYTHON="$PYTHON_INSTALL_FOLDER/python"
107 | 
108 | 	if [ ! -f "$PM_PYTHON" ]; then
109 | 		PYTHON_PACKAGE_TMP=$(generate_temp_file_name)
110 | 		fetch_file_from_s3 "python@$PYTHON_PACKAGE.tar.gz" "$PYTHON_PACKAGE_TMP"
111 | 		if [ "$?" -eq "0" ]; then
112 | 			echo "Unpacking python"
113 | 			tar -xf "$PYTHON_PACKAGE_TMP" -C "$PYTHON_INSTALL_FOLDER"
114 | 			rm "$PYTHON_PACKAGE_TMP"
115 | 		else
116 | 			echo "Failed downloading the Python interpreter"
117 | 			exit $?
118 | 		fi
119 | 	fi
120 | }
121 | 
122 | # Ensure python is available:
123 | if [ -z "${PM_PYTHON_EXT:-}" ]; then
124 | 	install_python
125 | else
126 | 	PM_PYTHON="$PM_PYTHON_EXT"
127 | fi
128 | 
129 | # The packman module may be externally configured
130 | if [ -z "${PM_MODULE_DIR_EXT:-}" ]; then
131 | 	PM_MODULE_DIR="$PM_PACKAGES_ROOT/packman-common/$PM_PACKMAN_VERSION"
132 | else
133 |     PM_MODULE_DIR="$PM_MODULE_DIR_EXT"
134 | fi
135 | export PM_MODULE="$PM_MODULE_DIR/run.py"
136 | 
137 | # Ensure the packman package exists:
138 | if [ ! -f "$PM_MODULE" ]; then
139 | 	# Remove a previously corrupt packman-common if it's there
140 | 	if [ -d "$PM_MODULE_DIR" ]; then
141 | 		rm -rf "$PM_MODULE_DIR"
142 | 	fi
143 | 	PM_MODULE_PACKAGE="packman-common@$PM_PACKMAN_VERSION.zip"
144 | 	TARGET=$(generate_temp_file_name)
145 | 	# We always fetch packman from S3:
146 | 	fetch_file_from_s3 "$PM_MODULE_PACKAGE" "$TARGET"
147 | 	if [ "$?" -eq "0" ]; then
148 | 		echo "Unpacking ..."
149 | 		"$PM_PYTHON" -S -s -u -E "$PM_INSTALL_PATH/bootstrap/install_package.py" "$TARGET" "$PM_MODULE_DIR"
150 | 		rm "$TARGET"
151 | 	else
152 | 		echo "Failure while fetching packman module from S3!"
153 | 		exit 1
154 | 	fi
155 | fi
156 | 
157 | # Generate temporary file name for environment variables:
158 | PM_VAR_PATH=`mktemp -u -t tmp.$$.pmvars.XXXXXX`
159 | 
160 | if [ $# -ne 0 ]
161 |   then
162 |     PM_VAR_PATH_ARG=--var-path="$PM_VAR_PATH"
163 | fi
164 | 
165 | "$PM_PYTHON" -S -s -u -E "$PM_MODULE" "$@" ${PM_VAR_PATH_ARG:-}
166 | exit_code=$?
167 | # Export the variables if the file was used and remove the file:
168 | if [ -f "$PM_VAR_PATH" ]; then
169 | 	while read -r line
170 | 	do
171 |         if [ ${#line} -gt 0 ]; then
172 |     		export "$line"
173 |         fi
174 | 	done < "$PM_VAR_PATH"
175 |     rm -f "$PM_VAR_PATH"
176 | fi
177 | 
178 | # avoid leaking -e and -u into the host script if they weren't originally set
179 | if [[ ! ( "$SAVED_SETTINGS" =~ e ) ]]; then
180 |     set +e
181 | fi
182 | 
183 | if [[ ! ( "$SAVED_SETTINGS" =~ u ) ]]; then
184 |     set +u
185 | fi
186 | 
187 | # Return the exit code from python
188 | if [ "$exit_code" != 0 ]; then
189 |     exit "$exit_code"
190 | fi
191 | 


--------------------------------------------------------------------------------
/tools/packman/packman.cmd:
--------------------------------------------------------------------------------
 1 | :: RUN_PM_MODULE must always be at the same spot for packman update to work (batch reloads file during update!) 
 2 | :: [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
 3 | :: Reset errorlevel status (don't inherit from caller) 
 4 | @call :ECHO_AND_RESET_ERROR
 5 | 
 6 | :: You can remove this section if you do your own manual configuration of the dev machines
 7 | call :CONFIGURE
 8 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
 9 | 
10 | :: Everything below is mandatory
11 | if not defined PM_PYTHON goto :PYTHON_ENV_ERROR
12 | if not defined PM_MODULE goto :MODULE_ENV_ERROR
13 | 
14 | set PM_VAR_PATH_ARG=
15 | 
16 | if "%1"=="pull" goto :SET_VAR_PATH
17 | if "%1"=="install" goto :SET_VAR_PATH
18 | 
19 | :RUN_PM_MODULE
20 | "%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" %* %PM_VAR_PATH_ARG%
21 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
22 | 
23 | :: Marshall environment variables into the current environment if they have been generated and remove temporary file
24 | if exist "%PM_VAR_PATH%" (
25 | 	for /F "usebackq tokens=*" %%A in ("%PM_VAR_PATH%") do set "%%A"
26 | )
27 | if %errorlevel% neq 0 ( goto :VAR_ERROR )
28 | 
29 | if exist "%PM_VAR_PATH%" (
30 | 	del /F "%PM_VAR_PATH%"
31 | )
32 | if %errorlevel% neq 0 ( goto :VAR_ERROR )
33 | 
34 | set PM_VAR_PATH=
35 | goto :eof
36 | 
37 | :: Subroutines below
38 | :PYTHON_ENV_ERROR
39 | @echo User environment variable PM_PYTHON is not set! Please configure machine for packman or call configure.bat.
40 | exit /b 1
41 | 
42 | :MODULE_ENV_ERROR
43 | @echo User environment variable PM_MODULE is not set! Please configure machine for packman or call configure.bat.
44 | exit /b 1
45 | 
46 | :VAR_ERROR
47 | @echo Error while processing and setting environment variables!
48 | exit /b 1
49 | 
50 | :: pad [xxxx]
51 | :ECHO_AND_RESET_ERROR
52 | @echo off
53 | if /I "%PM_VERBOSITY%"=="debug" (
54 | 	@echo on
55 | )
56 | exit /b 0
57 | 
58 | :SET_VAR_PATH
59 | :: Generate temporary path for variable file
60 | for /f "delims=" %%a in ('%PM_PYTHON% -S -s -u -E -c "import tempfile;file = tempfile.NamedTemporaryFile(mode='w+t', delete=False);print(file.name)"') do (set PM_VAR_PATH=%%a)
61 | set PM_VAR_PATH_ARG=--var-path="%PM_VAR_PATH%"
62 | goto :RUN_PM_MODULE
63 | 
64 | :CONFIGURE
65 | :: Must capture and set code page to work around issue #279, powershell invocation mutates console font
66 | :: This issue only happens in Windows CMD shell when using 65001 code page. Some Git Bash implementations 
67 | :: don't support chcp so this workaround is a bit convoluted.
68 | :: Test for chcp:
69 | chcp > nul 2>&1
70 | if %errorlevel% equ 0 ( 
71 | 	for /f "tokens=2 delims=:" %%a in ('chcp') do (set PM_OLD_CODE_PAGE=%%a)
72 | ) else (
73 | 	call :ECHO_AND_RESET_ERROR
74 | )
75 | :: trim leading space (this is safe even when PM_OLD_CODE_PAGE has not been set)
76 | set PM_OLD_CODE_PAGE=%PM_OLD_CODE_PAGE:~1%
77 | if "%PM_OLD_CODE_PAGE%" equ "65001" (
78 | 	chcp 437 > nul
79 | 	set PM_RESTORE_CODE_PAGE=1
80 | )
81 | call "%~dp0\bootstrap\configure.bat"
82 | set PM_CONFIG_ERRORLEVEL=%errorlevel%
83 | if defined PM_RESTORE_CODE_PAGE (
84 | 	:: Restore code page
85 | 	chcp %PM_OLD_CODE_PAGE% > nul
86 | )
87 | set PM_OLD_CODE_PAGE=
88 | set PM_RESTORE_CODE_PAGE=
89 | exit /b %PM_CONFIG_ERRORLEVEL%
90 | 


--------------------------------------------------------------------------------
/tools/packman/packmanconf.py:
--------------------------------------------------------------------------------
  1 | # Use this file to bootstrap packman into your Python environment (3.7.x). Simply
  2 | # add the path by doing sys.insert to where packmanconf.py is located and then execute:
  3 | #
  4 | # >>> import packmanconf
  5 | # >>> packmanconf.init()
  6 | #
  7 | # It will use the configured remote(s) and the version of packman in the same folder,
  8 | # giving you full access to the packman API via the following module
  9 | #
 10 | # >> import packmanapi
 11 | # >> dir(packmanapi)
 12 | 
 13 | import os
 14 | import platform
 15 | import sys
 16 | 
 17 | 
 18 | def init():
 19 |     """Call this function to initialize the packman configuration.
 20 | 
 21 |     Calls to the packman API will work after successfully calling this function.
 22 | 
 23 |     Note:
 24 |         This function only needs to be called once during the execution of your
 25 |         program. Calling it repeatedly is harmless but wasteful.
 26 |         Compatibility with your Python interpreter is checked and upon failure
 27 |         the function will report what is required.
 28 | 
 29 |     Example:
 30 |         >>> import packmanconf
 31 |         >>> packmanconf.init()
 32 |         >>> import packmanapi
 33 |         >>> packmanapi.set_verbosity_level(packmanapi.VERBOSITY_HIGH)
 34 |     """
 35 |     major = sys.version_info[0]
 36 |     minor = sys.version_info[1]
 37 |     if major != 3 or minor != 10:
 38 |         raise RuntimeError(
 39 |             f"This version of packman requires Python 3.10.x, but {major}.{minor} was provided"
 40 |         )
 41 |     conf_dir = os.path.dirname(os.path.abspath(__file__))
 42 |     os.environ["PM_INSTALL_PATH"] = conf_dir
 43 |     packages_root = get_packages_root(conf_dir)
 44 |     version = get_version(conf_dir)
 45 |     module_dir = get_module_dir(conf_dir, packages_root, version)
 46 |     sys.path.insert(1, module_dir)
 47 | 
 48 | 
 49 | def get_packages_root(conf_dir: str) -> str:
 50 |     root = os.getenv("PM_PACKAGES_ROOT")
 51 |     if not root:
 52 |         platform_name = platform.system()
 53 |         if platform_name == "Windows":
 54 |             drive, _ = os.path.splitdrive(conf_dir)
 55 |             root = os.path.join(drive, "packman-repo")
 56 |         elif platform_name == "Darwin":
 57 |             # macOS
 58 |             root = os.path.join(
 59 |                 os.path.expanduser("~"), "/Library/Application Support/packman-cache"
 60 |             )
 61 |         elif platform_name == "Linux":
 62 |             try:
 63 |                 cache_root = os.environ["XDG_HOME_CACHE"]
 64 |             except KeyError:
 65 |                 cache_root = os.path.join(os.path.expanduser("~"), ".cache")
 66 |             return os.path.join(cache_root, "packman")
 67 |         else:
 68 |             raise RuntimeError(f"Unsupported platform '{platform_name}'")
 69 |     # make sure the path exists:
 70 |     os.makedirs(root, exist_ok=True)
 71 |     return root
 72 | 
 73 | 
 74 | def get_module_dir(conf_dir, packages_root: str, version: str) -> str:
 75 |     module_dir = os.path.join(packages_root, "packman-common", version)
 76 |     if not os.path.exists(module_dir):
 77 |         import tempfile
 78 | 
 79 |         tf = tempfile.NamedTemporaryFile(delete=False)
 80 |         target_name = tf.name
 81 |         tf.close()
 82 |         url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip"
 83 |         print(f"Downloading '{url}' ...")
 84 |         import urllib.request
 85 | 
 86 |         urllib.request.urlretrieve(url, target_name)
 87 |         from importlib.machinery import SourceFileLoader
 88 | 
 89 |         # import module from path provided
 90 |         script_path = os.path.join(conf_dir, "bootstrap", "install_package.py")
 91 |         ip = SourceFileLoader("install_package", script_path).load_module()
 92 |         print("Unpacking ...")
 93 |         ip.install_package(target_name, module_dir)
 94 |         os.unlink(tf.name)
 95 |     return module_dir
 96 | 
 97 | 
 98 | def get_version(conf_dir: str):
 99 |     path = os.path.join(conf_dir, "packman")
100 |     if not os.path.exists(path):  # in dev repo fallback
101 |         path += ".sh"
102 |     with open(path, "rt", encoding="utf8") as launch_file:
103 |         for line in launch_file.readlines():
104 |             if line.startswith("PM_PACKMAN_VERSION"):
105 |                 _, value = line.split("=")
106 |                 return value.strip()
107 |     raise RuntimeError(f"Unable to find 'PM_PACKMAN_VERSION' in '{path}'")
108 | 


--------------------------------------------------------------------------------
/tools/packman/python.bat:
--------------------------------------------------------------------------------
 1 | :: Copyright 2019-2020 NVIDIA CORPORATION
 2 | ::
 3 | :: Licensed under the Apache License, Version 2.0 (the "License");
 4 | :: you may not use this file except in compliance with the License.
 5 | :: You may obtain a copy of the License at
 6 | ::
 7 | ::    http://www.apache.org/licenses/LICENSE-2.0
 8 | ::
 9 | :: Unless required by applicable law or agreed to in writing, software
10 | :: distributed under the License is distributed on an "AS IS" BASIS,
11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | :: See the License for the specific language governing permissions and
13 | :: limitations under the License.
14 | 
15 | @echo off
16 | setlocal enableextensions
17 | 
18 | call "%~dp0\packman" init
19 | set "PYTHONPATH=%PM_MODULE_DIR%;%PYTHONPATH%"
20 | 
21 | if not defined PYTHONNOUSERSITE (
22 |     set PYTHONNOUSERSITE=1
23 | )
24 | 
25 | REM For performance, default to unbuffered; however, allow overriding via
26 | REM PYTHONUNBUFFERED=0 since PYTHONUNBUFFERED on windows can truncate output
27 | REM when printing long strings
28 | if not defined PYTHONUNBUFFERED (
29 |     set PYTHONUNBUFFERED=1
30 | )
31 | 
32 | "%PM_PYTHON%" %*


--------------------------------------------------------------------------------
/tools/packman/python.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019-2020 NVIDIA CORPORATION
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -e
18 | 
19 | PACKMAN_CMD="$(dirname "${BASH_SOURCE}")/packman"
20 | if [ ! -f "$PACKMAN_CMD" ]; then
21 |     PACKMAN_CMD="${PACKMAN_CMD}.sh"
22 | fi
23 | source "$PACKMAN_CMD" init
24 | export PYTHONPATH="${PM_MODULE_DIR}:${PYTHONPATH}"
25 | 
26 | if [ -z "${PYTHONNOUSERSITE:-}" ]; then
27 |     export PYTHONNOUSERSITE=1
28 | fi
29 | 
30 | # For performance, default to unbuffered; however, allow overriding via
31 | # PYTHONUNBUFFERED=0 since PYTHONUNBUFFERED on windows can truncate output
32 | # when printing long strings
33 | if [ -z "${PYTHONUNBUFFERED:-}" ]; then
34 |     export PYTHONUNBUFFERED=1
35 | fi
36 | 
37 | # workaround for our python not shipping with certs
38 | if [[ -z ${SSL_CERT_DIR:-} ]]; then
39 |     export SSL_CERT_DIR=/etc/ssl/certs/
40 | fi
41 | 
42 | "${PM_PYTHON}" "$@"
43 | 


--------------------------------------------------------------------------------
/tools/repoman/repoman.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import io
 4 | import contextlib
 5 | import packmanapi
 6 | 
 7 | REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..")
 8 | REPO_DEPS_FILE = os.path.join(REPO_ROOT, "deps/repo-deps.packman.xml")
 9 | 
10 | 
11 | def bootstrap():
12 |     """
13 |     Bootstrap all omni.repo modules.
14 | 
15 |     Pull with packman from repo.packman.xml and add them all to python sys.path to enable importing.
16 |     """
17 |     with contextlib.redirect_stdout(io.StringIO()):
18 |         deps = packmanapi.pull(REPO_DEPS_FILE)
19 |     for dep_path in deps.values():
20 |         if dep_path not in sys.path:
21 |             sys.path.append(dep_path)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     bootstrap()
26 |     import omni.repo.man
27 | 
28 |     omni.repo.man.main(REPO_ROOT)
29 | 


--------------------------------------------------------------------------------
/work/dcgm-offline.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/work/dcgm-offline.inv


--------------------------------------------------------------------------------