├── .codespell_exclude_lines.txt ├── .github ├── dependabot.yml └── workflows │ ├── docs-build-pr.yaml │ ├── docs-build.yaml │ ├── docs-preview-pr.yaml │ └── docs-remove-stale-reviews.yaml ├── .gitignore ├── .gitlab-ci.yml ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── assets ├── NVIDIA_Horizontal_Logo_RGBBlack.png ├── NVLogo_H_B&W.png ├── NVLogo_H_B_W.png ├── favicon.ico ├── nvidia-logo-white.png └── nvidia_logo.png ├── container-toolkit ├── arch-overview.md ├── assets │ ├── nvidia-containerd-arch.png │ ├── nvidia-crio-lxc-arch.png │ ├── nvidia-docker-arch-new.png │ ├── nvidia-docker-arch.png │ └── runtime-architecture.png ├── cdi-support.md ├── docker-specialized.md ├── index.md ├── install-guide.md ├── output │ └── nvidia-smi.txt ├── release-notes.md ├── sample-workload.md ├── supported-platforms.md ├── troubleshooting.md ├── versions.json └── versions1.json ├── contents.rst ├── css └── custom.css ├── deps └── repo-deps.packman.xml ├── docker └── Dockerfile ├── driver-containers ├── graphics │ ├── driver-container-demo.gif │ └── nvidia-driver-container-image.png ├── redirected.rst ├── versions.json └── versions1.json ├── edge ├── anthos-guide.rst ├── graphics │ └── anthos │ │ └── virt │ │ ├── image01.png │ │ ├── image02.png │ │ ├── image03.png │ │ ├── image04.png │ │ ├── image05.png │ │ ├── image06.png │ │ ├── image07.png │ │ ├── image08.png │ │ ├── image09.png │ │ ├── image10.png │ │ ├── image11.png │ │ ├── image12.png │ │ ├── image13.png │ │ ├── image14.png │ │ ├── image15.png │ │ ├── image16.png │ │ └── image17.png ├── index.rst ├── nvidia-gpu-with-device-edge.rst ├── versions.json └── versions1.json ├── gpu-operator ├── amazon-eks.rst ├── cdi.rst ├── custom-driver-params.rst ├── dra-cds.rst ├── dra-gpus.rst ├── dra-intro-install.rst ├── getting-started.rst ├── google-gke.rst ├── gpu-driver-configuration.rst ├── gpu-driver-upgrades.rst ├── gpu-operator-kubevirt.rst ├── gpu-operator-mig.rst ├── gpu-operator-rdma.rst ├── gpu-sharing.rst ├── graphics │ ├── gpu-op-confidential-containers.svg │ ├── gpu-operator-demo.gif │ ├── nvd-basics.svg │ ├── nvidia-gpu-operator-image.jpg │ └── upgrade-controller-state-machine.png ├── index.rst ├── install-gpu-operator-air-gapped.rst ├── install-gpu-operator-gov-ready.rst ├── install-gpu-operator-nvaie.rst ├── install-gpu-operator-outdated-kernels.rst ├── install-gpu-operator-proxy.rst ├── install-gpu-operator-service-mesh.rst ├── install-gpu-operator-vgpu.rst ├── life-cycle-policy.rst ├── manifests │ ├── input │ │ ├── amazon-eks-cluster-config.yaml │ │ ├── custom-mig-config.yaml │ │ ├── google-gke-gpu-operator-quota.yaml │ │ ├── gpu-direct-rdma-demo-pod-1.yaml │ │ ├── gpu-direct-rdma-demo-pod-2.yaml │ │ ├── mig-cm-values.yaml │ │ ├── nvd-all.yaml │ │ ├── nvd-demo-gold.yaml │ │ ├── nvd-driver-multiple.yaml │ │ ├── nvd-precompiled-all.yaml │ │ ├── nvd-precompiled-some.yaml │ │ ├── tf-notebook.yaml │ │ ├── time-slicing-config-all.yaml │ │ ├── time-slicing-config-fine.yaml │ │ ├── time-slicing-config-sample.yaml │ │ └── time-slicing-verification.yaml │ └── output │ │ ├── cdi-get-pods-restart.txt │ │ ├── common-cuda-vectoradd-logs.txt │ │ ├── mig-get-pods.txt │ │ ├── mig-mixed-node-labels.json │ │ ├── mig-mixed-nvidia-smi.txt │ │ ├── mig-node-labels.json │ │ ├── mig-nvidia-smi.txt │ │ ├── precomp-driver-conventional-running.txt │ │ ├── precomp-driver-running.txt │ │ ├── precomp-driver-terminating.txt │ │ ├── time-slicing-get-events.txt │ │ ├── time-slicing-get-pods.txt │ │ └── time-slicing-logs-pods.txt ├── microsoft-aks.rst ├── overview.rst ├── platform-support.rst ├── precompiled-drivers.rst ├── release-notes.rst ├── security.rst ├── troubleshooting.rst ├── uninstall.rst ├── upgrade.rst ├── versions.json └── versions1.json ├── gpu-telemetry ├── about-telemetry.rst ├── dcgm-exporter.rst ├── graphics │ ├── dcgm-e2e │ │ ├── 001-dcgm-e2e-prom-screenshot.png │ │ ├── 002-dcgm-e2e-grafana-screenshot.png │ │ ├── 003-dcgm-e2e-grafana-home-screenshot.png │ │ ├── 004-dcgm-e2e-grafana-manage-screenshot.png │ │ ├── 005-dcgm-e2e-grafana-import-screenshot.png │ │ ├── 006-dcgm-e2e-grafana-import-screenshot.png │ │ ├── 007-dcgm-e2e-grafana-import-screenshot.png │ │ ├── 008-dcgm-e2e-grafana-dashboard-screenshot.png │ │ ├── 009-dcgm-e2e-deepstream-screenshot.png │ │ ├── 010-dcgm-e2e-deepstream-screenshot.png │ │ └── 011-dcgm-e2e-prom-dashboard-metrics-screenshot.png │ ├── dcgm-exporter-bare-metal.png │ ├── dcgm-exporter-containers.png │ ├── dcgm-exporter_embedded.png │ └── dcgm_and_dcgm-exporter.png ├── index.rst ├── integrating-telemetry-kubernetes.rst ├── kube-prometheus.rst ├── versions.json └── versions1.json ├── kubernetes ├── index.rst ├── versions.json └── versions1.json ├── make.bat ├── mig ├── mig-examples.rst ├── mig-k8s.rst └── mig.rst ├── openshift ├── appendix-ocp.rst ├── clean-up.rst ├── download │ └── 0003-cluster-wide-machineconfigs.yaml.template ├── enable-gpu-monitoring-dashboard.rst ├── get-entitlement.rst ├── gpu-operator-with-precompiled-drivers.rst ├── graphics │ ├── Mig-profile-A100.png │ ├── cluster-policy-image-version.png │ ├── cluster-policy-repository.png │ ├── cluster-policy-state-ready.png │ ├── cluster-policy-suceed.png │ ├── cluster_entitlement_1.png │ ├── cluster_entitlement_2.png │ ├── cluster_entitlement_3.png │ ├── cluster_entitlement_4.png │ ├── cluster_entitlement_5.png │ ├── cluster_entitlement_6.png │ ├── cluster_entitlement_attachsub.png │ ├── cluster_policy1.png │ ├── cluster_policy2.png │ ├── cluster_policy_1.png │ ├── cluster_policy_3.png │ ├── cluster_policy_4.png │ ├── cluster_policy_configure_vgpu.png │ ├── cluster_policy_enable_sandbox_workloads.png │ ├── cluster_policy_suceed.png │ ├── cluster_policy_vGPU_confg.png │ ├── cluster_policy_vgpu_1.png │ ├── cluster_policy_vgpu_2.png │ ├── create_cluster_policy.png │ ├── create_config_map1.png │ ├── create_project_1.png │ ├── create_project_2.png │ ├── createclusterpolicy2.png │ ├── createclusterpolicy3.png │ ├── created_pull-secret.png │ ├── disconnected_cluster.png │ ├── driver_toolkit_alert.png │ ├── enable-gpu-direct-rdma.png │ ├── entitlement_hypervisor.png │ ├── gpu-operator-certified-cli-install.png │ ├── gpu_dashboards.png │ ├── locate-cluster-acm.png │ ├── mig-mixed-profile-A100.png │ ├── mig_strategy.png │ ├── navigate_to_cluster_policy.png │ ├── nvaie2.3_cluster_policy.png │ ├── ocp_main_console_alerts.png │ ├── pci_passthrough.png │ ├── precompiled_driver_config_repository.png │ ├── precompiled_driver_config_version_and_image.png │ ├── pull-secret.png │ ├── secrets.png │ ├── secrets_2.png │ └── vmx_secure_boot.png ├── index.rst ├── install-gpu-ocp.rst ├── install-gpu-operator-gov-ready-openshift.rst ├── install-nfd.rst ├── introduction.rst ├── mig-ocp.rst ├── mirror-gpu-ocp-disconnected.rst ├── nvaie-with-ocp.rst ├── openshift-virtualization.rst ├── prerequisites.rst ├── steps-overview.rst ├── time-slicing-gpus-in-openshift.rst ├── troubleshooting-gpu-ocp.rst ├── versions.json └── versions1.json ├── partner-validated ├── PARTNER-VALIDATED-TEMPLATE.rst ├── index.rst ├── k0rdent.rst ├── mirantis-mke.rst ├── versions.json └── versions1.json ├── playground ├── dind.rst └── x-arch.rst ├── repo ├── repo.bat ├── repo.toml ├── review ├── index.rst ├── versions.json └── versions1.json ├── scripts └── create_archive.sh ├── secure-services-istio-keycloak ├── configure.md ├── images │ ├── keycloak-1.png │ ├── keycloak-10.png │ ├── keycloak-11.png │ ├── keycloak-12.png │ ├── keycloak-13.png │ ├── keycloak-14.png │ ├── keycloak-15.png │ ├── keycloak-16.png │ ├── keycloak-2.png │ ├── keycloak-3.png │ ├── keycloak-4.png │ ├── keycloak-5.png │ ├── keycloak-6.png │ ├── keycloak-7.png │ ├── keycloak-8.png │ ├── keycloak-9.png │ └── reference-arch-01.png ├── implementation.md ├── index.md ├── manifests │ ├── authorizationPolicy.yaml │ ├── istio-sample-manifest.yaml │ └── requestAuthentication.yaml ├── platform-support.md ├── versions.json └── versions1.json ├── templates ├── breadcrumbs.html └── last-updated.html ├── tools ├── packman │ ├── bootstrap │ │ ├── configure.bat │ │ ├── download_file_from_url.ps1 │ │ ├── fetch_file_from_packman_bootstrap.cmd │ │ ├── generate_temp_file_name.ps1 │ │ ├── generate_temp_folder.ps1 │ │ └── install_package.py │ ├── config.packman.xml │ ├── packman │ ├── packman.cmd │ ├── packmanconf.py │ ├── python.bat │ └── python.sh └── repoman │ └── repoman.py └── work └── dcgm-offline.inv /.codespell_exclude_lines.txt: -------------------------------------------------------------------------------- 1 | # Include whole lines that have codespell-recognized typos. 2 | # This is better than accepting a typo for ask someplace random. 3 | # End the file with a blank line. 4 | Approaches for Working with Azure AKS 5 | You can approach running workloads in Azure AKS with NVIDIA GPUs in at least two ways. 6 | Default AKS configuration without the GPU Operator 7 | By default, you can run Azure AKS images on GPU-enabled virtual machines with NVIDIA GPUs, 8 | AKS images include a preinstalled NVIDIA GPU Driver and preinstalled NVIDIA Container Toolkit. 9 | `Use GPUs for compute-intensive workloads on Azure Kubernetes Services `__ 10 | The images that are available in AKS always include a preinstalled NVIDIA GPU driver 11 | After you start your Azure AKS cluster, you are ready to install the NVIDIA GPU Operator. 12 | GPU Operator with Azure AKS 13 | * Added support for running the Operator with Microsoft Azure Kubernetes Service (AKS). 14 | You must use an AKS image with a preinstalled NVIDIA GPU driver and a preinstalled 15 | Create AKS Cluster with a Node Pool to Skip GPU Driver installation 16 | command-line argument to the ``az aks nodepool add`` command. 17 | $ az aks nodepool add --resource-group --name gpunodes --cluster-name \ 18 | `Skip GPU driver installation (preview) `__ 19 | After you start your Azure AKS cluster with an image that includes a preinstalled NVIDIA GPU Driver 20 | Azure AKS 21 | .. |prod-name-short| replace:: MKE 22 | Mirantis Kubernetes Engine (MKE) gives you the power to build, run, and scale cloud-native 23 | * - MKE 3.6.2+ and 3.5.7+ 24 | * A running MKE cluster with at least one control plane node and two worker nodes. 25 | * A seed node to connect to the MKE instance, with Helm 3.x installed on the seed node. 26 | * The kubeconfig file for the MKE cluster on the seed node. 27 | You can get the file from the MKE web interface by downloading a client bundle. 28 | Alternatively, if the MKE cluster is a managed cluster of a Mirantis Container Cloud (MCC) instance, 29 | In this case, the MKE web interface can be accessed from the MCC web interface. 30 | * You have an MKE administrator user name and password, and you have the MKE host URL. 31 | Perform the following steps to prepare the MKE cluster: 32 | #. MKE does not apply a label to worker nodes. 33 | $ export MKE_USERNAME= \ 34 | MKE_PASSWORD= \ 35 | MKE_HOST= 36 | #. Get an API key from MKE so that you can make API calls later: 37 | '{"username":"'$MKE_USERNAME'","password":"'$MKE_PASSWORD'"}' \ 38 | https://$MKE_HOST/auth/login | jq --raw-output .auth_token) 39 | #. Download the MKE configuration file: 40 | $ curl --silent --insecure -X GET "https://$MKE_HOST/api/ucp/config-toml" \ 41 | #. Upload the edited MKE configuration file: 42 | https://$MKE_HOST/api/ucp/config-toml 43 | The MKE cluster is ready for you to install the GPU Operator with Helm. 44 | Refer to the MKE product documentation for information about working with MKE. 45 | * https://docs.mirantis.com/mke/3.6/overview.html 46 | $ cat < nvidia-container-microshift.te 47 | $ checkmodule -m -M -o nvidia-container-microshift.mod nvidia-container-microshift.te 48 | 2023/06/22 14:25:38 Retreiving plugins. 49 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /.github/workflows/docs-build-pr.yaml: -------------------------------------------------------------------------------- 1 | name: docs-build-pr 2 | 3 | on: 4 | pull_request: 5 | branches: [ main, release-* ] 6 | types: [ opened, synchronize ] 7 | 8 | env: 9 | GH_TOKEN: ${{ github.token }} 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | build-docs: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v5 21 | - name: Set up Docker Buildx 22 | uses: docker/setup-buildx-action@v3 23 | - name: Build image 24 | uses: docker/build-push-action@v6 25 | with: 26 | context: . 27 | file: docker/Dockerfile 28 | load: true 29 | tags: pr-image:${{ github.sha }} 30 | - name: Build docs 31 | run: | 32 | docker run -v $(pwd):/work -w /work pr-image:${{ github.sha }} ./repo docs 33 | - name: Delete unnecessary files 34 | run: | 35 | sudo find _build -name .doctrees -prune -exec rm -rf {} \; 36 | sudo find _build -name .buildinfo -exec rm {} \; 37 | - name: Copy review page 38 | run: | 39 | sudo mv _build/docs/review/latest/* _build/docs 40 | sudo rm -rf _build/docs/review _build/docs/tmp _build/docs/sphinx_warnings.txt 41 | - name: Upload HTML 42 | uses: actions/upload-artifact@v4 43 | with: 44 | name: html-build-artifact 45 | path: _build/docs 46 | if-no-files-found: error 47 | retention-days: 1 48 | - name: Store PR information 49 | run: | 50 | mkdir ./pr 51 | echo ${{ github.event.number }} > ./pr/pr.txt 52 | echo ${{ github.event.pull_request.merged }} > ./pr/merged.txt 53 | echo ${{ github.event.action }} > ./pr/action.txt 54 | - name: Upload PR information 55 | uses: actions/upload-artifact@v4 56 | with: 57 | name: pr 58 | path: pr/ 59 | -------------------------------------------------------------------------------- /.github/workflows/docs-preview-pr.yaml: -------------------------------------------------------------------------------- 1 | name: docs-preview-pr 2 | 3 | on: 4 | workflow_run: 5 | workflows: [ docs-build-pr ] 6 | types: [ completed ] 7 | branches-ignore: [ main ] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 11 | cancel-in-progress: true 12 | 13 | env: 14 | WF_ID: ${{ github.event.workflow_run.id }} 15 | 16 | jobs: 17 | preview: 18 | uses: nvidia-merlin/.github/.github/workflows/docs-preview-pr-common.yaml@main -------------------------------------------------------------------------------- /.github/workflows/docs-remove-stale-reviews.yaml: -------------------------------------------------------------------------------- 1 | name: docs-remove-stale-reviews 2 | 3 | on: 4 | schedule: 5 | # 42 minutes after 0:00 UTC on Sundays 6 | - cron: "42 0 * * 0" 7 | workflow_dispatch: 8 | 9 | jobs: 10 | remove: 11 | uses: nvidia-merlin/.github/.github/workflows/docs-remove-stale-reviews-common.yaml@main 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | _* 3 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | CONTAINER_TEST_IMAGE: "${CI_REGISTRY_IMAGE}:${CI_COMMIT_REF_SLUG}" 3 | CONTAINER_RELEASE_IMAGE: "${CI_REGISTRY_IMAGE}:0.5.1" 4 | BUILDER_IMAGE: ghcr.io/nvidia/cloud-native-docs:0.5.1 5 | PUBLISHER_IMAGE: "${CI_REGISTRY_PUBLISHER}/publisher:3.1.0" 6 | 7 | stages: 8 | - .pre 9 | - build_image 10 | - build_docs 11 | - publish_docs 12 | 13 | .build_image: 14 | image: docker:23.0.6 15 | stage: .pre 16 | services: 17 | - docker:23.0.6-dind 18 | variables: 19 | GIT_STRATEGY: clone 20 | script: 21 | - apk add git 22 | - git fetch origin "${CI_DEFAULT_BRANCH}" 23 | - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" 24 | - if ! docker manifest inspect "${BUILDER_IMAGE}" 2>&1 > /dev/null ; then export NEEDS_IMAGE=true ; fi 25 | - FILES=$(git diff --name-only "${CI_COMMIT_SHA}" "origin/${CI_DEFAULT_BRANCH}" | tr '\n' ' ') 26 | - if echo "${FILES}" | grep -q "deps/\|Dockerfile\|repo.toml" ; then export NEEDS_IMAGE=true ; fi 27 | - > 28 | if [[ "${NEEDS_IMAGE}" ]]; then 29 | docker build -t "${CONTAINER_TEST_IMAGE}" . -f docker/Dockerfile 30 | docker push "${CONTAINER_TEST_IMAGE}" 31 | echo "BUILDER_IMAGE=${CONTAINER_TEST_IMAGE}" >> build.env 32 | else 33 | echo "BUILDER_IMAGE=${BUILDER_IMAGE}" >> build.env 34 | fi 35 | - > 36 | if [ "${NEEDS_IMAGE}" ] && [ "${CI_COMMIT_BRANCH}" == "${CI_DEFAULT_BRANCH}" ] && [ "${CI_PIPELINE_SOURCE}" == "push" ]; then 37 | docker tag "${CONTAINER_TEST_IMAGE}" "${CONTAINER_RELEASE_IMAGE}" 38 | docker push "${CONTAINER_RELEASE_IMAGE}" 39 | fi 40 | artifacts: 41 | reports: 42 | dotenv: build.env 43 | 44 | .build_image_rules: 45 | rules: 46 | - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH) || $CI_PIPELINE_SOURCE == "merge_request_event" 47 | 48 | build_image_ext: 49 | extends: .build_image 50 | rules: 51 | - if: $INTERNAL != null 52 | when: never 53 | - !reference [.build_image_rules, rules] 54 | 55 | build_image_int: 56 | extends: .build_image 57 | tags: 58 | - os/linux 59 | - type/docker 60 | rules: 61 | - if: $INTERNAL == null 62 | when: never 63 | - !reference [.build_image_rules, rules] 64 | 65 | .build: 66 | stage: build_docs 67 | image: "${BUILDER_IMAGE}" 68 | script: 69 | - ./repo docs 70 | - echo "BUILDER_IMAGE=${BUILDER_IMAGE}" >> build.env 71 | artifacts: 72 | name: ${CI_PROJECT_NAME}-${CI_COMMIT_SHORT_SHA} 73 | paths: 74 | - _build 75 | expire_in: 4w 76 | reports: 77 | dotenv: build.env 78 | 79 | .build_rules: 80 | rules: 81 | - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH) || $CI_PIPELINE_SOURCE == "merge_request_event" 82 | - if: $CI_COMMIT_REF_NAME =~ /-v[0-9]/ 83 | 84 | build_ext: 85 | extends: .build 86 | variables: 87 | APIURL: "${CI_API_V4_URL}/projects/${CI_MERGE_REQUEST_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}/discussions" 88 | after_script: 89 | - PROJPART=$(echo "${CI_PROJECT_PATH#$CI_PROJECT_ROOT_NAMESPACE}") 90 | - BASEURL=$(echo "https://${CI_PROJECT_ROOT_NAMESPACE}.${CI_PAGES_DOMAIN}/-${PROJPART}") 91 | - REVURL=$(echo "${BASEURL}/-/jobs/${CI_JOB_ID}/artifacts/_build/docs/review/latest/index.html") 92 | - MSG=$(echo "{\"body\":\"

Review HTML

${REVURL}

\"}") 93 | - echo "${REVURL}" 94 | - echo "${MSG}" 95 | - 'curl -X POST -H "Authorization: Bearer ${MR_COMMENT}" "${APIURL}" -H "Content-Type: application/json" --data-raw "${MSG}"' 96 | rules: 97 | - if: $INTERNAL != null 98 | when: never 99 | - !reference [.build_rules, rules] 100 | 101 | build_int: 102 | extends: .build 103 | tags: 104 | - os/linux 105 | - type/docker 106 | rules: 107 | - if: $INTERNAL == null 108 | when: never 109 | - !reference [.build_rules, rules] 110 | 111 | pages: 112 | image: "${CONTAINER_RELEASE_IMAGE}" 113 | stage: publish_docs 114 | script: 115 | - rm -rf public 116 | - cp -r _build/docs/ public 117 | artifacts: 118 | paths: 119 | - public 120 | expire_in: 1 week 121 | dependencies: 122 | - build_ext 123 | rules: 124 | - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push" && $INTERNAL != 'true' 125 | 126 | 127 | publish_docs: 128 | image: 129 | name: "${PUBLISHER_IMAGE}" 130 | entrypoint: [ "" ] 131 | stage: publish_docs 132 | tags: 133 | - os/linux 134 | - type/docker 135 | variables: 136 | HTML_PATH: "_build/docs" 137 | FORCE_LATEST: "true" 138 | script: 139 | - echo "Pushing docs live to https://docs.nvidia.com/datacenter/cloud-native" 140 | - |+ 141 | if [[ "${CI_COMMIT_REF_NAME}" =~ (.+)-v([0-9]+\.[0-9]+(\.[a-zA-Z0-9]+)?) ]]; then 142 | export DOCSET="${BASH_REMATCH[1]}" 143 | export VERSION="${BASH_REMATCH[2]}" 144 | fi 145 | - |+ 146 | if [ -z "${DOCSET}" ] || [ -z "${VERSION}" ]; then 147 | echo "Failed to determine the docset or version." 148 | exit 1 149 | fi 150 | - |+ 151 | if [[ "${CI_COMMIT_MESSAGE}" =~ $'/not-latest\n' ]]; then 152 | export FORCE_LATEST=false 153 | fi 154 | - echo "Publishing docs for ${DOCSET} and version ${VERSION}" 155 | - pushd "${HTML_PATH}/${DOCSET}/latest" 156 | - deploy_s3.sh --archive "${DOCSET}" "${VERSION}" 157 | - |+ 158 | if [ "true" == "${FORCE_LATEST}" ]; then 159 | deploy_s3.sh --latest "${DOCSET}" 160 | fi 161 | - deploy_s3.sh --flush "${DOCSET}" 162 | dependencies: 163 | - build_int 164 | rules: 165 | - if: $CI_COMMIT_TAG =~ /-v[0-9]/ && ($CI_PIPELINE_SOURCE == "push" || $CI_PIPELINE_SOURCE == "web" ) && $INTERNAL 166 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: mixed-line-ending 6 | - id: trailing-whitespace 7 | - id: check-yaml 8 | - repo: https://github.com/codespell-project/codespell 9 | rev: v2.2.2 10 | hooks: 11 | - id: codespell 12 | args: [ "-x", ".codespell_exclude_lines.txt"] 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to the Docs 2 | 3 | Thanks for contributing to the documentation repository! The documentation is licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0). Before 4 | patches are accepted and merged, we require that these relatively simple guidelines be followed: 5 | * Adhere to the documentation style guidelines 6 | * Sign your work 7 | 8 | Also, read an [overview](https://developers.google.com/tech-writing/overview) on Technical Writing from Google on authoring good technical content! 9 | 10 | ## Documentation style guidelines 11 | 12 | This documentation is authored using [reStructuredText](http://docutils.sourceforge.net/rst.html) as a markup language and uses the 13 | [Sphinx](https://www.sphinx-doc.org/en/master/) documentation generator. 14 | 15 | ### Filenames 16 | 17 | Use only lowercase alphanumeric characters and hyphens `-` where required. Filenames are suffixed with the `.rst` extension. 18 | 19 | ### Headings 20 | 21 | Use title case for headings. 22 | Refer to https://titlecase.com/ for more information. 23 | 24 | The headings follow this convention: 25 | 26 | 1. `H1` or document title based on `#` with overline 27 | 1. `H2` based on `*` with overline 28 | 1. `H3` based on `=` 29 | 1. `H4` based on `-` 30 | 1. `H5` based on `^` 31 | 1. `H6` based on `"` 32 | 33 | If you need more levels, then consider creating a new document. A document has only one `H1`. 34 | 35 | ### Guideline for Kubernetes Object Types in Body Text 36 | 37 | Prefer lowercase plain text such as namespace, pod, daemon set, container, service, and so on. 38 | This guideline applies to multi-word types like custom resource definition. 39 | 40 | Use the camel case name only if you follow the name with object, resource, and so on. 41 | For example, "Delete the ``Pod`` object..." 42 | However, that example is not compelling and is just as clear when written as "Delete the pod..." 43 | 44 | ### Console Outputs 45 | 46 | #### Directives 47 | 48 | For console outputs in this document, use `code-block:: console` directive. This results in a red prompt, which makes it easy to distinguish between the prompt 49 | and the command. 50 | 51 | #### Commands 52 | 53 | Separate each command into its own `code-block`. Since this repository uses the Sphinx `copy-button` to allow for easy copy/pasting of commands 54 | by users, it makes sense to separate each command for readability and usage. 55 | 56 | If you need to aggregate multiple commands, then use the separator, 2-space indentation and `&&` on each line as shown in the example below: 57 | ```console 58 | $ command1 \ 59 | && command2 \ 60 | && command3 61 | ``` 62 | 63 | #### Outputs 64 | 65 | Separate outputs and commands into their own `code-block` sequence. Since the repository is configured to copy everything (including items after the prompt lines by 66 | setting `copybutton_only_copy_prompt_lines` to false), it is desirable to only copy commands. 67 | 68 | ## Sign your work 69 | 70 | The sign-off is a simple line at the end of the explanation for the patch. Your 71 | signature certifies that you wrote the patch or otherwise have the right to pass 72 | it on as an open-source patch. The rules are pretty simple: if you can certify 73 | the below (from [developercertificate.org](http://developercertificate.org/)): 74 | 75 | ``` 76 | Developer Certificate of Origin 77 | Version 1.1 78 | 79 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 80 | 1 Letterman Drive 81 | Suite D4700 82 | San Francisco, CA, 94129 83 | 84 | Everyone is permitted to copy and distribute verbatim copies of this 85 | license document, but changing it is not allowed. 86 | 87 | Developer's Certificate of Origin 1.1 88 | 89 | By making a contribution to this project, I certify that: 90 | 91 | (a) The contribution was created in whole or in part by me and I 92 | have the right to submit it under the open source license 93 | indicated in the file; or 94 | 95 | (b) The contribution is based upon previous work that, to the best 96 | of my knowledge, is covered under an appropriate open source 97 | license and I have the right under that license to submit that 98 | work with modifications, whether created in whole or in part 99 | by me, under the same open source license (unless I am 100 | permitted to submit under a different license), as indicated 101 | in the file; or 102 | 103 | (c) The contribution was provided directly to me by some other 104 | person who certified (a), (b) or (c) and I have not modified 105 | it. 106 | 107 | (d) I understand and agree that this project and the contribution 108 | are public and that a record of the contribution (including all 109 | personal information I submit with it, including my sign-off) is 110 | maintained indefinitely and may be redistributed consistent with 111 | this project or the open source license(s) involved. 112 | ``` 113 | 114 | Then you just add a line to every git commit message: 115 | 116 | Signed-off-by: Joe Smith 117 | 118 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 119 | 120 | If you set your `user.name` and `user.email` git configs, you can sign your 121 | commit automatically with `git commit -s`. 122 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /assets/NVIDIA_Horizontal_Logo_RGBBlack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/NVIDIA_Horizontal_Logo_RGBBlack.png -------------------------------------------------------------------------------- /assets/NVLogo_H_B&W.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/NVLogo_H_B&W.png -------------------------------------------------------------------------------- /assets/NVLogo_H_B_W.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/NVLogo_H_B_W.png -------------------------------------------------------------------------------- /assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/favicon.ico -------------------------------------------------------------------------------- /assets/nvidia-logo-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/nvidia-logo-white.png -------------------------------------------------------------------------------- /assets/nvidia_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/assets/nvidia_logo.png -------------------------------------------------------------------------------- /container-toolkit/assets/nvidia-containerd-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-containerd-arch.png -------------------------------------------------------------------------------- /container-toolkit/assets/nvidia-crio-lxc-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-crio-lxc-arch.png -------------------------------------------------------------------------------- /container-toolkit/assets/nvidia-docker-arch-new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-docker-arch-new.png -------------------------------------------------------------------------------- /container-toolkit/assets/nvidia-docker-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/nvidia-docker-arch.png -------------------------------------------------------------------------------- /container-toolkit/assets/runtime-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/container-toolkit/assets/runtime-architecture.png -------------------------------------------------------------------------------- /container-toolkit/index.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | ```{toctree} 4 | :caption: NVIDIA Container Toolkit 5 | :hidden: true 6 | :titlesonly: true 7 | 8 | self 9 | Installing the Toolkit 10 | sample-workload 11 | supported-platforms.md 12 | troubleshooting.md 13 | release-notes.md 14 | ``` 15 | 16 | ```{toctree} 17 | :caption: Advanced Configuration 18 | :hidden: true 19 | :titlesonly: true 20 | 21 | arch-overview.md 22 | Container Device Interface 23 | docker-specialized.md 24 | ``` 25 | 26 | The NVIDIA Container Toolkit is a collection of libraries and utilities enabling users to build and run GPU-accelerated containers. It currently includes: 27 | 28 | * The NVIDIA Container Runtime (`nvidia-container-runtime`) 29 | * The NVIDIA Container Toolkit CLI (`nvidia-ctk`) 30 | * The NVIDIA CDI Hooks (`nvidia-cdi-hook`) 31 | * The NVIDIA Container Runtime Hook (`nvidia-container-runtime-hook`) 32 | * The NVIDIA Container CLI (`nvidia-container-cli`) 33 | * The NVIDIA Container Library (`libnvidia-container1`) 34 | 35 | ## License 36 | 37 | The NVIDIA Container Toolkit (and all included components) is licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and 38 | contributions are accepted with a Developer Certificate of Origin (DCO). Refer to the [contributing](https://github.com/NVIDIA/nvidia-container-toolkit/blob/master/CONTRIBUTING.md) document for 39 | more information. 40 | -------------------------------------------------------------------------------- /container-toolkit/output/nvidia-smi.txt: -------------------------------------------------------------------------------- 1 | +-----------------------------------------------------------------------------+ 2 | | NVIDIA-SMI 535.86.10 Driver Version: 535.86.10 CUDA Version: 12.2 | 3 | |-------------------------------+----------------------+----------------------+ 4 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 5 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 6 | | | | MIG M. | 7 | |===============================+======================+======================| 8 | | 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | 9 | | N/A 34C P8 9W / 70W | 0MiB / 15109MiB | 0% Default | 10 | | | | N/A | 11 | +-------------------------------+----------------------+----------------------+ 12 | 13 | +-----------------------------------------------------------------------------+ 14 | | Processes: | 15 | | GPU GI CI PID Type Process name GPU Memory | 16 | | ID ID Usage | 17 | |=============================================================================| 18 | | No running processes found | 19 | +-----------------------------------------------------------------------------+ 20 | -------------------------------------------------------------------------------- /container-toolkit/sample-workload.md: -------------------------------------------------------------------------------- 1 | # Running a Sample Workload 2 | 3 | ## Running a Sample Workload with Docker 4 | 5 | After you install and configure the toolkit and install an NVIDIA GPU Driver, 6 | you can verify your installation by running a sample workload. 7 | 8 | - Run a sample CUDA container: 9 | 10 | ```console 11 | sudo docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi 12 | ``` 13 | 14 | Your output should resemble the following output: 15 | 16 | ```{literalinclude} ./output/nvidia-smi.txt 17 | --- 18 | language: output 19 | --- 20 | ``` 21 | 22 | ## Running a Sample Workload with Podman 23 | 24 | After you install and configure the toolkit (including [generating a CDI specification](cdi-support.md)) and install an NVIDIA GPU Driver, 25 | you can verify your installation by running a sample workload. 26 | 27 | - Run a sample CUDA container: 28 | 29 | ```console 30 | podman run --rm --security-opt=label=disable \ 31 | --device=nvidia.com/gpu=all \ 32 | ubuntu nvidia-smi 33 | ``` 34 | 35 | Your output should resemble the following output: 36 | 37 | ```{literalinclude} ./output/nvidia-smi.txt 38 | --- 39 | language: output 40 | --- 41 | ``` 42 | 43 | ## Running Sample Workloads with containerd or CRI-O 44 | 45 | These runtimes are more common with Kubernetes than desktop computing. 46 | Refer to {doc}`gpuop:index` in the NVIDIA GPU Operator documentation for more information. -------------------------------------------------------------------------------- /container-toolkit/supported-platforms.md: -------------------------------------------------------------------------------- 1 | % Date: August 10 2020 2 | 3 | % Author: pramarao 4 | 5 | (supported-platforms)= 6 | 7 | # Platform support 8 | 9 | Recent NVIDIA Container Toolkit releases are tested and expected to work on these Linux distributions: 10 | 11 | | OS Name / Version | amd64 / x86_64 | ppc64le | arm64 / aarch64 {sup}`1` | 12 | | ------------------------ | -------------- | ------- | ------------------------ | 13 | | Amazon Linux 2023 | X | | X {sup}`2` | 14 | | Amazon Linux 2 | X | | X | 15 | | Open Suse/SLES 15.x | X | | | 16 | | Debian Linux 11 | X | | | 17 | | CentOS 8 | X | X | X | 18 | | RHEL 8.x | X | X | X | 19 | | RHEL 9.x | X | X | X | 20 | | RHEL 10.x | X | X | X | 21 | | Ubuntu 20.04 | X | X | X | 22 | | Ubuntu 22.04 | X | X | X | 23 | | Ubuntu 24.04 | X | | X | 24 | 25 | 26 | ## Report issues 27 | 28 | Our qualification-testing procedures are constantly evolving and we might miss 29 | certain problems. [Report](https://github.com/NVIDIA/nvidia-container-toolkit/issues) issues in 30 | particular as they occur on a platform listed above. 31 | 32 | 33 | ## Other Linux distributions 34 | 35 | Releases may work on more platforms than indicated in the table above (such as on distribution versions older and newer than listed). 36 | Give things a try and we invite you to [report](https://github.com/NVIDIA/nvidia-container-toolkit/issues) any issue observed even if your Linux distribution is not listed. 37 | 38 | ---- 39 | 40 | 1. The `arm64` / `aarch64` architecture includes support for Tegra-based systems. 41 | 2. For Amazon Linux 2023 on Arm64, a `g5g.2xlarge` Amazon EC2 instance was used for validation. 42 | The `g5g.xlarge` instance caused failures due to the limited system memory. 43 | -------------------------------------------------------------------------------- /container-toolkit/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "latest": "1.18.1", 3 | "versions": 4 | [ 5 | { 6 | "version": "1.18.1" 7 | }, 8 | { 9 | "version": "1.18.0" 10 | }, 11 | { 12 | "version": "1.17.8" 13 | }, 14 | { 15 | "version": "1.17.7" 16 | }, 17 | { 18 | "version": "1.17.6" 19 | }, 20 | { 21 | "version": "1.17.5" 22 | }, 23 | { 24 | "version": "1.17.4" 25 | }, 26 | { 27 | "version": "1.17.3" 28 | }, 29 | { 30 | "version": "1.17.2" 31 | }, 32 | { 33 | "version": "1.17.1" 34 | }, 35 | { 36 | "version": "1.17.0" 37 | }, 38 | { 39 | "version": "1.16.2" 40 | }, 41 | { 42 | "version": "1.16.1" 43 | }, 44 | { 45 | "version": "1.16.0" 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /container-toolkit/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.18.1", 5 | "version": "1.18.1" 6 | }, 7 | { 8 | "url": "../1.18.0", 9 | "version": "1.18.0" 10 | }, 11 | { 12 | "url": "../1.17.8", 13 | "version": "1.17.8" 14 | }, 15 | { 16 | "url": "../1.17.7", 17 | "version": "1.17.7" 18 | }, 19 | { 20 | "url": "../1.17.6", 21 | "version": "1.17.6" 22 | }, 23 | { 24 | "url": "../1.17.5", 25 | "version": "1.17.5" 26 | }, 27 | { 28 | "url": "../1.17.4", 29 | "version": "1.17.4" 30 | }, 31 | { 32 | "url": "../1.17.3", 33 | "version": "1.17.3" 34 | }, 35 | { 36 | "url": "../1.17.2", 37 | "version": "1.17.2" 38 | }, 39 | { 40 | "url": "../1.17.1", 41 | "version": "1.17.1" 42 | }, 43 | { 44 | "url": "../1.17.0", 45 | "version": "1.17.0" 46 | }, 47 | { 48 | "url": "../1.16.2", 49 | "version": "1.16.2" 50 | }, 51 | { 52 | "url": "../1.16.1", 53 | "version": "1.16.1" 54 | }, 55 | { 56 | "url": "../1.16.0", 57 | "version": "1.16.0" 58 | } 59 | ] -------------------------------------------------------------------------------- /contents.rst: -------------------------------------------------------------------------------- 1 | .. NVIDIA Cloud Native Technologies documentation master file, created by 2 | sphinx-quickstart on Mon Jul 27 23:51:30 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | NVIDIA Cloud Native Technologies 7 | ================================ 8 | This documentation repository contains the product documentation for the 9 | :ref:`NVIDIA Container Toolkit `, the :ref:`NVIDIA GPU Operator `, and 10 | using NVIDIA GPUs with Kubernetes. 11 | 12 | .. toctree:: 13 | :hidden: 14 | 15 | .. Documentation home 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | :caption: NVIDIA Container Toolkit: 20 | 21 | container-toolkit/overview.rst 22 | container-toolkit/concepts.rst 23 | container-toolkit/arch-overview.rst 24 | container-toolkit/install-guide.rst 25 | container-toolkit/troubleshooting.rst 26 | container-toolkit/user-guide.rst 27 | container-toolkit/release-notes.rst 28 | container-toolkit/archive.rst 29 | 30 | .. toctree:: 31 | :maxdepth: 2 32 | :caption: NVIDIA GPU Operator: 33 | 34 | gpu-operator/overview.rst 35 | gpu-operator/getting-started.rst 36 | gpu-operator/platform-support.rst 37 | gpu-operator/release-notes.rst 38 | gpu-operator/gpu-driver-upgrades.rst 39 | gpu-operator/install-gpu-operator-vgpu.rst 40 | gpu-operator/install-gpu-operator-nvaie.rst 41 | GPU Operator on OpenShift 42 | gpu-operator/gpu-operator-mig.rst 43 | gpu-operator/gpu-sharing.rst 44 | gpu-operator/gpu-operator-rdma.rst 45 | gpu-operator/gpu-operator-kubevirt.rst 46 | gpu-operator/appendix.rst 47 | gpu-operator/archive.rst 48 | 49 | .. toctree:: 50 | :maxdepth: 2 51 | :caption: Kubernetes with GPUs: 52 | 53 | kubernetes/install-k8s.rst 54 | kubernetes/mig-k8s.rst 55 | kubernetes/anthos-guide.rst 56 | 57 | .. toctree:: 58 | :titlesonly: 59 | :caption: NVIDIA GPUs and Red Hat Device Edge 60 | 61 | edge/nvidia-gpu-with-device-edge.rst 62 | 63 | .. toctree:: 64 | :maxdepth: 2 65 | :caption: GPU Telemetry: 66 | 67 | gpu-telemetry/dcgm-exporter.rst 68 | 69 | .. toctree:: 70 | :maxdepth: 2 71 | :caption: Multi-Instance GPU: 72 | 73 | mig/mig.rst 74 | mig/mig-k8s.rst 75 | 76 | .. toctree:: 77 | :maxdepth: 2 78 | :caption: Driver Containers: 79 | 80 | driver-containers/overview.rst 81 | 82 | .. toctree:: 83 | :maxdepth: 2 84 | :caption: Playground: 85 | 86 | playground/dind.rst 87 | playground/x-arch.rst 88 | 89 | .. Indices and tables 90 | .. ================== 91 | .. 92 | .. * :ref:`genindex` 93 | -------------------------------------------------------------------------------- /css/custom.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | html[data-theme=light] .highlight .go { 6 | font-style:unset 7 | } 8 | 9 | .bd-page-width { 10 | max-width: 176rem; 11 | } 12 | 13 | .bd-main { 14 | flex: 1 1 auto; 15 | } 16 | 17 | .bd-main .bd-content .bd-article-container { 18 | max-width: 100%; 19 | } 20 | 21 | .bd-sidebar-secondary { 22 | /* flex: 0 0 auto; */ 23 | flex-basis: 15%; 24 | min-width: var(--pst-sidebar-secondary); 25 | } 26 | 27 | html[data-theme=light] .bd-toc-nav .nav-link-expand { 28 | display: none !important; 29 | } 30 | 31 | .bd-sidebar-primary li.has-children>details>summary .toctree-toggle { 32 | display: none !important; 33 | } 34 | -------------------------------------------------------------------------------- /deps/repo-deps.packman.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Imported from https://hub.docker.com/r/sphinxdoc/sphinx/dockerfile 2 | # maintainer="Sphinx Team " 3 | # $ docker build --pull \ 4 | # --tag ${REGISTRY}/sphinxdoc 5 | # --file Dockerfile . 6 | FROM python:3.10-slim 7 | 8 | WORKDIR /docs 9 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive \ 10 | && apt-get install --no-install-recommends -y \ 11 | curl \ 12 | && apt-get autoremove \ 13 | && apt-get clean \ 14 | && rm -rf /var/lib/apt/lists/* 15 | 16 | ENV PM_PACKAGES_ROOT=/var/tmp/packman 17 | 18 | RUN --mount=type=bind,source=.,destination=/x,rw /x/repo docs -p review || true 19 | 20 | RUN --mount=type=bind,source=.,destination=/x,rw /x/tools/packman/python.sh -m pip install --no-cache-dir --no-deps -U \ 21 | -t /tmp/extension \ 22 | sphinx-copybutton \ 23 | nvidia-sphinx-theme \ 24 | pydata-sphinx-theme \ 25 | linuxdoc 26 | 27 | RUN (cd /tmp/extension; tar cf - . ) | (cd /var/tmp/packman/chk/sphinx/4.5.0.2-py3.7-linux-x86_64/; tar xf -) 28 | RUN rm -rf /tmp/extension 29 | 30 | RUN --mount=type=bind,target=/work echo 'alias build-docs="./repo docs"' >> ~/.bashrc 31 | -------------------------------------------------------------------------------- /driver-containers/graphics/driver-container-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/driver-containers/graphics/driver-container-demo.gif -------------------------------------------------------------------------------- /driver-containers/graphics/nvidia-driver-container-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/driver-containers/graphics/nvidia-driver-container-image.png -------------------------------------------------------------------------------- /driver-containers/redirected.rst: -------------------------------------------------------------------------------- 1 | Documentation for the driver containers is obsolete. 2 | 3 | Refer to :external+gpuop:doc:`index`. -------------------------------------------------------------------------------- /driver-containers/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": 3 | [ 4 | { 5 | "version": "1.0.0" 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /driver-containers/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image01.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image02.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image03.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image04.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image05.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image06.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image07.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image08.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image09.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image10.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image11.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image12.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image13.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image14.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image15.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image16.png -------------------------------------------------------------------------------- /edge/graphics/anthos/virt/image17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/edge/graphics/anthos/virt/image17.png -------------------------------------------------------------------------------- /edge/index.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings (h1/h2/h3/h4/h5) are # * = - 18 | 19 | ########################################### 20 | NVIDIA Cloud Native Reference Architectures 21 | ########################################### 22 | 23 | .. toctree:: 24 | :titlesonly: 25 | 26 | nvidia-gpu-with-device-edge 27 | anthos-guide -------------------------------------------------------------------------------- /edge/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": 3 | [ 4 | { 5 | "version": "1.0.0" 6 | } 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /edge/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /gpu-operator/cdi.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | ############################################################ 20 | Container Device Interface (CDI) Support in the GPU Operator 21 | ############################################################ 22 | 23 | ************************************ 24 | About the Container Device Interface 25 | ************************************ 26 | 27 | The `Container Device Interface (CDI) `_ 28 | is an open specification for container runtimes that abstracts what access to a device, such as an NVIDIA GPU, means, 29 | and standardizes access across container runtimes. Popular container runtimes can read and process the specification to 30 | ensure that a device is available in a container. CDI simplifies adding support for devices such as NVIDIA GPUs because 31 | the specification is applicable to all container runtimes that support CDI. 32 | 33 | Starting with GPU Operator v25.10.0, CDI is used by default for enabling GPU support in containers running on Kubernetes. 34 | Specifically, CDI support in container runtimes, e.g. containerd and cri-o, is used to inject GPU(s) into workload 35 | containers. This differs from prior GPU Operator releases where CDI was used via a CDI-enabled ``nvidia`` runtime class. 36 | 37 | Use of CDI is transparent to cluster administrators and application developers. 38 | The benefits of CDI are largely to reduce development and support for runtime-specific 39 | plugins. 40 | 41 | ******************************** 42 | Enabling CDI During Installation 43 | ******************************** 44 | 45 | CDI is enabled by default during installation in GPU Operator v25.10.0 and later. 46 | Follow the instructions for installing the Operator with Helm on the :doc:`getting-started` page. 47 | 48 | CDI is also enabled by default during a Helm upgrade to GPU Operator v25.10.0 and later. 49 | 50 | ******************************* 51 | Enabling CDI After Installation 52 | ******************************* 53 | 54 | CDI is enabled by default in GPU Operator v25.10.0 and later. 55 | Use the following procedure to enable CDI if you disabled CDI during installation. 56 | 57 | .. rubric:: Procedure 58 | 59 | #. Enable CDI by modifying the cluster policy: 60 | 61 | .. code-block:: console 62 | 63 | $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ 64 | -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value":true}]' 65 | 66 | *Example Output* 67 | 68 | .. code-block:: output 69 | 70 | clusterpolicy.nvidia.com/cluster-policy patched 71 | 72 | #. (Optional) Confirm that the container toolkit and device plugin pods restart: 73 | 74 | .. code-block:: console 75 | 76 | $ kubectl get pods -n gpu-operator 77 | 78 | *Example Output* 79 | 80 | .. literalinclude:: ./manifests/output/cdi-get-pods-restart.txt 81 | :language: output 82 | :emphasize-lines: 6,9 83 | 84 | 85 | ************* 86 | Disabling CDI 87 | ************* 88 | 89 | While CDI is the default and recommended mechanism for injecting GPU support into containers, you can 90 | disable CDI and use the legacy NVIDIA Container Toolkit stack instead with the following procedure: 91 | 92 | #. If your nodes use the CRI-O container runtime, then temporarily disable the 93 | GPU Operator validator: 94 | 95 | .. code-block:: console 96 | 97 | $ kubectl label nodes \ 98 | nvidia.com/gpu.deploy.operator-validator=false \ 99 | -l nvidia.com/gpu.present=true \ 100 | --overwrite 101 | 102 | .. tip:: 103 | 104 | You can run ``kubectl get nodes -o wide`` and view the ``CONTAINER-RUNTIME`` 105 | column to determine if your nodes use CRI-O. 106 | 107 | #. Disable CDI by modifying the cluster policy: 108 | 109 | .. code-block:: console 110 | 111 | $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ 112 | -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value":false}]' 113 | 114 | *Example Output* 115 | 116 | .. code-block:: output 117 | 118 | clusterpolicy.nvidia.com/cluster-policy patched 119 | 120 | #. If you temporarily disabled the GPU Operator validator, re-enable the validator: 121 | 122 | .. code-block:: console 123 | 124 | $ kubectl label nodes \ 125 | nvidia.com/gpu.deploy.operator-validator=true \ 126 | nvidia.com/gpu.present=true \ 127 | --overwrite 128 | -------------------------------------------------------------------------------- /gpu-operator/custom-driver-params.rst: -------------------------------------------------------------------------------- 1 | .. Date: Mar 11 2022 2 | .. Author: cdesiniotis 3 | 4 | .. _custom-driver-params: 5 | 6 | Customizing NVIDIA GPU Driver Parameters during Installation 7 | ************************************************************ 8 | 9 | The NVIDIA Driver kernel modules accept a number of parameters which can be used to customize the behavior of the driver. 10 | By default, the GPU Operator loads the kernel modules with default values. 11 | On a machine with the driver already installed, you can list the parameter names and values with the ``cat /proc/driver/nvidia/params`` command. 12 | You can pass custom parameters to the kernel modules that get loaded as part of the 13 | NVIDIA Driver installation (``nvidia``, ``nvidia-modeset``, ``nvidia-uvm``, and ``nvidia-peermem``). 14 | 15 | Configure Custom Driver Parameters 16 | ----------------------------------- 17 | 18 | To pass custom parameters, execute the following steps. 19 | 20 | #. Create a configuration file named ``.conf``, where ```` is the name of the kernel module the parameters are for. 21 | The file should contain parameters as key-value pairs -- one parameter per line. 22 | 23 | The following example shows the GPU firmware logging parameter being passed to the ``nvidia`` module. 24 | 25 | .. code-block:: console 26 | 27 | $ cat nvidia.conf 28 | NVreg_EnableGpuFirmwareLogs=2 29 | 30 | #. Create a ``ConfigMap`` for the configuration file. 31 | If multiple modules are being configured, pass multiple files when creating the ``ConfigMap``. 32 | 33 | .. code-block:: console 34 | 35 | $ kubectl create configmap kernel-module-params -n gpu-operator --from-file=nvidia.conf=./nvidia.conf 36 | 37 | #. Install the GPU Operator and set ``driver.kernelModuleConfig.name`` to the name of the ``ConfigMap`` 38 | containing the kernel module parameters. 39 | 40 | .. code-block:: console 41 | 42 | $ helm install --wait --generate-name \ 43 | -n gpu-operator --create-namespace \ 44 | nvidia/gpu-operator \ 45 | --version=${version} \ 46 | --set driver.kernelModuleConfig.name="kernel-module-params" 47 | 48 | ----------------------------------- 49 | Example using ``nvidia-uvm`` module 50 | ----------------------------------- 51 | 52 | This example shows the Heterogeneous Memory Management (HMM) being disabled in the ``nvidia-uvm`` module. 53 | Refer to `Simplifying GPU Application Development with Heterogeneous Memory Management `_ for more information about HMM. 54 | 55 | #. Create a configuration file named ``nvidia-uvm.conf``: 56 | 57 | .. code-block:: console 58 | 59 | $ cat nvidia-uvm.conf 60 | uvm_disable_hmm=1 61 | 62 | 63 | #. Create a ``ConfigMap`` for the configuration file. 64 | If multiple modules are being configured, pass multiple files when creating the ``ConfigMap``. 65 | 66 | .. code-block:: console 67 | 68 | $ kubectl create configmap kernel-module-params -n gpu-operator --from-file=nvidia-uvm.conf=./nvidia-uvm.conf 69 | 70 | #. Install the GPU Operator and set ``driver.kernelModuleConfig.name`` to the name of the ``ConfigMap`` 71 | containing the kernel module parameters. 72 | 73 | .. code-block:: console 74 | 75 | $ helm install --wait --generate-name \ 76 | -n gpu-operator --create-namespace \ 77 | nvidia/gpu-operator \ 78 | --version=${version} \ 79 | --set driver.kernelModuleConfig.name="kernel-module-params" 80 | 81 | #. Verify the parameter has been correctly applied, go to ``/sys/module/nvidia_uvm/parameters/`` on the node: 82 | 83 | .. code-block:: console 84 | 85 | $ ls /sys/module/nvidia_uvm/parameters/ 86 | 87 | *Example Output* 88 | 89 | .. code-block:: output 90 | 91 | ... 92 | uvm_disable_hmm uvm_perf_access_counter_migration_enable uvm_perf_prefetch_min_faults 93 | uvm_downgrade_force_membar_sys uvm_perf_access_counter_threshold uvm_perf_prefetch_threshold 94 | ... 95 | 96 | Then check the value of the parameter: 97 | 98 | .. code-block:: console 99 | 100 | $ cat /sys/module/nvidia_uvm/parameters/uvm_disable_hmm 101 | 102 | *Example Output* 103 | 104 | .. code-block:: output 105 | 106 | Y -------------------------------------------------------------------------------- /gpu-operator/dra-gpus.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | ########################## 6 | NVIDIA DRA Driver for GPUs 7 | ########################## 8 | 9 | .. _dra_docs_gpus: 10 | 11 | ************** 12 | GPU allocation 13 | ************** 14 | 15 | Compared to `traditional GPU allocation `_ using coarse-grained count-based requests, the GPU allocation side of this driver enables fine-grained control and powerful features long desired by the community, such as: 16 | 17 | #. Controlled sharing of individual GPUs between multiple pods and/or containers. 18 | #. GPU selection via complex constraints expressed via `CEL `_. 19 | #. Dynamic partitioning. 20 | 21 | To learn more about this part of the driver and about what we are planning to build in the future, have a look at `these release notes `_. 22 | 23 | While the GPU allocation features of this driver can be tried out, they are not yet officially supported. 24 | Hence, the GPU kubelet plugin is currently disabled by default in the Helm chart installation. 25 | 26 | For documentation on how to use and test the current set of GPU allocation features, please head over to the `demo section `_ of the driver's README and to its `quickstart directory `_. 27 | 28 | .. note:: 29 | This part of the NVIDIA DRA Driver for GPUs is in **Technology Preview**. 30 | It is not yet supported in production environments and not yet functionally complete. 31 | Generally spoken, Technology Preview features provide early access to upcoming product features, enabling users to test functionality and provide feedback during the development process. 32 | Technology Preview releases may not have full documentation, and testing is limited. 33 | -------------------------------------------------------------------------------- /gpu-operator/graphics/gpu-operator-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-operator/graphics/gpu-operator-demo.gif -------------------------------------------------------------------------------- /gpu-operator/graphics/nvidia-gpu-operator-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-operator/graphics/nvidia-gpu-operator-image.jpg -------------------------------------------------------------------------------- /gpu-operator/graphics/upgrade-controller-state-machine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-operator/graphics/upgrade-controller-state-machine.png -------------------------------------------------------------------------------- /gpu-operator/index.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | .. toctree:: 20 | :caption: NVIDIA GPU Operator 21 | :titlesonly: 22 | :hidden: 23 | 24 | About the Operator 25 | Install 26 | Upgrade 27 | Uninstall 28 | Platform Support 29 | Release Notes 30 | Troubleshooting 31 | gpu-driver-upgrades.rst 32 | install-gpu-operator-vgpu.rst 33 | install-gpu-operator-nvaie.rst 34 | Security Considerations 35 | 36 | 37 | 38 | .. toctree:: 39 | :caption: Advanced Operator Configuration 40 | :titlesonly: 41 | :hidden: 42 | 43 | Multi-Instance GPU 44 | Time-Slicing GPUs 45 | gpu-operator-rdma.rst 46 | Outdated Kernels 47 | Custom GPU Driver Parameters 48 | precompiled-drivers.rst 49 | GPU Driver CRD 50 | Container Device Interface (CDI) Support 51 | 52 | .. toctree:: 53 | :caption: Sandboxed Workloads 54 | :titlesonly: 55 | :hidden: 56 | 57 | KubeVirt 58 | 59 | .. toctree:: 60 | :caption: Specialized Networks 61 | :titlesonly: 62 | :hidden: 63 | 64 | HTTP Proxy 65 | Air-Gapped Network 66 | Service Mesh 67 | 68 | .. toctree:: 69 | :caption: CSP configurations 70 | :titlesonly: 71 | :hidden: 72 | 73 | Amazon EKS 74 | Azure AKS 75 | Google GKE 76 | 77 | .. toctree:: 78 | :caption: NVIDIA DRA Driver for GPUs 79 | :titlesonly: 80 | :hidden: 81 | 82 | Introduction & Installation 83 | GPUs 84 | ComputeDomains 85 | 86 | .. include:: overview.rst 87 | -------------------------------------------------------------------------------- /gpu-operator/install-gpu-operator-outdated-kernels.rst: -------------------------------------------------------------------------------- 1 | .. Date: Aug 2 2021 2 | .. Author: cdesiniotis 3 | 4 | .. _install-gpu-operator-outdated-kernels: 5 | 6 | Considerations when Installing with Outdated Kernels in Cluster 7 | *************************************************************** 8 | 9 | The ``driver`` container deployed as part of the GPU Operator requires certain packages to be available as part of the driver installation. 10 | On GPU nodes where the running kernel is not the latest, the ``driver`` container may fail to find the right version of these packages 11 | (e.g. kernel-headers, kernel-devel) that correspond to the running kernel version. In the ``driver`` container logs, you will most likely 12 | see the following error message: ``Could not resolve Linux kernel version``. 13 | 14 | In general, upgrading your system to the latest kernel should fix this issue. But if this is not an option, the following is a 15 | workaround to successfully deploy the GPU Operator when GPU nodes in your cluster may not be running the latest kernel. 16 | 17 | Add Archived Package Repositories 18 | ================================= 19 | 20 | The workaround is to find the package archive containing packages for your outdated kernel and to add this repository to the package 21 | manager running inside the ``driver`` container. To achieve this, we can simply mount a repository list file into the ``driver`` container using a ``ConfigMap``. 22 | The ``ConfigMap`` containing the repository list file needs to be created in the ``gpu-operator`` namespace. 23 | 24 | Let us demonstrate this workaround via an example. The system used in this example is running CentOS 7 with an outdated kernel: 25 | 26 | .. code-block:: console 27 | 28 | $ uname -r 29 | 3.10.0-1062.12.1.el7.x86_64 30 | 31 | The official archive for older CentOS packages is https://vault.centos.org/. Typically, most archived CentOS repositories 32 | are found in ``/etc/yum.repos.d/CentOS-Vault.repo`` but they are disabled by default. If the appropriate archive repository 33 | was enabled, then the ``driver`` container would resolve the kernel version and be able to install the correct versions 34 | of the prerequisite packages. 35 | 36 | We can simply drop in a replacement of ``/etc/yum.repos.d/CentOS-Vault.repo`` to ensure the appropriate CentOS archive is enabled. 37 | For the kernel running in this example, the ``CentOS-7.7.1908`` archive contains the kernel-headers version we are looking for. 38 | Here is our example drop-in replacement file: 39 | 40 | .. code-block:: 41 | 42 | [C7.7.1908-base] 43 | name=CentOS-7.7.1908 - Base 44 | baseurl=http://vault.centos.org/7.7.1908/os/$basearch/ 45 | gpgcheck=1 46 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 47 | enabled=1 48 | 49 | [C7.7.1908-updates] 50 | name=CentOS-7.7.1908 - Updates 51 | baseurl=http://vault.centos.org/7.7.1908/updates/$basearch/ 52 | gpgcheck=1 53 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 54 | enabled=1 55 | 56 | Once the repo list file is created, we can create a ``ConfigMap`` for it: 57 | 58 | .. code-block:: console 59 | 60 | $ kubectl create configmap repo-config -n gpu-operator --from-file= 61 | 62 | Once the ``ConfigMap`` is created using the above command, update ``values.yaml`` with this information, to let the GPU Operator mount the repo configuration 63 | within the ``driver`` container to pull required packages. 64 | 65 | For Ubuntu: 66 | 67 | .. code-block:: yaml 68 | 69 | driver: 70 | repoConfig: 71 | configMapName: repo-config 72 | destinationDir: /etc/apt/sources.list.d 73 | 74 | For RHEL/Centos/RHCOS: 75 | 76 | .. code-block:: yaml 77 | 78 | driver: 79 | repoConfig: 80 | configMapName: repo-config 81 | destinationDir: /etc/yum.repos.d 82 | 83 | Deploy GPU Operator with updated ``values.yaml``: 84 | 85 | .. code-block:: console 86 | 87 | $ helm install --wait --generate-name \ 88 | -n gpu-operator --create-namespace \ 89 | nvidia/gpu-operator \ 90 | --version=${version} \ 91 | -f values.yaml 92 | 93 | 94 | Check the status of the pods to ensure all the containers are running: 95 | 96 | .. code-block:: console 97 | 98 | $ kubectl get pods -n gpu-operator 99 | -------------------------------------------------------------------------------- /gpu-operator/install-gpu-operator-proxy.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | .. Date: Sep 16 2021 20 | .. Author: cdesiniotis 21 | 22 | .. _install-gpu-operator-proxy: 23 | 24 | Install GPU Operator in Proxy Environments 25 | ****************************************** 26 | 27 | Introduction 28 | ============ 29 | 30 | This page describes how to successfully deploy the GPU Operator in clusters behind an HTTP proxy. 31 | By default, the GPU Operator requires internet access for the following reasons: 32 | 33 | 1) Container images need to be pulled during GPU Operator installation. 34 | 2) The ``driver`` container needs to download several OS packages prior to driver installation. 35 | 36 | .. tip:: 37 | Using :doc:`precompiled-drivers` removes the need for the ``driver`` containers to 38 | download operating system packages. 39 | 40 | To address these requirements, all Kubernetes nodes as well as the ``driver`` container need proper configuration 41 | in order to direct traffic through the proxy. 42 | 43 | This document demonstrates how to configure the GPU Operator so that the ``driver`` container can successfully 44 | download packages behind a HTTP proxy. Since configuring Kubernetes/container runtime components to use 45 | a proxy is not specific to the GPU Operator, we do not include those instructions here. 46 | 47 | The instructions for Openshift are different, so skip the section titled :ref:`proxy_config_openshift` if you are not running Openshift. 48 | 49 | Prerequisites 50 | ============= 51 | 52 | * Kubernetes cluster is configured with HTTP proxy settings (container runtime should be enabled with HTTP proxy) 53 | 54 | .. _proxy_config_openshift: 55 | 56 | HTTP Proxy Configuration for Openshift 57 | ====================================== 58 | 59 | For Openshift, it is recommended to use the cluster-wide Proxy object to provide proxy information for the cluster. 60 | Follow the procedure described in `Configuring the cluster-wide proxy `_ 61 | from Red Hat Openshift public documentation. The GPU Operator will automatically inject proxy related ENV into the ``driver`` container 62 | based on information present in the cluster-wide Proxy object. 63 | 64 | HTTP Proxy Configuration 65 | ======================== 66 | 67 | First, get the ``values.yaml`` file used for GPU Operator configuration: 68 | 69 | .. code-block:: console 70 | 71 | $ curl -sO https://raw.githubusercontent.com/NVIDIA/gpu-operator/${version}/deployments/gpu-operator/values.yaml 72 | 73 | Specify ``driver.env`` in ``values.yaml`` with appropriate HTTP_PROXY, HTTPS_PROXY, and NO_PROXY environment variables 74 | (in both uppercase and lowercase). 75 | 76 | .. code-block:: yaml 77 | 78 | driver: 79 | env: 80 | - name: HTTPS_PROXY 81 | value: http:// 82 | - name: HTTP_PROXY 83 | value: http:// 84 | - name: NO_PROXY 85 | value: 86 | - name: https_proxy 87 | value: http:// 88 | - name: http_proxy 89 | value: http:// 90 | - name: no_proxy 91 | value: 92 | 93 | .. note:: 94 | 95 | * Proxy related ENV are automatically injected by GPU Operator into the ``driver`` container to indicate proxy information used when downloading necessary packages. 96 | * If HTTPS Proxy server is setup then change the values of HTTPS_PROXY and https_proxy to use ``https`` instead. 97 | 98 | Deploy GPU Operator 99 | =================== 100 | 101 | Download and deploy GPU Operator Helm Chart with the updated ``values.yaml``. 102 | 103 | Fetch the chart from the NGC repository: 104 | 105 | .. code-block:: console 106 | 107 | $ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-${version}.tgz 108 | 109 | Install the GPU Operator with updated ``values.yaml``: 110 | 111 | .. code-block:: console 112 | 113 | $ helm install --wait gpu-operator \ 114 | -n gpu-operator --create-namespace \ 115 | gpu-operator-${version}.tgz \ 116 | -f values.yaml 117 | 118 | Check the status of the pods to ensure all the containers are running: 119 | 120 | .. code-block:: console 121 | 122 | $ kubectl get pods -n gpu-operator 123 | -------------------------------------------------------------------------------- /gpu-operator/install-gpu-operator-service-mesh.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | ###################################### 20 | Install GPU Operator with Service Mesh 21 | ###################################### 22 | 23 | 24 | ***************************************** 25 | Special Considerations for Service Meshes 26 | ***************************************** 27 | 28 | You can use NVIDIA GPU Operator in a cluster that uses a service mesh provided by Istio CNI or Linkerd CNI. 29 | 30 | The typical consideration for using the Operator with a service mesh is that the ``k8s-driver-manager`` init container 31 | for the ``driver`` container needs network access to the Kubernetes API server of the cluster. 32 | 33 | The data plane---implemented by Istio CNI or Linkerd CNI as proxies running as sidecar containers---must be running for any pod networking to work. 34 | The proxy sidecar containers start only after the init phase of the pod, so init containers are not able to communicate with the API server. 35 | 36 | To address the connectivity challenge, NVIDIA recommends disabling injection for the GPU Operator namespace. 37 | Refer to the following documentation for more information: 38 | 39 | - `Controlling the injection policy `_ 40 | in the Istio documentation. 41 | - `Overriding injection `_ 42 | in the Linkerd documentation. 43 | 44 | 45 | **************************************** 46 | Label the Namespace to Disable Injection 47 | **************************************** 48 | 49 | - Label the Operator namespace to prevent automatic injection: 50 | 51 | .. code-block:: console 52 | 53 | $ kubectl label namespace gpu-operator istio-injection=disabled 54 | 55 | Or, for Linkerd: 56 | 57 | .. code-block:: console 58 | 59 | $ kubectl label namespace gpu-operator linkerd.io/inject=disabled 60 | 61 | 62 | If the GPU Operator is not already installed, refer to 63 | :doc:`getting-started` 64 | for information about custom options and common installation scenarios. 65 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/amazon-eks-cluster-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: eksctl.io/v1alpha5 2 | kind: ClusterConfig 3 | metadata: 4 | name: demo-cluster 5 | region: us-west-2 6 | version: "1.25" 7 | nodeGroups: 8 | - name: demo-gpu-workers 9 | instanceType: g4dn.xlarge 10 | ami: ami-0770ab88ec35aa875 11 | amiFamily: Ubuntu2004 12 | minSize: 1 13 | desiredCapacity: 3 14 | maxSize: 3 15 | volumeSize: 100 16 | overrideBootstrapCommand: | 17 | #!/bin/bash 18 | source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh 19 | /etc/eks/bootstrap.sh ${CLUSTER_NAME} --container-runtime containerd --kubelet-extra-args "--node-labels=${NODE_LABELS}" 20 | ssh: 21 | allow: true 22 | publicKeyPath: ~/.ssh/id_rsa.pub -------------------------------------------------------------------------------- /gpu-operator/manifests/input/custom-mig-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: custom-mig-config 5 | data: 6 | config.yaml: | 7 | version: v1 8 | mig-configs: 9 | all-disabled: 10 | - devices: all 11 | mig-enabled: false 12 | 13 | five-1g-one-2g: 14 | - devices: all 15 | mig-enabled: true 16 | mig-devices: 17 | "1g.10gb": 5 18 | "2g.20gb": 1 19 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/google-gke-gpu-operator-quota.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ResourceQuota 3 | metadata: 4 | name: gpu-operator-quota 5 | spec: 6 | hard: 7 | pods: 100 8 | scopeSelector: 9 | matchExpressions: 10 | - operator: In 11 | scopeName: PriorityClass 12 | values: 13 | - system-node-critical 14 | - system-cluster-critical -------------------------------------------------------------------------------- /gpu-operator/manifests/input/gpu-direct-rdma-demo-pod-1.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: demo-pod-1 5 | annotations: 6 | k8s.v1.cni.cncf.io/networks: demo-macvlannetwork 7 | # If a network with static IPAM is used replace network annotation with the below. 8 | # k8s.v1.cni.cncf.io/networks: '[ 9 | # { "name": "rdma-net", 10 | # "ips": ["192.168.111.101/24"], 11 | # "gateway": ["192.168.111.1"] 12 | # } 13 | # ]' 14 | spec: 15 | nodeSelector: 16 | # Note: Replace hostname or remove selector altogether 17 | kubernetes.io/hostname: nvnode1 18 | restartPolicy: OnFailure 19 | containers: 20 | - image: mellanox/cuda-perftest 21 | name: rdma-gpu-test-ctr 22 | securityContext: 23 | capabilities: 24 | add: [ "IPC_LOCK" ] 25 | resources: 26 | limits: 27 | nvidia.com/gpu: 1 28 | rdma/rdma_shared_device_a: 1 29 | requests: 30 | nvidia.com/gpu: 1 31 | rdma/rdma_shared_device_a: 1 -------------------------------------------------------------------------------- /gpu-operator/manifests/input/gpu-direct-rdma-demo-pod-2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: demo-pod-2 5 | annotations: 6 | k8s.v1.cni.cncf.io/networks: demo-macvlannetwork 7 | # If a network with static IPAM is used replace network annotation with the below. 8 | # k8s.v1.cni.cncf.io/networks: '[ 9 | # { "name": "rdma-net", 10 | # "ips": ["192.168.111.101/24"], 11 | # "gateway": ["192.168.111.1"] 12 | # } 13 | # ]' 14 | spec: 15 | nodeSelector: 16 | # Note: Replace hostname or remove selector altogether 17 | kubernetes.io/hostname: nvnode2 18 | restartPolicy: OnFailure 19 | containers: 20 | - image: mellanox/cuda-perftest 21 | name: rdma-gpu-test-ctr 22 | securityContext: 23 | capabilities: 24 | add: [ "IPC_LOCK" ] 25 | resources: 26 | limits: 27 | nvidia.com/gpu: 1 28 | rdma/rdma_shared_device_a: 1 29 | requests: 30 | nvidia.com/gpu: 1 31 | rdma/rdma_shared_device_a: 1 -------------------------------------------------------------------------------- /gpu-operator/manifests/input/mig-cm-values.yaml: -------------------------------------------------------------------------------- 1 | migManager: 2 | config: 3 | name: custom-mig-config 4 | create: true 5 | data: 6 | config.yaml: |- 7 | version: v1 8 | mig-configs: 9 | all-disabled: 10 | - devices: all 11 | mig-enabled: false 12 | custom-mig: 13 | - devices: [0] 14 | mig-enabled: true 15 | mig-devices: 16 | "1g.10gb": 2 17 | "2g.20gb": 2 18 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/nvd-all.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nvidia.com/v1alpha1 2 | kind: NVIDIADriver 3 | metadata: 4 | name: demo-all 5 | spec: 6 | driverType: gpu 7 | image: driver 8 | imagePullPolicy: IfNotPresent 9 | imagePullSecrets: [] 10 | manager: {} 11 | rdma: 12 | enabled: false 13 | useHostMofed: false 14 | gds: 15 | enabled: false 16 | repository: nvcr.io/nvidia 17 | startupProbe: 18 | failureThreshold: 120 19 | initialDelaySeconds: 60 20 | periodSeconds: 10 21 | timeoutSeconds: 60 22 | usePrecompiled: false 23 | version: 535.104.12 24 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/nvd-demo-gold.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nvidia.com/v1alpha1 2 | kind: NVIDIADriver 3 | metadata: 4 | name: demo-gold 5 | spec: 6 | driverType: gpu 7 | gdrcopy: 8 | enabled: false 9 | repository: nvcr.io/nvidia/cloud-native 10 | image: gdrdrv 11 | version: v2.4.1 12 | imagePullPolicy: IfNotPresent 13 | imagePullSecrets: [] 14 | env: [] 15 | args: [] 16 | image: driver 17 | imagePullPolicy: IfNotPresent 18 | imagePullSecrets: [] 19 | kernelModuleType: auto 20 | manager: {} 21 | nodeSelector: 22 | driver.config: "gold" 23 | rdma: 24 | enabled: false 25 | useHostMofed: false 26 | gds: 27 | enabled: false 28 | repository: nvcr.io/nvidia 29 | startupProbe: 30 | failureThreshold: 120 31 | initialDelaySeconds: 60 32 | periodSeconds: 10 33 | timeoutSeconds: 60 34 | usePrecompiled: false 35 | version: 535.104.12 36 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/nvd-driver-multiple.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nvidia.com/v1alpha1 2 | kind: NVIDIADriver 3 | metadata: 4 | name: demo-gold 5 | spec: 6 | driverType: gpu 7 | env: [] 8 | image: driver 9 | imagePullPolicy: IfNotPresent 10 | imagePullSecrets: [] 11 | manager: {} 12 | nodeSelector: 13 | driver.config: "gold" 14 | repository: nvcr.io/nvidia 15 | version: "535.104.12" 16 | --- 17 | apiVersion: nvidia.com/v1alpha1 18 | kind: NVIDIADriver 19 | metadata: 20 | name: demo-silver 21 | spec: 22 | driverType: gpu 23 | env: [] 24 | image: driver 25 | imagePullPolicy: IfNotPresent 26 | imagePullSecrets: [] 27 | manager: {} 28 | nodeSelector: 29 | driver.config: "silver" 30 | repository: nvcr.io/nvidia 31 | version: "470.141.10" 32 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/nvd-precompiled-all.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nvidia.com/v1alpha1 2 | kind: NVIDIADriver 3 | metadata: 4 | name: demo-precomp-all 5 | spec: 6 | driverType: gpu 7 | env: [] 8 | image: driver 9 | imagePullPolicy: IfNotPresent 10 | imagePullSecrets: [] 11 | manager: {} 12 | nodeSelector: {} 13 | repository: nvcr.io/nvidia 14 | resources: {} 15 | usePrecompiled: true 16 | version: "535" 17 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/nvd-precompiled-some.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nvidia.com/v1alpha1 2 | kind: NVIDIADriver 3 | metadata: 4 | name: demo-precomp 5 | spec: 6 | driverType: gpu 7 | env: [] 8 | image: driver 9 | imagePullPolicy: IfNotPresent 10 | imagePullSecrets: [] 11 | manager: {} 12 | nodeSelector: 13 | driver.precompiled: "true" 14 | driver.version: "535" 15 | repository: nvcr.io/nvidia 16 | resources: {} 17 | usePrecompiled: true 18 | version: "535" 19 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/tf-notebook.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: tf-notebook 6 | labels: 7 | app: tf-notebook 8 | spec: 9 | type: NodePort 10 | ports: 11 | - port: 80 12 | name: http 13 | targetPort: 8888 14 | nodePort: 30001 15 | selector: 16 | app: tf-notebook 17 | --- 18 | apiVersion: v1 19 | kind: Pod 20 | metadata: 21 | name: tf-notebook 22 | labels: 23 | app: tf-notebook 24 | spec: 25 | securityContext: 26 | fsGroup: 0 27 | containers: 28 | - name: tf-notebook 29 | image: tensorflow/tensorflow:latest-gpu-jupyter 30 | resources: 31 | limits: 32 | nvidia.com/gpu: 1 33 | ports: 34 | - containerPort: 8888 35 | name: notebook -------------------------------------------------------------------------------- /gpu-operator/manifests/input/time-slicing-config-all.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: time-slicing-config-all 5 | data: 6 | any: |- 7 | version: v1 8 | flags: 9 | migStrategy: none 10 | sharing: 11 | timeSlicing: 12 | resources: 13 | - name: nvidia.com/gpu 14 | replicas: 4 15 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/time-slicing-config-fine.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: time-slicing-config-fine 5 | data: 6 | a100-40gb: |- 7 | version: v1 8 | flags: 9 | migStrategy: mixed 10 | sharing: 11 | timeSlicing: 12 | resources: 13 | - name: nvidia.com/gpu 14 | replicas: 8 15 | - name: nvidia.com/mig-1g.5gb 16 | replicas: 2 17 | - name: nvidia.com/mig-2g.10gb 18 | replicas: 2 19 | - name: nvidia.com/mig-3g.20gb 20 | replicas: 3 21 | - name: nvidia.com/mig-7g.40gb 22 | replicas: 7 23 | tesla-t4: |- 24 | version: v1 25 | flags: 26 | migStrategy: none 27 | sharing: 28 | timeSlicing: 29 | resources: 30 | - name: nvidia.com/gpu 31 | replicas: 4 32 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/time-slicing-config-sample.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: time-slicing-config 5 | data: 6 | any: |- 7 | version: v1 8 | flags: 9 | migStrategy: none 10 | sharing: 11 | timeSlicing: 12 | renameByDefault: false 13 | failRequestsGreaterThanOne: false 14 | resources: 15 | - name: nvidia.com/gpu 16 | replicas: 4 17 | -------------------------------------------------------------------------------- /gpu-operator/manifests/input/time-slicing-verification.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: time-slicing-verification 5 | labels: 6 | app: time-slicing-verification 7 | spec: 8 | replicas: 5 9 | selector: 10 | matchLabels: 11 | app: time-slicing-verification 12 | template: 13 | metadata: 14 | labels: 15 | app: time-slicing-verification 16 | spec: 17 | tolerations: 18 | - key: nvidia.com/gpu 19 | operator: Exists 20 | effect: NoSchedule 21 | hostPID: true 22 | containers: 23 | - name: cuda-sample-vector-add 24 | image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04" 25 | command: ["/bin/bash", "-c", "--"] 26 | args: 27 | - while true; do /cuda-samples/vectorAdd; done 28 | resources: 29 | limits: 30 | nvidia.com/gpu: 1 31 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/cdi-get-pods-restart.txt: -------------------------------------------------------------------------------- 1 | NAME READY STATUS RESTARTS AGE 2 | gpu-feature-discovery-qnw2q 1/1 Running 0 47h 3 | gpu-operator-6d59774ff-hznmr 1/1 Running 0 2d 4 | gpu-operator-node-feature-discovery-master-6d6649d597-7l8bj 1/1 Running 0 2d 5 | gpu-operator-node-feature-discovery-worker-v86vj 1/1 Running 0 2d 6 | nvidia-container-toolkit-daemonset-2768s 1/1 Running 0 2m11s 7 | nvidia-cuda-validator-ls4vc 0/1 Completed 0 47h 8 | nvidia-dcgm-exporter-fxp9h 1/1 Running 0 47h 9 | nvidia-device-plugin-daemonset-dvp4v 1/1 Running 0 2m26s 10 | nvidia-device-plugin-validator-kvxbs 0/1 Completed 0 47h 11 | nvidia-driver-daemonset-m86r7 1/1 Running 0 2d 12 | nvidia-operator-validator-xg98r 1/1 Running 0 47h 13 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/common-cuda-vectoradd-logs.txt: -------------------------------------------------------------------------------- 1 | [Vector addition of 50000 elements] 2 | Copy input data from the host memory to the CUDA device 3 | CUDA kernel launch with 196 blocks of 256 threads 4 | Copy output data from the CUDA device to the host memory 5 | Test PASSED 6 | Done 7 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/mig-get-pods.txt: -------------------------------------------------------------------------------- 1 | NAME READY STATUS RESTARTS AGE 2 | gpu-feature-discovery-qmwb2 1/1 Running 0 14m 3 | gpu-operator-7bbf8bb6b7-xz664 1/1 Running 0 14m 4 | gpu-operator-node-feature-discovery-gc-79d6d968bb-sg4t6 1/1 Running 0 14m 5 | gpu-operator-node-feature-discovery-master-6d9f8d497c-7cwrp 1/1 Running 0 14m 6 | gpu-operator-node-feature-discovery-worker-x5z62 1/1 Running 0 14m 7 | nvidia-container-toolkit-daemonset-pkcpr 1/1 Running 0 14m 8 | nvidia-cuda-validator-wt6bc 0/1 Completed 0 12m 9 | nvidia-dcgm-exporter-zsskv 1/1 Running 0 14m 10 | nvidia-device-plugin-daemonset-924x6 1/1 Running 0 14m 11 | nvidia-driver-daemonset-klj5s 1/1 Running 0 14m 12 | nvidia-mig-manager-8d6wz 1/1 Running 0 12m 13 | nvidia-operator-validator-fnsmk 1/1 Running 0 14m 14 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/mig-mixed-nvidia-smi.txt: -------------------------------------------------------------------------------- 1 | GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-b4895dbf-9350-2524-a89b-98161ddd9fe4) 2 | MIG 3g.40gb Device 0: (UUID: MIG-7089d0f3-293f-58c9-8f8c-5ea666eedbde) 3 | MIG 2g.20gb Device 1: (UUID: MIG-56c30729-347f-5dd6-8da0-c3cc59e969e0) 4 | MIG 1g.10gb Device 2: (UUID: MIG-9d14fb21-4ae1-546f-a636-011582899c39) 5 | MIG 1g.10gb Device 3: (UUID: MIG-0f709664-740c-52b0-ae79-3e4c9ede6d3b) 6 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/mig-nvidia-smi.txt: -------------------------------------------------------------------------------- 1 | GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-b4895dbf-9350-2524-a89b-98161ddd9fe4) 2 | MIG 1g.10gb Device 0: (UUID: MIG-3f6f389f-b0cc-5e5c-8e32-eaa8fd067902) 3 | MIG 1g.10gb Device 1: (UUID: MIG-35f93699-4b53-5a19-8289-80b8418eec60) 4 | MIG 1g.10gb Device 2: (UUID: MIG-9d14fb21-4ae1-546f-a636-011582899c39) 5 | MIG 1g.10gb Device 3: (UUID: MIG-0f709664-740c-52b0-ae79-3e4c9ede6d3b) 6 | MIG 1g.10gb Device 4: (UUID: MIG-5d23f73a-d378-50ac-a6f5-3bf5184773bb) 7 | MIG 1g.10gb Device 5: (UUID: MIG-6cea15c7-8a56-578c-b965-0e73cb6dfc10) 8 | MIG 1g.10gb Device 6: (UUID: MIG-981c86e9-3607-57d7-9426-295347e4b925) 9 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/precomp-driver-conventional-running.txt: -------------------------------------------------------------------------------- 1 | NAME READY STATUS RESTARTS AGE 2 | nvidia-driver-daemonset-qwprp 1/1 Running 0 10m 3 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/precomp-driver-running.txt: -------------------------------------------------------------------------------- 1 | NAME READY STATUS RESTARTS AGE 2 | nvidia-driver-daemonset-5.15.0-69-generic-ubuntu22.04-thbts 1/1 Running 0 44s 3 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/precomp-driver-terminating.txt: -------------------------------------------------------------------------------- 1 | NAME READY STATUS RESTARTS AGE 2 | pod/gpu-feature-discovery-pzzr8 2/2 Running 0 19m 3 | pod/gpu-operator-859cb64846-57hfn 1/1 Running 0 47m 4 | pod/gpu-operator-node-feature-discovery-master-6d6649d597-7l8bj 1/1 Running 0 10d 5 | pod/gpu-operator-node-feature-discovery-worker-v86vj 1/1 Running 0 10d 6 | pod/nvidia-container-toolkit-daemonset-6ltbv 1/1 Running 0 19m 7 | pod/nvidia-cuda-validator-62w6r 0/1 Completed 0 17m 8 | pod/nvidia-dcgm-exporter-fh5wz 1/1 Running 0 19m 9 | pod/nvidia-device-plugin-daemonset-rwslh 2/2 Running 0 19m 10 | pod/nvidia-device-plugin-validator-gq4ww 0/1 Completed 0 17m 11 | pod/nvidia-driver-daemonset-xqrxk 1/1 Terminating 0 20m 12 | pod/nvidia-operator-validator-78mzv 1/1 Running 0 19m 13 | 14 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/time-slicing-get-events.txt: -------------------------------------------------------------------------------- 1 | LAST SEEN TYPE REASON OBJECT MESSAGE 2 | 33s Normal Created pod/nvidia-device-plugin-daemonset-cffds Created container toolkit-validation 3 | 33s Normal Started pod/nvidia-device-plugin-daemonset-cffds Started container toolkit-validation 4 | 33s Normal Started pod/gpu-feature-discovery-rvlg9 Started container toolkit-validation 5 | 33s Normal Created pod/gpu-feature-discovery-rvlg9 Created container toolkit-validation 6 | 33s Normal Pulled pod/gpu-feature-discovery-rvlg9 Container image "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1" already present on machine 7 | 33s Normal Pulled pod/nvidia-device-plugin-daemonset-cffds Container image "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1" already present on machine 8 | 32s Normal Created pod/nvidia-device-plugin-daemonset-cffds Created container config-manager-init 9 | 32s Normal Pulled pod/nvidia-device-plugin-daemonset-cffds Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine 10 | 32s Normal Pulled pod/gpu-feature-discovery-rvlg9 Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine 11 | 32s Normal Created pod/gpu-feature-discovery-rvlg9 Created container config-manager-init 12 | 32s Normal Started pod/gpu-feature-discovery-rvlg9 Started container config-manager-init 13 | 32s Normal Started pod/nvidia-device-plugin-daemonset-cffds Started container config-manager-init 14 | 31s Normal Created pod/gpu-feature-discovery-rvlg9 Created container config-manager 15 | 31s Normal Started pod/gpu-feature-discovery-rvlg9 Started container gpu-feature-discovery 16 | 31s Normal Created pod/gpu-feature-discovery-rvlg9 Created container gpu-feature-discovery 17 | 31s Normal Pulled pod/gpu-feature-discovery-rvlg9 Container image "nvcr.io/nvidia/gpu-feature-discovery:v0.7.0-ubi8" already present on machine 18 | 31s Normal Started pod/nvidia-device-plugin-daemonset-cffds Started container config-manager 19 | 31s Normal Created pod/nvidia-device-plugin-daemonset-cffds Created container config-manager 20 | 31s Normal Pulled pod/nvidia-device-plugin-daemonset-cffds Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine 21 | 31s Normal Started pod/nvidia-device-plugin-daemonset-cffds Started container nvidia-device-plugin 22 | 31s Normal Created pod/nvidia-device-plugin-daemonset-cffds Created container nvidia-device-plugin 23 | 31s Normal Pulled pod/nvidia-device-plugin-daemonset-cffds Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine 24 | 31s Normal Pulled pod/gpu-feature-discovery-rvlg9 Container image "nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8" already present on machine 25 | 31s Normal Started pod/gpu-feature-discovery-rvlg9 Started container config-manager 26 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/time-slicing-get-pods.txt: -------------------------------------------------------------------------------- 1 | NAME READY STATUS RESTARTS AGE 2 | time-slicing-verification-7cdc7f87c5-lkd9d 1/1 Running 0 23s 3 | time-slicing-verification-7cdc7f87c5-rrzq7 1/1 Running 0 23s 4 | time-slicing-verification-7cdc7f87c5-s8qwk 1/1 Running 0 23s 5 | time-slicing-verification-7cdc7f87c5-xhmb7 1/1 Running 0 23s 6 | time-slicing-verification-7cdc7f87c5-zsncp 1/1 Running 0 23s 7 | -------------------------------------------------------------------------------- /gpu-operator/manifests/output/time-slicing-logs-pods.txt: -------------------------------------------------------------------------------- 1 | Found 5 pods, using pod/time-slicing-verification-7cdc7f87c5-s8qwk 2 | [Vector addition of 50000 elements] 3 | Copy input data from the host memory to the CUDA device 4 | CUDA kernel launch with 196 blocks of 256 threads 5 | Copy output data from the CUDA device to the host memory 6 | Test PASSED 7 | Done 8 | [Vector addition of 50000 elements] 9 | Copy input data from the host memory to the CUDA device 10 | CUDA kernel launch with 196 blocks of 256 threads 11 | Copy output data from the CUDA device to the host memory 12 | ... 13 | -------------------------------------------------------------------------------- /gpu-operator/overview.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | 20 | ***************************** 21 | About the NVIDIA GPU Operator 22 | ***************************** 23 | 24 | .. image:: graphics/nvidia-gpu-operator-image.jpg 25 | :width: 600 26 | 27 | Kubernetes provides access to special hardware resources such as NVIDIA GPUs, NICs, Infiniband adapters and other devices 28 | through the `device plugin framework `_. 29 | However, configuring and managing nodes with these hardware resources requires 30 | configuration of multiple software components such as drivers, container runtimes or other libraries which are difficult 31 | and prone to errors. The NVIDIA GPU Operator uses the `operator framework `_ 32 | within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. These components include the NVIDIA drivers (to enable CUDA), 33 | Kubernetes device plugin for GPUs, the `NVIDIA Container Toolkit `_, 34 | automatic node labeling using `GFD `_, `DCGM `_ based monitoring and others. 35 | 36 | 37 | .. card:: Red Hat OpenShift Container Platform 38 | 39 | For information about installing, managing, and upgrading the Operator, 40 | refer to :external+ocp:doc:`index`. 41 | 42 | Information about supported versions is available in :ref:`Supported Operating Systems and Kubernetes Platforms`. 43 | 44 | 45 | About This Documentation 46 | ======================== 47 | 48 | Browse through the following documents for getting started, platform support and release notes. 49 | 50 | Getting Started 51 | --------------- 52 | 53 | The :ref:`operator-install-guide` guide includes information on installing the GPU Operator in a Kubernetes cluster. 54 | 55 | Release Notes 56 | --------------- 57 | 58 | Refer to :ref:`operator-release-notes` for information about releases. 59 | 60 | Platform Support 61 | ------------------ 62 | 63 | The :ref:`operator-platform-support` describes the supported platform configurations. 64 | 65 | Licenses and Contributing 66 | ========================= 67 | 68 | .. _pstai: https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/ 69 | .. |pstai| replace:: Product-Specific Terms for NVIDIA AI Products 70 | 71 | The NVIDIA GPU Operator source code is licensed under `Apache 2.0 `__ and 72 | contributions are accepted with a DCO. Refer to the `contributing `_ document for 73 | more information on how to contribute and the release artifacts. 74 | 75 | The base images used by the software might include software that is licensed under open-source licenses such as GPL. 76 | The source code for these components is archived on the CUDA opensource `index `_. 77 | 78 | The following table identifieis the licenses for the Operator and software components. 79 | By installing and using the GPU Operator, you accept the terms and conditions of these licenses. 80 | 81 | .. list-table:: 82 | :header-rows: 1 83 | :widths: 30 10 60 84 | 85 | * - Component 86 | - Artifact Type 87 | - Artifact Licenses 88 | 89 | * - NVIDIA GPU Operator 90 | - Helm Chart 91 | - `Apache 2.0 `__ 92 | 93 | * - NVIDIA GPU Operator 94 | - Image 95 | - |pstai|_ 96 | 97 | * - NVIDIA GPU Feature Discovery 98 | - Image 99 | - |pstai|_ 100 | 101 | * - NVIDIA GPU Driver 102 | - Image 103 | - `License for Customer Use of NVIDIA Software `__ 104 | 105 | |pstai|_ 106 | 107 | * - NVIDIA Container Toolkit 108 | - Image 109 | - |pstai|_ 110 | 111 | * - NVIDIA Kubernetes Device Plugin 112 | - Image 113 | - |pstai|_ 114 | 115 | * - NVIDIA MIG Manager for Kubernetes 116 | - Image 117 | - |pstai|_ 118 | 119 | * - Validator for NVIDIA GPU Operator 120 | - Image 121 | - |pstai|_ 122 | 123 | * - NVIDIA DCGM 124 | - Image 125 | - |pstai|_ 126 | 127 | * - NVIDIA DCGM Exporter 128 | - Image 129 | - |pstai|_ 130 | 131 | * - NVIDIA Driver Manager for Kubernetes 132 | - Image 133 | - |pstai|_ 134 | 135 | * - NVIDIA KubeVirt GPU Device Plugin 136 | - Image 137 | - |pstai|_ 138 | 139 | * - NVIDIA vGPU Device Manager 140 | - Image 141 | - |pstai|_ 142 | 143 | * - NVIDIA GDS Driver 144 | - Image 145 | - `License for Customer Use of NVIDIA Software `__ 146 | 147 | |pstai|_ 148 | 149 | * - NVIDIA Confidential Computing 150 | Manager for Kubernetes 151 | - Image 152 | - |pstai|_ 153 | 154 | * - NVIDIA Kata Manager for Kubernetes 155 | - Image 156 | - |pstai|_ 157 | 158 | * - NVIDIA GDRCopy Driver 159 | - Image 160 | - |pstai|_ -------------------------------------------------------------------------------- /gpu-operator/security.rst: -------------------------------------------------------------------------------- 1 | 2 | ***************************** 3 | Security Considerations 4 | ***************************** 5 | 6 | 7 | Pod Security Context of the Operator and Operands 8 | ================================================= 9 | 10 | Several of the NVIDIA GPU Operator operands, such as the driver containers and container toolkit, 11 | require the following elevated privileges: 12 | 13 | - ``privileged: true`` 14 | - ``hostPID: true`` 15 | - ``hostIPC: true`` 16 | 17 | The elevated privileges are required for the following reasons: 18 | 19 | - Access to the host file system and hardware devices, such as NVIDIA GPUs. 20 | - Restart system services such as containerd. 21 | - Loading and unloading kernel modules. 22 | 23 | Only the Kubernetes cluster administrator needs to access or manage the Operator namespace. 24 | As a best practice, establish proper security policies and prevent any other users from accessing the Operator namespace. 25 | 26 | 27 | CVEs 28 | ================================================= 29 | 30 | The following is a list of known CVEs in the GPU Operator or its operands. 31 | To view any published security bulletins for NVIDIA products published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/. 32 | 33 | .. list-table:: CVEs 34 | :widths: 20 45 35 35 | :header-rows: 1 36 | 37 | * - CVE ID 38 | - Affected Components 39 | - Fixed Version 40 | 41 | * - `NVIDIA CVE-2025-23359 `_ 42 | - NVIDIA Container Toolkit, all versions up to and including 1.17.3 43 | 44 | NVIDIA GPU Operator, all versions up to and including 24.9.1 45 | - NVIDIA Container Toolkit 1.17.4 46 | 47 | NVIDIA GPU Operator 24.9.2 48 | 49 | * - `NVIDIA CVE-2024-0135 `_ 50 | - NVIDIA Container Toolkit, all versions up to and including 1.17.2 51 | 52 | NVIDIA GPU Operator, all versions up to and including 24.9.0 53 | - NVIDIA Container Toolkit 1.17.3 54 | 55 | NVIDIA GPU Operator 24.9.1 56 | 57 | * - `NVIDIA CVE-2024-0136 `_ 58 | - NVIDIA Container Toolkit, all versions up to and including 1.17.2 59 | 60 | NVIDIA GPU Operator, all versions up to and including 24.9.0 61 | - NVIDIA Container Toolkit 1.17.3 62 | 63 | NVIDIA GPU Operator 24.9.1 64 | 65 | * - `NVIDIA CVE-2024-0137 `_ 66 | - NVIDIA Container Toolkit, all versions up to and including 1.17.2 67 | 68 | NVIDIA GPU Operator, all versions up to and including 24.9.0 69 | - NVIDIA Container Toolkit 1.17.3 70 | 71 | NVIDIA GPU Operator 24.9.1 72 | 73 | * - `NVIDIA CVE-2024-0134 `_ 74 | - NVIDIA Container Toolkit, all versions up to and including 1.16.2 75 | 76 | NVIDIA GPU Operator, all versions up to and including 24.6.2 77 | - NVIDIA Container Toolkit 1.17.0 78 | 79 | NVIDIA GPU Operator 24.9.0 80 | 81 | * - `NVIDIA CVE-2024-0132 `_ 82 | - NVIDIA Container Toolkit, all versions up to and including 1.16.1 83 | 84 | NVIDIA GPU Operator, all versions up to and including 24.6.1 85 | - NVIDIA Container Toolkit 1.16.2 86 | 87 | NVIDIA GPU Operator 24.6.2 88 | * - `NVIDIA CVE-2024-0133 `_ 89 | - NVIDIA Container Toolkit, all versions up to and including 1.16.1 90 | 91 | NVIDIA GPU Operator, all versions up to and including 24.6.1 92 | - NVIDIA Container Toolkit 1.16.2 93 | 94 | NVIDIA GPU Operator 24.6.2 95 | 96 | Report a Vulnerability 97 | ----------------------------- 98 | 99 | For details on reporting a suspected vulnerability, refer to the `GPU Operator Security policies `_ page. 100 | -------------------------------------------------------------------------------- /gpu-operator/uninstall.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | ############################# 20 | Uninstalling the GPU Operator 21 | ############################# 22 | 23 | Perform the following steps to uninstall the Operator. 24 | 25 | #. Optional: List and delete NVIDIA driver custom resources. 26 | 27 | .. code-block:: console 28 | 29 | $ kubectl get nvidiadrivers 30 | 31 | *Example Output* 32 | 33 | .. code-block:: output 34 | 35 | NAME STATUS AGE 36 | demo-gold ready 2023-10-16T17:57:12Z 37 | demo-silver ready 2023-10-16T17:57:12Z 38 | 39 | .. code-block:: console 40 | 41 | $ kubectl delete nvidiadriver demo-gold 42 | $ kubectl delete nvidiadriver demo-silver 43 | 44 | .. code-block:: console 45 | 46 | $ kubectl delete crd nvidiadrivers.nvidia.com 47 | 48 | #. Delete the Operator: 49 | 50 | .. code-block:: console 51 | 52 | $ helm delete -n gpu-operator $(helm list -n gpu-operator | grep gpu-operator | awk '{print $1}') 53 | 54 | #. Optional: List the pods in the Operator namespace to confirm the pods are deleted or in the process of deleting: 55 | 56 | .. code-block:: console 57 | 58 | $ kubectl get pods -n gpu-operator 59 | 60 | *Example Output* 61 | 62 | .. code-block:: output 63 | 64 | No resources found. 65 | 66 | By default, Helm does not `support deleting existing CRDs `__ 67 | when you delete the chart. 68 | As a result, the ``clusterpolicy`` CRD and ``nvidiadrivers`` CRD will still remain, by default. 69 | 70 | .. code-block:: console 71 | 72 | $ kubectl get crd clusterpolicies.nvidia.com 73 | 74 | To overcome this, the Operator uses a `post-delete hook `__ 75 | to perform the CRD cleanup. 76 | The ``operator.cleanupCRD`` chart parameter is added to enable this hook. 77 | This parameter is disabled by default. 78 | You can enable the hook by specifying ``--set operator.cleanupCRD=true`` during install or upgrade to perform automatic CRD cleanup on chart deletion. 79 | 80 | Alternatively, you can delete the custom resource definition: 81 | 82 | .. code-block:: console 83 | 84 | $ kubectl delete crd clusterpolicies.nvidia.com 85 | 86 | .. note:: 87 | 88 | * After uninstalling the Operator, the NVIDIA driver modules might still be loaded. 89 | Either reboot the node or unload them using the following command: 90 | 91 | .. code-block:: console 92 | 93 | $ sudo rmmod nvidia_modeset nvidia_uvm nvidia 94 | 95 | * Helm hooks used with the GPU Operator use the Operator image itself. 96 | If the Operator image cannot be pulled successfully (either due to network error or an invalid NGC registry secret in case of NVAIE), hooks will fail. 97 | In this case, delete the chart and specify the ``--no-hooks`` argument to avoid hanging on hook failures. 98 | -------------------------------------------------------------------------------- /gpu-operator/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "latest": "25.10", 3 | "versions": 4 | [ 5 | { 6 | "version": "25.10" 7 | }, 8 | { 9 | "version": "25.3" 10 | }, 11 | { 12 | "version": "24.9" 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /gpu-operator/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../25.10", 5 | "version": "25.10" 6 | }, 7 | { 8 | "url": "../25.3", 9 | "version": "25.3" 10 | }, 11 | { 12 | "url": "../24.9", 13 | "version": "24.9" 14 | } 15 | ] 16 | -------------------------------------------------------------------------------- /gpu-telemetry/about-telemetry.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings (h1/h2/h3/h4/h5) are # * = - 18 | 19 | ################### 20 | About GPU Telemetry 21 | ################### 22 | 23 | Monitoring stacks usually consist of a collector, a time-series database to store metrics and a visualization layer. 24 | A popular open-source stack is `Prometheus `_ used along with `Grafana `_ as 25 | the visualization tool to create rich dashboards. Prometheus also includes an `Alertmanager `_, 26 | to create and manage alerts. Prometheus is deployed along with `kube-state-metrics `_ and 27 | `node_exporter `_ to expose cluster-level metrics for Kubernetes API objects and node-level 28 | metrics such as CPU utilization. 29 | 30 | An architecture of Prometheus is shown in the figure below: 31 | 32 | .. image:: https://boxboat.com/2019/08/08/monitoring-kubernetes-with-prometheus/prometheus-architecture.png 33 | :width: 800 34 | 35 | 36 | To gather GPU telemetry in Kubernetes, its recommended to use DCGM Exporter. DCGM Exporter, based on `DCGM `_ exposes 37 | GPU metrics for Prometheus and can be visualized using Grafana. DCGM Exporter is architected to take advantage of 38 | ``KubeletPodResources`` `API `_ and exposes GPU metrics in a format that can be 39 | scraped by Prometheus. A ``ServiceMonitor`` is also included to expose endpoints. 40 | -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/001-dcgm-e2e-prom-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/001-dcgm-e2e-prom-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/002-dcgm-e2e-grafana-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/002-dcgm-e2e-grafana-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/003-dcgm-e2e-grafana-home-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/003-dcgm-e2e-grafana-home-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/004-dcgm-e2e-grafana-manage-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/004-dcgm-e2e-grafana-manage-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/005-dcgm-e2e-grafana-import-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/005-dcgm-e2e-grafana-import-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/006-dcgm-e2e-grafana-import-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/006-dcgm-e2e-grafana-import-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/007-dcgm-e2e-grafana-import-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/007-dcgm-e2e-grafana-import-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/008-dcgm-e2e-grafana-dashboard-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/008-dcgm-e2e-grafana-dashboard-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/009-dcgm-e2e-deepstream-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/009-dcgm-e2e-deepstream-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/010-dcgm-e2e-deepstream-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/010-dcgm-e2e-deepstream-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-e2e/011-dcgm-e2e-prom-dashboard-metrics-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-e2e/011-dcgm-e2e-prom-dashboard-metrics-screenshot.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-exporter-bare-metal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-exporter-bare-metal.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-exporter-containers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-exporter-containers.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm-exporter_embedded.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm-exporter_embedded.png -------------------------------------------------------------------------------- /gpu-telemetry/graphics/dcgm_and_dcgm-exporter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/gpu-telemetry/graphics/dcgm_and_dcgm-exporter.png -------------------------------------------------------------------------------- /gpu-telemetry/index.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings (h1/h2/h3/h4/h5) are # * = - 18 | 19 | .. toctree:: 20 | :caption: DCGM Exporter 21 | :titlesonly: 22 | :hidden: 23 | 24 | about-telemetry 25 | dcgm-exporter 26 | integrating-telemetry-kubernetes 27 | kube-prometheus 28 | 29 | 30 | .. include:: about-telemetry.rst 31 | :start-line: 18 32 | -------------------------------------------------------------------------------- /gpu-telemetry/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": 3 | [ 4 | { 5 | "version": "1.0.0" 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /gpu-telemetry/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /kubernetes/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": 3 | [ 4 | { 5 | "version": "1.0.0" 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /kubernetes/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /mig/mig.rst: -------------------------------------------------------------------------------- 1 | .. Date: April 26 2021 2 | .. Author: pramarao 3 | 4 | .. headings (h1/h2/h3/h4/h5) are # * - = 5 | 6 | .. _mig-landing: 7 | 8 | #################### 9 | Multi-Instance GPU 10 | #################### 11 | 12 | ************* 13 | Introduction 14 | ************* 15 | 16 | The new Multi-Instance GPU (MIG) feature allows GPUs based on the NVIDIA Ampere architecture 17 | (such as NVIDIA A100) to be securely partitioned into up to seven separate GPU Instances for 18 | CUDA applications, providing multiple users with separate GPU resources for optimal GPU 19 | utilization. This feature is particularly beneficial for workloads that do not fully saturate 20 | the GPU’s compute capacity and therefore users may want to run different workloads in parallel 21 | to maximize utilization. 22 | 23 | Refer to the `MIG User Guide `_ 24 | for more details on the technical concepts, setting up and using MIG on NVIDIA Ampere GPUs. 25 | 26 | 27 | -------------------------------------------------------------------------------- /openshift/appendix-ocp.rst: -------------------------------------------------------------------------------- 1 | .. Date: November 17 2021 2 | .. Author: kquinn 3 | 4 | .. _ocp-appendix: 5 | 6 | ********** 7 | Appendix 8 | ********** 9 | 10 | .. _cluster-entitlement: 11 | 12 | Entitled NVIDIA Driver Builds No Longer Supported 13 | ================================================= 14 | 15 | Introduction 16 | ------------- 17 | 18 | .. important:: 19 | 20 | **Entitled NVIDIA driver builds are deprecated and not supported starting with Red Hat OpenShift 4.10.** 21 | 22 | The Driver Toolkit (DTK) enables entitlement-free deployments of the GPU Operator. In the past, entitled builds were used pre-DTK and for some OpenShift versions where Driver Toolkit images were broken. 23 | 24 | If you encounter the :ref:`"broken driver toolkit detected" ` warning on OpenShift 4.10 or later, you should :ref:`troubleshoot ` to find the root cause instead of falling back to entitled driver builds. 25 | 26 | If the broken DTK warning is encountered on an older version of OpenShift, refer to the documentation for an older version of the NVIDIA GPU Operator to enable entitled builds. Keep in mind that older versions of OpenShift might no longer be supported. 27 | 28 | .. _broken-dtk-troubleshooting: 29 | 30 | Troubleshooting Broken Driver Toolkit Errors 31 | -------------------------------------------- 32 | 33 | The most likely reason for the broken DTK message is Node Feature Discovery (NFD) not working correctly. NFD might be disabled, failing, or not updating the kernel version label for other reasons. Another cause might be a missing or incomplete DTK image stream, for example, because of broken mirroring. 34 | 35 | Follow these steps for initial troubleshooting of Node Feature Discovery: 36 | 37 | #. **Check Node Feature Discovery (NFD) status:** 38 | 39 | .. code-block:: console 40 | 41 | $ oc get pods -n openshift-nfd 42 | 43 | Ensure NFD pods are running and healthy. If NFD is not deployed or is failing, this can cause DTK issues. 44 | 45 | #. **Verify kernel version labels are present and correct:** 46 | 47 | .. code-block:: console 48 | 49 | $ oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{":\t"}{.metadata.labels.feature\.node\.kubernetes\.io/kernel-version\.full}{"\n"}{end}' 50 | 51 | Ensure nodes have proper kernel version labels that match the current OpenShift version of the cluster. 52 | 53 | #. **Check Driver Toolkit image stream:** 54 | 55 | .. code-block:: console 56 | 57 | $ oc get -n openshift is/driver-toolkit 58 | 59 | Verify the driver-toolkit image stream exists and has the correct tags that correspond to the current OpenShift version. 60 | 61 | For additional troubleshooting resources: 62 | 63 | * `Node Feature Discovery documentation `_. 64 | * `Red Hat Node Feature Discovery Operator documentation `_ 65 | * `OpenShift Driver Toolkit documentation `_ 66 | * `OpenShift Driver Toolkit GitHub repository `_ 67 | * `OpenShift troubleshooting guide `_ 68 | -------------------------------------------------------------------------------- /openshift/clean-up.rst: -------------------------------------------------------------------------------- 1 | .. Date: September 01 2021 2 | .. Author: kquinn 3 | 4 | .. _clean-up: 5 | 6 | ***************************************** 7 | Cleanup 8 | ***************************************** 9 | This section describes how to clean up (remove) the GPU Operator if it is no longer needed. 10 | 11 | #. Delete the NVIDIA GPU Operator from the cluster following the guidance outlined in `Deleting Operators from a cluster `_. 12 | 13 | #. Delete the cluster policy by using the OpenShift Container Platform CLI. 14 | 15 | .. code-block:: console 16 | 17 | $ oc delete crd clusterpolicies.nvidia.com 18 | 19 | .. code-block:: console 20 | 21 | customresourcedefinition.apiextensions.k8s.io "clusterpolicies.nvidia.com" deleted 22 | -------------------------------------------------------------------------------- /openshift/download/0003-cluster-wide-machineconfigs.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: machineconfiguration.openshift.io/v1 2 | kind: MachineConfig 3 | metadata: 4 | labels: 5 | machineconfiguration.openshift.io/role: worker 6 | name: 50-rhsm-conf 7 | spec: 8 | config: 9 | ignition: 10 | version: 2.2.0 11 | storage: 12 | files: 13 | - contents: 14 | source: data:text/plain;charset=utf-8;base64,IyBSZWQgSGF0IFN1YnNjcmlwdGlvbiBNYW5hZ2VyIENvbmZpZ3VyYXRpb24gRmlsZToKCiMgVW5pZmllZCBFbnRpdGxlbWVudCBQbGF0Zm9ybSBDb25maWd1cmF0aW9uCltzZXJ2ZXJdCiMgU2VydmVyIGhvc3RuYW1lOgpob3N0bmFtZSA9IHN1YnNjcmlwdGlvbi5yaHNtLnJlZGhhdC5jb20KCiMgU2VydmVyIHByZWZpeDoKcHJlZml4ID0gL3N1YnNjcmlwdGlvbgoKIyBTZXJ2ZXIgcG9ydDoKcG9ydCA9IDQ0MwoKIyBTZXQgdG8gMSB0byBkaXNhYmxlIGNlcnRpZmljYXRlIHZhbGlkYXRpb246Cmluc2VjdXJlID0gMAoKIyBTZXQgdGhlIGRlcHRoIG9mIGNlcnRzIHdoaWNoIHNob3VsZCBiZSBjaGVja2VkCiMgd2hlbiB2YWxpZGF0aW5nIGEgY2VydGlmaWNhdGUKc3NsX3ZlcmlmeV9kZXB0aCA9IDMKCiMgYW4gaHR0cCBwcm94eSBzZXJ2ZXIgdG8gdXNlCnByb3h5X2hvc3RuYW1lID0KCiMgVGhlIHNjaGVtZSB0byB1c2UgZm9yIHRoZSBwcm94eSB3aGVuIHVwZGF0aW5nIHJlcG8gZGVmaW5pdGlvbnMsIGlmIG5lZWRlZAojIGUuZy4gaHR0cCBvciBodHRwcwpwcm94eV9zY2hlbWUgPSBodHRwCgojIHBvcnQgZm9yIGh0dHAgcHJveHkgc2VydmVyCnByb3h5X3BvcnQgPQoKIyB1c2VyIG5hbWUgZm9yIGF1dGhlbnRpY2F0aW5nIHRvIGFuIGh0dHAgcHJveHksIGlmIG5lZWRlZApwcm94eV91c2VyID0KCiMgcGFzc3dvcmQgZm9yIGJhc2ljIGh0dHAgcHJveHkgYXV0aCwgaWYgbmVlZGVkCnByb3h5X3Bhc3N3b3JkID0KCiMgaG9zdC9kb21haW4gc3VmZml4IGJsYWNrbGlzdCBmb3IgcHJveHksIGlmIG5lZWRlZApub19wcm94eSA9CgpbcmhzbV0KIyBDb250ZW50IGJhc2UgVVJMOgpiYXNldXJsID0gaHR0cHM6Ly9jZG4ucmVkaGF0LmNvbQoKIyBSZXBvc2l0b3J5IG1ldGFkYXRhIEdQRyBrZXkgVVJMOgpyZXBvbWRfZ3BnX3VybCA9CgojIFNlcnZlciBDQSBjZXJ0aWZpY2F0ZSBsb2NhdGlvbjoKY2FfY2VydF9kaXIgPSAvZXRjL3Joc20vY2EvCgojIERlZmF1bHQgQ0EgY2VydCB0byB1c2Ugd2hlbiBnZW5lcmF0aW5nIHl1bSByZXBvIGNvbmZpZ3M6CnJlcG9fY2FfY2VydCA9ICUoY2FfY2VydF9kaXIpc3JlZGhhdC11ZXAucGVtCgojIFdoZXJlIHRoZSBjZXJ0aWZpY2F0ZXMgc2hvdWxkIGJlIHN0b3JlZApwcm9kdWN0Q2VydERpciA9IC9ldGMvcGtpL3Byb2R1Y3QKZW50aXRsZW1lbnRDZXJ0RGlyID0gL2V0Yy9wa2kvZW50aXRsZW1lbnQKY29uc3VtZXJDZXJ0RGlyID0gL2V0Yy9wa2kvY29uc3VtZXIKCiMgTWFuYWdlIGdlbmVyYXRpb24gb2YgeXVtIHJlcG9zaXRvcmllcyBmb3Igc3Vic2NyaWJlZCBjb250ZW50OgptYW5hZ2VfcmVwb3MgPSAxCgojIFJlZnJlc2ggcmVwbyBmaWxlcyB3aXRoIHNlcnZlciBvdmVycmlkZXMgb24gZXZlcnkgeXVtIGNvbW1hbmQKZnVsbF9yZWZyZXNoX29uX3l1bSA9IDAKCiMgSWYgc2V0IHRvIHplcm8sIHRoZSBjbGllbnQgd2lsbCBub3QgcmVwb3J0IHRoZSBwYWNrYWdlIHByb2ZpbGUgdG8KIyB0aGUgc3Vic2NyaXB0aW9uIG1hbmFnZW1lbnQgc2VydmljZS4KcmVwb3J0X3BhY2thZ2VfcHJvZmlsZSA9IDEKCiMgVGhlIGRpcmVjdG9yeSB0byBzZWFyY2ggZm9yIHN1YnNjcmlwdGlvbiBtYW5hZ2VyIHBsdWdpbnMKcGx1Z2luRGlyID0gL3Vzci9zaGFyZS9yaHNtLXBsdWdpbnMKCiMgVGhlIGRpcmVjdG9yeSB0byBzZWFyY2ggZm9yIHBsdWdpbiBjb25maWd1cmF0aW9uIGZpbGVzCnBsdWdpbkNvbmZEaXIgPSAvZXRjL3Joc20vcGx1Z2luY29uZi5kCgojIE1hbmFnZSBhdXRvbWF0aWMgZW5hYmxpbmcgb2YgeXVtL2RuZiBwbHVnaW5zIChwcm9kdWN0LWlkLCBzdWJzY3JpcHRpb24tbWFuYWdlcikKYXV0b19lbmFibGVfeXVtX3BsdWdpbnMgPSAxCgojIFJ1biB0aGUgcGFja2FnZSBwcm9maWxlIG9uIGVhY2ggeXVtL2RuZiB0cmFuc2FjdGlvbgpwYWNrYWdlX3Byb2ZpbGVfb25fdHJhbnMgPSAwCgojIElub3RpZnkgaXMgdXNlZCBmb3IgbW9uaXRvcmluZyBjaGFuZ2VzIGluIGRpcmVjdG9yaWVzIHdpdGggY2VydGlmaWNhdGVzLgojIEN1cnJlbnRseSBvbmx5IHRoZSAvZXRjL3BraS9jb25zdW1lciBkaXJlY3RvcnkgaXMgbW9uaXRvcmVkIGJ5IHRoZQojIHJoc20uc2VydmljZS4gV2hlbiB0aGlzIGRpcmVjdG9yeSBpcyBtb3VudGVkIHVzaW5nIGEgbmV0d29yayBmaWxlIHN5c3RlbQojIHdpdGhvdXQgaW5vdGlmeSBub3RpZmljYXRpb24gc3VwcG9ydCAoZS5nLiBORlMpLCB0aGVuIGRpc2FibGluZyBpbm90aWZ5CiMgaXMgc3Ryb25nbHkgcmVjb21tZW5kZWQuIFdoZW4gaW5vdGlmeSBpcyBkaXNhYmxlZCwgcGVyaW9kaWNhbCBkaXJlY3RvcnkKIyBwb2xsaW5nIGlzIHVzZWQgaW5zdGVhZC4KaW5vdGlmeSA9IDEKCltyaHNtY2VydGRdCiMgSW50ZXJ2YWwgdG8gcnVuIGNlcnQgY2hlY2sgKGluIG1pbnV0ZXMpOgpjZXJ0Q2hlY2tJbnRlcnZhbCA9IDI0MAojIEludGVydmFsIHRvIHJ1biBhdXRvLWF0dGFjaCAoaW4gbWludXRlcyk6CmF1dG9BdHRhY2hJbnRlcnZhbCA9IDE0NDAKIyBJZiBzZXQgdG8gemVybywgdGhlIGNoZWNrcyBkb25lIGJ5IHRoZSByaHNtY2VydGQgZGFlbW9uIHdpbGwgbm90IGJlIHNwbGF5ZWQgKHJhbmRvbWx5IG9mZnNldCkKc3BsYXkgPSAxCiMgSWYgc2V0IHRvIDEsIHJoc21jZXJ0ZCB3aWxsIG5vdCBleGVjdXRlLgpkaXNhYmxlID0gMAoKW2xvZ2dpbmddCmRlZmF1bHRfbG9nX2xldmVsID0gSU5GTwojIHN1YnNjcmlwdGlvbl9tYW5hZ2VyID0gREVCVUcKIyBzdWJzY3JpcHRpb25fbWFuYWdlci5tYW5hZ2VyY2xpID0gREVCVUcKIyByaHNtID0gREVCVUcKIyByaHNtLmNvbm5lY3Rpb24gPSBERUJVRwojIHJoc20tYXBwID0gREVCVUcKIyByaHNtLWFwcC5yaHNtZCA9IERFQlVHCg== 15 | filesystem: root 16 | mode: 0644 17 | path: /etc/rhsm/rhsm.conf 18 | --- 19 | apiVersion: machineconfiguration.openshift.io/v1 20 | kind: MachineConfig 21 | metadata: 22 | labels: 23 | machineconfiguration.openshift.io/role: worker 24 | name: 50-entitlement-pem 25 | spec: 26 | config: 27 | ignition: 28 | version: 2.2.0 29 | storage: 30 | files: 31 | - contents: 32 | source: data:text/plain;charset=utf-8;base64,BASE64_ENCODED_PEM_FILE 33 | filesystem: root 34 | mode: 0644 35 | path: /etc/pki/entitlement/entitlement.pem 36 | --- 37 | apiVersion: machineconfiguration.openshift.io/v1 38 | kind: MachineConfig 39 | metadata: 40 | labels: 41 | machineconfiguration.openshift.io/role: worker 42 | name: 50-entitlement-key-pem 43 | spec: 44 | config: 45 | ignition: 46 | version: 2.2.0 47 | storage: 48 | files: 49 | - contents: 50 | source: data:text/plain;charset=utf-8;base64,BASE64_ENCODED_PEM_FILE 51 | filesystem: root 52 | mode: 0644 53 | path: /etc/pki/entitlement/entitlement-key.pem 54 | -------------------------------------------------------------------------------- /openshift/enable-gpu-monitoring-dashboard.rst: -------------------------------------------------------------------------------- 1 | .. Date: August 27 2023 2 | .. Author: empovit 3 | 4 | .. _enable-gpu-monitoring-dashboard: 5 | 6 | ##################################### 7 | Enabling the GPU Monitoring Dashboard 8 | ##################################### 9 | 10 | The GPU Operator exposes GPU telemetry for Prometheus by using the NVIDIA DCGM Exporter. 11 | These metrics can be visualized using a monitoring dashboard based on Grafana. 12 | 13 | Perform the following procedure to add the dashboard to the **Observe** section of the OpenShift Container Platform web console. 14 | 15 | 16 | ************* 17 | Prerequisites 18 | ************* 19 | 20 | * Your cluster uses OpenShift Container Platform 4.10 or higher. 21 | * You have access to the cluster as a user with the ``cluster-admin`` cluster role. 22 | 23 | 24 | ********************************************** 25 | Configuring the NVIDIA DCGM Exporter Dashboard 26 | ********************************************** 27 | 28 | #. Download the latest NVIDIA DCGM Exporter Dashboard from the DCGM Exporter repository on GitHub: 29 | 30 | .. code-block:: console 31 | 32 | $ curl -LfO https://github.com/NVIDIA/dcgm-exporter/raw/main/grafana/dcgm-exporter-dashboard.json 33 | 34 | #. Create a config map from the downloaded file in the ``openshift-config-managed`` namespace: 35 | 36 | .. code-block:: console 37 | 38 | $ oc create configmap nvidia-dcgm-exporter-dashboard -n openshift-config-managed --from-file=dcgm-exporter-dashboard.json 39 | 40 | #. Label the config map to expose the dashboard in the **Administrator** perspective of the web console: 41 | 42 | .. code-block:: console 43 | 44 | $ oc label configmap nvidia-dcgm-exporter-dashboard -n openshift-config-managed "console.openshift.io/dashboard=true" 45 | 46 | #. Optional: Label the config map to expose the dashboard in the **Developer** perspecitive of the web console: 47 | 48 | .. code-block:: console 49 | 50 | $ oc label configmap nvidia-dcgm-exporter-dashboard -n openshift-config-managed "console.openshift.io/odc-dashboard=true" 51 | 52 | #. View the created resource and verify the labels: 53 | 54 | .. code-block:: console 55 | 56 | $ oc -n openshift-config-managed get cm nvidia-dcgm-exporter-dashboard --show-labels 57 | 58 | 59 | ################### 60 | Viewing GPU Metrics 61 | ################### 62 | 63 | - In the OpenShift Container Platform web console from the side menu, switch to the **Administrator** perspective, then navigate to 64 | **Observe** > **Dashboards** and select **NVIDIA DCGM Exporter Dashboard** from the **Dashboard** list. 65 | 66 | If the dashboard was added to the **Developer** perspective, in the OpenShift Container Platform web console from the side menu, switch to 67 | the **Developer** perspective, navigate to **Observe** > **Dashboard** and select **NVIDIA DCGM Exporter Dashboard** from the **Dashboard** list. 68 | 69 | The **NVIDIA DCGM Exporter Dashboard** displays the GPU-related graphs. 70 | 71 | .. image:: graphics/gpu_dashboards.png 72 | 73 | The provided Grafana dashboard includes a default set of DCGM metrics. 74 | You can create and deploy a custom dashboard definition in Grafana 6.x format. 75 | 76 | 77 | *********************************** 78 | Default NVIDIA DCGM Exporter Graphs 79 | *********************************** 80 | 81 | The following table provides a brief description of the graphs on the default dashboard. 82 | 83 | +--------------------------+------------------------------------------------------------+ 84 | | Graph | Description | 85 | +==========================+============================================================+ 86 | | GPU Temperature | GPU temperature in Celsius. | 87 | +--------------------------+------------------------------------------------------------+ 88 | | GPU Avg. Temp | Average GPU temperature in Celsius. | 89 | +--------------------------+------------------------------------------------------------+ 90 | | GPU Power Usage | Power usage in watts for each GPU. | 91 | +--------------------------+------------------------------------------------------------+ 92 | | GPU Power Total | Total power usage in watts. | 93 | +--------------------------+------------------------------------------------------------+ 94 | | GPU SM Clocks | SM clock frequency in hertz. | 95 | +--------------------------+------------------------------------------------------------+ 96 | | GPU Utilization | GPU utilization, percent. | 97 | +--------------------------+------------------------------------------------------------+ 98 | | GPU Framebuffer Mem Used | Frame buffer memory used in MB. | 99 | +--------------------------+------------------------------------------------------------+ 100 | | Tensor Core Utilization | Ratio of cycles the tensor (HMMA) pipe is active, percent. | 101 | +--------------------------+------------------------------------------------------------+ 102 | -------------------------------------------------------------------------------- /openshift/get-entitlement.rst: -------------------------------------------------------------------------------- 1 | .. Date: Sept 07 2021 2 | .. Author: kquinn 3 | 4 | .. _get-entitlement: 5 | 6 | #################################################### 7 | Entitled Driver Builds No Longer Supported 8 | #################################################### 9 | 10 | .. important:: 11 | 12 | **Entitled NVIDIA driver builds are deprecated and not supported.** 13 | 14 | If you encounter issues with the NVIDIA GPU driver build that might require entitlement, refer to the Driver Toolkit (DTK) troubleshooting section: :ref:`broken-dtk-troubleshooting`. 15 | -------------------------------------------------------------------------------- /openshift/graphics/Mig-profile-A100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/Mig-profile-A100.png -------------------------------------------------------------------------------- /openshift/graphics/cluster-policy-image-version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-image-version.png -------------------------------------------------------------------------------- /openshift/graphics/cluster-policy-repository.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-repository.png -------------------------------------------------------------------------------- /openshift/graphics/cluster-policy-state-ready.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-state-ready.png -------------------------------------------------------------------------------- /openshift/graphics/cluster-policy-suceed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster-policy-suceed.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_1.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_2.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_3.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_4.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_5.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_6.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_entitlement_attachsub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_entitlement_attachsub.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy1.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy2.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_1.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_3.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_4.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_configure_vgpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_configure_vgpu.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_enable_sandbox_workloads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_enable_sandbox_workloads.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_suceed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_suceed.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_vGPU_confg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_vGPU_confg.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_vgpu_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_vgpu_1.png -------------------------------------------------------------------------------- /openshift/graphics/cluster_policy_vgpu_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/cluster_policy_vgpu_2.png -------------------------------------------------------------------------------- /openshift/graphics/create_cluster_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_cluster_policy.png -------------------------------------------------------------------------------- /openshift/graphics/create_config_map1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_config_map1.png -------------------------------------------------------------------------------- /openshift/graphics/create_project_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_project_1.png -------------------------------------------------------------------------------- /openshift/graphics/create_project_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/create_project_2.png -------------------------------------------------------------------------------- /openshift/graphics/createclusterpolicy2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/createclusterpolicy2.png -------------------------------------------------------------------------------- /openshift/graphics/createclusterpolicy3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/createclusterpolicy3.png -------------------------------------------------------------------------------- /openshift/graphics/created_pull-secret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/created_pull-secret.png -------------------------------------------------------------------------------- /openshift/graphics/disconnected_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/disconnected_cluster.png -------------------------------------------------------------------------------- /openshift/graphics/driver_toolkit_alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/driver_toolkit_alert.png -------------------------------------------------------------------------------- /openshift/graphics/enable-gpu-direct-rdma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/enable-gpu-direct-rdma.png -------------------------------------------------------------------------------- /openshift/graphics/entitlement_hypervisor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/entitlement_hypervisor.png -------------------------------------------------------------------------------- /openshift/graphics/gpu-operator-certified-cli-install.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/gpu-operator-certified-cli-install.png -------------------------------------------------------------------------------- /openshift/graphics/gpu_dashboards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/gpu_dashboards.png -------------------------------------------------------------------------------- /openshift/graphics/locate-cluster-acm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/locate-cluster-acm.png -------------------------------------------------------------------------------- /openshift/graphics/mig-mixed-profile-A100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/mig-mixed-profile-A100.png -------------------------------------------------------------------------------- /openshift/graphics/mig_strategy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/mig_strategy.png -------------------------------------------------------------------------------- /openshift/graphics/navigate_to_cluster_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/navigate_to_cluster_policy.png -------------------------------------------------------------------------------- /openshift/graphics/nvaie2.3_cluster_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/nvaie2.3_cluster_policy.png -------------------------------------------------------------------------------- /openshift/graphics/ocp_main_console_alerts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/ocp_main_console_alerts.png -------------------------------------------------------------------------------- /openshift/graphics/pci_passthrough.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/pci_passthrough.png -------------------------------------------------------------------------------- /openshift/graphics/precompiled_driver_config_repository.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/precompiled_driver_config_repository.png -------------------------------------------------------------------------------- /openshift/graphics/precompiled_driver_config_version_and_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/precompiled_driver_config_version_and_image.png -------------------------------------------------------------------------------- /openshift/graphics/pull-secret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/pull-secret.png -------------------------------------------------------------------------------- /openshift/graphics/secrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/secrets.png -------------------------------------------------------------------------------- /openshift/graphics/secrets_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/secrets_2.png -------------------------------------------------------------------------------- /openshift/graphics/vmx_secure_boot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/openshift/graphics/vmx_secure_boot.png -------------------------------------------------------------------------------- /openshift/index.rst: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | NVIDIA GPU Operator on Red Hat OpenShift Container Platform 3 | ########################################################### 4 | 5 | .. toctree:: 6 | :titlesonly: 7 | 8 | Introduction 9 | Prerequisites 10 | Installation and Upgrade Overview 11 | NFD Operator Installation 12 | GPU Operator Installation 13 | nvaie-with-ocp.rst 14 | mig-ocp.rst 15 | clean-up.rst 16 | mirror-gpu-ocp-disconnected.rst 17 | enable-gpu-monitoring-dashboard.rst 18 | time-slicing-gpus-in-openshift.rst 19 | openshift-virtualization.rst 20 | gpu-operator-with-precompiled-drivers.rst 21 | troubleshooting-gpu-ocp.rst 22 | appendix-ocp.rst 23 | -------------------------------------------------------------------------------- /openshift/install-nfd.rst: -------------------------------------------------------------------------------- 1 | .. Date: Nov 15 2021 2 | .. Author: kquinn 3 | 4 | .. _install-nfd: 5 | 6 | ########################################################### 7 | Installing the Node Feature Discovery Operator on OpenShift 8 | ########################################################### 9 | 10 | ********* 11 | Procedure 12 | ********* 13 | 14 | The Node Feature Discovery (NFD) Operator is a prerequisite for the **NVIDIA GPU Operator**. Install the NFD Operator using the Red Hat OperatorHub catalog in the OpenShift Container Platform web console. 15 | 16 | #. Follow the Red Hat documentation guidance in the `Node Feature Discovery Operator guide `_ to install the Node Feature Discovery Operator. 17 | 18 | #. Verify the Node Feature Discovery Operator is running: 19 | 20 | .. code-block:: console 21 | 22 | $ oc get pods -n openshift-nfd 23 | 24 | .. code-block:: console 25 | 26 | NAME READY STATUS RESTARTS AGE 27 | nfd-controller-manager-7f86ccfb58-nqgxm 2/2 Running 0 11m 28 | 29 | #. When the Node Feature Discovery is installed, create an instance of Node Feature Discovery using the **NodeFeatureDiscovery** tab: 30 | 31 | #. Click **Operators** > **Installed Operators** from the side menu. 32 | 33 | #. Find the **Node Feature Discovery** entry. 34 | 35 | #. Click **NodeFeatureDiscovery** under the **Provided APIs** field. 36 | 37 | #. Click **Create NodeFeatureDiscovery**. 38 | 39 | #. In the following screen, click **Create**. This starts the Node Feature Discovery Operator that proceeds to label the nodes in the cluster that have GPUs. 40 | 41 | .. note:: The values prepopulated by the OperatorHub are valid for the GPU Operator. 42 | 43 | ************************************************************************* 44 | Verify that the Node Feature Discovery Operator is functioning correctly 45 | ************************************************************************* 46 | 47 | The Node Feature Discovery Operator uses vendor PCI IDs to identify hardware in a node. NVIDIA uses the PCI ID ``10de``. Use the OpenShift Container Platform web console or the CLI to verify that the Node Feature Discovery Operator is functioning correctly. 48 | 49 | 50 | #. In the OpenShift Container Platform web console, click **Compute** > **Nodes** from the side menu. 51 | 52 | #. Select a worker node that contains a GPU. 53 | 54 | #. Click the **Details** tab. 55 | 56 | #. Under **Node Labels**, verify that the following label is present: 57 | 58 | .. code-block:: console 59 | 60 | feature.node.kubernetes.io/pci-10de.present=true 61 | 62 | .. note:: ``0x10de`` is the PCI vendor ID assigned to NVIDIA. 63 | 64 | #. Verify that the GPU device (``pci-10de``) is discovered on the GPU node: 65 | 66 | .. code-block:: console 67 | 68 | $ oc describe node | egrep 'Roles|pci' | grep -v master 69 | 70 | .. code-block:: console 71 | 72 | Roles: worker 73 | feature.node.kubernetes.io/pci-10de.present=true 74 | feature.node.kubernetes.io/pci-1d0f.present=true 75 | Roles: worker 76 | feature.node.kubernetes.io/pci-1013.present=true 77 | feature.node.kubernetes.io/pci-8086.present=true 78 | Roles: worker 79 | feature.node.kubernetes.io/pci-1013.present=true 80 | feature.node.kubernetes.io/pci-8086.present=true 81 | Roles: worker 82 | feature.node.kubernetes.io/pci-1013.present=true 83 | feature.node.kubernetes.io/pci-8086.present=true 84 | -------------------------------------------------------------------------------- /openshift/introduction.rst: -------------------------------------------------------------------------------- 1 | .. Date: Oct 24 2022 2 | .. Author: kquinn 3 | 4 | .. _essug: https://docs.nvidia.com/enterprise-support-and-services-user-guide/about-this-user-guide/index.html 5 | .. |essug| replace:: *NVIDIA Enterprise Support and Services User Guide* 6 | 7 | .. _openshift-introduction: 8 | 9 | ************************************************ 10 | Introduction to NVIDIA GPU Operator on OpenShift 11 | ************************************************ 12 | 13 | Kubernetes is an open-source platform for automating the deployment, scaling, and managing of containerized applications. 14 | 15 | Red Hat OpenShift Container Platform is a security-centric and enterprise-grade hardened Kubernetes platform for deploying and managing Kubernetes clusters at scale, developed and supported by Red Hat. 16 | Red Hat OpenShift Container Platform includes enhancements to Kubernetes so users can easily configure and use GPU resources for accelerating workloads like deep learning. 17 | 18 | The NVIDIA GPU Operator uses the operator framework within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. These components include the NVIDIA drivers (to enable CUDA), 19 | Kubernetes device plugin for GPUs, the `NVIDIA Container Toolkit `_, 20 | automatic node labeling using `GFD `_, `DCGM `_-based monitoring, and others. 21 | 22 | For guidance on the specific NVIDIA support entitlement needs, 23 | refer |essug|_ if you have an NVIDIA AI Enterprise entitlement. 24 | Otherwise, refer to the `Obtaining Support from NVIDIA `_ 25 | Red Hat Knowledgebase article. 26 | -------------------------------------------------------------------------------- /openshift/prerequisites.rst: -------------------------------------------------------------------------------- 1 | .. Date: November 26 2021 2 | .. Author: kquinn 3 | 4 | ******************************************* 5 | Prerequisites for GPU Operator on OpenShift 6 | ******************************************* 7 | 8 | Before following the steps in this guide, ensure that your environment has: 9 | 10 | * A working OpenShift cluster up and running with a GPU worker node. Refer to the `OpenShift Container Platform installation overview `_ for installation guidance. 11 | Refer to :external+gpuop:ref:`Container Platforms ` for the support matrix of the GPU Operator releases and the supported container platforms for more information. 12 | * Access to the OpenShift cluster as a ``cluster-admin`` to perform the necessary steps. 13 | * OpenShift CLI (``oc``) installed. 14 | -------------------------------------------------------------------------------- /openshift/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "latest": "25.10", 3 | "versions": 4 | [ 5 | { 6 | "version": "25.10" 7 | }, 8 | { 9 | "version": "25.3" 10 | }, 11 | { 12 | "version": "24.9" 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /openshift/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../25.10", 5 | "version": "25.10" 6 | }, 7 | { 8 | "url": "../25.3", 9 | "version": "25.3" 10 | }, 11 | { 12 | "url": "../24.9", 13 | "version": "24.9" 14 | } 15 | ] 16 | -------------------------------------------------------------------------------- /partner-validated/PARTNER-VALIDATED-TEMPLATE.rst: -------------------------------------------------------------------------------- 1 | .. headings # #, * *, =, -, ^, " 2 | 3 | .. |prod-name-long| replace:: Your Product Name v1.0 4 | .. |prod-name-short| replace:: YPN 5 | 6 | ######################################### 7 | NVIDIA GPU Operator with |prod-name-long| 8 | ######################################### 9 | 10 | 11 | ********************************************* 12 | About the GPU Operator with |prod-name-short| 13 | ********************************************* 14 | 15 | Use this section of the documentation to describe the benefits that customers 16 | can experience by using the NVIDIA GPU Operator and the product together. 17 | 18 | Providing a summary of the competitive advantages that your product provides 19 | is appropriate. 20 | 21 | Providing a URL to your product documentation so readers can learn more about 22 | your product is also appropriate. 23 | 24 | 25 | ****************************** 26 | Validated Configuration Matrix 27 | ****************************** 28 | 29 | Identify the hardware baseline that was used to self-validate your product with 30 | the Operator. 31 | 32 | .. rubric:: Example 33 | 34 | .. list-table:: 35 | :header-rows: 1 36 | 37 | * - 38 | - | NVIDIA 39 | | GPU Operator 40 | - | Operating 41 | | System 42 | - | Container 43 | | Runtime 44 | - Kubernetes 45 | - Helm 46 | - NVIDIA GPU 47 | - Hardware Model 48 | 49 | * - |prod-name-long| 50 | - v23.3.1 51 | - | Ubuntu 22.04 52 | | Ubuntu 20.04 53 | - containerd v1.6 54 | - 1.25, 1.26 55 | - v3 56 | - | NVIDIA HGX H100 57 | | NVIDIA H100 58 | | NVIDIA A100 59 | - | Dell PowerEdge R740 60 | | 2 $\times$ Intel Xeon Silver 2.2 GHz 61 | | 64GB RAM, 1TB NVMe 62 | 63 | Include at least the following pieces of information: 64 | 65 | * **Product name.** 66 | Specify your product name and version. 67 | 68 | * **GPU Operator version.** 69 | Specify the version of the NVIDIA GPU Operator that you self-validated. 70 | 71 | * **Operating system.** 72 | Specify the operating system name and version that you self-validated. 73 | 74 | * **Container runtime.** 75 | Specify the container runtime name and version. 76 | Refer to the 77 | `Supported Container Runtimes `_ 78 | section of the platform support page. 79 | 80 | * **Kubernetes version.** 81 | Specify the Kubernetes version, such as ``1.25``, that your product uses. 82 | 83 | * **Helm version.** 84 | Specify the version of Helm that you used with your product to self-validate. 85 | If Helm is not used to install the NVIDIA GPU Operator, identify the product 86 | and version that you used for installation. 87 | 88 | * **NVIDIA GPU model.** 89 | Use the same product model name that is provided in the 90 | `Supported NVIDIA GPUs and Systems `_ 91 | section of the platform support page. 92 | 93 | * **Hardware model.** 94 | Including a summary of the CPU model, number of CPUs, memory, and other 95 | popular specifications is appropriate. 96 | 97 | 98 | ************* 99 | Prerequisites 100 | ************* 101 | 102 | Specify the conditions that the customer must meet before beginning to install 103 | the NVIDIA GPU Operator. 104 | 105 | References to product documentation are appropriate. 106 | 107 | A few commands with brief example output that customers can run to verify their 108 | readiness is appropriate. 109 | 110 | A bulleted list is an effective presentation for simple and brief prerequisites 111 | information, but is not required. 112 | 113 | If the prerequisites are not simple and require running several commands to 114 | verify readiness to begin, organize the commands or requirements into stages 115 | and create a level 3 heading for each of the stages. 116 | 117 | 118 | ********* 119 | Procedure 120 | ********* 121 | 122 | You can keep the heading as Procedure, or you can replace with text similar to 123 | Configuring |prod-name-short| with the GPU Operator. 124 | 125 | If the procedure is in the range of 7 to 10 steps, then present them after 126 | the heading. 127 | 128 | If the procedure is more sophisticated, organize the steps into stages and 129 | create a level 3 heading for each of the stages. 130 | 131 | 132 | **************************************************** 133 | Verifying |prod-name-short| with the GPU Operator 134 | **************************************************** 135 | 136 | Optionally, include commands that the customer can run to verify that the 137 | installation is successful and that workloads can use the NVIDIA GPUs. 138 | 139 | 140 | *************** 141 | Getting Support 142 | *************** 143 | 144 | Indicate how end users can receive support from you regarding your product. 145 | 146 | * URL for product documentation. 147 | * Information to help an end user to open a support request with you. 148 | 149 | 150 | ******************* 151 | Related Information 152 | ******************* 153 | 154 | Provide URLs to product documentation, support forums, and so on. -------------------------------------------------------------------------------- /partner-validated/k0rdent.rst: -------------------------------------------------------------------------------- 1 | .. headings # #, * *, =, -, ^, " 2 | 3 | .. |prod-name-long| replace:: Mirantis k0rdent 4 | .. |prod-name-short| replace:: k0rdent 5 | 6 | ############################################# 7 | |prod-name-long| with the NVIDIA GPU Operator 8 | ############################################# 9 | 10 | 11 | ********************************************* 12 | About |prod-name-short| with the GPU Operator 13 | ********************************************* 14 | 15 | |prod-name-short| is as a "super control plane" designed to ensure the consistent provisioning and lifecycle 16 | management of Kubernetes clusters and the services that make them useful. The goal of the k0rdent project is 17 | to provide platform engineers with the means to deliver a distributed container management environment (DCME) 18 | and enable them to compose unique internal developer platforms (IDP) to support a diverse range of complex 19 | modern application workloads. 20 | 21 | The NVIDIA GPU Operator uses the operator framework within Kubernetes to automate 22 | both the deployment and management of all NVIDIA software components needed to provision NVIDIA GPUs. 23 | These components include the NVIDIA GPU drivers to enable CUDA, Kubernetes device plugin for GPUs, 24 | the NVIDIA Container Toolkit, automatic node labeling using GFD, DCGM based monitoring and others. 25 | 26 | 27 | ****************************** 28 | Validated Configuration Matrix 29 | ****************************** 30 | 31 | |prod-name-long| has self-validated with the following components and versions: 32 | 33 | .. list-table:: 34 | :header-rows: 1 35 | 36 | * - Version 37 | - | NVIDIA 38 | | GPU 39 | | Operator 40 | - | Operating 41 | | System 42 | - | Container 43 | | Runtime 44 | - Kubernetes 45 | - Helm 46 | - NVIDIA GPU 47 | - Hardware Model 48 | 49 | * - k0rdent 0.2.0 / k0s v1.31.5+k0s 50 | - v24.9.2 51 | - | Ubuntu 22.04 52 | - containerd v1.7.24 with the NVIDIA Container Toolkit v1.17.4 53 | - 1.31.5 54 | - Helm v3 55 | - | 2x NVIDIA RTX 4000 SFF Ada 20GB GDDR6 (ECC) 56 | - | Supermicro SuperServer 6028U-E1CNR4T+ 57 | 58 | | 1000W Supermicro PWS-1K02A-1R 59 | 60 | | 2x Intel Xeon E5-2630v4, 10C/20T 2.2/3.1 GHz LGA 2011-3 25MB 85W 61 | 62 | | 32GB DDR4-2666 RDIMM, M393A4K40BB2-CTD6Q 63 | 64 | | NVMe 960GB PM983 NVMe M.2, MZ1LB960HAJQ-00007 65 | 66 | | 2 x NVIDIA RTX 4000 SFF Ada 20GB GDDR6 (ECC), 70W, PCIe 4.0x16, 4x 67 | 68 | | 4x Mini DisplayPort 1.4a 69 | 70 | 71 | ************* 72 | Prerequisites 73 | ************* 74 | 75 | * A running |prod-name-short| managed cluster with at least one control plane node and two worker nodes. 76 | The recommended configuration is at least three control plane nodes and at least two worker nodes. 77 | 78 | * At least one worker node with an NVIDIA GPU physically installed. 79 | The GPU Operator can locate the GPU and label the node accordingly. 80 | 81 | * The kubeconfig file for the |prod-name-short| managed cluster on the seed node. 82 | You can get the file from the |prod-name-short| control plane. 83 | 84 | * You have access to the |prod-name-short| cluster. 85 | 86 | 87 | ********* 88 | Procedure 89 | ********* 90 | 91 | Perform the following steps to prepare the |prod-name-short| cluster: 92 | 93 | #. Install template to k0rdent 94 | 95 | .. code-block:: console 96 | 97 | $ helm install gpu-operator oci://ghcr.io/k0rdent/catalog/charts/gpu-operator-service-template \ 98 | --version 24.9.2 -n kcm-system 99 | 100 | #. Verify service template: 101 | 102 | .. code-block:: console 103 | 104 | $ kubectl get servicetemplates -A 105 | 106 | *Example Output* 107 | 108 | .. code-block:: output 109 | 110 | NAMESPACE NAME VALID 111 | kcm-system gpu-operator-24-9-2 true 112 | 113 | #. Deploy service template to child cluster: 114 | 115 | .. code-block:: console 116 | 117 | apiVersion: k0rdent.mirantis.com/v1alpha1 118 | kind: MultiClusterService 119 | metadata: 120 | name: gpu-operator 121 | spec: 122 | clusterSelector: 123 | matchLabels: 124 | group: demo 125 | serviceSpec: 126 | services: 127 | - template: gpu-operator-24-9-2 128 | name: gpu-operator 129 | namespace: gpu-operator 130 | values: | 131 | operator: 132 | defaultRuntime: containerd 133 | toolkit: 134 | env: 135 | - name: CONTAINERD_CONFIG 136 | value: /etc/k0s/containerd.d/nvidia.toml 137 | - name: CONTAINERD_SOCKET 138 | value: /run/k0s/containerd.sock 139 | - name: CONTAINERD_RUNTIME_CLASS 140 | value: nvidia 141 | 142 | 143 | The |prod-name-short| managed clusters will now have the NVIDIA GPU operator 144 | 145 | ************************************************* 146 | Verifying |prod-name-short| with the GPU Operator 147 | ************************************************* 148 | 149 | Refer to :external+gpuop:ref:`running sample gpu applications` to verify the installation. 150 | 151 | *************** 152 | Getting Support 153 | *************** 154 | 155 | Refer to the k0RDENT product documentation for information about working with k0RDENT. 156 | 157 | ******************* 158 | Related information 159 | ******************* 160 | 161 | * https://docs.k0rdent.io/v0.2.0/ 162 | -------------------------------------------------------------------------------- /partner-validated/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": 3 | [ 4 | { 5 | "version": "1.0.0" 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /partner-validated/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /playground/dind.rst: -------------------------------------------------------------------------------- 1 | .. Date: November 13 2020 2 | .. Author: pramarao 3 | 4 | .. _dind: 5 | 6 | ################## 7 | Docker-in-Docker 8 | ################## 9 | 10 | You can also run GPU containers with Docker-in-Docker (dind). Just mount in the Docker socket to the container and then 11 | specify the CUDA container that you want to run: 12 | 13 | .. code-block:: console 14 | 15 | $ sudo docker run -v /var/run/docker.sock:/var/run/docker.sock \ 16 | docker run --rm --gpus all \ 17 | nvidia/cuda:11.0-base \ 18 | nvidia-smi 19 | 20 | With the resulting output: 21 | 22 | .. code-block:: console 23 | 24 | +-----------------------------------------------------------------------------+ 25 | | NVIDIA-SMI 455.45.01 Driver Version: 455.45.01 CUDA Version: 11.1 | 26 | |-------------------------------+----------------------+----------------------+ 27 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 28 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 29 | | | | MIG M. | 30 | |===============================+======================+======================| 31 | | 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | 32 | | N/A 31C P8 9W / 70W | 0MiB / 15109MiB | 0% Default | 33 | | | | N/A | 34 | +-------------------------------+----------------------+----------------------+ 35 | 36 | +-----------------------------------------------------------------------------+ 37 | | Processes: | 38 | | GPU GI CI PID Type Process name GPU Memory | 39 | | ID ID Usage | 40 | |=============================================================================| 41 | | No running processes found | 42 | +-----------------------------------------------------------------------------+ 43 | 44 | Or launch an interactive session within an interactive session, Inception style! 45 | 46 | .. code-block:: console 47 | 48 | $ sudo docker run -ti -v /var/run/docker.sock:/var/run/docker.sock docker 49 | 50 | .. code-block:: console 51 | 52 | / # docker run -it --gpus all nvidia/cuda:11.1-base 53 | Unable to find image 'nvidia/cuda:11.1-base' locally 54 | 11.1-base: Pulling from nvidia/cuda 55 | 6a5697faee43: Pull complete 56 | ba13d3bc422b: Pull complete 57 | a254829d9e55: Pull complete 58 | f853e5702a31: Pull complete 59 | 29cfce72a460: Pull complete 60 | 4bb689f629d3: Pull complete 61 | Digest: sha256:6007208f8a1f626c0175260ebd46b1cbde10aab67e6d810fa593357b8199bfbe 62 | Status: Downloaded newer image for nvidia/cuda:11.1-base 63 | root@f29740c58731:/# nvidia-smi 64 | 65 | +-----------------------------------------------------------------------------+ 66 | | NVIDIA-SMI 455.45.01 Driver Version: 455.45.01 CUDA Version: 11.1 | 67 | |-------------------------------+----------------------+----------------------+ 68 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 69 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 70 | | | | MIG M. | 71 | |===============================+======================+======================| 72 | | 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | 73 | | N/A 31C P8 9W / 70W | 0MiB / 15109MiB | 0% Default | 74 | | | | N/A | 75 | +-------------------------------+----------------------+----------------------+ 76 | 77 | +-----------------------------------------------------------------------------+ 78 | | Processes: | 79 | | GPU GI CI PID Type Process name GPU Memory | 80 | | ID ID Usage | 81 | |=============================================================================| 82 | | No running processes found | 83 | +-----------------------------------------------------------------------------+ 84 | 85 | What other cool stuff can you do? Send us details via GitHub `issues `_! 86 | 87 | -------------------------------------------------------------------------------- /playground/x-arch.rst: -------------------------------------------------------------------------------- 1 | .. Date: November 13 2020 2 | .. Author: pramarao 3 | 4 | .. _x-arch: 5 | 6 | ########################################## 7 | Running Cross-Architecture Containers 8 | ########################################## 9 | 10 | For many reasons, it is desirable to build and run containers for one CPU architecture (e.g. ``x86_64``) 11 | on another CPU architecture (e.g. ``Arm64``). 12 | 13 | ************************ 14 | Emulation Environment 15 | ************************ 16 | 17 | One solution would be to use an emulation environment using the `QEMU `_ emulator and Docker. 18 | Using **QEMU**, `binfmt_misc `_ and the registration scripts via the 19 | `multiarch/qemu-user-static `_ project, we can run containers built for 20 | either *Arm64* or *POWER* architectures on *x86_64* servers or workstations. 21 | 22 | Installing QEMU 23 | ----------------- 24 | 25 | Install the *qemu*, *binfmt-support*, and *qemu-user-static* packages. The *binfmt-support* contains scripts to register binary 26 | formats with the kernel using the *binfmt_misc* module; and the *qemu-user-static* package registers binary formats that emulators can handle. 27 | 28 | .. code-block:: console 29 | 30 | $ sudo apt-get install -y qemu \ 31 | && binfmt-support \ 32 | && qemu-user-static 33 | 34 | Run the ``multiarch/qemu-user-static`` container to register: 35 | 36 | .. code-block:: console 37 | 38 | $ sudo docker run --rm --privileged \ 39 | multiarch/qemu-user-static \ 40 | --reset \ 41 | -p yes 42 | 43 | Now, verify that the *binfmt* entries were registered on the system: 44 | 45 | .. code-block:: console 46 | 47 | $ update-binfmts --display 48 | 49 | .. code-block:: console 50 | 51 | ... 52 | qemu-aarch64 (enabled): 53 | package = qemu-user-static 54 | type = magic 55 | offset = 0 56 | magic = \x7f\x45\x4c\x46\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7\x00 57 | mask = \xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff 58 | interpreter = /usr/bin/qemu-aarch64-static 59 | detector = 60 | ... 61 | 62 | Running Containers 63 | -------------------- 64 | 65 | The community maintains a number of Docker containers on DockerHub under `arm64v8 `_. 66 | Without an emulator, if you try running an ``arm64`` Alpine container on ``x86_64``, you will observe a format error from Docker. 67 | 68 | This can be seen in the example below: 69 | 70 | .. code-block:: console 71 | 72 | $ uname -m 73 | x86_64 74 | 75 | .. code-block:: console 76 | 77 | $ sudo docker run --rm arm64v8/alpine uname -m 78 | 79 | .. code-block:: console 80 | 81 | standard_init_linux.go:211: exec user process caused "exec format error" 82 | 83 | After installing the QEMU emulator and registering: 84 | 85 | .. code-block:: console 86 | 87 | $ sudo docker run --rm arm64v8/alpine uname -m 88 | 89 | .. code-block:: console 90 | 91 | aarch64 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /repo: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT_DIR=$(dirname ${BASH_SOURCE}) 6 | cd "$SCRIPT_DIR" 7 | 8 | exec "tools/packman/python.sh" tools/repoman/repoman.py $@ 9 | -------------------------------------------------------------------------------- /repo.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call "%~dp0tools\packman\python.bat" %~dp0tools\repoman\repoman.py %* 4 | if %errorlevel% neq 0 ( goto Error ) 5 | 6 | :Success 7 | exit /b 0 8 | 9 | :Error 10 | exit /b %errorlevel% 11 | -------------------------------------------------------------------------------- /review/index.rst: -------------------------------------------------------------------------------- 1 | .. license-header 2 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: Apache-2.0 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | .. headings # #, * *, =, -, ^, " 18 | 19 | ################ 20 | Technical Review 21 | ################ 22 | 23 | Refer to the following URLs for the review HTML: 24 | 25 | * `NVIDIA Container Toolkit <./container-toolkit/latest/index.html>`__ 26 | * `NVIDIA Driver Containers <./driver-containers/latest/index.html>`__ 27 | * `NVIDIA GPU Operator <./gpu-operator/latest/index.html>`__ 28 | * `NVIDIA GPU Operator on Red Hat OpenShift Container Platform <./openshift/latest/index.html>`__ 29 | * `NVIDIA GPUs and Edge Computing <./edge/latest/index.html>`__ 30 | * `Partner-Validated Configurations <./partner-validated/latest/index.html>`__ 31 | -------------------------------------------------------------------------------- /review/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": 3 | [ 4 | { 5 | "version": "0.1.0" 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /review/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /scripts/create_archive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # WARNING: assumes you are running this script from the top-level directory (e.g. scripts/create_archive.sh) 4 | # Example: 5 | # PROJECT=gpu-operator VERSION=1.9.0 ./scripts/create_archive.sh 6 | 7 | set -e 8 | 9 | PROJECT=${PROJECT:?"Missing PROJECT to archive"} 10 | VERSION=${VERSION:?"Missing VERSION to archive"} 11 | ARCHIVE="${PROJECT}/archive/${VERSION}" 12 | 13 | # Create archive directory and copy over all current files/directories, excluding the archive directory itself 14 | rm -rf $ARCHIVE 15 | mkdir -p $ARCHIVE 16 | rsync -aq "${PROJECT}/" $ARCHIVE --exclude "archive/" 17 | 18 | # Find all labels in the project documentation and extract the label name. 19 | # Labels are in the format: ".. _label-name:" 20 | labels=$(grep -ohr --include \*.rst ".. _[^:]*" ${ARCHIVE} | cut -c 5- | xargs -n1 | sort | xargs) 21 | 22 | # For each label, append a version suffix and update any references to the label. 23 | for label in $labels; do 24 | echo "Updating all references to label: $label" 25 | find $ARCHIVE -name "*.rst" -exec sed -i '' "s/.. _${label}/&-${VERSION}/g" {} \; 26 | find $ARCHIVE -name "*.rst" -exec sed -i '' "s/:ref:\`.*${label}/&-${VERSION}/g" {} \; 27 | done 28 | 29 | -------------------------------------------------------------------------------- /secure-services-istio-keycloak/configure.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | # Configure RBAC 7 | 8 | ````{only} not publish_bsp 9 | ```{contents} 10 | :depth: 2 11 | :backlinks: none 12 | :local: true 13 | ``` 14 | ```` 15 | 16 | ## Inject Istio 17 | 18 | 1. Label the namespace to enable Istio injection. 19 | 20 | ```console 21 | kubectl label namespace istio-injection=enabled --overwrite 22 | ``` 23 | 24 | Replace the `` with your target namespace. 25 | 26 | 2. Delete the existing pods to recreate them with Istio sidecar containers. 27 | 28 | ```console 29 | kubectl delete pod $(kubectl get pods -n | awk '{print $1}') -n 30 | ```` 31 | 32 | ## Deploy Manifests 33 | 34 | 1. The following sample manifest deploys a gateway and ingress virtual service. 35 | 36 | - Update the target namespace for the virtual service resource. 37 | - The sample manifest applies to NVIDIA NIM for LLMs. For other NVIDIA microservices, update the `match` and `route` for the microservice endpoints. 38 | - For information about the microservice endpoints, refer to the following documents: 39 | - [NIM Inference API Inference](https://docs.nvidia.com/nim/large-language-models/latest/api-reference.html) 40 | - [NIM Embedding API Reference](https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/reference.html) 41 | - [NIM ReRanking API Reference](https://docs.nvidia.com/nim/nemo-retriever/text-reranking/latest/reference.html) 42 | 43 | ```{literalinclude} ./manifests/istio-sample-manifest.yaml 44 | :language: yaml 45 | ``` 46 | 47 | 2. Apply the manifest. 48 | 49 | ```console 50 | kubectl apply -f istio-sample-manifest.yaml 51 | ```` 52 | 53 | 3. Determine the Istio ingress gateway node port. 54 | 55 | ```console 56 | kubectl get svc -n istio-system | grep ingress 57 | ``` 58 | 59 | *Example Output* 60 | 61 | ```output 62 | istio-ingressgateway LoadBalancer 10.102.8.149 10.28.234.101 15021:32658/TCP,80:30611/TCP,443:31874/TCP,31400:30160/TCP,15443:32430/TCP 22h 63 | ``` 64 | 65 | 4. List the worker IP addresses. 66 | 67 | ```console 68 | for node in `kubectl get nodes | awk '{print $1}' | grep -v NAME`; do echo $node ' ' | tr -d '\n'; kubectl describe node $node | grep -i 'internalIP:' | awk '{print $2}'; done 69 | ``` 70 | 71 | *Example Output* 72 | 73 | ```console 74 | nim-test-cluster-03-worker-nbhk9-56b4b888dd-8lpqd 10.120.199.16 75 | nim-test-cluster-03-worker-nbhk9-56b4b888dd-hnrxr 10.120.199.23 76 | ``` 77 | 78 | 5. The following manifest creates request authentication resources. 79 | 80 | - Update the target namespace. 81 | - Modify the issuer in the manifest with one of the preceding IP addresses and preceeding ingress Istio gateway node ports, mapped to port 80. 82 | 83 | ```{literalinclude} ./manifests/requestAuthentication.yaml 84 | :language: yaml 85 | ``` 86 | 87 | 6. Apply the manifest. 88 | 89 | ```console 90 | kubectl apply -f requestAuthentication.yaml 91 | ``` 92 | 93 | 7. The following manifest creates an authorization policy resource. 94 | 95 | - Update the target namespace. 96 | - Update the rules that apply to the target microservices. 97 | 98 | ```{literalinclude} ./manifests/authorizationPolicy.yaml 99 | :language: yaml 100 | ``` 101 | 102 | 8. Apply the manifest. 103 | 104 | ```console 105 | kubectl apply -f authorizationPolicy.yaml 106 | ``` 107 | 108 | 9. Create a token for Keycloak authentication. 109 | Update the node IP address and ingress gateway node port. 110 | 111 | ```console 112 | TOKEN=`curl -X POST -d "client_id=nvidia-nim" -d "username=nim" -d "password=nvidia123" -d "grant_type=password" "http://10.217.19.114:30611/realms/nvidia-nim-llm/protocol/openid-connect/token"| jq .access_token| tr -d '"' ` 113 | ``` 114 | 115 | 10. Verify access to the microservice from Keycloak through the Istio gateway. 116 | 117 | ```console 118 | curl -v -X POST http://10.217.19.114:30611/v1/completions -H "Authorization: Bearer $TOKEN" -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ "model": "llama-2-13b-chat","prompt": "What is Kubernetes?","max_tokens": 16,"temperature": 1, "n": 1, "stream": false, "stop": "string", "frequency_penalty": 0.0 }' 119 | ``` 120 | 121 | Update the node IP address and ingress gateway port. 122 | Update the model name if it is not `llama-2-13b-chat`. 123 | 124 | 11. Generate some more data so it can be visualized in the next step on the Kiali dashboard. 125 | 126 | ```console 127 | for i in $(seq 1 100); do curl -X POST http://10.217.19.114:30611/v1/chat/completions -H 'accept: application/json' -H "Authorization: Bearer $TOKEN" -H 'Content-Type: application/json' -d '{"model": "llama-2-13b-chat","messages": [{"role": "system","content": "You are a helpful assistant."},{"role": "user", "content": "Hello!"}]}' -s -o /dev/null; done 128 | ``` 129 | 130 | 12. Access the Istio Dashboard, specifying your client system IP address. 131 | 132 | ```console 133 | istioctl dashboard kiali --address 134 | ``` 135 | 136 | Access in browser with `system-ip` and port `20001`. 137 | 138 | ## Conclusion 139 | 140 | This architecture offers a robust solution for deploying NVIDIA NeMo MicroServices in a secure, scalable, and efficient manner. Integrating advanced service mesh capabilities with OIDC authentication sets a new standard for building sophisticated AI-driven applications. -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-1.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-10.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-11.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-12.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-13.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-14.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-15.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-16.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-2.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-3.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-4.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-5.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-6.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-7.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-8.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/keycloak-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/keycloak-9.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/images/reference-arch-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/secure-services-istio-keycloak/images/reference-arch-01.png -------------------------------------------------------------------------------- /secure-services-istio-keycloak/manifests/authorizationPolicy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.istio.io/v1beta1 2 | kind: AuthorizationPolicy 3 | metadata: 4 | name: nim-auth-policy 5 | namespace: 6 | spec: 7 | selector: 8 | matchLabels: 9 | app.kubernetes.io/name: inferencing 10 | rules: 11 | - from: 12 | - source: 13 | requestPrincipals: ["*"] 14 | to: 15 | - operation: 16 | methods: ["POST"] 17 | paths: ["/v1/completions*"] 18 | when: 19 | - key: request.auth.claims[realm_access][roles] 20 | values: ["completions"] 21 | - from: 22 | - source: 23 | requestPrincipals: ["*"] 24 | to: 25 | - operation: 26 | methods: ["POST"] 27 | paths: ["/v1/chat/completions*"] 28 | when: 29 | - key: request.auth.claims[realm_access][roles] 30 | values: ["chat"] -------------------------------------------------------------------------------- /secure-services-istio-keycloak/manifests/istio-sample-manifest.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: networking.istio.io/v1alpha3 3 | kind: Gateway 4 | metadata: 5 | name: rag-gateway 6 | namespace: istio-system 7 | spec: 8 | selector: 9 | istio: ingressgateway 10 | servers: 11 | - port: 12 | number: 80 13 | name: http2 14 | protocol: HTTP 15 | hosts: 16 | - "*" 17 | 18 | --- 19 | apiVersion: networking.istio.io/v1alpha3 20 | kind: VirtualService 21 | metadata: 22 | name: sample-vs 23 | namespace: 24 | spec: 25 | hosts: 26 | - "*" 27 | gateways: 28 | - istio-system/rag-gateway 29 | http: 30 | - match: 31 | - uri: 32 | prefix: /admin 33 | - uri: 34 | prefix: /resources 35 | - uri: 36 | prefix: /welcome 37 | - uri: 38 | prefix: /realms 39 | route: 40 | - destination: 41 | host: keycloak.default.svc.cluster.local 42 | port: 43 | number: 8080 44 | - match: 45 | - uri: 46 | prefix: /v1/completions 47 | - uri: 48 | prefix: /v1/chat/completions 49 | route: 50 | - destination: 51 | host: inferencing 52 | port: 53 | number: 8080 54 | -------------------------------------------------------------------------------- /secure-services-istio-keycloak/manifests/requestAuthentication.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: security.istio.io/v1beta1 3 | kind: RequestAuthentication 4 | metadata: 5 | name: nim-request-authentication 6 | namespace: 7 | spec: 8 | selector: 9 | matchLabels: 10 | app.kubernetes.io/name: inferencing 11 | jwtRules: 12 | - issuer: "http://10.176.21.249:30669/realms/nvidia-nim" 13 | jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs" 14 | forwardOriginalToken: true 15 | fromHeaders: 16 | - name: Authorization 17 | prefix: "Bearer" 18 | - issuer: "http://10.176.21.249/realms/nvidia-nim" 19 | jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs" 20 | forwardOriginalToken: true 21 | fromHeaders: 22 | - name: Authorization 23 | prefix: "Bearer" 24 | --- 25 | apiVersion: security.istio.io/v1beta1 26 | kind: RequestAuthentication 27 | metadata: 28 | name: nim-request-authentication-gw 29 | namespace: istio-system 30 | spec: 31 | selector: 32 | matchLabels: 33 | istio: ingressgateway 34 | jwtRules: 35 | - issuer: "http://10.176.21.249:30669/realms/nvidia-nim" 36 | jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs" 37 | forwardOriginalToken: true 38 | fromHeaders: 39 | - name: Authorization 40 | prefix: "Bearer" 41 | - issuer: "http://10.176.21.249/realms/nvidia-nim" 42 | jwksUri: "http://keycloak.default.svc.cluster.local:8080/realms/nvidia-nim/protocol/openid-connect/certs" 43 | forwardOriginalToken: true 44 | fromHeaders: 45 | - name: Authorization 46 | prefix: "Bearer" 47 | -------------------------------------------------------------------------------- /secure-services-istio-keycloak/platform-support.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | # Platform Support 7 | 8 | ````{only} not publish_bsp 9 | ```{contents} 10 | :depth: 2 11 | :backlinks: none 12 | :local: true 13 | ``` 14 | ```` 15 | 16 | ## Operating Systems and Kubernetes Platforms 17 | 18 | ```{list-table} 19 | :header-rows: 1 20 | :stub-columns: 1 21 | 22 | * - Operating System 23 | - Kubernetes 24 | - Red Hat OpenShift 25 | - VMware vSphere with Tanzu 26 | 27 | * - Ubuntu 22.04 28 | - 1.29---1.31 29 | - 30 | - 8.0 Update 2 31 | 32 | * - Red Hat Core OS 33 | - 34 | - 4.16 35 | - 36 | ``` 37 | 38 | ## Container Runtimes 39 | 40 | ```{list-table} 41 | :header-rows: 1 42 | 43 | * - Operating System 44 | - containerd 45 | - CRI-O 46 | 47 | * - Ubuntu 22.04 48 | - 1.6, 1.7 49 | - 1.30 50 | 51 | * - Red Hat Core OS 52 | - None 53 | - Yes [{sup}`1`](cri-o-ocp) 54 | ``` 55 | 56 | (cri-o-ocp)= 57 | {sup}`1` The CRI-O version supported by OpenShift Container Platform is supported. 58 | 59 | ## Command-Line Tools 60 | 61 | ```{list-table} 62 | :header-rows: 1 63 | :widths: 30 70 64 | 65 | * - Tool 66 | - Installation Documentation 67 | 68 | * - kubectl (match cluster version) 69 | - Refer to 70 | [Install Tools](https://kubernetes.io/docs/tasks/tools/) 71 | in the Kubernetes documentation for more information. 72 | 73 | * - Helm v3 and higher 74 | - Refer to 75 | [Install Helm](https://helm.sh/docs/intro/install/) 76 | in the Helm documentation for more information. 77 | ``` 78 | 79 | ## Installed Componenets 80 | 81 | ```{list-table} 82 | :header-rows: 1 83 | :widths: 30 70 84 | 85 | * - Component 86 | - Verified Version 87 | 88 | * - Istio 89 | - 1.23.2 90 | Refer to [Istion Releases](https://github.com/istio/istio/tree/release-1.23) 91 | for more information. 92 | 93 | * - Keycloak 94 | - 26.0.0 95 | Refer to [Keycloak Releases](https://github.com/keycloak/keycloak/tree/release/26.0) 96 | for more information. 97 | ``` -------------------------------------------------------------------------------- /secure-services-istio-keycloak/versions.json: -------------------------------------------------------------------------------- 1 | { 2 | "latest": "0.1.0", 3 | "versions": 4 | [ 5 | { 6 | "version": "0.1.0" 7 | } 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /secure-services-istio-keycloak/versions1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "preferred": "true", 4 | "url": "../1.0.0", 5 | "version": "1.0.0" 6 | } 7 | ] -------------------------------------------------------------------------------- /templates/breadcrumbs.html: -------------------------------------------------------------------------------- 1 | {% extends '!components/breadcrumbs.html' %} 2 | 3 | {% set docs_home = "https://docs.nvidia.com" %} 4 | {% set home = docs_home + "/datacenter/cloud-native" %} 5 | 6 | {%- block breadcrumbs %} 7 | 28 | {%- endblock %} 29 | -------------------------------------------------------------------------------- /templates/last-updated.html: -------------------------------------------------------------------------------- 1 | {# Suppress the default last-updated template. #} 2 | -------------------------------------------------------------------------------- /tools/packman/bootstrap/download_file_from_url.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | Copyright 2019 NVIDIA CORPORATION 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | #> 16 | 17 | param( 18 | [Parameter(Mandatory=$true)][string]$source=$null, 19 | [string]$output="out.exe" 20 | ) 21 | $filename = $output 22 | 23 | $triesLeft = 4 24 | $delay = 2 25 | do 26 | { 27 | $triesLeft -= 1 28 | 29 | try 30 | { 31 | Write-Host "Downloading from bootstrap.packman.nvidia.com ..." 32 | $wc = New-Object net.webclient 33 | $wc.Downloadfile($source, $fileName) 34 | exit 0 35 | } 36 | catch 37 | { 38 | Write-Host "Error downloading $source!" 39 | Write-Host $_.Exception|format-list -force 40 | if ($triesLeft) 41 | { 42 | Write-Host "Retrying in $delay seconds ..." 43 | Start-Sleep -seconds $delay 44 | } 45 | $delay = $delay * $delay 46 | } 47 | } while ($triesLeft -gt 0) 48 | # We only get here if the retries have been exhausted, remove any left-overs: 49 | if (Test-Path $fileName) 50 | { 51 | Remove-Item $fileName 52 | } 53 | exit 1 -------------------------------------------------------------------------------- /tools/packman/bootstrap/fetch_file_from_packman_bootstrap.cmd: -------------------------------------------------------------------------------- 1 | :: Copyright 2019 NVIDIA CORPORATION 2 | :: 3 | :: Licensed under the Apache License, Version 2.0 (the "License"); 4 | :: you may not use this file except in compliance with the License. 5 | :: You may obtain a copy of the License at 6 | :: 7 | :: http://www.apache.org/licenses/LICENSE-2.0 8 | :: 9 | :: Unless required by applicable law or agreed to in writing, software 10 | :: distributed under the License is distributed on an "AS IS" BASIS, 11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | :: See the License for the specific language governing permissions and 13 | :: limitations under the License. 14 | 15 | :: You need to specify as input to this command 16 | @setlocal 17 | @set PACKAGE_NAME=%1 18 | @set TARGET_PATH=%2 19 | 20 | @echo Fetching %PACKAGE_NAME% ... 21 | 22 | @powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0download_file_from_url.ps1" ^ 23 | -source "http://bootstrap.packman.nvidia.com/%PACKAGE_NAME%" -output %TARGET_PATH% 24 | :: A bug in powershell prevents the errorlevel code from being set when using the -File execution option 25 | :: We must therefore do our own failure analysis, basically make sure the file exists: 26 | @if not exist %TARGET_PATH% goto ERROR_DOWNLOAD_FAILED 27 | 28 | @endlocal 29 | @exit /b 0 30 | 31 | :ERROR_DOWNLOAD_FAILED 32 | @echo Failed to download file from S3 33 | @echo Most likely because endpoint cannot be reached or file %PACKAGE_NAME% doesn't exist 34 | @endlocal 35 | @exit /b 1 -------------------------------------------------------------------------------- /tools/packman/config.packman.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /tools/packman/packman: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019-2023 NVIDIA CORPORATION 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | SAVED_SETTINGS="$-" 17 | set -eu 18 | 19 | if echo ${PM_VERBOSITY-} | grep -i "debug" > /dev/null ; then 20 | set -x 21 | else 22 | PM_CURL_SILENT="-s -S" 23 | PM_WGET_QUIET="--quiet" 24 | fi 25 | PM_PACKMAN_VERSION=7.5 26 | 27 | # This is necessary for newer macOS 28 | if [ `uname` == 'Darwin' ]; then 29 | export LC_ALL=en_US.UTF-8 30 | export LANG=en_US.UTF-8 31 | fi 32 | 33 | # We cannot rely on realpath, it isn't installed on macOS and some Linux distros 34 | get_abs_filename() { 35 | echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")" 36 | } 37 | 38 | # Specify where packman command exists 39 | export PM_INSTALL_PATH="$(get_abs_filename "$(dirname "${BASH_SOURCE}")")" 40 | 41 | # The packages root may already be configured by the user 42 | if [ -z "${PM_PACKAGES_ROOT:-}" ]; then 43 | # Set variable temporarily in this process so that the following execution will work 44 | if [ `uname` == 'Darwin' ]; then 45 | export PM_PACKAGES_ROOT="${HOME}/Library/Application Support/packman-cache" 46 | else 47 | if [ -z "${XDG_CACHE_HOME:-}" ]; then 48 | export PM_PACKAGES_ROOT="${HOME}/.cache/packman" 49 | else 50 | export PM_PACKAGES_ROOT="${XDG_CACHE_HOME}/packman" 51 | fi 52 | fi 53 | fi 54 | 55 | # Ensure the packages root path exists: 56 | if [ ! -d "$PM_PACKAGES_ROOT" ]; then 57 | echo "Creating packman packages cache at $PM_PACKAGES_ROOT" 58 | mkdir -p -m a+rwx "$PM_PACKAGES_ROOT" 59 | fi 60 | 61 | fetch_file_from_s3() 62 | { 63 | SOURCE=$1 64 | SOURCE_URL=http://bootstrap.packman.nvidia.com/$SOURCE 65 | TARGET=$2 66 | echo "Fetching $SOURCE from bootstrap.packman.nvidia.com ..." 67 | if command -v wget >/dev/null 2>&1; then 68 | wget $PM_WGET_QUIET -O$TARGET $SOURCE_URL 69 | else 70 | curl -o $TARGET $SOURCE_URL $PM_CURL_SILENT 71 | fi 72 | } 73 | 74 | generate_temp_file_name() 75 | { 76 | if [ `uname` == "Darwin" ]; then 77 | local tmpfile=`mktemp -t packman` 78 | else 79 | local tmpfile=`mktemp -t packman.XXXXXXXX` 80 | fi 81 | echo "$tmpfile" 82 | } 83 | 84 | install_python() 85 | { 86 | PLATFORM=`uname` 87 | PROCESSOR=`uname -m` 88 | PYTHON_VERSION=3.10.5-1 89 | 90 | if [ $PLATFORM == 'Darwin' ]; then 91 | PYTHON_PACKAGE=$PYTHON_VERSION-macos-x86_64 92 | elif [ $PLATFORM == 'Linux' ] && [ $PROCESSOR == 'x86_64' ]; then 93 | PYTHON_PACKAGE=$PYTHON_VERSION-linux-x86_64 94 | elif [ $PLATFORM == 'Linux' ] && [ $PROCESSOR == 'aarch64' ]; then 95 | PYTHON_PACKAGE=$PYTHON_VERSION-linux-aarch64 96 | else 97 | echo "Operating system not supported" 98 | exit 1 99 | fi 100 | 101 | PYTHON_INSTALL_FOLDER="$PM_PACKAGES_ROOT/python/$PYTHON_PACKAGE" 102 | if [ ! -d "$PYTHON_INSTALL_FOLDER" ]; then 103 | mkdir -p "$PYTHON_INSTALL_FOLDER" 104 | fi 105 | 106 | export PM_PYTHON="$PYTHON_INSTALL_FOLDER/python" 107 | 108 | if [ ! -f "$PM_PYTHON" ]; then 109 | PYTHON_PACKAGE_TMP=$(generate_temp_file_name) 110 | fetch_file_from_s3 "python@$PYTHON_PACKAGE.tar.gz" "$PYTHON_PACKAGE_TMP" 111 | if [ "$?" -eq "0" ]; then 112 | echo "Unpacking python" 113 | tar -xf "$PYTHON_PACKAGE_TMP" -C "$PYTHON_INSTALL_FOLDER" 114 | rm "$PYTHON_PACKAGE_TMP" 115 | else 116 | echo "Failed downloading the Python interpreter" 117 | exit $? 118 | fi 119 | fi 120 | } 121 | 122 | # Ensure python is available: 123 | if [ -z "${PM_PYTHON_EXT:-}" ]; then 124 | install_python 125 | else 126 | PM_PYTHON="$PM_PYTHON_EXT" 127 | fi 128 | 129 | # The packman module may be externally configured 130 | if [ -z "${PM_MODULE_DIR_EXT:-}" ]; then 131 | PM_MODULE_DIR="$PM_PACKAGES_ROOT/packman-common/$PM_PACKMAN_VERSION" 132 | else 133 | PM_MODULE_DIR="$PM_MODULE_DIR_EXT" 134 | fi 135 | export PM_MODULE="$PM_MODULE_DIR/run.py" 136 | 137 | # Ensure the packman package exists: 138 | if [ ! -f "$PM_MODULE" ]; then 139 | # Remove a previously corrupt packman-common if it's there 140 | if [ -d "$PM_MODULE_DIR" ]; then 141 | rm -rf "$PM_MODULE_DIR" 142 | fi 143 | PM_MODULE_PACKAGE="packman-common@$PM_PACKMAN_VERSION.zip" 144 | TARGET=$(generate_temp_file_name) 145 | # We always fetch packman from S3: 146 | fetch_file_from_s3 "$PM_MODULE_PACKAGE" "$TARGET" 147 | if [ "$?" -eq "0" ]; then 148 | echo "Unpacking ..." 149 | "$PM_PYTHON" -S -s -u -E "$PM_INSTALL_PATH/bootstrap/install_package.py" "$TARGET" "$PM_MODULE_DIR" 150 | rm "$TARGET" 151 | else 152 | echo "Failure while fetching packman module from S3!" 153 | exit 1 154 | fi 155 | fi 156 | 157 | # Generate temporary file name for environment variables: 158 | PM_VAR_PATH=`mktemp -u -t tmp.$$.pmvars.XXXXXX` 159 | 160 | if [ $# -ne 0 ] 161 | then 162 | PM_VAR_PATH_ARG=--var-path="$PM_VAR_PATH" 163 | fi 164 | 165 | "$PM_PYTHON" -S -s -u -E "$PM_MODULE" "$@" ${PM_VAR_PATH_ARG:-} 166 | exit_code=$? 167 | # Export the variables if the file was used and remove the file: 168 | if [ -f "$PM_VAR_PATH" ]; then 169 | while read -r line 170 | do 171 | if [ ${#line} -gt 0 ]; then 172 | export "$line" 173 | fi 174 | done < "$PM_VAR_PATH" 175 | rm -f "$PM_VAR_PATH" 176 | fi 177 | 178 | # avoid leaking -e and -u into the host script if they weren't originally set 179 | if [[ ! ( "$SAVED_SETTINGS" =~ e ) ]]; then 180 | set +e 181 | fi 182 | 183 | if [[ ! ( "$SAVED_SETTINGS" =~ u ) ]]; then 184 | set +u 185 | fi 186 | 187 | # Return the exit code from python 188 | if [ "$exit_code" != 0 ]; then 189 | exit "$exit_code" 190 | fi 191 | -------------------------------------------------------------------------------- /tools/packman/packman.cmd: -------------------------------------------------------------------------------- 1 | :: RUN_PM_MODULE must always be at the same spot for packman update to work (batch reloads file during update!) 2 | :: [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx] 3 | :: Reset errorlevel status (don't inherit from caller) 4 | @call :ECHO_AND_RESET_ERROR 5 | 6 | :: You can remove this section if you do your own manual configuration of the dev machines 7 | call :CONFIGURE 8 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 9 | 10 | :: Everything below is mandatory 11 | if not defined PM_PYTHON goto :PYTHON_ENV_ERROR 12 | if not defined PM_MODULE goto :MODULE_ENV_ERROR 13 | 14 | set PM_VAR_PATH_ARG= 15 | 16 | if "%1"=="pull" goto :SET_VAR_PATH 17 | if "%1"=="install" goto :SET_VAR_PATH 18 | 19 | :RUN_PM_MODULE 20 | "%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" %* %PM_VAR_PATH_ARG% 21 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 22 | 23 | :: Marshall environment variables into the current environment if they have been generated and remove temporary file 24 | if exist "%PM_VAR_PATH%" ( 25 | for /F "usebackq tokens=*" %%A in ("%PM_VAR_PATH%") do set "%%A" 26 | ) 27 | if %errorlevel% neq 0 ( goto :VAR_ERROR ) 28 | 29 | if exist "%PM_VAR_PATH%" ( 30 | del /F "%PM_VAR_PATH%" 31 | ) 32 | if %errorlevel% neq 0 ( goto :VAR_ERROR ) 33 | 34 | set PM_VAR_PATH= 35 | goto :eof 36 | 37 | :: Subroutines below 38 | :PYTHON_ENV_ERROR 39 | @echo User environment variable PM_PYTHON is not set! Please configure machine for packman or call configure.bat. 40 | exit /b 1 41 | 42 | :MODULE_ENV_ERROR 43 | @echo User environment variable PM_MODULE is not set! Please configure machine for packman or call configure.bat. 44 | exit /b 1 45 | 46 | :VAR_ERROR 47 | @echo Error while processing and setting environment variables! 48 | exit /b 1 49 | 50 | :: pad [xxxx] 51 | :ECHO_AND_RESET_ERROR 52 | @echo off 53 | if /I "%PM_VERBOSITY%"=="debug" ( 54 | @echo on 55 | ) 56 | exit /b 0 57 | 58 | :SET_VAR_PATH 59 | :: Generate temporary path for variable file 60 | for /f "delims=" %%a in ('%PM_PYTHON% -S -s -u -E -c "import tempfile;file = tempfile.NamedTemporaryFile(mode='w+t', delete=False);print(file.name)"') do (set PM_VAR_PATH=%%a) 61 | set PM_VAR_PATH_ARG=--var-path="%PM_VAR_PATH%" 62 | goto :RUN_PM_MODULE 63 | 64 | :CONFIGURE 65 | :: Must capture and set code page to work around issue #279, powershell invocation mutates console font 66 | :: This issue only happens in Windows CMD shell when using 65001 code page. Some Git Bash implementations 67 | :: don't support chcp so this workaround is a bit convoluted. 68 | :: Test for chcp: 69 | chcp > nul 2>&1 70 | if %errorlevel% equ 0 ( 71 | for /f "tokens=2 delims=:" %%a in ('chcp') do (set PM_OLD_CODE_PAGE=%%a) 72 | ) else ( 73 | call :ECHO_AND_RESET_ERROR 74 | ) 75 | :: trim leading space (this is safe even when PM_OLD_CODE_PAGE has not been set) 76 | set PM_OLD_CODE_PAGE=%PM_OLD_CODE_PAGE:~1% 77 | if "%PM_OLD_CODE_PAGE%" equ "65001" ( 78 | chcp 437 > nul 79 | set PM_RESTORE_CODE_PAGE=1 80 | ) 81 | call "%~dp0\bootstrap\configure.bat" 82 | set PM_CONFIG_ERRORLEVEL=%errorlevel% 83 | if defined PM_RESTORE_CODE_PAGE ( 84 | :: Restore code page 85 | chcp %PM_OLD_CODE_PAGE% > nul 86 | ) 87 | set PM_OLD_CODE_PAGE= 88 | set PM_RESTORE_CODE_PAGE= 89 | exit /b %PM_CONFIG_ERRORLEVEL% 90 | -------------------------------------------------------------------------------- /tools/packman/packmanconf.py: -------------------------------------------------------------------------------- 1 | # Use this file to bootstrap packman into your Python environment (3.7.x). Simply 2 | # add the path by doing sys.insert to where packmanconf.py is located and then execute: 3 | # 4 | # >>> import packmanconf 5 | # >>> packmanconf.init() 6 | # 7 | # It will use the configured remote(s) and the version of packman in the same folder, 8 | # giving you full access to the packman API via the following module 9 | # 10 | # >> import packmanapi 11 | # >> dir(packmanapi) 12 | 13 | import os 14 | import platform 15 | import sys 16 | 17 | 18 | def init(): 19 | """Call this function to initialize the packman configuration. 20 | 21 | Calls to the packman API will work after successfully calling this function. 22 | 23 | Note: 24 | This function only needs to be called once during the execution of your 25 | program. Calling it repeatedly is harmless but wasteful. 26 | Compatibility with your Python interpreter is checked and upon failure 27 | the function will report what is required. 28 | 29 | Example: 30 | >>> import packmanconf 31 | >>> packmanconf.init() 32 | >>> import packmanapi 33 | >>> packmanapi.set_verbosity_level(packmanapi.VERBOSITY_HIGH) 34 | """ 35 | major = sys.version_info[0] 36 | minor = sys.version_info[1] 37 | if major != 3 or minor != 10: 38 | raise RuntimeError( 39 | f"This version of packman requires Python 3.10.x, but {major}.{minor} was provided" 40 | ) 41 | conf_dir = os.path.dirname(os.path.abspath(__file__)) 42 | os.environ["PM_INSTALL_PATH"] = conf_dir 43 | packages_root = get_packages_root(conf_dir) 44 | version = get_version(conf_dir) 45 | module_dir = get_module_dir(conf_dir, packages_root, version) 46 | sys.path.insert(1, module_dir) 47 | 48 | 49 | def get_packages_root(conf_dir: str) -> str: 50 | root = os.getenv("PM_PACKAGES_ROOT") 51 | if not root: 52 | platform_name = platform.system() 53 | if platform_name == "Windows": 54 | drive, _ = os.path.splitdrive(conf_dir) 55 | root = os.path.join(drive, "packman-repo") 56 | elif platform_name == "Darwin": 57 | # macOS 58 | root = os.path.join( 59 | os.path.expanduser("~"), "/Library/Application Support/packman-cache" 60 | ) 61 | elif platform_name == "Linux": 62 | try: 63 | cache_root = os.environ["XDG_HOME_CACHE"] 64 | except KeyError: 65 | cache_root = os.path.join(os.path.expanduser("~"), ".cache") 66 | return os.path.join(cache_root, "packman") 67 | else: 68 | raise RuntimeError(f"Unsupported platform '{platform_name}'") 69 | # make sure the path exists: 70 | os.makedirs(root, exist_ok=True) 71 | return root 72 | 73 | 74 | def get_module_dir(conf_dir, packages_root: str, version: str) -> str: 75 | module_dir = os.path.join(packages_root, "packman-common", version) 76 | if not os.path.exists(module_dir): 77 | import tempfile 78 | 79 | tf = tempfile.NamedTemporaryFile(delete=False) 80 | target_name = tf.name 81 | tf.close() 82 | url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip" 83 | print(f"Downloading '{url}' ...") 84 | import urllib.request 85 | 86 | urllib.request.urlretrieve(url, target_name) 87 | from importlib.machinery import SourceFileLoader 88 | 89 | # import module from path provided 90 | script_path = os.path.join(conf_dir, "bootstrap", "install_package.py") 91 | ip = SourceFileLoader("install_package", script_path).load_module() 92 | print("Unpacking ...") 93 | ip.install_package(target_name, module_dir) 94 | os.unlink(tf.name) 95 | return module_dir 96 | 97 | 98 | def get_version(conf_dir: str): 99 | path = os.path.join(conf_dir, "packman") 100 | if not os.path.exists(path): # in dev repo fallback 101 | path += ".sh" 102 | with open(path, "rt", encoding="utf8") as launch_file: 103 | for line in launch_file.readlines(): 104 | if line.startswith("PM_PACKMAN_VERSION"): 105 | _, value = line.split("=") 106 | return value.strip() 107 | raise RuntimeError(f"Unable to find 'PM_PACKMAN_VERSION' in '{path}'") 108 | -------------------------------------------------------------------------------- /tools/packman/python.bat: -------------------------------------------------------------------------------- 1 | :: Copyright 2019-2020 NVIDIA CORPORATION 2 | :: 3 | :: Licensed under the Apache License, Version 2.0 (the "License"); 4 | :: you may not use this file except in compliance with the License. 5 | :: You may obtain a copy of the License at 6 | :: 7 | :: http://www.apache.org/licenses/LICENSE-2.0 8 | :: 9 | :: Unless required by applicable law or agreed to in writing, software 10 | :: distributed under the License is distributed on an "AS IS" BASIS, 11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | :: See the License for the specific language governing permissions and 13 | :: limitations under the License. 14 | 15 | @echo off 16 | setlocal enableextensions 17 | 18 | call "%~dp0\packman" init 19 | set "PYTHONPATH=%PM_MODULE_DIR%;%PYTHONPATH%" 20 | 21 | if not defined PYTHONNOUSERSITE ( 22 | set PYTHONNOUSERSITE=1 23 | ) 24 | 25 | REM For performance, default to unbuffered; however, allow overriding via 26 | REM PYTHONUNBUFFERED=0 since PYTHONUNBUFFERED on windows can truncate output 27 | REM when printing long strings 28 | if not defined PYTHONUNBUFFERED ( 29 | set PYTHONUNBUFFERED=1 30 | ) 31 | 32 | "%PM_PYTHON%" %* -------------------------------------------------------------------------------- /tools/packman/python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019-2020 NVIDIA CORPORATION 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | 19 | PACKMAN_CMD="$(dirname "${BASH_SOURCE}")/packman" 20 | if [ ! -f "$PACKMAN_CMD" ]; then 21 | PACKMAN_CMD="${PACKMAN_CMD}.sh" 22 | fi 23 | source "$PACKMAN_CMD" init 24 | export PYTHONPATH="${PM_MODULE_DIR}:${PYTHONPATH}" 25 | 26 | if [ -z "${PYTHONNOUSERSITE:-}" ]; then 27 | export PYTHONNOUSERSITE=1 28 | fi 29 | 30 | # For performance, default to unbuffered; however, allow overriding via 31 | # PYTHONUNBUFFERED=0 since PYTHONUNBUFFERED on windows can truncate output 32 | # when printing long strings 33 | if [ -z "${PYTHONUNBUFFERED:-}" ]; then 34 | export PYTHONUNBUFFERED=1 35 | fi 36 | 37 | # workaround for our python not shipping with certs 38 | if [[ -z ${SSL_CERT_DIR:-} ]]; then 39 | export SSL_CERT_DIR=/etc/ssl/certs/ 40 | fi 41 | 42 | "${PM_PYTHON}" "$@" 43 | -------------------------------------------------------------------------------- /tools/repoman/repoman.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import io 4 | import contextlib 5 | import packmanapi 6 | 7 | REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..") 8 | REPO_DEPS_FILE = os.path.join(REPO_ROOT, "deps/repo-deps.packman.xml") 9 | 10 | 11 | def bootstrap(): 12 | """ 13 | Bootstrap all omni.repo modules. 14 | 15 | Pull with packman from repo.packman.xml and add them all to python sys.path to enable importing. 16 | """ 17 | with contextlib.redirect_stdout(io.StringIO()): 18 | deps = packmanapi.pull(REPO_DEPS_FILE) 19 | for dep_path in deps.values(): 20 | if dep_path not in sys.path: 21 | sys.path.append(dep_path) 22 | 23 | 24 | if __name__ == "__main__": 25 | bootstrap() 26 | import omni.repo.man 27 | 28 | omni.repo.man.main(REPO_ROOT) 29 | -------------------------------------------------------------------------------- /work/dcgm-offline.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloud-native-docs/HEAD/work/dcgm-offline.inv --------------------------------------------------------------------------------