├── cmd ├── kubelet-gpu-plugin │ ├── test-claims │ │ ├── empty.json │ │ ├── invalid.json │ │ └── multi.json │ └── healthcare.go ├── kubelet-qat-plugin │ ├── main.go │ ├── deviceresources.go │ └── config.go ├── kubelet-gaudi-plugin │ ├── node_state_test.go │ └── main.go ├── qat-showdevice │ └── main.go └── goxpusmi │ └── main.go ├── deployments ├── gpu │ ├── kustomization.yaml │ ├── tests │ │ └── gpu-sample-app │ │ │ ├── kustomization.yaml │ │ │ └── gpu-sample-app.yaml │ ├── base │ │ ├── namespace.yaml │ │ ├── kustomization.yaml │ │ └── device-class.yaml │ ├── overlays │ │ ├── device-faker │ │ │ ├── kustomization.yaml │ │ │ ├── remove-sysfs.yaml │ │ │ └── device-faker.yaml │ │ ├── openshift │ │ │ ├── delete-v1-device-class.yaml │ │ │ ├── device-class.yaml │ │ │ ├── kustomization.yaml │ │ │ ├── clusterRole.yaml │ │ │ ├── clusterRoleBinding.yaml │ │ │ ├── README.md │ │ │ └── securityContextConstraints.yaml │ │ └── nfd_labeled_nodes │ │ │ ├── kustomization.yaml │ │ │ ├── add-nodeselector-intel-gpu.yaml │ │ │ ├── nfd-intel-gpu-device-rule.yaml │ │ │ └── nfd-intel-gpu-platform-labeling.yaml │ ├── intel-xpumanager │ │ ├── gpu-monitor-claim.yaml │ │ ├── xpumd-delete-limits.yaml │ │ ├── kustomization.yaml │ │ └── xpumd-add-dra-resource.yaml │ └── examples │ │ ├── resource-claim-template.yaml │ │ ├── claim-external-gpu.yaml │ │ ├── pod-for-claim-external-gpu.yaml │ │ ├── deployment-extended-resources.yaml │ │ ├── deployment-extended-resources-implicit.yaml │ │ ├── monitor-pod-inline.yaml │ │ ├── pod-inline-aligned-gpus.yaml │ │ ├── pod-inline-first-available.yaml │ │ ├── deployment-inline.yaml │ │ └── pod-inline-gpu.yaml ├── qat │ ├── kustomization.yaml │ ├── base │ │ ├── namespace.yaml │ │ ├── kustomization.yaml │ │ └── device-class.yaml │ ├── overlays │ │ ├── openshift │ │ │ ├── delete-v1-device-class.yaml │ │ │ ├── device-class.yaml │ │ │ ├── kustomization.yaml │ │ │ ├── clusterRole.yaml │ │ │ ├── clusterRoleBinding.yaml │ │ │ ├── README.md │ │ │ └── securityContextConstraints.yaml │ │ └── nfd_labeled_nodes │ │ │ ├── kustomization.yaml │ │ │ ├── add-nodeselector-intel-qat.yaml │ │ │ └── nfd-intel-qat-device-rule.yaml │ ├── tests │ │ ├── openssl-qat-engine │ │ │ ├── kustomization.yaml │ │ │ └── openssl-qat-engine.yaml │ │ ├── qatlib-sample-code │ │ │ ├── kustomization.yaml │ │ │ └── qatlib-sample-code.yaml │ │ ├── qat-dpdk-test │ │ │ ├── kustomization.yaml │ │ │ ├── modified-clusterconfig.yaml │ │ │ ├── crypto-perf.yaml │ │ │ └── compress-perf.yaml │ │ └── resource-claim-template.yaml │ └── examples │ │ ├── intel-qat-resource-driver-configuration.yaml │ │ └── deployment-inline.yaml └── gaudi │ ├── kustomization.yaml │ ├── base │ ├── namespace.yaml │ ├── kustomization.yaml │ └── device-class.yaml │ ├── overlays │ ├── device-faker │ │ ├── kustomization.yaml │ │ ├── remove-sysfs.yaml │ │ └── device-faker.yaml │ ├── openshift │ │ ├── delete-v1-device-class.yaml │ │ ├── device-class.yaml │ │ ├── kustomization.yaml │ │ ├── clusterRole.yaml │ │ ├── clusterRoleBinding.yaml │ │ ├── README.md │ │ └── securityContextConstraints.yaml │ └── nfd_labeled_nodes │ │ ├── kustomization.yaml │ │ ├── add-nodeselector-intel-gaudi.yaml │ │ └── nfd-intel-gaudi-device-rule.yaml │ └── examples │ ├── deployment-extended-resources.yaml │ ├── deployment-extended-resources-implicit.yaml │ ├── monitor-pod-inline.yaml │ ├── pod-inline.yaml │ └── deployment-inline.yaml ├── charts ├── intel-gaudi-resource-driver │ ├── templates │ │ ├── NOTES.txt │ │ ├── validating-admission-policy-binding.yaml │ │ ├── ocp-scc-clusterrole.yaml │ │ ├── serviceaccount.yaml │ │ ├── clusterrole.yaml │ │ ├── nfd.yaml │ │ ├── ocp-scc-clusterrolebinding.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── device-class.yaml │ │ ├── ocp-scc.yaml │ │ ├── validating-admission-policy.yaml │ │ ├── _helpers.tpl │ │ └── resource-driver.yaml │ ├── .helmignore │ ├── Chart.yaml │ ├── values.yaml │ └── README.md ├── intel-gpu-resource-driver │ ├── templates │ │ ├── NOTES.txt │ │ ├── validating-admission-policy-binding.yaml │ │ ├── ocp-scc-clusterrole.yaml │ │ ├── serviceaccount.yaml │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── ocp-scc-clusterrolebinding.yaml │ │ ├── device-class.yaml │ │ ├── ocp-scc.yaml │ │ ├── validating-admission-policy.yaml │ │ ├── _helpers.tpl │ │ ├── resource-driver.yaml │ │ └── node-feature-rules.yaml │ ├── .helmignore │ ├── Chart.yaml │ ├── values.yaml │ └── README.md └── intel-qat-resource-driver │ ├── templates │ ├── NOTES.txt │ ├── validating-admission-policy-binding.yaml │ ├── ocp-scc-clusterrole.yaml │ ├── serviceaccount.yaml │ ├── clusterrole.yaml │ ├── clusterrolebinding.yaml │ ├── ocp-scc-clusterrolebinding.yaml │ ├── device-class.yaml │ ├── nfd.yaml │ ├── ocp-scc.yaml │ ├── validating-admission-policy.yaml │ ├── _helpers.tpl │ └── resource-driver.yaml │ ├── Chart.yaml │ ├── values.yaml │ └── README.md ├── NOTICE ├── pkg ├── goxpusmi │ └── README.md ├── helpers │ ├── driver.go │ ├── device.go │ └── node_state.go ├── version │ └── version.go ├── gpu │ └── drm │ │ └── drm.go ├── gaudi │ └── device │ │ └── device_test.go └── fakesysfs │ └── fakesysfs.go ├── hack ├── tools.go ├── kind-config.yaml ├── boilerplate.go.txt └── clusterconfig.yaml ├── .gitignore ├── SECURITY.md ├── doc ├── gpu │ ├── generate-pngs.sh │ ├── allocation-immediate.puml │ ├── high-level-overview.puml │ ├── allocation-delayed.puml │ ├── BUILD.md │ ├── complete-overview.puml │ └── README.md ├── cdi-spec-generator │ ├── BUILD.md │ └── README.md ├── qat │ ├── BUILD.md │ ├── TESTING.md │ └── README.md └── gaudi │ ├── BUILD.md │ └── README.md ├── Dockerfile.device-faker ├── Dockerfile.gaudi ├── Dockerfile.qat ├── test └── e2e │ ├── utils │ └── utils.go │ ├── dra_suite_test.go │ └── gpu │ └── gpu.go ├── .golangci.yaml ├── gaudi.mk ├── Dockerfile.gaudi-test ├── gpu.mk ├── qat.mk ├── CONTRIBUTING.md ├── DEV.md ├── README.md └── Dockerfile.gpu /cmd/kubelet-gpu-plugin/test-claims/empty.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /deployments/gpu/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - base 3 | -------------------------------------------------------------------------------- /deployments/qat/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - base 3 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/test-claims/invalid.json: -------------------------------------------------------------------------------- 1 | {"foo":"bar",} 2 | -------------------------------------------------------------------------------- /deployments/gaudi/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - base 3 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. -------------------------------------------------------------------------------- /deployments/gpu/tests/gpu-sample-app/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - gpu-sample-app.yaml 3 | -------------------------------------------------------------------------------- /deployments/qat/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: intel-qat-resource-driver 5 | -------------------------------------------------------------------------------- /deployments/gaudi/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: intel-gaudi-resource-driver 6 | -------------------------------------------------------------------------------- /deployments/gpu/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: intel-gpu-resource-driver 6 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | These contents may have been developed with support from one or more Intel-operated generative artificial intelligence solutions. 2 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/device-faker/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../../base 3 | 4 | patches: 5 | - path: remove-sysfs.yaml 6 | - path: device-faker.yaml 7 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/device-faker/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../../base 3 | 4 | patches: 5 | - path: remove-sysfs.yaml 6 | - path: device-faker.yaml 7 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/delete-v1-device-class.yaml: -------------------------------------------------------------------------------- 1 | $patch: delete 2 | apiVersion: resource.k8s.io/v1 3 | kind: DeviceClass 4 | metadata: 5 | name: gpu.intel.com 6 | -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/delete-v1-device-class.yaml: -------------------------------------------------------------------------------- 1 | $patch: delete 2 | apiVersion: resource.k8s.io/v1 3 | kind: DeviceClass 4 | metadata: 5 | name: qat.intel.com 6 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/delete-v1-device-class.yaml: -------------------------------------------------------------------------------- 1 | $patch: delete 2 | apiVersion: resource.k8s.io/v1 3 | kind: DeviceClass 4 | metadata: 5 | name: gaudi.intel.com 6 | -------------------------------------------------------------------------------- /pkg/goxpusmi/README.md: -------------------------------------------------------------------------------- 1 | This is Go binding for libxpum API used by Intel GPU DRA driver. 2 | 3 | Headers and library are from public releases of [XPUManager project](https://github.com/intel/xpumanager/releases/). 4 | -------------------------------------------------------------------------------- /hack/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | // +build tools 3 | 4 | // This package imports things required by build scripts, to force `go mod` to see them as dependencies 5 | package tools 6 | 7 | import _ "k8s.io/code-generator" 8 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: gpu.intel.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "gpu.intel.com" 9 | -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: qat.intel.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "qat.intel.com" 9 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: gaudi.intel.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "gaudi.intel.com" 9 | -------------------------------------------------------------------------------- /deployments/qat/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - device-class.yaml 3 | - namespace.yaml 4 | - resource-driver.yaml 5 | 6 | #images: 7 | # - name: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver 8 | # newTag: v0.4.0 9 | -------------------------------------------------------------------------------- /deployments/qat/overlays/nfd_labeled_nodes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | - nfd-intel-qat-device-rule.yaml 7 | 8 | patches: 9 | - path: add-nodeselector-intel-qat.yaml 10 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/nfd_labeled_nodes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | - nfd-intel-gaudi-device-rule.yaml 7 | 8 | patches: 9 | - path: add-nodeselector-intel-gaudi.yaml 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /vendor/ 3 | 4 | # macOS 5 | .DS_Store 6 | 7 | # files generated by editors 8 | .idea/ 9 | *.iml 10 | .vscode/ 11 | *.swp 12 | *.sublime-project 13 | *.sublime-workspace 14 | *~ 15 | *.o 16 | *.so 17 | *.out 18 | 19 | Chart.lock 20 | *.tgz 21 | 22 | *.patch 23 | -------------------------------------------------------------------------------- /deployments/qat/tests/openssl-qat-engine/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - openssl-qat-engine.yaml 3 | 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | images: 7 | - name: openssl-qat-engine:devel 8 | newName: intel/openssl-qat-engine 9 | newTag: devel 10 | -------------------------------------------------------------------------------- /deployments/qat/tests/qatlib-sample-code/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - qatlib-sample-code.yaml 3 | 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | images: 7 | - name: openssl-qat-engine:devel 8 | newName: intel/openssl-qat-engine 9 | newTag: devel 10 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/gpu-monitor-claim.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: intel-gpu-monitor-claim 5 | spec: 6 | metadata: 7 | labels: 8 | app: intel-gpu-monitor-claim 9 | spec: 10 | resourceClassName: intel-gpu-monitor 11 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | - nfd-intel-gpu-device-rule.yaml 7 | - nfd-intel-gpu-platform-labeling.yaml 8 | 9 | patches: 10 | - path: add-nodeselector-intel-gpu.yaml 11 | -------------------------------------------------------------------------------- /deployments/gaudi/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - device-class.yaml 3 | - namespace.yaml 4 | - resource-driver.yaml 5 | 6 | #images: 7 | # - name: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver 8 | # newName: registry.local/intel-gaudi-resource-driver 9 | # newTag: v0.6.0 10 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/validating-admission-policy-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gpu 5 | spec: 6 | policyName: resourceslices-policy-dra-kubelet-plugin-gpu 7 | validationActions: [Deny] 8 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/validating-admission-policy-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-qat 5 | spec: 6 | policyName: resourceslices-policy-dra-kubelet-plugin-qat 7 | validationActions: [Deny] 8 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/validating-admission-policy-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gaudi 5 | spec: 6 | policyName: resourceslices-policy-dra-kubelet-plugin-gaudi 7 | validationActions: [Deny] 8 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - clusterRole.yaml 6 | - clusterRoleBinding.yaml 7 | - ../../base 8 | - device-class.yaml 9 | - securityContextConstraints.yaml 10 | patches: 11 | - path: delete-v1-device-class.yaml 12 | -------------------------------------------------------------------------------- /deployments/gpu/examples/resource-claim-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: claim1 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | spec: 8 | devices: 9 | requests: 10 | - name: gpu 11 | exactly: 12 | deviceClassName: gpu.intel.com 13 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - clusterRole.yaml 6 | - clusterRoleBinding.yaml 7 | - ../../base 8 | - device-class.yaml 9 | - securityContextConstraints.yaml 10 | patches: 11 | - path: delete-v1-device-class.yaml 12 | -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - clusterRole.yaml 6 | - clusterRoleBinding.yaml 7 | - ../../base 8 | - device-class.yaml 9 | - securityContextConstraints.yaml 10 | patches: 11 | - path: delete-v1-device-class.yaml 12 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/add-nodeselector-intel-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | nodeSelector: 10 | intel.feature.node.kubernetes.io/gpu: "true" 11 | -------------------------------------------------------------------------------- /deployments/qat/overlays/nfd_labeled_nodes/add-nodeselector-intel-qat.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-qat-resource-driver-kubelet-plugin 5 | namespace: intel-qat-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | nodeSelector: 10 | intel.feature.node.kubernetes.io/qat: "true" 11 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/nfd_labeled_nodes/add-nodeselector-intel-gaudi.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: intel-gaudi-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | nodeSelector: 10 | intel.feature.node.kubernetes.io/gaudi: "true" 11 | -------------------------------------------------------------------------------- /deployments/gpu/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - device-class.yaml 3 | - namespace.yaml 4 | - resource-driver.yaml 5 | 6 | #images: 7 | # - name: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver 8 | # newName: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver 9 | # newTag: v0.9.0 10 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/xpumd-delete-limits.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-xpumanager 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: xpumd 10 | resources: 11 | limits: 12 | # gpu.intel.com/i915_monitoring: 1 13 | $patch: delete 14 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/kustomization.yaml: -------------------------------------------------------------------------------- 1 | configMapGenerator: 2 | - files: 3 | - file.txt 4 | name: test-data 5 | 6 | resources: 7 | - crypto-perf.yaml 8 | - compress-perf.yaml 9 | 10 | apiVersion: kustomize.config.k8s.io/v1beta1 11 | kind: Kustomization 12 | images: 13 | - name: crypto-perf:devel 14 | newName: intel/crypto-perf 15 | newTag: devel 16 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: monitoring 2 | resources: 3 | - https://github.com/intel/xpumanager/deployment/kubernetes/daemonset/base/?ref=V1.2.39 4 | - gpu-monitor-claim.yaml 5 | patches: 6 | - path: xpumd-delete-limits.yaml 7 | target: 8 | kind: DaemonSet 9 | - path: xpumd-add-dra-resource.yaml 10 | target: 11 | kind: DaemonSet 12 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/clusterRole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: system:openshift:scc:intel-gpu-resource-driver 5 | rules: 6 | - apiGroups: 7 | - security.openshift.io 8 | resourceNames: 9 | - intel-gpu-resource-driver 10 | resources: 11 | - securitycontextconstraints 12 | verbs: 13 | - use 14 | -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/clusterRole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: system:openshift:scc:intel-qat-resource-driver 5 | rules: 6 | - apiGroups: 7 | - security.openshift.io 8 | resourceNames: 9 | - intel-qat-resource-driver 10 | resources: 11 | - securitycontextconstraints 12 | verbs: 13 | - use 14 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/clusterRole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: system:openshift:scc:intel-gaudi-resource-driver 5 | rules: 6 | - apiGroups: 7 | - security.openshift.io 8 | resourceNames: 9 | - intel-gaudi-resource-driver 10 | resources: 11 | - securitycontextconstraints 12 | verbs: 13 | - use 14 | -------------------------------------------------------------------------------- /deployments/gpu/examples/claim-external-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaim 3 | metadata: 4 | name: one-flex 5 | spec: 6 | devices: 7 | requests: 8 | - name: gpu 9 | exactly: 10 | deviceClassName: gpu.intel.com 11 | selectors: 12 | - cel: 13 | expression: device.attributes["gpu.intel.com"].family == 'Flex' 14 | -------------------------------------------------------------------------------- /hack/kind-config.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | featureGates: 4 | DRAExtendedResource: true 5 | DRADeviceTaints: true 6 | nodes: 7 | - role: control-plane 8 | kubeadmConfigPatches: 9 | - | 10 | kind: ClusterConfiguration 11 | apiServer: 12 | extraArgs: 13 | runtime-config: "resource.k8s.io/v1alpha3=true,resource.k8s.io/v1beta1=true" 14 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and 3 | providing clear guidance on the solution, impact, severity and mitigation. 4 | 5 | ## Reporting a Vulnerability 6 | Please report any security vulnerabilities in this project 7 | [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 8 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | # Common backup files 9 | *.swp 10 | *.bak 11 | *.tmp 12 | *.orig 13 | *~ 14 | # Various IDEs 15 | .project 16 | .idea/ 17 | *.tmproj 18 | .vscode/ 19 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | # Common backup files 9 | *.swp 10 | *.bak 11 | *.tmp 12 | *.orig 13 | *~ 14 | # Various IDEs 15 | .project 16 | .idea/ 17 | *.tmproj 18 | .vscode/ 19 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/ocp-scc-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 6 | rules: 7 | - apiGroups: 8 | - security.openshift.io 9 | resourceNames: 10 | - {{ .Values.openshift.sccName }} 11 | resources: 12 | - securitycontextconstraints 13 | verbs: 14 | - use 15 | {{- end }} 16 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/ocp-scc-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 6 | rules: 7 | - apiGroups: 8 | - security.openshift.io 9 | resourceNames: 10 | - {{ .Values.openshift.sccName }} 11 | resources: 12 | - securitycontextconstraints 13 | verbs: 14 | - use 15 | {{- end }} 16 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/ocp-scc-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 6 | rules: 7 | - apiGroups: 8 | - security.openshift.io 9 | resourceNames: 10 | - {{ .Values.openshift.sccName }} 11 | resources: 12 | - securitycontextconstraints 13 | verbs: 14 | - use 15 | {{- end }} 16 | -------------------------------------------------------------------------------- /deployments/gpu/base/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: DeviceClass 3 | metadata: 4 | name: gpu.intel.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "gpu.intel.com" 9 | # Available in K8s v1.34 requires feature gate enabled, uncomment if needed. 10 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource 11 | #extendedResourceName: intel.com/gpu 12 | -------------------------------------------------------------------------------- /deployments/gpu/intel-xpumanager/xpumd-add-dra-resource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-xpumanager 5 | spec: 6 | template: 7 | spec: 8 | resourceClaims: 9 | - name: intel-gpu-resource 10 | source: 11 | resourceClaimTemplateName: intel-gpu-monitor-claim 12 | containers: 13 | - name: xpumd 14 | resources: 15 | claims: 16 | - name: intel-gpu-resource 17 | -------------------------------------------------------------------------------- /deployments/qat/base/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: DeviceClass 3 | metadata: 4 | name: qat.intel.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "qat.intel.com" 9 | # Available in K8s v1.34 requires feature gate enabled, uncomment if needed. 10 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource 11 | #extendedResourceName: intel.com/qat 12 | 13 | -------------------------------------------------------------------------------- /deployments/gaudi/base/device-class.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: DeviceClass 3 | metadata: 4 | name: gaudi.intel.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "gaudi.intel.com" 9 | # Available in K8s v1.34 requires feature gate enabled, uncomment if needed. 10 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource 11 | # extendedResourceName: intel.com/gaudi 12 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/clusterRoleBinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: system:openshift:scc:intel-gpu-resource-driver 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: system:openshift:scc:intel-gpu-resource-driver 9 | subjects: 10 | - kind: ServiceAccount 11 | name: intel-gpu-resource-driver-service-account 12 | namespace: intel-gpu-resource-driver 13 | -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/clusterRoleBinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: system:openshift:scc:intel-qat-resource-driver 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: system:openshift:scc:intel-qat-resource-driver 9 | subjects: 10 | - kind: ServiceAccount 11 | name: intel-qat-resource-driver-service-account 12 | namespace: intel-qat-resource-driver 13 | -------------------------------------------------------------------------------- /deployments/qat/examples/intel-qat-resource-driver-configuration.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: intel-qat-resource-driver-configuration 5 | namespace: intel-qat-resource-driver 6 | data: 7 | # Map of : in map indexed by hostname 8 | qatdefaults.config: | 9 | { "host-name-here": 10 | { 11 | "0000:aa:00.0": "asym;sym", 12 | "0000:bb:00.0": "dc;sym" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/clusterRoleBinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: system:openshift:scc:intel-gaudi-resource-driver 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: system:openshift:scc:intel-gaudi-resource-driver 9 | subjects: 10 | - kind: ServiceAccount 11 | name: intel-gaudi-resource-driver-service-account 12 | namespace: intel-gaudi-resource-driver 13 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/device-faker/remove-sysfs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: kubelet-plugin 11 | volumeMounts: 12 | - name: sysfs 13 | mountPath: /sysfs 14 | $patch: delete 15 | volumes: 16 | - name: sysfs 17 | $patch: delete 18 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/device-faker/remove-sysfs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: intel-gaudi-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: kubelet-plugin 11 | volumeMounts: 12 | - name: sysfs 13 | mountPath: /sys 14 | $patch: delete 15 | volumes: 16 | - name: sysfs 17 | $patch: delete 18 | -------------------------------------------------------------------------------- /doc/gpu/generate-pngs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if ! type plantuml &> /dev/null; then 4 | echo "ERR: No plantuml found in PATH, plantuml is needed to produce PNG files" 5 | exit 1 6 | fi 7 | 8 | # source files are in script dir 9 | dir=${0%/*} 10 | 11 | for puml in "$dir"/*puml; do 12 | png="${puml%.puml}.png" 13 | # update if PNG missing or older that source file 14 | if test "$puml" -nt "$png"; then 15 | echo "$puml" 16 | plantuml "$puml" "$png" 17 | fi 18 | done 19 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-qat-resource-driver.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/nfd-intel-gpu-device-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-gpu-device-rule 5 | spec: 6 | rules: 7 | - name: intel.gpu.device 8 | labels: 9 | "intel.feature.node.kubernetes.io/gpu": "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["8086"]} 14 | class: {op: In, value: ["0300", "0380"]} 15 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/nfd_labeled_nodes/nfd-intel-gaudi-device-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-gaudi-device-rule 5 | spec: 6 | rules: 7 | - name: "intel.gaudi" 8 | labels: 9 | "intel.feature.node.kubernetes.io/gaudi": "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["1da3"]} 14 | device: {op: In, value: ["1020", "1030"]} 15 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "intel-gpu-resource-driver.clusterRoleName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["nodes"] 9 | verbs: ["get"] 10 | - apiGroups: ["resource.k8s.io"] 11 | resources: ["resourceslices"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: ["resource.k8s.io"] 14 | resources: ["resourceclaims"] 15 | verbs: ["get"] 16 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "intel-gpu-resource-driver.clusterRoleBindingName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 9 | namespace: {{ .Release.Namespace }} 10 | roleRef: 11 | kind: ClusterRole 12 | name: {{ include "intel-gpu-resource-driver.clusterRoleName" . }} 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "intel-qat-resource-driver.clusterRoleName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["nodes"] 9 | verbs: ["get"] 10 | - apiGroups: ["resource.k8s.io"] 11 | resources: ["resourceslices"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: ["resource.k8s.io"] 14 | resources: ["resourceclaims"] 15 | verbs: ["get"] 16 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "intel-qat-resource-driver.clusterRoleBindingName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 9 | namespace: {{ .Release.Namespace }} 10 | roleRef: 11 | kind: ClusterRole 12 | name: {{ include "intel-qat-resource-driver.clusterRoleName" . }} 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /doc/gpu/allocation-immediate.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | title "Immediate allocation" 3 | 4 | actor Actor 5 | participant ResourceClaim 6 | participant Pod 7 | participant Controller 8 | participant Plugin 9 | 10 | Actor -> ResourceClaim : deploy 11 | ResourceClaim -> Controller : notify 12 | note right of Controller 13 | the difference is here 14 | end note 15 | Controller -> Controller : find suitable nodes 16 | Controller -> Controller : Allocate on Node N 17 | Actor -> Pod : deploy 18 | Plugin -> ResourceClaim : prepare resource and mark Ready 19 | 20 | @enduml 21 | 22 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "intel-gaudi-resource-driver.clusterRoleName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["nodes"] 9 | verbs: ["get"] 10 | - apiGroups: ["resource.k8s.io"] 11 | resources: ["resourceslices"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: ["resource.k8s.io"] 14 | resources: ["resourceclaims"] 15 | verbs: ["get"] 16 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/nfd.yaml: -------------------------------------------------------------------------------- 1 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 2 | apiVersion: nfd.k8s-sigs.io/v1alpha1 3 | kind: NodeFeatureRule 4 | metadata: 5 | name: intel-gaudi-device-rule 6 | spec: 7 | rules: 8 | - name: "intel.gaudi" 9 | labels: 10 | "intel.feature.node.kubernetes.io/gaudi": "true" 11 | matchFeatures: 12 | - feature: pci.device 13 | matchExpressions: 14 | vendor: {op: In, value: ["1da3"]} 15 | device: {op: In, value: ["1020", "1030"]} 16 | {{- end }} 17 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/ocp-scc-clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 10 | subjects: 11 | - kind: ServiceAccount 12 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 13 | namespace: {{ .Release.Namespace }} 14 | {{- end }} 15 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/ocp-scc-clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 10 | subjects: 11 | - kind: ServiceAccount 12 | name: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 13 | namespace: {{ .Release.Namespace }} 14 | {{- end }} 15 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/ocp-scc-clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: system:openshift:scc:{{ .Values.openshift.sccName }} 10 | subjects: 11 | - kind: ServiceAccount 12 | name: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 13 | namespace: {{ .Release.Namespace }} 14 | {{- end }} 15 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "intel-gaudi-resource-driver.clusterRoleBindingName" . }} 5 | namespace: {{ .Release.Namespace }} 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 9 | namespace: {{ .Release.Namespace }} 10 | roleRef: 11 | kind: ClusterRole 12 | name: {{ include "intel-gaudi-resource-driver.clusterRoleName" . }} 13 | apiGroup: rbac.authorization.k8s.io 14 | -------------------------------------------------------------------------------- /doc/gpu/high-level-overview.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | 3 | allowmixing 4 | 5 | actor User 6 | 7 | component "resourceclaim0\n\nresourceClass: class0\nparametersRef:" as resclaim0 { 8 | component resclaimparams0 [ 9 | type: gpu, 10 | memory: 256, 11 | millicores: 100, 12 | count: 1, 13 | ] 14 | } 15 | 16 | component "resource-classes" { 17 | component "class0" { 18 | component "class0-parameters" 19 | } 20 | component "class1" { 21 | component "class1-parameters" 22 | } 23 | } 24 | 25 | left to right direction 26 | 27 | User --> resclaim0 : deploy 28 | 29 | @enduml 30 | 31 | -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/main.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "fmt" 9 | "os" 10 | 11 | "github.com/urfave/cli/v2" 12 | 13 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers" 14 | qat "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 15 | ) 16 | 17 | func main() { 18 | if err := helpers.NewApp(qat.DriverName, newDriver, []cli.Flag{}, nil).Run(os.Args); err != nil { 19 | fmt.Fprintf(os.Stderr, "Error: %v\n", err) 20 | os.Exit(1) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /doc/gpu/allocation-delayed.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | title "Delayed allocation" 3 | 4 | actor Actor 5 | participant ResourceClaim 6 | participant Pod 7 | participant Controller 8 | participant Plugin 9 | 10 | Actor -> ResourceClaim : deploy 11 | ResourceClaim -> Controller : notify 12 | note right of Controller 13 | the difference is here 14 | end note 15 | Controller -> Controller : wait for first user 16 | Actor -> Pod : deploy 17 | Pod -> Controller : find suitable nodes 18 | Pod -> Controller : Allocate on Node N 19 | Plugin -> ResourceClaim : prepare resource and mark Ready 20 | 21 | @enduml 22 | 23 | -------------------------------------------------------------------------------- /deployments/gpu/examples/pod-for-claim-external-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: test-one-flex 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: with-resource 9 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 10 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 11 | resources: 12 | claims: 13 | - name: resource 14 | - name: without-resource 15 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 16 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 17 | resourceClaims: 18 | - name: resource 19 | resourceClaimName: one-flex 20 | -------------------------------------------------------------------------------- /deployments/gpu/examples/deployment-extended-resources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: dra-extended-resource-example 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: dra-extended-resource-example 10 | template: 11 | metadata: 12 | labels: 13 | app: dra-extended-resource-example 14 | spec: 15 | containers: 16 | - name: workload 17 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 18 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 300"] 19 | resources: 20 | limits: 21 | intel.com/gpu: 1 22 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/deployment-extended-resources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: dra-extended-resource-example 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: dra-extended-resource-example 10 | template: 11 | metadata: 12 | labels: 13 | app: dra-extended-resource-example 14 | spec: 15 | containers: 16 | - name: workload 17 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 18 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 300"] 19 | resources: 20 | limits: 21 | intel.com/gaudi: 1 22 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/device-class.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: resource.k8s.io/v1beta1 3 | {{- else }} 4 | apiVersion: resource.k8s.io/v1 5 | {{- end }} 6 | kind: DeviceClass 7 | metadata: 8 | name: gpu.intel.com 9 | 10 | spec: 11 | selectors: 12 | - cel: 13 | expression: device.driver == "gpu.intel.com" 14 | {{- if .Values.enableDRAExtendedResources }} 15 | # Available in K8s v1.34 requires feature gate enabled 16 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource 17 | extendedResourceName: {{ .Values.extendedResourceName }} 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/device-class.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: resource.k8s.io/v1beta1 3 | {{- else }} 4 | apiVersion: resource.k8s.io/v1 5 | {{- end }} 6 | kind: DeviceClass 7 | metadata: 8 | name: qat.intel.com 9 | 10 | spec: 11 | selectors: 12 | - cel: 13 | expression: device.driver == "qat.intel.com" 14 | {{- if .Values.enableDRAExtendedResources }} 15 | # Available in K8s v1.34 requires feature gate enabled 16 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource 17 | extendedResourceName: {{ .Values.extendedResourceName }} 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/device-class.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: resource.k8s.io/v1beta1 3 | {{- else }} 4 | apiVersion: resource.k8s.io/v1 5 | {{- end }} 6 | kind: DeviceClass 7 | metadata: 8 | name: gaudi.intel.com 9 | 10 | spec: 11 | selectors: 12 | - cel: 13 | expression: device.driver == "gaudi.intel.com" 14 | {{- if .Values.enableDRAExtendedResources }} 15 | # Available in K8s v1.34 requires feature gate enabled 16 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource 17 | extendedResourceName: {{ .Values.extendedResourceName }} 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /deployments/gpu/tests/gpu-sample-app/gpu-sample-app.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gpu-sample-app 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | restartPolicy: Never 8 | containers: 9 | - name: with-resource 10 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 11 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 12 | resources: 13 | claims: 14 | - name: resource 15 | - name: without-resource 16 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 17 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 18 | resourceClaims: 19 | - name: resource 20 | resourceClaimTemplateName: claim1 21 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /deployments/gpu/examples/deployment-extended-resources-implicit.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: dra-implicit-extended-resource-example 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: dra-implicit-extended-resource-example 10 | template: 11 | metadata: 12 | labels: 13 | app: dra-implicit-extended-resource-example 14 | spec: 15 | containers: 16 | - name: workload 17 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 18 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 300"] 19 | resources: 20 | limits: 21 | deviceclass.resource.kubernetes.io/gpu.intel.com: 1 22 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/deployment-extended-resources-implicit.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: dra-implicit-extended-resource-example 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: dra-implicit-extended-resource-example 10 | template: 11 | metadata: 12 | labels: 13 | app: dra-implicit-extended-resource-example 14 | spec: 15 | containers: 16 | - name: workload 17 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 18 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 300"] 19 | resources: 20 | limits: 21 | deviceclass.resource.kubernetes.io/gaudi.intel.com: 1 22 | -------------------------------------------------------------------------------- /deployments/qat/tests/openssl-qat-engine/openssl-qat-engine.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: openssl-qat-engine-asym 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: openssl-qat-engine-asym 9 | image: openssl-qat-engine:devel 10 | imagePullPolicy: IfNotPresent 11 | command: ["testapp","-engine","qathwtest","-async_jobs","1","-c","1","-n","1","-nc","1","-v","-hw_algo","0x0029"] 12 | securityContext: 13 | readOnlyRootFilesystem: true 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | add: 17 | ["IPC_LOCK"] 18 | resources: 19 | claims: 20 | - name: qat-resource-asym 21 | resourceClaims: 22 | - name: qat-resource-asym 23 | resourceClaimTemplateName: qat-template-asym 24 | -------------------------------------------------------------------------------- /deployments/gpu/examples/monitor-pod-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: monitor-claim 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gpu 10 | exactly: 11 | deviceClassName: gpu.intel.com 12 | adminAccess: true 13 | allocationMode: "All" 14 | --- 15 | apiVersion: v1 16 | kind: Pod 17 | metadata: 18 | name: monitor-pod 19 | spec: 20 | restartPolicy: Never 21 | containers: 22 | - name: monitor 23 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 24 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 25 | resources: 26 | claims: 27 | - name: resource 28 | resourceClaims: 29 | - name: resource 30 | resourceClaimTemplateName: monitor-claim 31 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/monitor-pod-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: monitor-claim 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gaudi 10 | exactly: 11 | deviceClassName: gaudi.intel.com 12 | adminAccess: true 13 | allocationMode: "All" 14 | --- 15 | apiVersion: v1 16 | kind: Pod 17 | metadata: 18 | name: monitor-pod 19 | spec: 20 | restartPolicy: Never 21 | containers: 22 | - name: monitor 23 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 24 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 60"] 25 | resources: 26 | claims: 27 | - name: resource 28 | resourceClaims: 29 | - name: resource 30 | resourceClaimTemplateName: monitor-claim 31 | -------------------------------------------------------------------------------- /pkg/helpers/driver.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package helpers 18 | 19 | import "context" 20 | 21 | type Driver interface { 22 | Shutdown(ctx context.Context) error 23 | } 24 | -------------------------------------------------------------------------------- /doc/cdi-spec-generator/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel CDI Spec Generator 2 | A pre-compiled binary is already available for download, eliminating the need for manual building. See documentation [README.md](README.md#Releases) 3 | 4 | ## Prerequisites 5 | - Go 1.22 6 | 7 | ## Building 8 | 1. Clone the repository 9 | ```bash 10 | git clone https://github.com/intel/intel-resource-drivers-for-kubernetes.git 11 | cd intel-resource-drivers-for-kubernetes/cmd/cdi-specs-generator 12 | ``` 13 | 14 | 2. Build the executable 15 | ```bash 16 | go build -o intel-cdi-specs-generator main.go 17 | ``` 18 | This command will generate an executable named intel-cdi-specs-generator in the current directory. 19 | 20 | ## Verification 21 | To verify that the build was successful, you can check the version of the tool by running: 22 | ```bash 23 | intel-cdi-specs-generator --version 24 | ``` -------------------------------------------------------------------------------- /deployments/gpu/examples/pod-inline-aligned-gpus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaim 3 | metadata: 4 | name: same-pciroot-gpus 5 | spec: 6 | devices: 7 | requests: 8 | - name: gpu 9 | exactly: 10 | deviceClassName: gpu.intel.com 11 | count: 2 12 | constraints: 13 | - matchAttribute: "gpu.intel.com/pciRoot" 14 | --- 15 | apiversion: v1 16 | kind: Pod 17 | metadata: 18 | name: test-inline-claim 19 | spec: 20 | restartpolicy: never 21 | containers: 22 | - name: with-resource 23 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 24 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 25 | resources: 26 | claims: 27 | - name: resource 28 | - name: without-resource 29 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 30 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 31 | resourceclaims: 32 | - name: resource 33 | resourceclaimname: same-pciroot-gpus 34 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/test-claims/multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "uid1": { 3 | "Devices": [ 4 | { 5 | "requests": [ 6 | "request1" 7 | ], 8 | "poolname": "node1", 9 | "devicename": "0000-af-00-1-0xabcd", 10 | "cdideviceids": [ 11 | "0000-af-00-1-0xabcd" 12 | ] 13 | } 14 | ] 15 | }, 16 | "uid2": { 17 | "Devices": [ 18 | { 19 | "requests": [ 20 | "request1" 21 | ], 22 | "poolname": "node1", 23 | "devicename": "0000-af-00-2-0xabcd", 24 | "cdideviceids": [ 25 | "0000-af-00-2-0xabcd" 26 | ] 27 | } 28 | ] 29 | }, 30 | "uid3": { 31 | "Devices": [ 32 | { 33 | "requests": [ 34 | "request1" 35 | ], 36 | "poolname": "node1", 37 | "devicename": "0000-af-00-3-0xabcd", 38 | "cdideviceids": [ 39 | "0000-af-00-3-0xabcd" 40 | ] 41 | } 42 | ] 43 | } 44 | } -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: intel-gpu-resource-driver-chart 3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Resource Driver 4 | 5 | type: application 6 | version: 0.9.1 7 | appVersion: v0.9.1 8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts 9 | 10 | dependencies: 11 | - name: node-feature-discovery 12 | alias: nfd 13 | version: "0.17.4" 14 | condition: nfd.enabled 15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts 16 | 17 | annotations: 18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 20 | org.opencontainers.image.version: "0.9.1" 21 | org.opencontainers.image.title: "Intel GPU Resource Driver" 22 | org.opencontainers.image.description: "This chart installs the Intel GPU resource driver on Kubernetes." 23 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: intel-qat-resource-driver-chart 3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel QAT Resource Driver 4 | 5 | type: application 6 | version: 0.4.1 7 | appVersion: v0.4.1 8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts 9 | 10 | dependencies: 11 | - name: node-feature-discovery 12 | alias: nfd 13 | version: "0.17.4" 14 | condition: nfd.enabled 15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts 16 | 17 | annotations: 18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 20 | org.opencontainers.image.version: "0.4.0" 21 | org.opencontainers.image.title: "Intel QAT Resource Driver" 22 | org.opencontainers.image.description: "This chart installs the Intel QAT resource driver on Kubernetes." 23 | -------------------------------------------------------------------------------- /deployments/qat/overlays/nfd_labeled_nodes/nfd-intel-qat-device-rule.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-qat-device-rule 5 | spec: 6 | rules: 7 | - name: "intel.qat" 8 | labels: 9 | feature.node.kubernetes.io/qat: "true" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | vendor: {op: In, value: ["8086"]} 14 | device: {op: In, value: ["4940", "4941", "4944", "4946"]} 15 | class: {op: In, value: ["0b40"]} 16 | - feature: kernel.loadedmodule 17 | matchExpressions: 18 | intel_qat: {op: Exists} 19 | matchAny: 20 | - matchFeatures: 21 | - feature: kernel.loadedmodule 22 | matchExpressions: 23 | vfio_pci: {op: Exists} 24 | - matchFeatures: 25 | - feature: kernel.enabledmodule 26 | matchExpressions: 27 | vfio-pci: {op: Exists} 28 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: intel-gaudi-resource-driver-chart 3 | description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel Gaudi Resource Driver 4 | 5 | type: application 6 | version: 0.6.1 7 | appVersion: v0.6.1 8 | home: https://github.com/intel/intel-resource-drivers-for-kubernetes/charts 9 | 10 | dependencies: 11 | - name: node-feature-discovery 12 | alias: nfd 13 | version: "0.17.4" 14 | condition: nfd.enabled 15 | repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts 16 | 17 | annotations: 18 | org.opencontainers.image.url: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 19 | org.opencontainers.image.source: "https://github.com/intel/intel-resource-drivers-for-kubernetes" 20 | org.opencontainers.image.version: "0.6.1" 21 | org.opencontainers.image.title: "Intel Gaudi Resource Driver" 22 | org.opencontainers.image.description: "This chart installs the Intel Gaudi resource driver on Kubernetes." 23 | -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/deviceresources.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | resourceapi "k8s.io/api/resource/v1" 9 | "k8s.io/klog/v2" 10 | 11 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 12 | ) 13 | 14 | func deviceResources(qatvfdevices device.VFDevices) *[]resourceapi.Device { 15 | resourcedevices := []resourceapi.Device{} 16 | 17 | for _, qatvfdevice := range qatvfdevices { 18 | services := qatvfdevice.Services() 19 | device := resourceapi.Device{ 20 | Name: qatvfdevice.UID(), 21 | Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{ 22 | "services": { 23 | StringValue: &services, 24 | }, 25 | }, 26 | } 27 | resourcedevices = append(resourcedevices, device) 28 | 29 | klog.V(5).Infof("Adding Device resource: name '%s', service '%s'", device.Name, *device.Attributes["services"].StringValue) 30 | } 31 | 32 | return &resourcedevices 33 | } 34 | -------------------------------------------------------------------------------- /Dockerfile.device-faker: -------------------------------------------------------------------------------- 1 | FROM golang:1.24.2@sha256:b51b7beeabe2e2d8438ba4295c59d584049873a480ba0e7b56d80db74b3e3a3a AS build 2 | ARG LOCAL_LICENSES 3 | WORKDIR /build 4 | COPY . . 5 | 6 | RUN make bin/device-faker && \ 7 | mkdir -p /install_root && \ 8 | if [ -z "$LOCAL_LICENSES" ]; then \ 9 | make licenses; \ 10 | fi && \ 11 | cp -r licenses /install_root/ && \ 12 | cp bin/device-faker /install_root/ 13 | 14 | 15 | FROM alpine AS template 16 | COPY --from=build /install_root/device-faker /device-faker 17 | 18 | 19 | RUN mkdir -p /opt/templates && \ 20 | /device-faker gpu -n && \ 21 | mv /tmp/gpu-template-*.json /opt/templates/gpu-template.json && \ 22 | /device-faker gaudi -n && \ 23 | mv /tmp/gaudi-template-*.json /opt/templates/gaudi-template.json && \ 24 | chmod 644 /opt/templates/*.json 25 | 26 | FROM scratch 27 | LABEL description="Intel Device Faker" 28 | COPY --from=build /install_root/device-faker /device-faker 29 | COPY --from=template /opt/templates /opt/templates 30 | ENTRYPOINT ["/device-faker"] 31 | -------------------------------------------------------------------------------- /deployments/gpu/examples/pod-inline-first-available.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: claim1 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: ai-accelerator 10 | firstAvailable: 11 | - name: gpu 12 | deviceClassName: gpu.intel.com 13 | - name: gaudi 14 | deviceClassName: gaudi.intel.com 15 | --- 16 | apiVersion: v1 17 | kind: Pod 18 | metadata: 19 | name: test-inline-first-available 20 | spec: 21 | restartPolicy: Never 22 | containers: 23 | - name: with-resource 24 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 25 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 20"] 26 | resources: 27 | claims: 28 | - name: resource 29 | - name: without-resource 30 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 31 | command: ["sh", "-c", "ls -la /dev/ && sleep 20"] 32 | resourceClaims: 33 | - name: resource 34 | resourceClaimTemplateName: claim1 35 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/nfd.yaml: -------------------------------------------------------------------------------- 1 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 2 | apiVersion: nfd.k8s-sigs.io/v1alpha1 3 | kind: NodeFeatureRule 4 | metadata: 5 | name: intel-qat-device-rule 6 | spec: 7 | rules: 8 | - name: "intel.qat" 9 | labels: 10 | feature.node.kubernetes.io/qat: "true" 11 | matchFeatures: 12 | - feature: pci.device 13 | matchExpressions: 14 | vendor: {op: In, value: ["8086"]} 15 | device: {op: In, value: ["4940", "4941", "4944", "4946"]} 16 | class: {op: In, value: ["0b40"]} 17 | - feature: kernel.loadedmodule 18 | matchExpressions: 19 | intel_qat: {op: Exists} 20 | matchAny: 21 | - matchFeatures: 22 | - feature: kernel.loadedmodule 23 | matchExpressions: 24 | vfio_pci: {op: Exists} 25 | - matchFeatures: 26 | - feature: kernel.enabledmodule 27 | matchExpressions: 28 | vfio-pci: {op: Exists} 29 | {{- end }} 30 | -------------------------------------------------------------------------------- /cmd/kubelet-gaudi-plugin/node_state_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "reflect" 21 | "testing" 22 | 23 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device" 24 | ) 25 | 26 | func TestDeviceInfoDeepCopy(t *testing.T) { 27 | di := device.DeviceInfo{ 28 | UID: "f", 29 | Model: "ff", 30 | } 31 | 32 | dc := di.DeepCopy() 33 | 34 | if !reflect.DeepEqual(&di, dc) { 35 | t.Fatalf("device infos %v and %v do not match", di, dc) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /cmd/qat-showdevice/main.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "fmt" 9 | 10 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 11 | ) 12 | 13 | func printPFDevice(pfdev *device.PFDevice) { 14 | fmt.Printf("PF device: %s\n", pfdev.Device) 15 | fmt.Printf("State: %s\n", pfdev.State.String()) 16 | fmt.Printf("Services: %s\n", pfdev.Services.String()) 17 | fmt.Printf("Num VFs: %d\n", pfdev.NumVFs) 18 | fmt.Printf("Max VFs: %d\n", pfdev.TotalVFs) 19 | 20 | for _, vfdev := range pfdev.AvailableDevices { 21 | fmt.Printf("\tVF UID %s: device %s, device node %s, IOMMU %s, driver %s\n", vfdev.UID(), vfdev.PCIDevice(), vfdev.DeviceNode(), vfdev.VFIommu, vfdev.Driver()) 22 | } 23 | } 24 | 25 | func main() { 26 | pfdevices, err := device.New() 27 | if err != nil { 28 | fmt.Printf("Error: %v\n", err) 29 | return 30 | } 31 | 32 | if len(pfdevices) == 0 { 33 | fmt.Printf("No PF devices found\n") 34 | return 35 | } 36 | 37 | for _, pfdev := range pfdevices { 38 | printPFDevice(pfdev) 39 | fmt.Printf("---\n\n") 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/pod-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaim 3 | metadata: 4 | name: claim1 5 | spec: 6 | devices: 7 | requests: 8 | - name: gaudi 9 | exactly: 10 | deviceClassName: gaudi.intel.com 11 | ## 12 | ## if one is not enough 13 | # count: 2 14 | ## 15 | ## requesting particular series 16 | # selectors: 17 | # - cel: 18 | # expression: device.attributes["gaudi.intel.com"].model == 'Gaudi2' 19 | ## 20 | ## for monitoring 21 | # adminAccess: true 22 | # allocationMode: "All" 23 | --- 24 | apiVersion: v1 25 | kind: Pod 26 | metadata: 27 | name: test-inline-claim 28 | spec: 29 | restartPolicy: Never 30 | containers: 31 | - name: with-resource 32 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 33 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 60"] 34 | resources: 35 | claims: 36 | - name: resource 37 | - name: without-resource 38 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 39 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 40 | resourceClaims: 41 | - name: resource 42 | resourceClaimName: claim1 43 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/README.md: -------------------------------------------------------------------------------- 1 | # Overlay for installing Intel GPU resource driver in Red Hat OpenShift 2 | 3 | - supported RHOS version: 4.20+ 4 | 5 | # Installing 6 | 7 | ```shell 8 | kubectl apply -k deployments/gpu/overlays/openshift 9 | ``` 10 | 11 | The following warning could be shown during the installation: 12 | 13 | ```shell 14 | Warning: would violate PodSecurity "restricted:latest": privileged (container "kubelet-plugin" must not set securityContext.privileged=true), allowPrivilegeEscalation != false (container "kubelet-plugin" must set securityContext.allowPrivilegeEscalation=false), restricted volume types (volumes "plugins-registry", "plugins", "cdi", "varruncdi", "sysfs" use restricted volume type "hostPath"), runAsNonRoot != true (pod or container "kubelet-plugin" must set securityContext.runAsNonRoot=true), runAsUser=0 (container "kubelet-plugin" must not set runAsUser=0) 15 | 16 | ``` 17 | 18 | This happens when the SecurityContextConstraints gets created later than the DaemonSet, 19 | causing DaemonSet Pod creation to initially fail. Kubernetes will retry creating the Pods, 20 | and will eventually find the needed SecurityContextConstraints object. -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/README.md: -------------------------------------------------------------------------------- 1 | # Overlay for installing Intel QAT resource driver in Red Hat OpenShift 2 | 3 | - supported RHOS version: 4.20+ 4 | 5 | # Installing 6 | 7 | ```shell 8 | kubectl apply -k deployments/qat/overlays/openshift 9 | ``` 10 | 11 | The following warning could be shown during the installation: 12 | 13 | ```shell 14 | Warning: would violate PodSecurity "restricted:latest": privileged (container "kubelet-plugin" must not set securityContext.privileged=true), allowPrivilegeEscalation != false (container "kubelet-plugin" must set securityContext.allowPrivilegeEscalation=false), restricted volume types (volumes "plugins-registry", "plugins", "cdi", "varruncdi", "sysfs" use restricted volume type "hostPath"), runAsNonRoot != true (pod or container "kubelet-plugin" must set securityContext.runAsNonRoot=true), runAsUser=0 (container "kubelet-plugin" must not set runAsUser=0) 15 | 16 | ``` 17 | 18 | This happens when the SecurityContextConstraints gets created later than the DaemonSet, 19 | causing DaemonSet Pod creation to initially fail. Kubernetes will retry creating the Pods, 20 | and will eventually find the needed SecurityContextConstraints object. -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/README.md: -------------------------------------------------------------------------------- 1 | # Overlay for installing Intel Gaudi resource driver in Red Hat OpenShift 2 | 3 | - supported RHOS version: 4.20+ 4 | 5 | # Installing 6 | 7 | ```shell 8 | kubectl apply -k deployments/gaudi/overlays/openshift 9 | ``` 10 | 11 | The following warning could be shown during the installation: 12 | 13 | ```shell 14 | Warning: would violate PodSecurity "restricted:latest": privileged (container "kubelet-plugin" must not set securityContext.privileged=true), allowPrivilegeEscalation != false (container "kubelet-plugin" must set securityContext.allowPrivilegeEscalation=false), restricted volume types (volumes "plugins-registry", "plugins", "cdi", "varruncdi", "sysfs" use restricted volume type "hostPath"), runAsNonRoot != true (pod or container "kubelet-plugin" must set securityContext.runAsNonRoot=true), runAsUser=0 (container "kubelet-plugin" must not set runAsUser=0) 15 | 16 | ``` 17 | 18 | This happens when the SecurityContextConstraints gets created later than the DaemonSet, 19 | causing DaemonSet Pod creation to initially fail. Kubernetes will retry creating the Pods, 20 | and will eventually find the needed SecurityContextConstraints object. -------------------------------------------------------------------------------- /hack/clusterconfig.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeadm.k8s.io/v1beta4 2 | kind: InitConfiguration 3 | nodeRegistration: 4 | criSocket: unix:///var/run/containerd/containerd.sock 5 | imagePullPolicy: IfNotPresent 6 | --- 7 | apiVersion: kubeadm.k8s.io/v1beta4 8 | kind: ClusterConfiguration 9 | kubernetesVersion: v1.34.0 10 | apiServer: 11 | extraArgs: 12 | - name: feature-gates 13 | value: DRAExtendedResource=true,DRADeviceTaints=true,DRAAdminAccess=true,DRAPartitionableDevices=true,DRAResourceClaimDeviceStatus=true,ResourceHealthStatus=true,DRAPrioritizedList=true 14 | - name: runtime-config 15 | value: resource.k8s.io/v1beta1=true,resource.k8s.io/v1alpha3=true 16 | clusterName: kubernetes 17 | controllerManager: 18 | extraArgs: 19 | - name: feature-gates 20 | value: DRADeviceTaints=true,DRAPrioritizedList=true 21 | imageRepository: registry.k8s.io 22 | scheduler: 23 | extraArgs: 24 | - name: feature-gates 25 | value: DRAExtendedResource=true,DRADeviceTaints=true,DRAAdminAccess=true,DRAPartitionableDevices=true,DRAPrioritizedList=true 26 | --- 27 | apiVersion: kubelet.config.k8s.io/v1beta1 28 | kind: KubeletConfiguration 29 | featureGates: 30 | DRAExtendedResource: true 31 | ResourceHealthStatus: true 32 | -------------------------------------------------------------------------------- /Dockerfile.gaudi: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM golang:1.24.2@sha256:b51b7beeabe2e2d8438ba4295c59d584049873a480ba0e7b56d80db74b3e3a3a AS build 16 | ARG LOCAL_LICENSES 17 | WORKDIR /build 18 | COPY . . 19 | 20 | RUN make gaudi && \ 21 | mkdir -p /install_root && \ 22 | if [ -z "$LOCAL_LICENSES" ]; then \ 23 | make licenses; \ 24 | fi && \ 25 | cp -r licenses /install_root/ && \ 26 | cp bin/kubelet-gaudi-plugin /install_root/ 27 | 28 | FROM scratch 29 | LABEL description="Intel Gaudi resource driver for Kubernetes" 30 | 31 | COPY --from=build /install_root/ / 32 | CMD ["/kubelet-gaudi-plugin"] 33 | -------------------------------------------------------------------------------- /doc/gpu/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel GPU Resource Driver container image 2 | 3 | ## Platforms supported 4 | 5 | - Linux 6 | 7 | ## Prerequisites 8 | 9 | - Docker or Podman. 10 | 11 | ## Building 12 | 13 | `Makefile` automates this, only required tool is Docker or Podman. 14 | To build the container image locally, from the root of this Git repository: 15 | ```bash 16 | make gpu-container-build 17 | ``` 18 | 19 | It is possible to specify custom registry, container image name, and version (tag) as separate 20 | variables to override any part of release container image URL in the build command, e.g.: 21 | ```bash 22 | REGISTRY=myregistry GPU_IMAGE_NAME=myimage GPU_IMAGE_VERSION=myversion make gpu-container-build 23 | ``` 24 | 25 | or whole resulting image URL (this will ignore REGISTRY, GPU_IMAGE_NAME, GPU_IMAGE_VERSION even if specified): 26 | ```bash 27 | GPU_IMAGE_TAG=myregistry/myimagename:myversion make gpu-container-build 28 | ``` 29 | 30 | To build the container image and push image to the destination registry straight away: 31 | ```bash 32 | REGISTRY=registry.local make gpu-container-push 33 | ``` 34 | or 35 | ```bash 36 | GPU_IMAGE_TAG=registry.local/intel-gpu-resource-driver:latest make gpu-container-push 37 | ``` 38 | -------------------------------------------------------------------------------- /doc/qat/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel® QAT Resource Driver container image 2 | 3 | ## Platforms supported 4 | 5 | - Linux 6 | 7 | ## Prerequisites 8 | 9 | - Docker or Podman. 10 | 11 | ## Building 12 | 13 | `Makefile` automates this, only required tool is Docker or Podman. 14 | To build the container image locally, from the root of this Git repository: 15 | ```bash 16 | make qat-container-build 17 | ``` 18 | 19 | It is possible to specify custom registry, container image name, and version (tag) as separate 20 | variables to override any part of release container image URL in the build command, e.g.: 21 | ```bash 22 | REGISTRY=myregistry QAT_IMAGE_NAME=myimage QAT_IMAGE_VERSION=myversion make qat-container-build 23 | ``` 24 | 25 | or whole resulting image URL (this will ignore REGISTRY, QAT_IMAGE_NAME, QAT_IMAGE_VERSION even if specified): 26 | ```bash 27 | QAT_IMAGE_TAG=myregistry/myimagename:myversion make qat-container-build 28 | ``` 29 | 30 | To build the container image and push image to the destination registry straight away: 31 | ```bash 32 | REGISTRY=registry.local make qat-container-push 33 | ``` 34 | or 35 | ```bash 36 | QAT_IMAGE_TAG=registry.local/intel-qat-resource-driver:latest make qat-container-push 37 | ``` 38 | -------------------------------------------------------------------------------- /Dockerfile.qat: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM golang:1.24.2@sha256:b51b7beeabe2e2d8438ba4295c59d584049873a480ba0e7b56d80db74b3e3a3a AS build 16 | ARG LOCAL_LICENSES 17 | WORKDIR /build 18 | COPY . . 19 | 20 | RUN make qat && \ 21 | mkdir -p /install_root && \ 22 | if [ -z "$LOCAL_LICENSES" ]; then \ 23 | make licenses; \ 24 | fi && \ 25 | cp -r licenses /install_root/ && \ 26 | cp bin/kubelet-qat-plugin /install_root/ && \ 27 | cp bin/qat-showdevice /install_root/ 28 | 29 | 30 | FROM scratch 31 | WORKDIR / 32 | LABEL description="Intel QAT resource driver for Kubernetes" 33 | 34 | COPY --from=build /install_root / 35 | -------------------------------------------------------------------------------- /doc/gaudi/BUILD.md: -------------------------------------------------------------------------------- 1 | # How to build Intel Gaudi Resource Driver container image 2 | 3 | ## Platforms supported 4 | 5 | - Linux 6 | 7 | ## Prerequisites 8 | 9 | - Docker or Podman. 10 | 11 | ## Building 12 | 13 | `Makefile` automates this, only required tool is Docker or Podman. 14 | To build the container image locally, from the root of this Git repository: 15 | ```bash 16 | make gaudi-container-build 17 | ``` 18 | 19 | It is possible to specify custom registry, container image name, and version (tag) as separate 20 | variables to override any part of release container image URL in the build command, e.g.: 21 | ```bash 22 | REGISTRY=myregistry GAUDI_IMAGE_NAME=myimage GAUDI_IMAGE_VERSION=myversion make gaudi-container-build 23 | ``` 24 | 25 | or whole resulting image URL (this will ignore REGISTRY, GAUDI_IMAGE_NAME, GAUDI_IMAGE_VERSION even if specified): 26 | ```bash 27 | GAUDI_IMAGE_TAG=myregistry/myimagename:myversion make gaudi-container-build 28 | ``` 29 | 30 | To build the container image and push image to the destination registry straight away: 31 | ```bash 32 | REGISTRY=registry.local make gaudi-container-push 33 | ``` 34 | or 35 | ```bash 36 | GAUDI_IMAGE_TAG=registry.local/intel-gaudi-resource-driver:latest make gaudi-container-push 37 | ``` 38 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/openshift/securityContextConstraints.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.openshift.io/v1 2 | kind: SecurityContextConstraints 3 | metadata: 4 | name: intel-gpu-resource-driver 5 | allowHostDirVolumePlugin: true 6 | allowHostIPC: false 7 | allowHostNetwork: false 8 | allowHostPID: false 9 | allowHostPorts: false 10 | allowPrivilegeEscalation: true 11 | allowPrivilegedContainer: true 12 | allowedCapabilities: 13 | - '' 14 | allowedUnsafeSysctls: null 15 | defaultAddCapabilities: null 16 | fsGroup: 17 | type: RunAsAny 18 | groups: 19 | - system:cluster-admins 20 | - system:nodes 21 | - system:masters 22 | priority: null 23 | readOnlyRootFilesystem: false 24 | requiredDropCapabilities: ['ALL'] 25 | runAsUser: 26 | type: RunAsAny 27 | seLinuxContext: 28 | type: RunAsAny 29 | # OpenShift has runtime/default as a name of default profile. 30 | # See https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/security_and_compliance/seccomp-profiles#custom-seccomp-profile 31 | seccompProfiles: 32 | - 'runtime/default' 33 | supplementalGroups: 34 | type: RunAsAny 35 | userNamespaceLevel: AllowHostLevel 36 | users: 37 | - system:admin 38 | - system:serviceaccount:intel-gpu-resource-driver:intel-gpu-resource-driver-service-account 39 | volumes: 40 | - '*' 41 | -------------------------------------------------------------------------------- /deployments/qat/overlays/openshift/securityContextConstraints.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.openshift.io/v1 2 | kind: SecurityContextConstraints 3 | metadata: 4 | name: intel-qat-resource-driver 5 | allowHostDirVolumePlugin: true 6 | allowHostIPC: false 7 | allowHostNetwork: false 8 | allowHostPID: false 9 | allowHostPorts: false 10 | allowPrivilegeEscalation: true 11 | allowPrivilegedContainer: true 12 | allowedCapabilities: 13 | - '' 14 | allowedUnsafeSysctls: null 15 | defaultAddCapabilities: null 16 | fsGroup: 17 | type: RunAsAny 18 | groups: 19 | - system:cluster-admins 20 | - system:nodes 21 | - system:masters 22 | priority: null 23 | readOnlyRootFilesystem: false 24 | requiredDropCapabilities: ['ALL'] 25 | runAsUser: 26 | type: RunAsAny 27 | seLinuxContext: 28 | type: RunAsAny 29 | # OpenShift has runtime/default as a name of default profile. 30 | # See https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/security_and_compliance/seccomp-profiles#custom-seccomp-profile 31 | seccompProfiles: 32 | - 'runtime/default' 33 | supplementalGroups: 34 | type: RunAsAny 35 | userNamespaceLevel: AllowHostLevel 36 | users: 37 | - system:admin 38 | - system:serviceaccount:intel-qat-resource-driver:intel-qat-resource-driver-service-account 39 | volumes: 40 | - '*' 41 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/openshift/securityContextConstraints.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: security.openshift.io/v1 2 | kind: SecurityContextConstraints 3 | metadata: 4 | name: intel-gaudi-resource-driver 5 | allowHostDirVolumePlugin: true 6 | allowHostIPC: false 7 | allowHostNetwork: false 8 | allowHostPID: false 9 | allowHostPorts: false 10 | allowPrivilegeEscalation: true 11 | allowPrivilegedContainer: true 12 | allowedCapabilities: 13 | - '' 14 | allowedUnsafeSysctls: null 15 | defaultAddCapabilities: null 16 | fsGroup: 17 | type: RunAsAny 18 | groups: 19 | - system:cluster-admins 20 | - system:nodes 21 | - system:masters 22 | priority: null 23 | readOnlyRootFilesystem: false 24 | requiredDropCapabilities: ['ALL'] 25 | runAsUser: 26 | type: RunAsAny 27 | seLinuxContext: 28 | type: RunAsAny 29 | # OpenShift has runtime/default as a name of default profile. 30 | # See https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/security_and_compliance/seccomp-profiles#custom-seccomp-profile 31 | seccompProfiles: 32 | - 'runtime/default' 33 | supplementalGroups: 34 | type: RunAsAny 35 | userNamespaceLevel: AllowHostLevel 36 | users: 37 | - system:admin 38 | - system:serviceaccount:intel-gaudi-resource-driver:intel-gaudi-resource-driver-service-account 39 | volumes: 40 | - '*' 41 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/modified-clusterconfig.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeadm.k8s.io/v1beta4 2 | kind: InitConfiguration 3 | nodeRegistration: 4 | criSocket: unix:///var/run/containerd/containerd.sock 5 | imagePullPolicy: IfNotPresent 6 | --- 7 | apiVersion: kubeadm.k8s.io/v1beta4 8 | kind: ClusterConfiguration 9 | kubernetesVersion: v1.34.0 10 | apiServer: 11 | extraArgs: 12 | - name: feature-gates 13 | value: DRAExtendedResource=true,DRADeviceTaints=true,DRAAdminAccess=true,DRAPartitionableDevices=true,DRAResourceClaimDeviceStatus=true,ResourceHealthStatus=true 14 | clusterName: kubernetes 15 | controllerManager: 16 | extraArgs: 17 | - name: feature-gates 18 | value: DRADeviceTaints=true 19 | imageRepository: registry.k8s.io 20 | scheduler: 21 | extraArgs: 22 | - name: feature-gates 23 | value: DRAExtendedResource=true,DRADeviceTaints=true,DRAAdminAccess=true,DRAPartitionableDevices=true 24 | --- 25 | apiVersion: kubelet.config.k8s.io/v1beta1 26 | kind: KubeletConfiguration 27 | featureGates: 28 | DRAExtendedResource: true 29 | ResourceHealthStatus: true 30 | cpuManagerPolicy: static 31 | kubeReserved: 32 | cpu: "1" 33 | memory: "2Gi" 34 | ephemeral-storage: "1Gi" 35 | systemReserved: 36 | cpu: "1" 37 | memory: "2Gi" 38 | ephemeral-storage: "1Gi" 39 | -------------------------------------------------------------------------------- /deployments/gpu/examples/deployment-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: gpu-4g 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gpu 10 | exactly: 11 | deviceClassName: gpu.intel.com 12 | selectors: 13 | - cel: 14 | expression: device.capacity["gpu.intel.com"].memory.compareTo(quantity("4Gi")) >= 0 15 | 16 | --- 17 | apiVersion: apps/v1 18 | kind: Deployment 19 | metadata: 20 | name: gpu-test 21 | labels: 22 | app: inline-gpu-deployment 23 | spec: 24 | replicas: 1 25 | selector: 26 | matchLabels: 27 | app: inline-gpu-deployment 28 | template: 29 | metadata: 30 | labels: 31 | app: inline-gpu-deployment 32 | spec: 33 | containers: 34 | - name: with-resource 35 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 36 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 60"] 37 | resources: 38 | claims: 39 | - name: resource 40 | - name: without-resource 41 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 42 | command: ["sh", "-c", "ls -la /dev/ && sleep 60"] 43 | resourceClaims: 44 | - name: resource 45 | resourceClaimTemplateName: gpu-4g 46 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/ocp-scc.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: security.openshift.io/v1 3 | kind: SecurityContextConstraints 4 | metadata: 5 | name: {{ .Values.openshift.sccName }} 6 | allowHostDirVolumePlugin: true 7 | allowHostIPC: false 8 | allowHostNetwork: false 9 | allowHostPID: false 10 | allowHostPorts: false 11 | allowPrivilegeEscalation: true 12 | allowPrivilegedContainer: true 13 | allowedCapabilities: 14 | - '' 15 | allowedUnsafeSysctls: null 16 | defaultAddCapabilities: null 17 | fsGroup: 18 | type: RunAsAny 19 | groups: 20 | - system:cluster-admins 21 | - system:nodes 22 | - system:masters 23 | priority: null 24 | readOnlyRootFilesystem: false 25 | requiredDropCapabilities: ['ALL'] 26 | runAsUser: 27 | type: RunAsAny 28 | seLinuxContext: 29 | type: RunAsAny 30 | # OpenShift has runtime/default as a name of default profile. 31 | # See https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/security_and_compliance/seccomp-profiles#custom-seccomp-profile 32 | seccompProfiles: 33 | - 'runtime/default' 34 | supplementalGroups: 35 | type: RunAsAny 36 | userNamespaceLevel: AllowHostLevel 37 | users: 38 | - system:admin 39 | - system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 40 | volumes: 41 | - '*' 42 | {{- end }} 43 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/ocp-scc.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: security.openshift.io/v1 3 | kind: SecurityContextConstraints 4 | metadata: 5 | name: {{ .Values.openshift.sccName }} 6 | allowHostDirVolumePlugin: true 7 | allowHostIPC: false 8 | allowHostNetwork: false 9 | allowHostPID: false 10 | allowHostPorts: false 11 | allowPrivilegeEscalation: true 12 | allowPrivilegedContainer: true 13 | allowedCapabilities: 14 | - '' 15 | allowedUnsafeSysctls: null 16 | defaultAddCapabilities: null 17 | fsGroup: 18 | type: RunAsAny 19 | groups: 20 | - system:cluster-admins 21 | - system:nodes 22 | - system:masters 23 | priority: null 24 | readOnlyRootFilesystem: false 25 | requiredDropCapabilities: ['ALL'] 26 | runAsUser: 27 | type: RunAsAny 28 | seLinuxContext: 29 | type: RunAsAny 30 | # OpenShift has runtime/default as a name of default profile. 31 | # See https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/security_and_compliance/seccomp-profiles#custom-seccomp-profile 32 | seccompProfiles: 33 | - 'runtime/default' 34 | supplementalGroups: 35 | type: RunAsAny 36 | userNamespaceLevel: AllowHostLevel 37 | users: 38 | - system:admin 39 | - system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-gpu-resource-driver.serviceAccountName" . }} 40 | volumes: 41 | - '*' 42 | {{- end }} 43 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/ocp-scc.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.openshift.enabled }} 2 | apiVersion: security.openshift.io/v1 3 | kind: SecurityContextConstraints 4 | metadata: 5 | name: {{ .Values.openshift.sccName }} 6 | allowHostDirVolumePlugin: true 7 | allowHostIPC: false 8 | allowHostNetwork: false 9 | allowHostPID: false 10 | allowHostPorts: false 11 | allowPrivilegeEscalation: true 12 | allowPrivilegedContainer: true 13 | allowedCapabilities: 14 | - '' 15 | allowedUnsafeSysctls: null 16 | defaultAddCapabilities: null 17 | fsGroup: 18 | type: RunAsAny 19 | groups: 20 | - system:cluster-admins 21 | - system:nodes 22 | - system:masters 23 | priority: null 24 | readOnlyRootFilesystem: false 25 | requiredDropCapabilities: ['ALL'] 26 | runAsUser: 27 | type: RunAsAny 28 | seLinuxContext: 29 | type: RunAsAny 30 | # OpenShift has runtime/default as a name of default profile. 31 | # See https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/security_and_compliance/seccomp-profiles#custom-seccomp-profile 32 | seccompProfiles: 33 | - 'runtime/default' 34 | supplementalGroups: 35 | type: RunAsAny 36 | userNamespaceLevel: AllowHostLevel 37 | users: 38 | - system:admin 39 | - system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-qat-resource-driver.serviceAccountName" . }} 40 | volumes: 41 | - '*' 42 | {{- end }} 43 | -------------------------------------------------------------------------------- /deployments/gaudi/examples/deployment-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: two-gaudi3 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gaudi 10 | exactly: 11 | deviceClassName: gaudi.intel.com 12 | count: 2 13 | selectors: 14 | - cel: 15 | expression: device.attributes["gaudi.intel.com"].model == 'Gaudi3' 16 | 17 | --- 18 | apiVersion: apps/v1 19 | kind: Deployment 20 | metadata: 21 | name: gaudi-test 22 | labels: 23 | app: inline-gaudi-deployment 24 | spec: 25 | replicas: 1 26 | selector: 27 | matchLabels: 28 | app: inline-gaudi-deployment 29 | template: 30 | metadata: 31 | labels: 32 | app: inline-gaudi-deployment 33 | spec: 34 | containers: 35 | - name: with-resource 36 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 37 | command: ["sh", "-c", "ls -la /dev/accel/ && sleep 300"] 38 | resources: 39 | claims: 40 | - name: resource 41 | - name: without-resource 42 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 43 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"] 44 | resourceClaims: 45 | - name: resource 46 | resourceClaimTemplateName: two-gaudi3 47 | -------------------------------------------------------------------------------- /doc/qat/TESTING.md: -------------------------------------------------------------------------------- 1 | # Test Cases 2 | 3 | ## Intel® QAT Device Plugin 4 | There are test cases made for [Intel® QAT Device Plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md). 5 | It is possible to run those images using this resource driver. Those images are 6 | available in the following links. 7 | 8 | - [qatlib-sample-code](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/openssl-qat-engine) 9 | - [qat-dpdk-test](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/crypto-perf) 10 | 11 | Build the images in your environment, create a resourceClaimTemplate and run 12 | the pods with the following commands. 13 | ``` 14 | kubectl apply -f deployments/qat/tests/resource-claim-template.yaml 15 | kubectl apply -k deployments/qat/tests/qatlib-sample-code 16 | kubectl apply -k deployments/qat/tests/qat-dpdk-test 17 | ``` 18 | All cases include both crypto and compress tests. 19 | 20 | To run `qat-dpdk-test`, the cluster should have `CPU Manager Policy` as `static` 21 | in its kubelet configuration. In addition, `hugepages-2Mi` resource should be 22 | available. 23 | 24 | There is an example [cluster setup yaml](../../deployments/qat/tests/qat-dpdk-test/modified-cluster-setup.yaml) 25 | for setting cpu manager policy as static. Re-create the cluster with the 26 | configurations enabled. 27 | -------------------------------------------------------------------------------- /deployments/qat/tests/qatlib-sample-code/qatlib-sample-code.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: qatlib-sample-code-sym 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: qatlib-sample-code-sym 9 | image: openssl-qat-engine:devel 10 | imagePullPolicy: IfNotPresent 11 | command: ["cpa_sample_code", "runTests=1"] 12 | securityContext: 13 | readOnlyRootFilesystem: true 14 | allowPrivilegeEscalation: false 15 | capabilities: 16 | add: 17 | ["IPC_LOCK"] 18 | resources: 19 | claims: 20 | - name: qat-resource-sym 21 | resourceClaims: 22 | - name: qat-resource-sym 23 | resourceClaimTemplateName: qat-template-sym 24 | --- 25 | apiVersion: v1 26 | kind: Pod 27 | metadata: 28 | name: qatlib-sample-code-dc 29 | spec: 30 | restartPolicy: Never 31 | containers: 32 | - name: qatlib-sample-code-dc 33 | image: openssl-qat-engine:devel 34 | imagePullPolicy: IfNotPresent 35 | command: ["cpa_sample_code", "runTests=32"] 36 | securityContext: 37 | readOnlyRootFilesystem: true 38 | allowPrivilegeEscalation: false 39 | capabilities: 40 | add: 41 | ["IPC_LOCK"] 42 | resources: 43 | claims: 44 | - name: qat-resource-dc 45 | resourceClaims: 46 | - name: qat-resource-dc 47 | resourceClaimTemplateName: qat-template-dc 48 | -------------------------------------------------------------------------------- /deployments/gpu/examples/pod-inline-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: claim1 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: gpu 10 | exactly: 11 | deviceClassName: gpu.intel.com 12 | ## 13 | ## if one is not enough 14 | # count: 2 15 | ## 16 | ## requesting particular series 17 | # selectors: 18 | # - cel: 19 | # expression: device.attributes["gpu.intel.com"].driver == 'i915' 20 | # - cel: 21 | # expression: device.capacity["gpu.intel.com"].memory.compareTo(quantity("4Gi")) >= 0 22 | 23 | ## for monitoring 24 | # adminAccess: true # requires 'resource.kubernetes.io/admin-access: true' label 25 | # allocationMode: "All" 26 | --- 27 | apiVersion: v1 28 | kind: Pod 29 | metadata: 30 | name: test-inline-claim 31 | spec: 32 | restartPolicy: Never 33 | containers: 34 | - name: with-resource 35 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 36 | command: ["sh", "-c", "ls -la /dev/dri/ && sleep 20"] 37 | resources: 38 | claims: 39 | - name: resource 40 | - name: without-resource 41 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 42 | command: ["sh", "-c", "ls -la /dev/ && sleep 20"] 43 | resourceClaims: 44 | - name: resource 45 | resourceClaimTemplateName: claim1 46 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/crypto-perf.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: Pod 3 | apiVersion: v1 4 | metadata: 5 | name: qat-dpdk-test-crypto-perf 6 | spec: 7 | containers: 8 | - name: crypto-perf 9 | image: crypto-perf:devel 10 | imagePullPolicy: IfNotPresent 11 | env: 12 | - name: TESTCMD 13 | value: "crypto" 14 | - name: PTEST 15 | value: "--ptest throughput --devtype crypto_qat --optype cipher-only --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16 --total-ops 10000000 --burst-sz 32 --buffer-sz 64" 16 | volumeMounts: 17 | - mountPath: /dev/hugepages 18 | name: hugepage 19 | - mountPath: /var/run/dpdk 20 | name: dpdk-runtime 21 | resources: 22 | claims: 23 | - name: qat-resource-sym 24 | requests: 25 | cpu: "3" 26 | memory: "128Mi" 27 | hugepages-2Mi: "128Mi" 28 | limits: 29 | cpu: "3" 30 | memory: "128Mi" 31 | hugepages-2Mi: "128Mi" 32 | securityContext: 33 | readOnlyRootFilesystem: true 34 | allowPrivilegeEscalation: false 35 | capabilities: 36 | add: 37 | ["IPC_LOCK"] 38 | restartPolicy: Never 39 | volumes: 40 | - name: dpdk-runtime 41 | emptyDir: 42 | medium: Memory 43 | - name: hugepage 44 | emptyDir: 45 | medium: HugePages 46 | resourceClaims: 47 | - name: qat-resource-sym 48 | resourceClaimTemplateName: qat-template-sym 49 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicy 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gpu 5 | spec: 6 | failurePolicy: Fail 7 | matchConstraints: 8 | resourceRules: 9 | - apiGroups: ["resource.k8s.io"] 10 | apiVersions: ["v1"] 11 | operations: ["CREATE", "UPDATE", "DELETE"] 12 | resources: ["resourceslices"] 13 | matchConditions: 14 | - name: isRestrictedUser 15 | expression: >- 16 | request.userInfo.username == "system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-gpu-resource-driver.serviceAccountName" . }}" 17 | variables: 18 | - name: userNodeName 19 | expression: >- 20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 21 | - name: objectNodeName 22 | expression: >- 23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 24 | validations: 25 | - expression: variables.userNodeName != "" 26 | message: >- 27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 28 | - expression: variables.userNodeName == variables.objectNodeName 29 | messageExpression: >- 30 | "this user running on node '"+variables.userNodeName+"' may not modify " + 31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 32 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/validating-admission-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicy 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-qat 5 | spec: 6 | failurePolicy: Fail 7 | matchConstraints: 8 | resourceRules: 9 | - apiGroups: ["resource.k8s.io"] 10 | apiVersions: ["v1"] 11 | operations: ["CREATE", "UPDATE", "DELETE"] 12 | resources: ["resourceslices"] 13 | matchConditions: 14 | - name: isRestrictedUser 15 | expression: >- 16 | request.userInfo.username == "system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-qat-resource-driver.serviceAccountName" . }}" 17 | variables: 18 | - name: userNodeName 19 | expression: >- 20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 21 | - name: objectNodeName 22 | expression: >- 23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 24 | validations: 25 | - expression: variables.userNodeName != "" 26 | message: >- 27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 28 | - expression: variables.userNodeName == variables.objectNodeName 29 | messageExpression: >- 30 | "this user running on node '"+variables.userNodeName+"' may not modify " + 31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 32 | -------------------------------------------------------------------------------- /doc/cdi-spec-generator/README.md: -------------------------------------------------------------------------------- 1 | # Intel CDI Spec Generator 2 | 3 | ## Overview 4 | The Intel CDI Specs Generator is a command line tool to generate Container Device Interface (CDI) specifications for supported accelerators. 5 | 6 | ## Prerequisites 7 | - Administrative privileges on the system to write CDI specs. 8 | 9 | ## Usage 10 | Execute the built executable with the type of device you wish to generate CDI specs for: 11 | ```bash 12 | intel-cdi-specs-generator 13 | ``` 14 | 15 | Supported device types: 16 | - gpu: Use this option to generate CDI specs for Intel GPUs. 17 | - gaudi: Use this option to generate CDI specs for Intel Gaudi accelerators. 18 | 19 | ## Display Version 20 | To display the version of the binary, use the following command: 21 | ```bash 22 | intel-cdi-specs-generator --version 23 | ``` 24 | 25 | ## Example Usage 26 | To generate CDI specifications for GPUs, run the tool with gpu as an argument: 27 | ```bash 28 | intel-cdi-specs-generator gpu 29 | ``` 30 | This command will detect supported GPUs on the system, and ensure that there is a CDI device record for each of them. 31 | 32 | 33 | ## Building 34 | - [How to build CDI Spec Generator](BUILD.md) 35 | 36 | ## Releases 37 | The binary is available for download in the releases section: 38 | - [Intel Resource Drivers for Kubernetes releases](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases) 39 | - [CDI Spec Generator v0.1.0](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases/tag/specs-generator-v0.1.0) 40 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicy 3 | metadata: 4 | name: resourceslices-policy-dra-kubelet-plugin-gaudi 5 | spec: 6 | failurePolicy: Fail 7 | matchConstraints: 8 | resourceRules: 9 | - apiGroups: ["resource.k8s.io"] 10 | apiVersions: ["v1"] 11 | operations: ["CREATE", "UPDATE", "DELETE"] 12 | resources: ["resourceslices"] 13 | matchConditions: 14 | - name: isRestrictedUser 15 | expression: >- 16 | request.userInfo.username == "system:serviceaccount:{{ .Release.Namespace }}:{{ include "intel-gaudi-resource-driver.serviceAccountName" . }}" 17 | variables: 18 | - name: userNodeName 19 | expression: >- 20 | request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('') 21 | - name: objectNodeName 22 | expression: >- 23 | (request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("") 24 | validations: 25 | - expression: variables.userNodeName != "" 26 | message: >- 27 | no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled 28 | - expression: variables.userNodeName == variables.objectNodeName 29 | messageExpression: >- 30 | "this user running on node '"+variables.userNodeName+"' may not modify " + 31 | (variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'") 32 | -------------------------------------------------------------------------------- /deployments/qat/tests/qat-dpdk-test/compress-perf.yaml: -------------------------------------------------------------------------------- 1 | kind: Pod 2 | apiVersion: v1 3 | metadata: 4 | name: qat-dpdk-test-compress-perf 5 | spec: 6 | containers: 7 | - name: compress-perf 8 | image: crypto-perf:devel 9 | imagePullPolicy: IfNotPresent 10 | env: 11 | - name: TESTCMD 12 | value: "compress" 13 | - name: PTEST 14 | value: "--driver-name compress_qat --input-file /var/data/file.txt --seg-sz 8192 --compress-level 1:1:9 --num-iter 10 --extended-input-sz 1048576 --max-num-sgl-segs 16 --huffman-enc fixed" 15 | volumeMounts: 16 | - mountPath: /dev/hugepages 17 | name: hugepage 18 | - mountPath: /var/run/dpdk 19 | name: dpdk-runtime 20 | - mountPath: /var/data/ 21 | name: testfile 22 | resources: 23 | claims: 24 | - name: qat-resource-dc 25 | requests: 26 | cpu: "3" 27 | memory: "128Mi" 28 | hugepages-2Mi: "128Mi" 29 | limits: 30 | cpu: "3" 31 | memory: "128Mi" 32 | hugepages-2Mi: "128Mi" 33 | securityContext: 34 | readOnlyRootFilesystem: true 35 | allowPrivilegeEscalation: false 36 | capabilities: 37 | add: 38 | ["IPC_LOCK"] 39 | restartPolicy: Never 40 | volumes: 41 | - name: dpdk-runtime 42 | emptyDir: 43 | medium: Memory 44 | - name: hugepage 45 | emptyDir: 46 | medium: HugePages 47 | - name: testfile 48 | configMap: 49 | name: test-data 50 | resourceClaims: 51 | - name: qat-resource-dc 52 | resourceClaimTemplateName: qat-template-dc 53 | -------------------------------------------------------------------------------- /test/e2e/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | 10 | "k8s.io/kubernetes/test/e2e/framework" 11 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 12 | ) 13 | 14 | // LocateRepoFile locates a file inside this repository. 15 | func LocateRepoFile(repopath string) (string, error) { 16 | root := os.Getenv("PLUGINS_REPO_DIR") 17 | if root != "" { 18 | path := filepath.Join(root, repopath) 19 | if _, err := os.Stat(path); !os.IsNotExist(err) { 20 | return path, nil 21 | } 22 | } 23 | 24 | currentDir, err := os.Getwd() 25 | if err != nil { 26 | return "", err 27 | } 28 | 29 | path := filepath.Join(currentDir, repopath) 30 | if _, err := os.Stat(path); !os.IsNotExist(err) { 31 | return path, nil 32 | } 33 | 34 | path = filepath.Join(currentDir, "../../"+repopath) 35 | if _, err := os.Stat(path); !os.IsNotExist(err) { 36 | return path, err 37 | } 38 | 39 | return "", errors.New("no file found, try to define PLUGINS_REPO_DIR pointing to the root of the repository") 40 | } 41 | 42 | // GetPodLogs returns the log of the container. If not possible to get logs, it returns the error message. 43 | func GetPodLogs(ctx context.Context, f *framework.Framework, podName, containerName string) string { 44 | log, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, podName, containerName) 45 | if err != nil { 46 | return fmt.Sprintf("unable to get log from pod: %v", err) 47 | } 48 | 49 | return fmt.Sprintf("log output of the container %s in the pod %s:%s", containerName, podName, log) 50 | } 51 | -------------------------------------------------------------------------------- /pkg/version/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | import ( 20 | "runtime" 21 | 22 | "k8s.io/klog/v2" 23 | ) 24 | 25 | // These are set during build time via -ldflags. 26 | var ( 27 | version = "N/A" 28 | gitCommit = "N/A" 29 | buildDate = "N/A" 30 | ) 31 | 32 | // GetVersion returns the version information of the driver. 33 | func PrintDriverVersion(apiGroupName string) { 34 | klog.Infof(` 35 | Driver Name: %v, 36 | Driver Version: %v, 37 | Git Commit: %v, 38 | Build Date: %v, 39 | Go Version: %v, 40 | Compiler: %v, 41 | Platform: %v/%v`, 42 | apiGroupName, 43 | version, 44 | gitCommit, 45 | buildDate, 46 | runtime.Version(), 47 | runtime.Compiler, 48 | runtime.GOOS, 49 | runtime.GOARCH, 50 | ) 51 | } 52 | 53 | func GetVersion() string { 54 | return version 55 | } 56 | 57 | func GetGitCommit() string { 58 | return gitCommit 59 | } 60 | 61 | func GetBuildDate() string { 62 | return buildDate 63 | } 64 | -------------------------------------------------------------------------------- /.golangci.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | run: 3 | tests: true 4 | linters: 5 | enable: 6 | - asciicheck 7 | - contextcheck 8 | - forcetypeassert 9 | - gocritic 10 | - cyclop 11 | - godot 12 | - misspell 13 | - staticcheck 14 | settings: 15 | cyclop: 16 | max-complexity: 15 17 | staticcheck: 18 | initialisms: 19 | - ACL 20 | - API 21 | - ASCII 22 | - CPU 23 | - CSS 24 | - DNS 25 | - EOF 26 | - GUID 27 | - HTML 28 | - HTTP 29 | - HTTPS 30 | - IP 31 | - JSON 32 | - QPS 33 | - RAM 34 | - RPC 35 | - SLA 36 | - SMTP 37 | - SQL 38 | - SSH 39 | - TCP 40 | - TLS 41 | - TTL 42 | - UDP 43 | - UI 44 | - GID 45 | - UID 46 | - UUID 47 | - URI 48 | - URL 49 | - UTF8 50 | - VM 51 | - XML 52 | - XMPP 53 | - XSRF 54 | - XSS 55 | - SIP 56 | - RTP 57 | - AMQP 58 | - DB 59 | - TS 60 | exclusions: 61 | generated: lax 62 | presets: 63 | - comments 64 | - common-false-positives 65 | - legacy 66 | - std-error-handling 67 | paths: 68 | - third_party$ 69 | - builtin$ 70 | - examples$ 71 | formatters: 72 | enable: 73 | - gofmt 74 | - goimports 75 | settings: 76 | goimports: 77 | local-prefixes: 78 | - github.com/intel/intel-resource-drivers-for-kubernetes 79 | exclusions: 80 | generated: lax 81 | paths: 82 | - third_party$ 83 | - builtin$ 84 | - examples$ 85 | -------------------------------------------------------------------------------- /doc/gaudi/README.md: -------------------------------------------------------------------------------- 1 | # Intel Gaudi resource driver for Kubernetes 2 | 3 | CAUTION: This is a beta / non-production software, do not use on production clusters. 4 | 5 | ## About resource driver 6 | 7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates 8 | the resources and resource driver's kubelet-plugin ensures that the allocated devices are prepared 9 | and available for Pods. 10 | 11 | DRA API graduated to GA with v1 resource.k8s.io API in K8s v1.34, backwards compatibility may vary 12 | depending on features enabled. 13 | 14 | ## Supported Kubernetes Versions 15 | 16 | Supported Kubernetes versions are listed below: 17 | 18 | | Branch | Kubernetes branch/version | Status | DRA | 19 | |:------------------|:--------------------------------|:------------|:-------------------------------| 20 | | v0.1.0 | Kubernetes v1.27 ~ v1.30 | supported | Classic, Structured Parameters | 21 | | v0.2.0 | Kubernetes v1.31 | unsupported | Structured Parameters | 22 | | v0.3.0 | Kubernetes v1.32+ | unsupported | Structured Parameters | 23 | | v0.4.0 | Kubernetes v1.32+ | unsupported | Structured Parameters | 24 | | v0.5.0 | Kubernetes v1.33-v1.34 | unsupported | Structured Parameters | 25 | | v0.6.0 | Kubernetes v1.32+ | supported | Structured Parameters | 26 | 27 | ## Documentation 28 | 29 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md) 30 | - [How to deploy and use Intel Gaudi resource driver](USAGE.md) 31 | - Optional: [How to build Intel Gaudi resource driver container image](BUILD.md) 32 | -------------------------------------------------------------------------------- /doc/gpu/complete-overview.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | 3 | left to right direction 4 | allowmixing 5 | 6 | 7 | component "CRD resource-classes" { 8 | component "resource-class0" { 9 | component "CRD resource-class0-parameters" 10 | } 11 | component "resource-class1" { 12 | component "CRD resource-class1-parameters" 13 | } 14 | } 15 | 16 | component "CRD nodeallocationstats" as crdnas { 17 | cloud "node0" as nasnode0 { 18 | component "allocatable GPUs" as allocatable 19 | component "claim-requests" as requests 20 | component "claim-allocations" as allocations 21 | } 22 | } 23 | 24 | node "control-plane" as cp { 25 | component "Scheduler / DRA-controller" as scheduler 26 | component "R-D controller" as rdcontroller 27 | component "API" as api 28 | } 29 | 30 | node "node0" as wn { 31 | component "Pod" as pod 32 | component "R-D kubelet-plugin" as rdplugin 33 | } 34 | 35 | component "resourceclaim0\n\nresource-class0\nparametersRef:" as resclaim0 { 36 | component resclaimparams0 [ 37 | type: gpu, 38 | memory: 256, 39 | millicores: 100 40 | ] 41 | } 42 | 43 | package "Pod.yaml" as podyaml { 44 | } 45 | 46 | package "ResourceClaim.yaml" as resclaimyaml { 47 | } 48 | 49 | podyaml ..> api : deploy 50 | resclaimyaml ..> api : deploy 51 | 52 | cloud "Schedule Pod" as schedulepod { 53 | } 54 | 55 | api ..> schedulepod 56 | schedulepod ..> scheduler 57 | rdplugin --> allocatable : 0. populate & sync with CDI/CRD 58 | api --> resclaim0 : 1. create 59 | resclaim0 --> rdcontroller : 2. notify 60 | rdcontroller --> requests : 3. create 61 | rdcontroller --> requests : 3. create 62 | crdnas --> rdplugin : 4. allocate and update 63 | 64 | scheduler <=> rdcontroller : unsuitableNodes 65 | rdcontroller --> nasnode0 : enough resources? 66 | 67 | @enduml 68 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for intel-gpu-resource-driver. 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | selectorLabelsOverride: {} 5 | 6 | # To enable DRA resources allocation as extended resources. K8s v1.33-v1.36 requires FeatureGate. 7 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource . 8 | enableDRAExtendedResources: false 9 | extendedResourceName: intel.com/gpu 10 | 11 | imagePullSecrets: [] 12 | image: 13 | repository: ghcr.io/intel/intel-resource-drivers-for-kubernetes 14 | name: intel-gpu-resource-driver 15 | pullPolicy: IfNotPresent 16 | tag: "v0.9.1" 17 | 18 | serviceAccount: 19 | create: true 20 | annotations: {} 21 | name: "" 22 | automount: true 23 | 24 | openshift: 25 | enabled: false 26 | sccName: intel-gpu-resource-driver 27 | 28 | kubeletPlugin: 29 | podAnnotations: {} 30 | nodeSelector: {} 31 | # Label used when nfd.enabled is true. 32 | # Changes to this are ignored when .Values.nodeFeatureRules.enabled or .Values.nfd.enabled . 33 | #intel.feature.node.kubernetes.io/gpu: "true" 34 | tolerations: 35 | - key: node-role.kubernetes.io/master 36 | operator: Exists 37 | effect: NoSchedule 38 | - key: node-role.kubernetes.io/control-plane 39 | operator: Exists 40 | effect: NoSchedule 41 | # Refer to the official documentation for Node Feature Discovery (NFD) 42 | # regarding node tainting: 43 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting 44 | - key: "node.kubernetes.io/gpu" 45 | operator: "Exists" 46 | effect: "NoSchedule" 47 | affinity: {} 48 | 49 | cdi: 50 | staticPath: /etc/cdi 51 | dynamicPath: /var/run/cdi 52 | 53 | nodeFeatureRules: 54 | enabled: false 55 | 56 | nfd: 57 | enabled: false # change to true to install NFD to the cluster 58 | nameOverride: intel-gpu-nfd 59 | enableNodeFeatureApi: true 60 | -------------------------------------------------------------------------------- /cmd/kubelet-qat-plugin/config.go: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 Intel Corporation 2 | * SPDX-License-Identifier: Apache-2.0 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "encoding/json" 9 | "fmt" 10 | "os" 11 | 12 | "k8s.io/klog/v2" 13 | 14 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/qat/device" 15 | ) 16 | 17 | const defaultConfigFile = "/defaults/qatdefaults.config" 18 | 19 | func readConfigFile(hostname string) (map[string]string, error) { 20 | configBytes, err := os.ReadFile(defaultConfigFile) 21 | if err != nil { 22 | return nil, err 23 | } 24 | 25 | var configFile map[string]map[string]string 26 | if err := json.Unmarshal(configBytes, &configFile); err != nil { 27 | return nil, err 28 | } 29 | 30 | hostConfig, exists := configFile[hostname] 31 | if !exists { 32 | return nil, fmt.Errorf("no config for host '%s' found", hostname) 33 | } 34 | 35 | return hostConfig, nil 36 | } 37 | 38 | func getDefaultConfiguration(hostname string, q device.QATDevices) error { 39 | serviceconfig, err := readConfigFile(hostname) 40 | if err != nil { 41 | klog.Infof("Could not read default config file - leaving unconfigured: %v", err) 42 | return nil 43 | } 44 | 45 | klog.V(5).Infof("Default config for host '%s':", hostname) 46 | for _, pf := range q { 47 | if servicestr, exists := serviceconfig[pf.Device]; exists { 48 | var services device.Services 49 | var err error 50 | 51 | if services, err = device.StringToServices(servicestr); err != nil { 52 | klog.Warningf("Error parsing default config services for PF device '%s': %v", pf.Device, err) 53 | continue 54 | } 55 | 56 | if err := pf.SetServices([]device.Services{services}); err != nil { 57 | klog.Warningf("Error configuring services '%s' for PF device '%s': %v", services.String(), pf.Device, err) 58 | continue 59 | } 60 | 61 | klog.V(5).Infof("PF device '%s' configured with services %s'", pf.Device, services.String()) 62 | } 63 | } 64 | 65 | return nil 66 | } 67 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* Define common helpers */}} 2 | {{- define "intel-gpu-resource-driver.chart" -}} 3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 4 | {{- end }} 5 | 6 | {{/* Define the base name for the driver */}} 7 | {{- define "intel-gpu-resource-driver.baseName" -}} 8 | intel-gpu-resource-driver 9 | {{- end }} 10 | 11 | {{- define "intel-gpu-resource-driver.name" -}} 12 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 13 | {{- end }} 14 | 15 | {{- define "intel-gpu-resource-driver.fullname" -}} 16 | {{- if .Values.fullnameOverride -}} 17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 18 | {{- else -}} 19 | {{- printf "%s-%s" (include "intel-gpu-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}} 20 | {{- end -}} 21 | {{- end }} 22 | 23 | {{/* Labels for templates */}} 24 | {{- define "intel-gpu-resource-driver.labels" -}} 25 | helm.sh/chart: {{ include "intel-gpu-resource-driver.chart" . }} 26 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 27 | app.kubernetes.io/managed-by: {{ .Release.Service }} 28 | {{- end }} 29 | 30 | {{- define "intel-gpu-resource-driver.clusterRoleName" -}} 31 | {{- printf "%s-role" (include "intel-gpu-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 32 | {{- end }} 33 | 34 | {{- define "intel-gpu-resource-driver.clusterRoleBindingName" -}} 35 | {{- printf "%s-rolebinding" (include "intel-gpu-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 36 | {{- end }} 37 | 38 | {{- define "intel-gpu-resource-driver.serviceAccountName" -}} 39 | {{- if .Values.serviceAccount.create -}} 40 | {{- default "intel-gpu-sa" .Values.serviceAccount.name -}} 41 | {{- end -}} 42 | {{- end }} 43 | 44 | {{/* Define full image name */}} 45 | {{- define "intel-gpu-resource-driver.fullimage" -}} 46 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}} 47 | {{- end }} 48 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* Define common helpers */}} 2 | {{- define "intel-qat-resource-driver.chart" -}} 3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 4 | {{- end }} 5 | 6 | {{/* Define the base name for the driver */}} 7 | {{- define "intel-qat-resource-driver.baseName" -}} 8 | intel-qat-resource-driver 9 | {{- end }} 10 | 11 | {{- define "intel-qat-resource-driver.name" -}} 12 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 13 | {{- end }} 14 | 15 | {{- define "intel-qat-resource-driver.fullname" -}} 16 | {{- if .Values.fullnameOverride -}} 17 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 18 | {{- else -}} 19 | {{- printf "%s-%s" (include "intel-qat-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}} 20 | {{- end -}} 21 | {{- end }} 22 | 23 | {{/* Labels for templates */}} 24 | {{- define "intel-qat-resource-driver.labels" -}} 25 | helm.sh/chart: {{ include "intel-qat-resource-driver.chart" . }} 26 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 27 | app.kubernetes.io/managed-by: {{ .Release.Service }} 28 | {{- end }} 29 | 30 | {{- define "intel-qat-resource-driver.clusterRoleName" -}} 31 | {{- printf "%s-role" (include "intel-qat-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 32 | {{- end }} 33 | 34 | {{- define "intel-qat-resource-driver.clusterRoleBindingName" -}} 35 | {{- printf "%s-rolebinding" (include "intel-qat-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 36 | {{- end }} 37 | 38 | {{- define "intel-qat-resource-driver.serviceAccountName" -}} 39 | {{- if .Values.serviceAccount.create -}} 40 | {{- default "intel-qat-sa" .Values.serviceAccount.name -}} 41 | {{- end -}} 42 | {{- end }} 43 | 44 | {{/* Define full image name */}} 45 | {{- define "intel-qat-resource-driver.fullimage" -}} 46 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}} 47 | {{- end }} 48 | -------------------------------------------------------------------------------- /gaudi.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | GAUDI_VERSION ?= v0.6.1 17 | GAUDI_IMAGE_NAME ?= intel-gaudi-resource-driver 18 | GAUDI_IMAGE_VERSION ?= $(GAUDI_VERSION) 19 | GAUDI_IMAGE_TAG ?= $(REGISTRY)/$(GAUDI_IMAGE_NAME):$(GAUDI_IMAGE_VERSION) 20 | 21 | GAUDI_BINARIES = \ 22 | bin/kubelet-gaudi-plugin 23 | 24 | GAUDI_COMMON_SRC = \ 25 | $(COMMON_SRC) \ 26 | pkg/gaudi/cdihelpers/*.go \ 27 | pkg/gaudi/device/*.go \ 28 | pkg/gaudi/discovery/*.go 29 | 30 | GAUDI_LDFLAGS = ${LDFLAGS} -extldflags ${EXT_LDFLAGS} -X ${PKG}/pkg/version.version=${GAUDI_VERSION} 31 | 32 | .PHONY: gaudi 33 | gaudi: $(GAUDI_BINARIES) 34 | 35 | bin/kubelet-gaudi-plugin: cmd/kubelet-gaudi-plugin/*.go $(GAUDI_COMMON_SRC) 36 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 37 | go build -a -ldflags "${GAUDI_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-gaudi-plugin 38 | 39 | .PHONY: gaudi-container-build 40 | gaudi-container-build: cleanall vendor 41 | @echo "Building Gaudi resource driver container..." 42 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(GAUDI_IMAGE_TAG) \ 43 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) \ 44 | --build-arg http_proxy=$(http_proxy) \ 45 | --build-arg https_proxy=$(https_proxy) \ 46 | --build-arg no_proxy=$(no_proxy) \ 47 | -f Dockerfile.gaudi . 48 | 49 | .PHONY: gaudi-container-push 50 | gaudi-container-push: gaudi-container-build 51 | $(DOCKER) push $(GAUDI_IMAGE_TAG) 52 | -------------------------------------------------------------------------------- /pkg/gpu/drm/drm.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package drm 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "path" 23 | "strconv" 24 | 25 | "k8s.io/klog/v2" 26 | 27 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gpu/device" 28 | ) 29 | 30 | // DeduceCardAndRenderdIndexes arg is device "/bus/pci/drivers/i915//drm/" path. 31 | func DeduceCardAndRenderdIndexes(sysfsDeviceDir string) (uint64, uint64, error) { 32 | var cardIdx uint64 33 | var renderDidx uint64 34 | 35 | // get card and renderD indexes 36 | drmDir := path.Join(sysfsDeviceDir, "drm") 37 | drmFiles, err := os.ReadDir(drmDir) 38 | if err != nil { // ignore this device 39 | return 0, 0, fmt.Errorf("cannot read device folder %v: %v", drmDir, err) 40 | } 41 | 42 | for _, drmFile := range drmFiles { 43 | drmFileName := drmFile.Name() 44 | if device.CardRegexp.MatchString(drmFileName) { 45 | cardIdx, err = strconv.ParseUint(drmFileName[4:], 10, 64) 46 | if err != nil { 47 | return 0, 0, fmt.Errorf("failed to parse index of DRM card device '%v', skipping", drmFileName) 48 | } 49 | } else if device.RenderdRegexp.MatchString(drmFileName) { 50 | renderDidx, err = strconv.ParseUint(drmFileName[7:], 10, 64) 51 | if err != nil { 52 | klog.Errorf("failed to parse renderDN device: %v, skipping", drmFileName) 53 | continue 54 | } 55 | } 56 | } 57 | 58 | return cardIdx, renderDidx, nil 59 | } 60 | -------------------------------------------------------------------------------- /cmd/kubelet-gaudi-plugin/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | 23 | "github.com/urfave/cli/v2" 24 | 25 | gaudi "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gaudi/device" 26 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers" 27 | ) 28 | 29 | type GaudiFlags struct { 30 | GaudiHookPath string 31 | GaudinetPath string 32 | } 33 | 34 | func main() { 35 | gaudiFlags := GaudiFlags{ 36 | GaudiHookPath: gaudi.DefaultHabanaHookPath, 37 | GaudinetPath: gaudi.DefaultGaudinetPath, 38 | } 39 | cliFlags := []cli.Flag{ 40 | &cli.StringFlag{ 41 | Name: "gaudi-hook-path", 42 | Aliases: []string{"p"}, 43 | Usage: "full path to the habana-container-hook", 44 | Value: gaudi.DefaultHabanaHookPath, 45 | Destination: &gaudiFlags.GaudiHookPath, 46 | EnvVars: []string{"GAUDI_HOOK_PATH"}, 47 | }, 48 | &cli.StringFlag{ 49 | Name: "gaudinet-path", 50 | Aliases: []string{"n"}, 51 | Usage: "full path to the network configuration file", 52 | Value: gaudi.DefaultGaudinetPath, 53 | Destination: &gaudiFlags.GaudinetPath, 54 | EnvVars: []string{"GAUDINET_PATH"}, 55 | }, 56 | } 57 | 58 | if err := helpers.NewApp(gaudi.DriverName, newDriver, cliFlags, &gaudiFlags).Run(os.Args); err != nil { 59 | fmt.Fprintf(os.Stderr, "Error: %v\n", err) 60 | os.Exit(1) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Dockerfile.gaudi-test: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | FROM golang:1.24.2@sha256:b51b7beeabe2e2d8438ba4295c59d584049873a480ba0e7b56d80db74b3e3a3a AS go 15 | 16 | FROM ubuntu:24.04@sha256:80dd3c3b9c6cecb9f1667e9290b3bc61b78c2678c02cbdae5f0fea92cc6734ab AS ubuntu 17 | ARG UID=1000 18 | ARG GID=1000 19 | COPY --from=go --chown=${UID}:${GID} /usr/local/go /home/ubuntu/go 20 | 21 | # add xpu-smi shared library for GPU tests and other dependencies 22 | RUN \ 23 | apt-get update && \ 24 | apt-get install -y make gcc wget software-properties-common python3-launchpadlib git && \ 25 | add-apt-repository -y ppa:kobuk-team/intel-graphics && \ 26 | apt-get update && \ 27 | apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc && \ 28 | wget -qO /tmp/xpu-smi.deb https://github.com/intel/xpumanager/releases/download/V1.3.1/xpu-smi_1.3.1_20250724.061629.60921e5e_u24.04_amd64.deb && \ 29 | apt-get install -y /tmp/xpu-smi.deb && \ 30 | rm /tmp/xpu-smi.deb && \ 31 | unset http_proxy https_proxy no_proxy && \ 32 | echo 'export PATH=/home/ubuntu/go/bin:$PATH' >> /home/ubuntu/.bashrc 33 | 34 | RUN \ 35 | mkdir /github && \ 36 | chmod 777 /github && \ 37 | mkdir /home/ubuntu/src && \ 38 | chown -R ${UID}:${GID} /home/ubuntu 39 | 40 | ENV GOCACHE=/home/ubuntu/.cache/go-build 41 | ENV GOMODCACHE=/home/ubuntu/.cache/go-mod 42 | ENV PATH=/home/ubuntu/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 43 | 44 | USER ubuntu 45 | WORKDIR /home/ubuntu 46 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* Define common helpers */}} 2 | {{- define "intel-gaudi-resource-driver.chart" -}} 3 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 4 | {{- end }} 5 | 6 | {{/* Define the base name for the driver */}} 7 | {{- define "intel-gaudi-resource-driver.baseName" -}} 8 | intel-gaudi-resource-driver 9 | {{- end }} 10 | 11 | {{/* Specific helpers */}} 12 | {{- define "intel-gaudi-resource-driver.name" -}} 13 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 14 | {{- end }} 15 | 16 | {{/* Create a default fully qualified app name */}} 17 | {{- define "intel-gaudi-resource-driver.fullname" -}} 18 | {{- if .Values.fullnameOverride -}} 19 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 20 | {{- else -}} 21 | {{- printf "%s-%s" (include "intel-gaudi-resource-driver.baseName" .) .Release.Name | trunc 63 | trimSuffix "-" -}} 22 | {{- end -}} 23 | {{- end }} 24 | 25 | {{/* Labels for templates */}} 26 | {{- define "intel-gaudi-resource-driver.labels" -}} 27 | helm.sh/chart: {{ include "intel-gaudi-resource-driver.chart" . }} 28 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 29 | app.kubernetes.io/managed-by: {{ .Release.Service }} 30 | {{- end }} 31 | 32 | {{- define "intel-gaudi-resource-driver.clusterRoleName" -}} 33 | {{- printf "%s-role" (include "intel-gaudi-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 34 | {{- end }} 35 | 36 | {{- define "intel-gaudi-resource-driver.clusterRoleBindingName" -}} 37 | {{- printf "%s-rolebinding" (include "intel-gaudi-resource-driver.baseName" .) | trunc 63 | trimSuffix "-" }} 38 | {{- end }} 39 | 40 | {{- define "intel-gaudi-resource-driver.serviceAccountName" -}} 41 | {{- if .Values.serviceAccount.create -}} 42 | {{- default "intel-gaudi-sa" .Values.serviceAccount.name -}} 43 | {{- end -}} 44 | {{- end }} 45 | 46 | {{/* Define full image name */}} 47 | {{- define "intel-gaudi-resource-driver.fullimage" -}} 48 | {{- printf "%s/%s:%s" .Values.image.repository .Values.image.name .Values.image.tag -}} 49 | {{- end }} 50 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for intel-qat-resource-driver. 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | selectorLabelsOverride: {} 5 | 6 | # To enable DRA resources allocation as extended resources. K8s v1.33-v1.36 requires FeatureGate. 7 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource . 8 | enableDRAExtendedResources: false 9 | extendedResourceName: intel.com/qat 10 | 11 | imagePullSecrets: [] 12 | image: 13 | repository: ghcr.io/intel/intel-resource-drivers-for-kubernetes 14 | name: intel-qat-resource-driver 15 | pullPolicy: IfNotPresent 16 | tag: "v0.4.1" 17 | 18 | serviceAccount: 19 | create: true 20 | annotations: {} 21 | name: "" 22 | automount: true 23 | 24 | openshift: 25 | enabled: false 26 | sccName: intel-qat-resource-driver 27 | 28 | kubeletPlugin: 29 | podAnnotations: {} 30 | nodeSelector: {} 31 | # Label used when nfd.enabled is true. 32 | # Changes to this are ignored when .Values.nodeFeatureRules.enabled or .Values.nfd.enabled . 33 | #feature.node.kubernetes.io/qat: "true" 34 | tolerations: 35 | - key: node-role.kubernetes.io/master 36 | operator: Exists 37 | effect: NoSchedule 38 | - key: node-role.kubernetes.io/control-plane 39 | operator: Exists 40 | effect: NoSchedule 41 | # Refer to the official documentation for Node Feature Discovery (NFD) 42 | # regarding node tainting: 43 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting 44 | - key: "node.kubernetes.io/qat" 45 | operator: "Exists" 46 | effect: "NoSchedule" 47 | affinity: {} 48 | 49 | cdi: 50 | staticPath: /etc/cdi 51 | dynamicPath: /var/run/cdi 52 | 53 | nodeFeatureRules: 54 | enabled: false 55 | 56 | nfd: 57 | enabled: false # change to true to install NFD to the cluster 58 | nameOverride: intel-qat-nfd 59 | # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16): 60 | # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters 61 | enableNodeFeatureApi: true 62 | -------------------------------------------------------------------------------- /gpu.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Use a custom version for E2E tests if we are testing in CI 16 | GPU_VERSION ?= v0.9.1 17 | GPU_IMAGE_NAME ?= intel-gpu-resource-driver 18 | GPU_IMAGE_VERSION ?= $(GPU_VERSION) 19 | GPU_IMAGE_TAG ?= $(REGISTRY)/$(GPU_IMAGE_NAME):$(GPU_IMAGE_VERSION) 20 | 21 | GPU_BINARIES = \ 22 | bin/kubelet-gpu-plugin 23 | 24 | GPU_COMMON_SRC = \ 25 | $(COMMON_SRC) \ 26 | pkg/gpu/cdihelpers/*.go \ 27 | pkg/gpu/device/*.go \ 28 | pkg/gpu/discovery/*.go 29 | 30 | GPU_LDFLAGS = ${LDFLAGS} -X ${PKG}/pkg/version.version=${GPU_VERSION} 31 | 32 | .PHONY: gpu 33 | gpu: $(GPU_BINARIES) 34 | 35 | bin/kubelet-gpu-plugin: cmd/kubelet-gpu-plugin/*.go $(GPU_COMMON_SRC) 36 | CGO_ENABLED=1 GOOS=linux GOARCH=${ARCH} \ 37 | go build -a -ldflags "${GPU_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-gpu-plugin 38 | 39 | bin/alert-webhook: cmd/alert-webhook/*.go $(GPU_COMMON_SRC) 40 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 41 | go build -a -ldflags "${GPU_LDFLAGS}" -mod vendor -o $@ ./cmd/alert-webhook 42 | 43 | .PHONY: gpu-container-build 44 | gpu-container-build: cleanall vendor 45 | @echo "Building GPU resource drivers container..." 46 | $(DOCKER) build --pull --platform="linux/$(ARCH)" \ 47 | -t $(GPU_IMAGE_TAG) \ 48 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) \ 49 | --build-arg http_proxy=$(http_proxy) \ 50 | --build-arg https_proxy=$(https_proxy) \ 51 | --build-arg no_proxy=$(no_proxy) \ 52 | -f Dockerfile.gpu . 53 | 54 | .PHONY: gpu-container-push 55 | gpu-container-push: gpu-container-build 56 | $(DOCKER) push $(GPU_IMAGE_TAG) 57 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for intel-gaudi-resource-driver. 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | selectorLabelsOverride: {} 5 | 6 | # To enable DRA resources allocation as extended resources. K8s v1.33-v1.36 requires FeatureGate. 7 | # See https://github.com/kubernetes/enhancements/tree/master/keps/sig-scheduling/5004-dra-extended-resource . 8 | enableDRAExtendedResources: false 9 | extendedResourceName: intel.com/gaudi 10 | 11 | imagePullSecrets: [] 12 | image: 13 | repository: ghcr.io/intel/intel-resource-drivers-for-kubernetes 14 | name: intel-gaudi-resource-driver 15 | pullPolicy: IfNotPresent 16 | tag: "v0.6.1" 17 | 18 | serviceAccount: 19 | create: true 20 | annotations: {} 21 | name: "" 22 | automount: true 23 | 24 | openshift: 25 | enabled: false 26 | sccName: intel-gaudi-resource-driver 27 | 28 | kubeletPlugin: 29 | podAnnotations: {} 30 | nodeSelector: {} 31 | # Label used when nfd.enabled is true. 32 | # Changes to this are ignored when .Values.nodeFeatureRules.enabled or .Values.nfd.enabled . 33 | #intel.feature.node.kubernetes.io/gaudi: "true" 34 | tolerations: 35 | - key: node-role.kubernetes.io/master 36 | operator: Exists 37 | effect: NoSchedule 38 | - key: node-role.kubernetes.io/control-plane 39 | operator: Exists 40 | effect: NoSchedule 41 | # Refer to the official documentation for Node Feature Discovery (NFD) 42 | # regarding node tainting: 43 | # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting 44 | - key: "intel.feature.node.kubernetes.io/gaudi" 45 | operator: "Exists" 46 | effect: "NoSchedule" 47 | affinity: {} 48 | 49 | cdi: 50 | staticPath: /etc/cdi 51 | dynamicPath: /var/run/cdi 52 | 53 | nodeFeatureRules: 54 | enabled: false 55 | 56 | nfd: 57 | enabled: false # change to true to install NFD to the cluster 58 | nameOverride: intel-gaudi-nfd 59 | # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16): 60 | # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters 61 | enableNodeFeatureApi: true 62 | -------------------------------------------------------------------------------- /cmd/goxpusmi/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | 23 | "github.com/spf13/cobra" 24 | 25 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/goxpusmi" 26 | ) 27 | 28 | var ( 29 | version = "v0.1.0" 30 | ) 31 | 32 | func main() { 33 | command := newCommand() 34 | err := command.Execute() 35 | if err != nil { 36 | fmt.Printf("Error: %v\n", err) 37 | os.Exit(1) 38 | } 39 | } 40 | 41 | func cobraRunFunc(cmd *cobra.Command, args []string) error { 42 | if err := goxpusmi.Initialize(); err != nil { 43 | return fmt.Errorf("failed to initialize xpu-smi: %w", err) 44 | } 45 | 46 | // Do a verbose discovery. 47 | devices, err := goxpusmi.Discover(true) 48 | if err != nil { 49 | return fmt.Errorf("failed to print device number: %w", err) 50 | } 51 | 52 | fmt.Printf("Number of discovered devices: %d\n", len(devices)) 53 | 54 | goxpusmi.HealthCheck(devices) 55 | 56 | if err := goxpusmi.Shutdown(); err != nil { 57 | return fmt.Errorf("failed to shutdown xpu-smi: %w", err) 58 | } 59 | 60 | return nil 61 | } 62 | 63 | func newCommand() *cobra.Command { 64 | cmd := &cobra.Command{ 65 | Use: "goxpusmi", 66 | Short: "Go xpu-smi tester", 67 | Long: "Test tool for xpu-smi Go bindings (goxpusmi)", 68 | RunE: cobraRunFunc, 69 | } 70 | cmd.Version = version 71 | cmd.Flags().BoolP("version", "v", false, "Show the version of the binary") 72 | cmd.SetVersionTemplate("Test tool xpu-smi Go bindings (goxpusmi). Version: {{.Version}}\n") 73 | 74 | return cmd 75 | } 76 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/nfd_labeled_nodes/nfd-intel-gpu-platform-labeling.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: nfd.k8s-sigs.io/v1alpha1 2 | kind: NodeFeatureRule 3 | metadata: 4 | name: intel-gpu-platform-labeling 5 | spec: 6 | rules: 7 | # A_Series (Alchemist) 8 | - labels: 9 | gpu.intel.com/family: "A_Series" 10 | matchFeatures: 11 | - feature: pci.device 12 | matchExpressions: 13 | class: {op: In, value: ["0300"]} 14 | vendor: {op: In, value: ["8086"]} 15 | device: 16 | op: In 17 | value: 18 | - "56a6" 19 | - "56a5" 20 | - "56a1" 21 | - "56a0" 22 | - "5694" 23 | - "5693" 24 | - "5692" 25 | - "5691" 26 | - "5690" 27 | - "56b3" 28 | - "56b2" 29 | - "56a4" 30 | - "56a3" 31 | - "5697" 32 | - "5696" 33 | - "5695" 34 | - "56b1" 35 | - "56b0" 36 | name: intel.gpu.a.series 37 | # Max_Series 38 | - labels: 39 | gpu.intel.com/family: "Max_Series" 40 | matchFeatures: 41 | - feature: pci.device 42 | matchExpressions: 43 | class: {op: In, value: ["0380"]} 44 | vendor: {op: In, value: ["8086"]} 45 | device: 46 | op: In 47 | value: 48 | - "0bda" 49 | - "0bd5" 50 | - "0bd9" 51 | - "0bdb" 52 | - "0bd7" 53 | - "0bd6" 54 | - "0bd0" 55 | name: intel.gpu.max.series 56 | # Flex_Series 57 | - labels: 58 | gpu.intel.com/family: "Flex_Series" 59 | matchFeatures: 60 | - feature: pci.device 61 | matchExpressions: 62 | class: {op: In, value: ["0300", "0380"]} 63 | vendor: {op: In, value: ["8086"]} 64 | device: 65 | op: In 66 | value: 67 | - "0f00" 68 | - "0f01" 69 | - "0f02" 70 | name: intel.gpu.flex.series 71 | -------------------------------------------------------------------------------- /qat.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | QAT_VERSION ?= v0.4.1 17 | QAT_IMAGE_NAME ?= intel-qat-resource-driver 18 | QAT_IMAGE_VERSION ?= $(QAT_VERSION) 19 | QAT_IMAGE_TAG ?= $(REGISTRY)/$(QAT_IMAGE_NAME):$(QAT_IMAGE_VERSION) 20 | 21 | QAT_BINARIES = \ 22 | bin/qat-showdevice \ 23 | bin/kubelet-qat-plugin 24 | 25 | QAT_COMMON_SRC = \ 26 | $(COMMON_SRC) \ 27 | pkg/qat/device/*.go \ 28 | pkg/qat/cdihelpers/*.go 29 | 30 | QAT_LDFLAGS = ${LDFLAGS} -extldflags $(EXT_LDFLAGS) -X ${PKG}/pkg/version.version=${QAT_VERSION} 31 | 32 | .PHONY: qat 33 | qat: $(QAT_BINARIES) 34 | 35 | bin/qat-showdevice: cmd/qat-showdevice/*.go $(QAT_COMMON_SRC) 36 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 37 | go build -a -ldflags "${QAT_LDFLAGS}" -mod vendor -o $@ ./cmd/qat-showdevice 38 | 39 | bin/kubelet-qat-plugin: cmd/kubelet-qat-plugin/*.go $(QAT_COMMON_SRC) 40 | CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} \ 41 | go build -a -ldflags "${QAT_LDFLAGS}" -mod vendor -o $@ ./cmd/kubelet-qat-plugin 42 | 43 | .PHONY: qat-container-build 44 | qat-container-build: cleanall vendor 45 | @echo "Building QAT resource driver container..." 46 | $(DOCKER) build --pull --platform="linux/$(ARCH)" -t $(QAT_IMAGE_TAG) \ 47 | --build-arg LOCAL_LICENSES=$(LOCAL_LICENSES) -f Dockerfile.qat . 48 | 49 | .PHONY: qat-container-push 50 | qat-container-push: qat-container-build 51 | $(DOCKER) push $(QAT_IMAGE_TAG) 52 | 53 | .PHONY: e2e-qat 54 | e2e-qat: 55 | sed -i 's|\(intel/intel-qat-resource-driver:\)[^ ]*|\1devel|' deployments/qat/base/resource-driver.yaml 56 | go test -v ./test/e2e/... --clean-start=true -ginkgo.v -ginkgo.trace -ginkgo.show-node-events 57 | -------------------------------------------------------------------------------- /deployments/qat/tests/resource-claim-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: qat-template-sym 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: qat-request-sym 10 | exactly: 11 | deviceClassName: qat.intel.com 12 | selectors: 13 | - cel: 14 | expression: |- 15 | device.attributes["qat.intel.com"].services == "sym" || 16 | device.attributes["qat.intel.com"].services == "sym;asym" || 17 | device.attributes["qat.intel.com"].services == "sym;dc" || 18 | device.attributes["qat.intel.com"].services == "asym;sym" || 19 | device.attributes["qat.intel.com"].services == "dc;sym" 20 | --- 21 | apiVersion: resource.k8s.io/v1 22 | kind: ResourceClaimTemplate 23 | metadata: 24 | name: qat-template-asym 25 | spec: 26 | spec: 27 | devices: 28 | requests: 29 | - name: qat-request-asym 30 | exactly: 31 | deviceClassName: qat.intel.com 32 | selectors: 33 | - cel: 34 | expression: |- 35 | device.attributes["qat.intel.com"].services == "asym" || 36 | device.attributes["qat.intel.com"].services == "asym;sym" || 37 | device.attributes["qat.intel.com"].services == "asym;dc" || 38 | device.attributes["qat.intel.com"].services == "sym;asym" || 39 | device.attributes["qat.intel.com"].services == "dc;asym" 40 | --- 41 | apiVersion: resource.k8s.io/v1 42 | kind: ResourceClaimTemplate 43 | metadata: 44 | name: qat-template-dc 45 | spec: 46 | spec: 47 | devices: 48 | requests: 49 | - name: qat-request-dc 50 | exactly: 51 | deviceClassName: qat.intel.com 52 | selectors: 53 | - cel: 54 | expression: |- 55 | device.attributes["qat.intel.com"].services == "dc" || 56 | device.attributes["qat.intel.com"].services == "dc;sym" || 57 | device.attributes["qat.intel.com"].services == "dc;asym" || 58 | device.attributes["qat.intel.com"].services == "sym;dc" || 59 | device.attributes["qat.intel.com"].services == "asym;dc" || 60 | device.attributes["qat.intel.com"].services == "dcc" 61 | -------------------------------------------------------------------------------- /test/e2e/dra_suite_test.go: -------------------------------------------------------------------------------- 1 | package e2e_test 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "os" 7 | "testing" 8 | 9 | "github.com/onsi/ginkgo/v2" 10 | "github.com/onsi/gomega" 11 | v1 "k8s.io/api/core/v1" 12 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 13 | "k8s.io/component-base/logs" 14 | "k8s.io/klog/v2" 15 | "k8s.io/kubernetes/test/e2e/framework" 16 | "k8s.io/kubernetes/test/e2e/framework/config" 17 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 18 | 19 | _ "github.com/intel/intel-resource-drivers-for-kubernetes/test/e2e/gpu" 20 | _ "github.com/intel/intel-resource-drivers-for-kubernetes/test/e2e/qat" 21 | ) 22 | 23 | func init() { 24 | ginkgo.SynchronizedBeforeSuite(setupFirstNode, func(data []byte) {}) 25 | } 26 | 27 | func setupFirstNode(ctx context.Context) []byte { 28 | c, err := framework.LoadClientset() 29 | if err != nil { 30 | framework.Failf("Error loading client: %v", err) 31 | } 32 | 33 | // Delete any namespaces except those created by the system. This ensures no 34 | // lingering resources are left over from a previous test run. 35 | if framework.TestContext.CleanStart { 36 | deleted, err2 := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */ 37 | []string{ 38 | metav1.NamespaceSystem, 39 | metav1.NamespaceDefault, 40 | metav1.NamespacePublic, 41 | v1.NamespaceNodeLease, 42 | "cert-manager", 43 | }) 44 | if err2 != nil { 45 | framework.Failf("Error deleting orphaned namespaces: %v", err2) 46 | } 47 | 48 | framework.Logf("Waiting for deletion of the following namespaces: %v", deleted) 49 | 50 | if err2 = framework.WaitForNamespacesDeleted(ctx, c, deleted, e2epod.DefaultPodDeletionTimeout); err2 != nil { 51 | framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err2) 52 | } 53 | } 54 | 55 | return []byte{} 56 | } 57 | func TestDra(t *testing.T) { 58 | gomega.RegisterFailHandler(ginkgo.Fail) 59 | ginkgo.RunSpecs(t, "E2E DRA Drivers Suite") 60 | } 61 | 62 | func TestMain(m *testing.M) { 63 | klog.SetOutput(ginkgo.GinkgoWriter) 64 | 65 | logs.InitLogs() 66 | config.CopyFlags(config.Flags, flag.CommandLine) 67 | framework.RegisterCommonFlags(flag.CommandLine) 68 | framework.RegisterClusterFlags(flag.CommandLine) 69 | flag.Parse() 70 | 71 | // Register framework flags, then handle flags. 72 | framework.AfterReadingAllFlags(&framework.TestContext) 73 | 74 | // Now run the test suite. 75 | os.Exit(m.Run()) 76 | } 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | Intel Resource Drivers for Kubernetes is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /DEV.md: -------------------------------------------------------------------------------- 1 | Contents: 2 | * [Runtime](#runtime) 3 | * [Enable CDI in Containerd](#enable-cdi-in-containerd) 4 | * [Generated source code](#generated-source-code) 5 | * [Required tools](#required-tools) 6 | 7 | 8 | # Runtime 9 | 10 | Runtime needs to have CDI injection support 11 | 12 | - CRI-O: 1.23+, enabled by default. 13 | - Containerd: v1.7+, disabled by default. 14 | 15 | ## Enable CDI in Containerd 16 | 17 | Containerd config file should have `enable_cdi` and `cdi_spec_dirs`. Example `/etc/containerd/config.toml`: 18 | ``` 19 | version = 2 20 | [plugins] 21 | [plugins."io.containerd.grpc.v1.cri"] 22 | enable_cdi = true 23 | cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"] 24 | ``` 25 | 26 | ### Determine your go binaries location from `go install --help`, quote: 27 | > Executables are installed in the directory named by the GOBIN environment 28 | > variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH 29 | > environment variable is not set. Executables in $GOROOT 30 | > are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN. 31 | 32 | ### Way 1 : install tools with Go: 33 | 34 | #### Add Go binaries directory to PATH 35 | Add this to the end of your `$HOME/.bashrc`: 36 | ```bash 37 | export PATH=":$PATH" 38 | ``` 39 | 40 | #### install tools 41 | ```bash 42 | GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest 43 | GO111MODULE=on go install k8s.io/code-generator/cmd/client-gen@latest 44 | ``` 45 | 46 | ### Way 2 : clone and build it: 47 | ```bash 48 | git clone https://github.com/kubernetes-sigs/controller-tools.git 49 | cd controller-tools 50 | go build ./cmd/controller-gen 51 | cd - 52 | git clone https://github.com/kubernetes/code-generator.git 53 | cd code-generator 54 | go build ./cmd/client-gen 55 | cd - 56 | ``` 57 | 58 | Make them available in PATH, for instance $HOME/go/bin: 59 | ```bash 60 | cp controller-tools/controller-gen code-generator/client-gen $HOME/go/bin 61 | # ensure it's in the path. You may want to add export to $HOME/.bashrc 62 | echo $PATH | grep -q $HOME/go/bin || export PATH=$HOME/go/bin:$PATH 63 | ``` 64 | 65 | ## Running tests in container 66 | 67 | To have your own user ID inside container image without access / permission issues, build a fresh 68 | container image, then run tests. The CI uses its own user ID. 69 | 70 | ```shell 71 | $ make test-image 72 | $ make test-containerized 73 | ``` 74 | 75 | Tests provide coverage data. If you need to see the coverage report, just run Make target for needed 76 | coverage target, e.g. 77 | 78 | ``` 79 | make gaudi-coverage 80 | ``` 81 | -------------------------------------------------------------------------------- /deployments/gpu/overlays/device-faker/device-faker.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: intel-gpu-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | initContainers: 10 | - name: device-faker 11 | # 'Always' policy makes it a sideCar container with longer lifecycle, 12 | # allowing it to be terminated last 13 | # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#termination-with-sidecars 14 | # which allows proper fake root cleanup. 15 | restartPolicy: Always 16 | # TODO: switch to CI image when public CI is in place 17 | image: registry.local/intel-device-faker:v0.5.0 18 | imagePullPolicy: Always 19 | command: ["/device-faker", "gpu", "-t", "/opt/templates/gpu-template.json", "-r", "-d", "/tmp/gpu-fake-root", "-c", "-p"] 20 | volumeMounts: 21 | - name: gpu-fake-root 22 | mountPath: /tmp/gpu-fake-root 23 | securityContext: 24 | readOnlyRootFilesystem: false 25 | allowPrivilegeEscalation: false 26 | capabilities: 27 | drop: [ "ALL" ] 28 | add: [ "MKNOD" ] 29 | containers: 30 | - name: kubelet-plugin 31 | command: ["/kubelet-gpu-plugin", "-v", "5"] 32 | # TODO: change to :devel when public CI is in place 33 | image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:devel 34 | # TODO: pull policy is needed when :devel is used instead of :latest 35 | imagePullPolicy: Always 36 | env: 37 | - name: SYSFS_ROOT 38 | value: "/tmp/gpu-fake-root/sysfs" 39 | - name: DEVFS_ROOT 40 | value: "/tmp/gpu-fake-root/dev" 41 | # Host dir for system's dynamic CDI dir. Containerd & CRI-O default value. 42 | - name: CDI_ROOT 43 | value: "/var/run/cdi" 44 | volumeMounts: 45 | - name: gpu-fake-root 46 | mountPath: /tmp/gpu-fake-root/sysfs 47 | subPath: sysfs 48 | - name: gpu-fake-root 49 | mountPath: /tmp/gpu-fake-root/dev 50 | subPath: dev 51 | # Host dir for system's dynamic CDI dir. Containerd & CRI-O default value. 52 | - name: dynamic-cdi 53 | mountPath: /var/run/cdi 54 | volumes: 55 | - name: gpu-fake-root 56 | hostPath: 57 | path: /tmp/gpu-fake-root 58 | # Host dir for system's dynamic CDI dir. Containerd & CRI-O default value. 59 | - name: dynamic-cdi 60 | hostPath: 61 | path: /var/run/cdi 62 | 63 | -------------------------------------------------------------------------------- /doc/gpu/README.md: -------------------------------------------------------------------------------- 1 | # Intel GPU resource driver for Kubernetes 2 | 3 | CAUTION: This is a beta / non-production software, do not use on production clusters. 4 | 5 | ## About resource driver 6 | 7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates 8 | the resources and DRA driver kubelet-plugin ensures that the allocated devices are prepared 9 | and available for Pods. 10 | 11 | DRA API graduated to GA with v1 API in K8s v1.34, backwards compatibility may vary 12 | depending on features enabled. 13 | 14 | ## Supported GPU devices 15 | 16 | Intel GPU DRA driver relies on the host Linux kernel [Intel GPU driver(s)](https://dgpu-docs.intel.com/driver/kernel-driver-types.html) to detect the devices. 17 | See the [supported hardware](https://dgpu-docs.intel.com/devices/hardware-table.html) 18 | section in the Intel GPU driver support documentation. 19 | 20 | (To _use_ the devices, workload containers need to include a suitable Intel GPU user space driver. See that documentation site on how to install them.) 21 | 22 | ## Supported Kubernetes Versions 23 | 24 | Supported Kubernetes versions are listed below: 25 | 26 | | Branch | Kubernetes branch/version | Status | DRA | 27 | |:------------------|:---------------------------------|:------------|:-------------------------------| 28 | | v0.1.0-beta | Kubernetes v1.26 branch v1.26.x | unsupported | Classic | 29 | | v0.1.1-beta | Kubernetes v1.27 branch v1.27.x | unsupported | Classic | 30 | | v0.2.0 | Kubernetes v1.28 branch v1.28.x | unsupported | Classic | 31 | | v0.3.0 | Kubernetes v1.28+ | unsupported | Classic | 32 | | v0.4.0 | Kubernetes v1.28+ | unsupported | Classic | 33 | | v0.5.0 | Kubernetes v1.27 - v1.30 | unsupported | Classic, Structured Parameters | 34 | | v0.6.0 | Kubernetes v1.31 | unsupported | Structured Parameters | 35 | | v0.7.0 | Kubernetes v1.32+ | unsupported | Structured Parameters | 36 | | v0.8.0 | Kubernetes v1.33-v1.34 | unsupported | Structured Parameters | 37 | | v0.9.0 | Kubernetes v1.32+ | supported | Structured Parameters | 38 | 39 | ## Documentation 40 | 41 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md) 42 | - [How to deploy and use Intel GPU resource driver](USAGE.md) 43 | - Optional: [How to build Intel GPU resource driver container image](BUILD.md) 44 | -------------------------------------------------------------------------------- /pkg/gaudi/device/device_test.go: -------------------------------------------------------------------------------- 1 | package device 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestCDIName(t *testing.T) { 8 | tests := []struct { 9 | name string 10 | device DeviceInfo 11 | expected string 12 | }{ 13 | { 14 | name: "Valid device UID", 15 | device: DeviceInfo{ 16 | UID: "0000-01-02-0-0x1234", 17 | }, 18 | expected: "intel.com/gaudi=0000-01-02-0-0x1234", 19 | }, 20 | } 21 | 22 | for _, tt := range tests { 23 | t.Run(tt.name, func(t *testing.T) { 24 | result := tt.device.CDIName() 25 | if result != tt.expected { 26 | t.Errorf("expected %v, got %v", tt.expected, result) 27 | } 28 | }) 29 | } 30 | } 31 | 32 | func TestDevicesInfoDeepCopy(t *testing.T) { 33 | original := DevicesInfo{ 34 | "0000-01-02-0-0x1234": { 35 | UID: "0000-01-02-0-0x1234", 36 | PCIAddress: "0000:01:02.0", 37 | Model: "0x1020", 38 | ModelName: "Gaudi2", 39 | DeviceIdx: 1, 40 | ModuleIdx: 2, 41 | PCIRoot: "0000:00", 42 | }, 43 | } 44 | 45 | copy := original.DeepCopy() 46 | 47 | if © == &original { 48 | t.Error("DeepCopy() returned the same pointer, expected different pointers") 49 | } 50 | 51 | for key, originalDevice := range original { 52 | copyDevice, exists := copy[key] 53 | if !exists { 54 | t.Errorf("DeepCopy() missing device with key %v", key) 55 | continue 56 | } 57 | 58 | if copyDevice == originalDevice { 59 | t.Errorf("DeepCopy() returned the same pointer for device with key %v, expected different pointers", key) 60 | } 61 | 62 | if *copyDevice != *originalDevice { 63 | t.Errorf("DeepCopy() returned different values for device with key %v, expected identical values", key) 64 | } 65 | } 66 | } 67 | 68 | func TestSetModelName(t *testing.T) { 69 | tests := []struct { 70 | name string 71 | deviceInfo DeviceInfo 72 | expected string 73 | }{ 74 | { 75 | name: "Known model 0x1000", 76 | deviceInfo: DeviceInfo{ 77 | Model: "0x1000", 78 | }, 79 | expected: "Gaudi", 80 | }, 81 | { 82 | name: "Known model 0x1020", 83 | deviceInfo: DeviceInfo{ 84 | Model: "0x1020", 85 | }, 86 | expected: "Gaudi2", 87 | }, 88 | { 89 | name: "Unknown model", 90 | deviceInfo: DeviceInfo{ 91 | Model: "0x9999", 92 | }, 93 | expected: "Unknown", 94 | }, 95 | } 96 | 97 | for _, tt := range tests { 98 | t.Run(tt.name, func(t *testing.T) { 99 | tt.deviceInfo.SetModelName() 100 | if tt.deviceInfo.ModelName != tt.expected { 101 | t.Errorf("expected %v, got %v", tt.expected, tt.deviceInfo.ModelName) 102 | } 103 | }) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Resource Allocation (DRA) Intel GPU Driver Helm Chart 2 | 3 | ## The chart installs GPU resource driver: 4 | 5 | - [GPU](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/gpu/README.md) 6 | 7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main) 8 | 9 | 10 | ## Installing the chart 11 | 12 | ```console 13 | helm install \ 14 | --namespace "intel-gpu-resource-driver" \ 15 | --create-namespace \ 16 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver-chart 17 | ``` 18 | 19 | > [!NOTE] 20 | > For Kubernetes clusters using [Pod Security Standards](https://kubernetes.io/docs/concepts/security/pod-security-standards/), 21 | > pre-create the namespace with the respective label allowing to use HostPath Volumes. 22 | 23 | ```console 24 | kubectl create namespace intel-gpu-resource-driver 25 | kubectl label --overwrite namespace intel-gpu-resource-driver pod-security.kubernetes.io/enforce=privileged 26 | helm install \ 27 | --namespace intel-gpu-resource-driver \ 28 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver-chart 29 | ``` 30 | 31 | ## Uninstalling the chart 32 | ```console 33 | helm uninstall intel-gpu-resource-driver --namespace intel-gpu-resource-driver 34 | ``` 35 | (Optional) Delete the namespace: 36 | ```console 37 | kubectl delete ns intel-gpu-resource-driver 38 | ``` 39 | 40 | ## Configuration 41 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: 42 | 43 | ```console 44 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver-chart 45 | ``` 46 | 47 | You may also run `helm show values` on this chart's dependencies for additional options. 48 | 49 | | Key | Type | Default | 50 | |-----|------|---------| 51 | | image.repository | string | `intel` | 52 | | image.name | string | `"intel-gpu-resource-driver"` | 53 | | image.pullPolicy | string | `"IfNotPresent"` | 54 | | image.tag | string | `"v0.9.1"` | 55 | 56 | ## Deploying to RedHat OpenShift Container Platform 57 | 58 | ```console 59 | helm install \ 60 | --set openshift.enabled=true \ 61 | --namespace "intel-gpu-resource-driver" \ 62 | --create-namespace \ 63 | intel-gpu-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver-chart 64 | ``` 65 | 66 | > [!NOTE] 67 | > Chart contains SecurityContextConstraints, which requires cluster admin privileges. Ensure the chart is installed by the cluster admin. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intel resource drivers for Kubernetes 2 | 3 | CAUTION: This is a beta / non-production software, do not use on production clusters. 4 | 5 | ## This repository containes following resource drivers: 6 | 7 | - [GPU](doc/gpu/README.md) 8 | - [Gaudi](doc/gaudi/README.md) 9 | - [QAT](doc/qat/README.md) 10 | 11 | ## Glossary 12 | 13 | - DRA https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3063-dynamic-resource-allocation 14 | - CDI https://github.com/cncf-tags/container-device-interface/ 15 | - K8s https://github.com/kubernetes/kubernetes.git 16 | 17 | ## About resource drivers 18 | 19 | Intel resource drivers for Kubernetes is an alternative for 20 | [Intel device plugins](https://github.com/intel/intel-device-plugins-for-kubernetes/), 21 | facilitating workload offloading by providing accelerator access on Kubernetes cluster worker nodes. 22 | 23 | Resource drivers are not Linux kernel mode drivers (KMD), and do not help the operational system on 24 | the worker nodes detect and operate the accelerators. 25 | 26 | The resource drivers are based on Dynamic Resource Allocation (DRA) framework in Kubernetes 27 | 28 | ### About Dynamic Resource Allocation 29 | 30 | Dynamic Resource Allocation (DRA) is a resource management framework in Kubernetes (1.26+), that 31 | allows management of special resources in cluster (typically HW accelerators) by vendor-provided 32 | resource drivers (typically a controller and a node-agent / kubelet-plugin) in a common way. 33 | 34 | Resource drivers are meant to handle discovery, allocation, accounting of specific resources as well 35 | as their preparation for Pod before Pod startup, and cleanup after the Pod has completed successfully 36 | and the resource is no longer needed. More info is 37 | [in the KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3063-dynamic-resource-allocation) 38 | 39 | 40 | ## Release process 41 | 42 | Every resource driver in this repository has its own releases, release branches and version tags. 43 | 44 | Typical release cadence is quarterly. During the release creation the project's documentation, 45 | deployment files etc. will be changed to point to the newly created version. 46 | 47 | Once the content is available in the main branch and validation PASSes, release branch will be 48 | created (e.g. gpu-release-v0.2.0). The HEAD of release branch will also be tagged with the corresponding 49 | tag (e.g. gpu-v0.2.0). 50 | 51 | During the release creation, the project's documentation, deployment files etc. will be changed to 52 | point to the newly created version. 53 | 54 | Patch releases (e.g. gaudi-v0.1.1) are done on a need basis if there are security issues or minor fixes 55 | for specific supported version. Fixes are always cherry-picked from the main branch to the release 56 | branches. 57 | -------------------------------------------------------------------------------- /deployments/gaudi/overlays/device-faker/device-faker.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: intel-gaudi-resource-driver 6 | spec: 7 | template: 8 | spec: 9 | initContainers: 10 | - name: device-faker 11 | # 'Always' policy makes it a sideCar container with longer lifecycle, 12 | # allowing it to be terminated last 13 | # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#termination-with-sidecars 14 | # which allows proper fake root cleanup. 15 | restartPolicy: Always 16 | # TODO: switch to CI image when public CI is in place 17 | image: registry.local/intel-device-faker:v0.5.0 18 | imagePullPolicy: Always 19 | command: ["/device-faker", "gaudi", "-t", "/opt/templates/gaudi-template.json", "-r", "-d", "/tmp/gaudi-fake-root", "-c", "-p"] 20 | volumeMounts: 21 | - name: gaudi-fake-root 22 | mountPath: /tmp/gaudi-fake-root 23 | securityContext: 24 | readOnlyRootFilesystem: false 25 | allowPrivilegeEscalation: false 26 | capabilities: 27 | drop: [ "ALL" ] 28 | add: [ "MKNOD" ] 29 | containers: 30 | - name: kubelet-plugin 31 | command: ["/kubelet-gaudi-plugin", "-v", "5", "-n", "/tmp/gaudi-fake-root/gaudinet", "-p", "/tmp/gaudi-fake-root/hookbin"] 32 | # TODO: change to :devel when public CI is in place 33 | image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver:latest 34 | # TODO: pull policy is needed when :devel is used instead of :latest 35 | #imagePullPolicy: Always 36 | env: 37 | - name: SYSFS_ROOT 38 | value: "/tmp/gaudi-fake-root/sysfs" 39 | - name: DEVFS_ROOT 40 | value: "/tmp/gaudi-fake-root/dev" 41 | # Host dir for system's dynamic CDI dir. Containerd & CRI-O default value. 42 | - name: CDI_ROOT 43 | value: "/var/run/cdi" 44 | volumeMounts: 45 | - name: gaudi-fake-root 46 | mountPath: /tmp/gaudi-fake-root/sysfs 47 | subPath: sysfs 48 | - name: gaudi-fake-root 49 | mountPath: /tmp/gaudi-fake-root/dev 50 | subPath: dev 51 | # Host dir for system's dynamic CDI dir. Containerd & CRI-O default value. 52 | - name: dynamic-cdi 53 | mountPath: /var/run/cdi 54 | securityContext: 55 | privileged: false 56 | volumes: 57 | - name: gaudi-fake-root 58 | hostPath: 59 | path: /tmp/gaudi-fake-root 60 | # Host dir for system's dynamic CDI dir. Containerd & CRI-O default value. 61 | - name: dynamic-cdi 62 | hostPath: 63 | path: /var/run/cdi 64 | 65 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/templates/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-qat-resource-driver-kubelet-plugin 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-qat-resource-driver.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-qat-resource-driver 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-qat-resource-driver 16 | spec: 17 | serviceAccountName: {{ include "intel-qat-resource-driver.serviceAccountName" . }} 18 | containers: 19 | - name: kubelet-plugin 20 | image: {{ include "intel-qat-resource-driver.fullimage" . }} 21 | imagePullPolicy: {{ .Values.image.pullPolicy }} 22 | command: ["/kubelet-qat-plugin"] 23 | env: 24 | - name: NODE_NAME 25 | valueFrom: 26 | fieldRef: 27 | fieldPath: spec.nodeName 28 | - name: POD_NAMESPACE 29 | valueFrom: 30 | fieldRef: 31 | fieldPath: metadata.namespace 32 | - name: SYSFS_ROOT 33 | value: "/sysfs" 34 | volumeMounts: 35 | - name: plugins-registry 36 | mountPath: /var/lib/kubelet/plugins_registry 37 | - name: plugins 38 | mountPath: /var/lib/kubelet/plugins 39 | - name: cdi 40 | mountPath: /etc/cdi 41 | - name: varruncdi 42 | mountPath: /var/run/cdi 43 | - name: sysfs 44 | mountPath: /sysfs 45 | - name: qatconfiguration 46 | mountPath: /defaults 47 | securityContext: 48 | privileged: true 49 | readOnlyRootFilesystem: true 50 | seccompProfile: 51 | type: RuntimeDefault 52 | volumes: 53 | - name: plugins-registry 54 | hostPath: 55 | path: /var/lib/kubelet/plugins_registry 56 | - name: plugins 57 | hostPath: 58 | path: /var/lib/kubelet/plugins 59 | - name: cdi 60 | hostPath: 61 | path: {{ .Values.cdi.staticPath }} 62 | - name: varruncdi 63 | hostPath: 64 | path: {{ .Values.cdi.dynamicPath}} 65 | - name: sysfs 66 | hostPath: 67 | path: /sys 68 | - name: qatconfiguration 69 | configMap: 70 | name: intel-qat-resource-driver-configuration 71 | optional: true 72 | {{- with .Values.kubeletPlugin.tolerations }} 73 | tolerations: 74 | {{- toYaml . | nindent 8 }} 75 | {{- end }} 76 | {{- with .Values.kubeletPlugin.nodeSelector }} 77 | nodeSelector: 78 | {{- toYaml . | nindent 8 }} 79 | {{- end }} 80 | {{- with .Values.kubeletPlugin.affinity }} 81 | affinity: 82 | {{- toYaml . | nindent 8 }} 83 | {{- end }} 84 | -------------------------------------------------------------------------------- /doc/qat/README.md: -------------------------------------------------------------------------------- 1 | # Intel® QAT resource driver for Kubernetes 2 | 3 | CAUTION: This is a beta / non-production software, do not use on production clusters. 4 | 5 | ## About resource driver 6 | 7 | With structured parameters (K8s v1.31+), the DRA driver publishes ResourceSlice, scheduler allocates 8 | the resources and resource driver's kubelet-plugin ensures that the allocated devices are prepared 9 | and available for Pods. 10 | 11 | DRA API graduated to GA with v1 API in K8s v1.34, backwards compatibility may vary 12 | depending on features enabled. 13 | 14 | ## Host OS requirements 15 | 16 | In order to guarantee proper operation, ensure Linux kernel module `vfio_pci` has been loaded. 17 | 18 | The QAT Kubernetes resource driver is intended to be used on upstream Linux kernels, 19 | see [the in-tree kernel documentation](https://intel.github.io/quickassist/RN/In-Tree/in_tree_firmware_RN.html) 20 | for details. Note though, that the QAT resource driver itself does not depend on 21 | any QAT user space libraries mentioned in that document. 22 | 23 | ## Supported QAT devices 24 | 25 | All 4th Gen Intel® Xeon® Scalable Processor QAT devices handled by the Linux kernel 26 | driver module `qat_4xxx` are supported. 27 | 28 | ## Supported Kubernetes Versions 29 | 30 | Supported Kubernetes versions are listed below: 31 | 32 | | Branch | Kubernetes branch/version | Status | DRA | 33 | |:------------------|:--------------------------------|:------------|:-------------------------------| 34 | | v0.1.0 | Kubernetes v1.31 | unsupported | Structured Parameters | 35 | | v0.2.0 | Kubernetes v1.32 | unsupported | Structured Parameters | 36 | | v0.3.0 | Kubernetes v1.33-v1.34 | unsupported | Structured Parameters | 37 | | v0.4.0 | Kubernetes v1.32+ | supported | Structured Parameters | 38 | 39 | ## QAT service configuration 40 | 41 | In version 0.1.0 static configuration of QAT services is using a ConfigMap, 42 | please have a look at 43 | [the example ConfigMap yaml](../../deployments/qat/examples/intel-qat-resource-driver-configuration.yaml). 44 | 45 | The ConfigMap and Resource Claims use the same string notation as the QAT kernel 46 | driver when specifying what services are to be configured for the device and Resource 47 | Claim. When two services are requested, the service strings are to be separated by 48 | semicolon (';'). Supported services are: 49 | * Symmetric cryptography: `sym` 50 | * Asymmetric cryptograpy: `asym` 51 | * Compression: `dc` 52 | 53 | ## Documentation 54 | 55 | - [How to setup a Kubernetes cluster with DRA enabled](../CLUSTER_SETUP.md) 56 | - [How to deploy and use Intel® QAT resource driver](USAGE.md) 57 | - Optional: [How to build Intel® QAT resource driver container image](BUILD.md) 58 | -------------------------------------------------------------------------------- /pkg/fakesysfs/fakesysfs.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package fakesysfs 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "path" 23 | "strconv" 24 | "strings" 25 | 26 | "golang.org/x/sys/unix" 27 | 28 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/helpers" 29 | ) 30 | 31 | const ( 32 | devNullMajor = 1 33 | devNullMinor = 3 34 | devNullType = unix.S_IFCHR 35 | ) 36 | 37 | // newPCIAddress finds next available free PCI address in given directory. 38 | // Returns partial PCI address without function, "0000:00:00.", used in loop 39 | // when fake VFs are generated. 40 | func newPCIAddress(driverDir string, currentAddress string) (string, error) { 41 | domain, err1 := strconv.ParseUint(currentAddress[:4], 10, 64) 42 | bus, err2 := strconv.ParseUint(currentAddress[5:7], 10, 64) 43 | device, err3 := strconv.ParseUint(currentAddress[8:10], 10, 64) 44 | 45 | if err1 != nil || err2 != nil || err3 != nil { 46 | return "", fmt.Errorf("could not parse current PCI address %v", currentAddress) 47 | } 48 | 49 | for ; domain <= 65535; domain++ { 50 | for ; bus <= 255; bus++ { 51 | for ; device <= 255; device++ { 52 | // partial PCI address without function 53 | newAddress := fmt.Sprintf("%04x:%02x:%02x.", domain, bus, device) 54 | // add zero for PCI function part of the address 55 | newSysfsDeviceDir := path.Join(driverDir, fmt.Sprintf("%s0", newAddress)) 56 | if _, err := os.Stat(newSysfsDeviceDir); err != nil { 57 | return newAddress, nil 58 | } 59 | } 60 | } 61 | } 62 | 63 | return "", fmt.Errorf("no addresses left") 64 | } 65 | 66 | // sanitizeFakeSysFsDir ensuring the /tmp location of fake sysfs. 67 | func sanitizeFakeSysFsDir(sysfsRootUntrusted string) error { 68 | // fake sysfsroot should be deletable. 69 | // To prevent disaster mistakes, it is enforced to be in /tmp. 70 | sysfsRoot := path.Join(sysfsRootUntrusted) 71 | if !strings.HasPrefix(sysfsRoot, "/tmp") { 72 | return fmt.Errorf("fake sysfsroot can only be in /tmp, got: %v", sysfsRoot) 73 | } 74 | 75 | return nil 76 | } 77 | 78 | func createDevice(filepath string, real bool) error { 79 | if !real { 80 | return helpers.WriteFile(filepath, "") 81 | } 82 | 83 | mode := uint32(0644 | devNullType) 84 | devid := int(unix.Mkdev(uint32(devNullMajor), uint32(devNullMinor))) 85 | 86 | return unix.Mknod(filepath, mode, devid) 87 | } 88 | -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Resource Allocation (DRA) Intel Gaudi Driver Helm Chart 2 | 3 | ## The chart installs Gaudi resource driver: 4 | 5 | - [Gaudi](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/gaudi/README.md) 6 | 7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main) 8 | 9 | 10 | ## Installing the chart 11 | 12 | ```console 13 | helm install \ 14 | --namespace intel-gaudi-resource-driver \ 15 | --create-namespace \ 16 | intel-gaudi-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver-chart 17 | ``` 18 | 19 | > [!NOTE] 20 | > For Kubernetes clusters using [Pod Security Standards](https://kubernetes.io/docs/concepts/security/pod-security-standards/), 21 | > pre-create the namespace with the respective label allowing to use HostPath Volumes. 22 | 23 | ```console 24 | kubectl create namespace intel-gaudi-resource-driver 25 | kubectl label --overwrite namespace intel-gaudi-resource-driver pod-security.kubernetes.io/enforce=privileged 26 | helm install \ 27 | --namespace intel-gaudi-resource-driver \ 28 | intel-gaudi-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver-chart 29 | ``` 30 | 31 | ## Uninstalling the chart 32 | ```console 33 | helm uninstall intel-gaudi-resource-driver --namespace intel-gaudi-resource-driver 34 | ``` 35 | (Optional) Delete the namespace: 36 | ```console 37 | kubectl delete ns intel-gaudi-resource-driver 38 | ``` 39 | 40 | ## Configuration 41 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: 42 | 43 | ```console 44 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver-chart 45 | ``` 46 | 47 | You may also run `helm show values` on this chart's dependencies for additional options. 48 | 49 | | Key | Type | Default | 50 | |-----|------|---------| 51 | | image.repository | string | `intel` | 52 | | image.name | string | `"intel-gaudi-resource-driver"` | 53 | | image.pullPolicy | string | `"IfNotPresent"` | 54 | | image.tag | string | `"v0.6.1"` | 55 | 56 | > [!Note] 57 | > If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases. 58 | 59 | ## Deploying to RedHat OpenShift Container Platform 60 | 61 | ```console 62 | helm install \ 63 | --set openshift.enabled=true \ 64 | --namespace intel-gaudi-resource-driver \ 65 | --create-namespace \ 66 | intel-gaudi-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gaudi-resource-driver-chart 67 | ``` 68 | 69 | > [!NOTE] 70 | > Chart contains SecurityContextConstraints, which requires cluster admin privileges. Ensure the chart is installed by the cluster admin. -------------------------------------------------------------------------------- /charts/intel-gaudi-resource-driver/templates/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gaudi-resource-driver-kubelet-plugin 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-gaudi-resource-driver.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-gaudi-resource-driver-kubelet-plugin 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-gaudi-resource-driver-kubelet-plugin 16 | spec: 17 | serviceAccountName: {{ include "intel-gaudi-resource-driver.serviceAccountName" . }} 18 | containers: 19 | - name: kubelet-plugin 20 | image: {{ include "intel-gaudi-resource-driver.fullimage" . }} 21 | imagePullPolicy: {{ .Values.image.pullPolicy }} 22 | command: ["/kubelet-gaudi-plugin"] 23 | env: 24 | - name: NODE_NAME 25 | valueFrom: 26 | fieldRef: 27 | fieldPath: spec.nodeName 28 | - name: POD_NAMESPACE 29 | valueFrom: 30 | fieldRef: 31 | fieldPath: metadata.namespace 32 | - name: SYSFS_ROOT 33 | value: "/sysfs" 34 | volumeMounts: 35 | - name: plugins-registry 36 | mountPath: /var/lib/kubelet/plugins_registry 37 | - name: plugins 38 | mountPath: /var/lib/kubelet/plugins 39 | - name: cdi 40 | mountPath: /etc/cdi 41 | - name: varruncdi 42 | mountPath: /var/run/cdi 43 | # when using fake sysfs - mount at the same place as on host 44 | - name: sysfs 45 | mountPath: "/sysfs" 46 | securityContext: 47 | privileged: true 48 | capabilities: 49 | drop: ["ALL"] 50 | readOnlyRootFilesystem: true 51 | runAsUser: 0 52 | seccompProfile: 53 | type: RuntimeDefault 54 | volumes: 55 | - name: plugins-registry 56 | hostPath: 57 | path: /var/lib/kubelet/plugins_registry 58 | - name: plugins 59 | hostPath: 60 | path: /var/lib/kubelet/plugins 61 | - name: cdi 62 | hostPath: 63 | path: {{ .Values.cdi.staticPath }} 64 | - name: varruncdi 65 | hostPath: 66 | path: {{ .Values.cdi.dynamicPath}} 67 | - name: sysfs 68 | hostPath: 69 | path: /sys 70 | {{- with .Values.kubeletPlugin.tolerations }} 71 | tolerations: 72 | {{- toYaml . | nindent 8 }} 73 | {{- end }} 74 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 75 | nodeSelector: 76 | intel.feature.node.kubernetes.io/gaudi: "true" 77 | {{- else }} 78 | {{- with .Values.kubeletPlugin.nodeSelector }} 79 | nodeSelector: 80 | {{- toYaml . | nindent 8 }} 81 | {{- end }} 82 | {{- end }} 83 | {{- with .Values.kubeletPlugin.affinity }} 84 | affinity: 85 | {{- toYaml . | nindent 8 }} 86 | {{- end }} 87 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/resource-driver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: intel-gpu-resource-driver-kubelet-plugin 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "intel-gpu-resource-driver.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: intel-gpu-resource-driver 12 | template: 13 | metadata: 14 | labels: 15 | app: intel-gpu-resource-driver 16 | spec: 17 | serviceAccountName: {{ include "intel-gpu-resource-driver.serviceAccountName" . }} 18 | containers: 19 | - name: kubelet-plugin 20 | image: {{ include "intel-gpu-resource-driver.fullimage" . }} 21 | imagePullPolicy: {{ .Values.image.pullPolicy }} 22 | command: ["/kubelet-gpu-plugin"] 23 | env: 24 | - name: NODE_NAME 25 | valueFrom: 26 | fieldRef: 27 | fieldPath: spec.nodeName 28 | - name: POD_NAMESPACE 29 | valueFrom: 30 | fieldRef: 31 | fieldPath: metadata.namespace 32 | - name: SYSFS_ROOT 33 | value: "/sysfs" 34 | - name: ZES_ENABLE_SYSMAN 35 | value: "1" 36 | volumeMounts: 37 | - name: plugins-registry 38 | mountPath: /var/lib/kubelet/plugins_registry 39 | - name: plugins 40 | mountPath: /var/lib/kubelet/plugins 41 | - name: cdi 42 | mountPath: /etc/cdi 43 | - name: varruncdi 44 | mountPath: /var/run/cdi 45 | # when using fake sysfs - mount at the same place as on host 46 | - name: sysfs 47 | mountPath: "/sysfs" 48 | securityContext: 49 | privileged: true 50 | capabilities: 51 | drop: ["ALL"] 52 | readOnlyRootFilesystem: true 53 | runAsUser: 0 54 | seccompProfile: 55 | type: RuntimeDefault 56 | volumes: 57 | - name: plugins-registry 58 | hostPath: 59 | path: /var/lib/kubelet/plugins_registry 60 | - name: plugins 61 | hostPath: 62 | path: /var/lib/kubelet/plugins 63 | - name: cdi 64 | hostPath: 65 | path: {{ .Values.cdi.staticPath }} 66 | - name: varruncdi 67 | hostPath: 68 | path: {{ .Values.cdi.dynamicPath}} 69 | - name: sysfs 70 | hostPath: 71 | path: /sys 72 | {{- with .Values.kubeletPlugin.tolerations }} 73 | tolerations: 74 | {{- toYaml . | nindent 8 }} 75 | {{- end }} 76 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 77 | nodeSelector: 78 | intel.feature.node.kubernetes.io/gpu: "true" 79 | {{- else }} 80 | {{- with .Values.kubeletPlugin.nodeSelector }} 81 | nodeSelector: 82 | {{- toYaml . | nindent 8 }} 83 | {{- end }} 84 | {{- end }} 85 | {{- with .Values.kubeletPlugin.affinity }} 86 | affinity: 87 | {{- toYaml . | nindent 8 }} 88 | {{- end }} 89 | -------------------------------------------------------------------------------- /charts/intel-gpu-resource-driver/templates/node-feature-rules.yaml: -------------------------------------------------------------------------------- 1 | {{- if or .Values.nodeFeatureRules.enabled .Values.nfd.enabled }} 2 | apiVersion: nfd.k8s-sigs.io/v1alpha1 3 | kind: NodeFeatureRule 4 | metadata: 5 | name: intel-gpu-device-rule 6 | spec: 7 | rules: 8 | - name: "intel.gpu" 9 | labels: 10 | "intel.feature.node.kubernetes.io/gpu": "true" 11 | matchFeatures: 12 | - feature: pci.device 13 | matchExpressions: 14 | vendor: {op: In, value: ["8086"]} 15 | class: {op: In, value: ["0300", "0380"]} 16 | matchAny: 17 | - matchFeatures: 18 | - feature: kernel.loadedmodule 19 | matchExpressions: 20 | i915: {op: Exists} 21 | - matchFeatures: 22 | - feature: kernel.enabledmodule 23 | matchExpressions: 24 | i915: {op: Exists} 25 | --- 26 | apiVersion: nfd.k8s-sigs.io/v1alpha1 27 | kind: NodeFeatureRule 28 | metadata: 29 | name: intel-gpu-platform-labeling 30 | spec: 31 | rules: 32 | # A_Series (Alchemist) 33 | - labels: 34 | gpu.intel.com/family: "A_Series" 35 | matchFeatures: 36 | - feature: pci.device 37 | matchExpressions: 38 | class: {op: In, value: ["0300"]} 39 | vendor: {op: In, value: ["8086"]} 40 | device: 41 | op: In 42 | value: 43 | - "56a6" 44 | - "56a5" 45 | - "56a1" 46 | - "56a0" 47 | - "5694" 48 | - "5693" 49 | - "5692" 50 | - "5691" 51 | - "5690" 52 | - "56b3" 53 | - "56b2" 54 | - "56a4" 55 | - "56a3" 56 | - "5697" 57 | - "5696" 58 | - "5695" 59 | - "56b1" 60 | - "56b0" 61 | name: intel.gpu.a.series 62 | # Max_Series 63 | - labels: 64 | gpu.intel.com/family: "Max_Series" 65 | matchFeatures: 66 | - feature: pci.device 67 | matchExpressions: 68 | class: {op: In, value: ["0380"]} 69 | vendor: {op: In, value: ["8086"]} 70 | device: 71 | op: In 72 | value: 73 | - "0bda" 74 | - "0bd5" 75 | - "0bd9" 76 | - "0bdb" 77 | - "0bd7" 78 | - "0bd6" 79 | - "0bd0" 80 | name: intel.gpu.max.series 81 | # Flex_Series 82 | - labels: 83 | gpu.intel.com/family: "Flex_Series" 84 | matchFeatures: 85 | - feature: pci.device 86 | matchExpressions: 87 | class: {op: In, value: ["0300", "0380"]} 88 | vendor: {op: In, value: ["8086"]} 89 | device: 90 | op: In 91 | value: 92 | - "0f00" 93 | - "0f01" 94 | - "0f02" 95 | name: intel.gpu.flex.series 96 | {{- end }} 97 | -------------------------------------------------------------------------------- /deployments/qat/examples/deployment-inline.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1 2 | kind: ResourceClaimTemplate 3 | metadata: 4 | name: qat-template-sym 5 | spec: 6 | spec: 7 | devices: 8 | requests: 9 | - name: qat-request-sym 10 | exactly: 11 | deviceClassName: qat.intel.com 12 | selectors: 13 | - cel: 14 | expression: |- 15 | device.attributes["qat.intel.com"].services == "sym" || 16 | device.attributes["qat.intel.com"].services == "sym;asym" || 17 | device.attributes["qat.intel.com"].services == "sym;dc" || 18 | device.attributes["qat.intel.com"].services == "asym;sym" || 19 | device.attributes["qat.intel.com"].services == "dc;sym" 20 | --- 21 | apiVersion: resource.k8s.io/v1 22 | kind: ResourceClaimTemplate 23 | metadata: 24 | name: qat-template-asym 25 | spec: 26 | spec: 27 | devices: 28 | requests: 29 | - name: qat-request-asym 30 | exactly: 31 | deviceClassName: qat.intel.com 32 | selectors: 33 | - cel: 34 | expression: |- 35 | device.attributes["qat.intel.com"].services == "asym" || 36 | device.attributes["qat.intel.com"].services == "asym;sym" || 37 | device.attributes["qat.intel.com"].services == "asym;dc" || 38 | device.attributes["qat.intel.com"].services == "sym;asym" || 39 | device.attributes["qat.intel.com"].services == "dc;asym" 40 | --- 41 | apiVersion: resource.k8s.io/v1 42 | kind: ResourceClaimTemplate 43 | metadata: 44 | name: qat-template-dc 45 | spec: 46 | spec: 47 | devices: 48 | requests: 49 | - name: qat-request-dc 50 | exactly: 51 | deviceClassName: qat.intel.com 52 | selectors: 53 | - cel: 54 | expression: |- 55 | device.attributes["qat.intel.com"].services == "dc" || 56 | device.attributes["qat.intel.com"].services == "dc;sym" || 57 | device.attributes["qat.intel.com"].services == "dc;asym" || 58 | device.attributes["qat.intel.com"].services == "sym;dc" || 59 | device.attributes["qat.intel.com"].services == "asym;dc" || 60 | device.attributes["qat.intel.com"].services == "dcc" 61 | 62 | --- 63 | apiVersion: apps/v1 64 | kind: Deployment 65 | metadata: 66 | name: qat-sample-sym 67 | labels: 68 | app: inline-qat-deployment 69 | spec: 70 | replicas: 1 71 | selector: 72 | matchLabels: 73 | app: inline-qat-deployment 74 | template: 75 | metadata: 76 | labels: 77 | app: inline-qat-deployment 78 | spec: 79 | containers: 80 | - name: with-resource 81 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 82 | command: ["sh", "-c", "ls -la /dev/vfio/ && sleep 300"] 83 | securityContext: 84 | capabilities: 85 | add: 86 | ["IPC_LOCK"] 87 | resources: 88 | claims: 89 | - name: resource-sym 90 | - name: without-resource 91 | image: registry.k8s.io/e2e-test-images/busybox:1.29-2 92 | command: ["sh", "-c", "ls -la /dev/ && sleep 300"] 93 | resourceClaims: 94 | - name: resource-sym 95 | resourceClaimTemplateName: qat-template-sym 96 | -------------------------------------------------------------------------------- /charts/intel-qat-resource-driver/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Resource Allocation (DRA) Intel QAT Driver Helm Chart 2 | 3 | ## The chart installs QAT resource driver: 4 | 5 | - [QAT](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main/doc/qat/README.md) 6 | 7 | More info: [Intel Resource Drivers for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes/tree/main) 8 | 9 | 10 | ## Installing the chart 11 | 12 | ```console 13 | helm install \ 14 | --namespace intel-qat-resource-driver \ 15 | --create-namespace \ 16 | intel-qat-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver 17 | ``` 18 | 19 | > [!NOTE] 20 | > For Kubernetes clusters using [Pod Security Standards](https://kubernetes.io/docs/concepts/security/pod-security-standards/), 21 | > pre-create the namespace with the respective label allowing to use HostPath Volumes. 22 | 23 | ```console 24 | kubectl create namespace intel-qat-resource-driver 25 | kubectl label --overwrite namespace intel-qat-resource-driver pod-security.kubernetes.io/enforce=privileged 26 | helm install \ 27 | --namespace intel-qat-resource-driver \ 28 | intel-qat-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver 29 | ``` 30 | 31 | ## Uninstalling the chart 32 | ```console 33 | helm uninstall intel-qat-resource-driver --namespace intel-qat-resource-driver 34 | ``` 35 | (Optional) Delete the namespace: 36 | ```console 37 | kubectl delete ns intel-qat-resource-driver 38 | ``` 39 | 40 | ## Configuration 41 | See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: 42 | 43 | ```console 44 | helm show values oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver 45 | ``` 46 | 47 | You may also run `helm show values` on this chart's dependencies for additional options. 48 | 49 | | Key | Type | Default | 50 | |-----|------|---------| 51 | | image.repository | string | `intel` | 52 | | image.name | string | `"intel-qat-resource-driver"` | 53 | | image.pullPolicy | string | `"IfNotPresent"` | 54 | | image.tag | string | `"v0.4.1"` | 55 | 56 | If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases. 57 | 58 | 59 | ## Read-only file system error for QAT 60 | 61 | When the following error appears in the logs of the QAT Kubelet plugin: 62 | ```console 63 | kubectl logs -n intel-qat-resource-driver intel-qat-resource-driver-kubelet-plugin-ttcs6 64 | DRA kubelet plugin 65 | In-cluster config 66 | Setting up CDI 67 | failed to create kubelet plugin driver: cannot enable PF device '0000:6b:00.0': open /sysfs/bus/pci/devices/0000:6b:00.0/sriov_numvfs: read-only file system 68 | ``` 69 | 70 | Try reseting QAT by reloading its kernel driver: 71 | ```console 72 | rmmod qat_4xxx 73 | modprobe qat_4xxx 74 | ``` 75 | 76 | ## Deploying to RedHat OpenShift Container Platform 77 | 78 | ```console 79 | helm install \ 80 | --set openshift.enabled=true \ 81 | --namespace "intel-qat-resource-driver" \ 82 | --create-namespace \ 83 | intel-qat-resource-driver oci://ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-qat-resource-driver-chart 84 | ``` 85 | 86 | > [!NOTE] 87 | > Chart contains SecurityContextConstraints, which requires cluster admin privileges. Ensure the chart is installed by the cluster admin. -------------------------------------------------------------------------------- /test/e2e/gpu/gpu.go: -------------------------------------------------------------------------------- 1 | package gpu 2 | 3 | import ( 4 | "context" 5 | "path/filepath" 6 | "time" 7 | 8 | "github.com/onsi/ginkgo/v2" 9 | "github.com/onsi/gomega" 10 | "k8s.io/apimachinery/pkg/labels" 11 | "k8s.io/kubernetes/test/e2e/framework" 12 | e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" 13 | e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 14 | admissionapi "k8s.io/pod-security-admission/api" 15 | 16 | "github.com/intel/intel-resource-drivers-for-kubernetes/test/e2e/utils" 17 | ) 18 | 19 | const ( 20 | gpuNamespace = "intel-gpu-resource-driver" 21 | gpuDeviceClassYaml = "deployments/gpu/base/device-class.yaml" 22 | gpuNamespaceYaml = "deployments/gpu/base/namespace.yaml" 23 | gpuDriverYaml = "deployments/gpu/base/resource-driver.yaml" 24 | gpuResourceClaimTemplateYaml = "deployments/gpu/examples/resource-claim-template.yaml" 25 | gpuSampleAppKustomizationYaml = "deployments/gpu/tests/gpu-sample-app/kustomization.yaml" 26 | ) 27 | 28 | var ( 29 | gpuDeviceClassYamlPath string 30 | gpuNamespaceYamlPath string 31 | gpuDriverYamlPath string 32 | gpuResourceClaimTemplateYamlPath string 33 | ) 34 | 35 | func init() { 36 | ginkgo.Describe("GPU DRA Driver", describeGpuDraDriver) 37 | } 38 | 39 | func describeGpuDraDriver() { 40 | f := framework.NewDefaultFramework("gpudra") 41 | f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged 42 | 43 | filePaths := map[string]*string{ 44 | gpuDeviceClassYaml: &gpuDeviceClassYamlPath, 45 | gpuNamespaceYaml: &gpuNamespaceYamlPath, 46 | gpuDriverYaml: &gpuDriverYamlPath, 47 | gpuResourceClaimTemplateYaml: &gpuResourceClaimTemplateYamlPath, 48 | } 49 | for file, pathVar := range filePaths { 50 | locatedPath, err := utils.LocateRepoFile(file) 51 | if err != nil { 52 | framework.Failf("unable to locate %q: %v", file, err) 53 | } 54 | *pathVar = locatedPath 55 | } 56 | 57 | ginkgo.BeforeEach(func(ctx context.Context) { 58 | ginkgo.By("deploying GPU plugin") 59 | e2ekubectl.RunKubectlOrDie(gpuNamespace, "apply", "-f", gpuNamespaceYamlPath) 60 | e2ekubectl.RunKubectlOrDie(gpuNamespace, "apply", "-f", gpuDriverYamlPath) 61 | _, _ = e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, gpuNamespace, 62 | labels.Set{"app": "intel-gpu-resource-driver-kubelet-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second) 63 | e2ekubectl.RunKubectlOrDie(gpuNamespace, "apply", "-f", gpuDeviceClassYamlPath) 64 | e2ekubectl.RunKubectlOrDie(gpuNamespace, "apply", "-f", gpuResourceClaimTemplateYamlPath) 65 | time.Sleep(10 * time.Second) 66 | }) 67 | 68 | ginkgo.AfterEach(func(ctx context.Context) { 69 | ginkgo.By("undeploying all in the GPU namespace") 70 | e2ekubectl.RunKubectlOrDie(gpuNamespace, "delete", "-f", gpuNamespaceYamlPath) 71 | }) 72 | 73 | ginkgo.Context("When GPU DRA driver is running", func() { 74 | ginkgo.It("deploys a GPU sample application pod", func(ctx context.Context) { 75 | gpuSampleAppKustomizeDir, err := utils.LocateRepoFile(gpuSampleAppKustomizationYaml) 76 | if err != nil { 77 | framework.Failf("unable to locate %q: %v", gpuSampleAppKustomizationYaml, err) 78 | } 79 | e2ekubectl.RunKubectlOrDie(gpuNamespace, "apply", "-k", filepath.Dir(gpuSampleAppKustomizeDir)) 80 | 81 | ginkgo.By("waiting the GPU sample app pod to finish successfully") 82 | err = e2epod.WaitForPodSuccessInNamespaceTimeout(ctx, f.ClientSet, "gpu-sample-app", gpuNamespace, 300*time.Second) 83 | gomega.Expect(err).To(gomega.BeNil(), utils.GetPodLogs(ctx, f, "gpu-sample-app", "gpu-sample-app")) 84 | }) 85 | }) 86 | } 87 | -------------------------------------------------------------------------------- /cmd/kubelet-gpu-plugin/healthcare.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | "k8s.io/klog/v2" 8 | 9 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/goxpusmi" 10 | "github.com/intel/intel-resource-drivers-for-kubernetes/pkg/gpu/device" 11 | ) 12 | 13 | // HealthStatusUpdates is a type alias for map[deviceUID]map[healthType]status. 14 | type HealthStatusUpdates map[string]map[string]string 15 | 16 | func (d *driver) startHealthMonitor(ctx context.Context, gpuFlags *GPUFlags) { 17 | // Channel carries per-interval health status deltas keyed by device UID. 18 | healthStatusUpdatesCh := make(chan HealthStatusUpdates) 19 | goxpusmiCtx, stopMonitor := context.WithCancel(ctx) 20 | go d.watchGPUHealthStatuses(goxpusmiCtx, gpuFlags, healthStatusUpdatesCh) 21 | 22 | for { 23 | select { 24 | // Listen to original ctx, when driver is shutting down, stop health watcher. 25 | case <-goxpusmiCtx.Done(): 26 | stopMonitor() 27 | return 28 | case healthDeltas := <-healthStatusUpdatesCh: 29 | d.updateHealth(goxpusmiCtx, healthDeltas) 30 | } 31 | } 32 | } 33 | 34 | func (d *driver) updateHealth(ctx context.Context, healthStatusUpdates HealthStatusUpdates) { 35 | d.state.Lock() 36 | defer d.state.Unlock() 37 | //nolint:forcetypeassert // We want the code to panic if our assumption turns out to be wrong. 38 | allocatable := d.state.Allocatable.(map[string]*device.DeviceInfo) 39 | for deviceUID, healthStatus := range healthStatusUpdates { 40 | klog.Infof("Updating info for device %v to status=%v", deviceUID, healthStatus) 41 | foundDevice, found := allocatable[deviceUID] 42 | if !found { 43 | klog.Errorf("could not find allocatable device with UID %v", deviceUID) 44 | return 45 | } 46 | 47 | // Determine overall health: healthy unless any status is CRITICAL. 48 | isHealthy := true 49 | if foundDevice.HealthStatus == nil { 50 | foundDevice.HealthStatus = make(map[string]string) 51 | } 52 | for healthType, status := range healthStatusUpdates[deviceUID] { 53 | foundDevice.HealthStatus[healthType] = status 54 | health := d.state.StatusHealth(status) 55 | isHealthy = isHealthy && health 56 | } 57 | foundDevice.Healthy = isHealthy 58 | } 59 | // Health is updated from a go routine, nothing we can do when publishing 60 | // resource slice fails, so error is only logged. 61 | if err := d.PublishResourceSlice(ctx); err != nil { 62 | klog.Errorf("could not publish updated resource slice: %v", err) 63 | } 64 | } 65 | 66 | // watchGPUHealthStatuses polls XPUM metric health info and sends per-interval 67 | // health status deltas to healthStatusUpdatesCh only when there are updates. 68 | func (d *driver) watchGPUHealthStatuses(ctx context.Context, gpuFlags *GPUFlags, healthStatusUpdatesCh chan<- HealthStatusUpdates) { 69 | nonVerboseDiscovery := false 70 | devices, err := goxpusmi.Discover(nonVerboseDiscovery) 71 | if err != nil { 72 | klog.Errorf("could not discover devices for health monitoring: %v", err) 73 | return 74 | } 75 | 76 | if gpuFlags.CoreThermalLimit != HealthCoreThermalLimitUnset { 77 | goxpusmi.SetHealthConfig(devices, "CoreThermalLimit", gpuFlags.CoreThermalLimit) 78 | } 79 | if gpuFlags.MemoryThermalLimit != HealthMemoryThermalLimitUnset { 80 | goxpusmi.SetHealthConfig(devices, "MemoryThermalLimit", gpuFlags.MemoryThermalLimit) 81 | } 82 | if gpuFlags.PowerLimit != HealthPowerLimitUnset { 83 | goxpusmi.SetHealthConfig(devices, "PowerLimit", gpuFlags.PowerLimit) 84 | } 85 | 86 | HealthcareInterval := time.NewTicker(time.Duration(int(gpuFlags.HealthcareInterval)) * time.Second) 87 | for { 88 | select { 89 | case <-ctx.Done(): 90 | if err = goxpusmi.Shutdown(); err != nil { 91 | klog.Errorf("failed to shutdown xpu-smi: %v", err) 92 | } 93 | return 94 | case <-HealthcareInterval.C: 95 | if updates := goxpusmi.HealthCheck(devices); len(updates) > 0 { 96 | healthStatusUpdatesCh <- updates 97 | } 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /pkg/helpers/device.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package helpers 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "path" 23 | "path/filepath" 24 | "strings" 25 | 26 | "k8s.io/klog/v2" 27 | ) 28 | 29 | const ( 30 | SysfsEnvVarName = "SYSFS_ROOT" 31 | sysfsDefaultRoot = "/sys" 32 | 33 | DevfsEnvVarName = "DEVFS_ROOT" 34 | devfsDefaultRoot = "/dev" 35 | 36 | PCIAddressLength = len("0000:00:00.0") 37 | ) 38 | 39 | // GetSysfsRoot tries to get path where sysfs is mounted from the env var, 40 | // or fallback to hardcoded path. 41 | func GetSysfsRoot(sysfsPath string) string { 42 | sysfsRoot, found := os.LookupEnv(SysfsEnvVarName) 43 | 44 | if found { 45 | if _, err := os.Stat(path.Join(sysfsRoot, sysfsPath)); err == nil { 46 | klog.V(5).Infof("using custom sysfs location: %v\n", sysfsRoot) 47 | return sysfsRoot 48 | } else { 49 | klog.V(5).Infof("could not find sysfs at '%v' from %v env var: %v\n", sysfsPath, SysfsEnvVarName, err) 50 | } 51 | } 52 | 53 | klog.V(5).Infof("using default sysfs location: %v\n", sysfsDefaultRoot) 54 | // If /sys is not available, devices discovery will fail gracefully. 55 | return sysfsDefaultRoot 56 | } 57 | 58 | func GetDevRoot(devfsRootEnvVarName string, devPath string) string { 59 | devfsRoot, found := os.LookupEnv(devfsRootEnvVarName) 60 | 61 | if found { 62 | if _, err := os.Stat(path.Join(devfsRoot, devPath)); err == nil { 63 | klog.V(5).Infof("using custom devfs location: %v\n", devfsRoot) 64 | return devfsRoot 65 | } else { 66 | klog.V(5).Infof("could not find devfs at '%v' from %v env var: %v\n", devPath, devfsRootEnvVarName, err) 67 | } 68 | } 69 | 70 | klog.V(5).Infof("using default devfs root: %v\n", devfsDefaultRoot) 71 | return devfsDefaultRoot 72 | } 73 | 74 | func PciInfoFromDeviceUID(deviceUID string) (string, string) { 75 | // 0000-00-01-0-0x0000 -> 0000:00:01.0, 0x0000 76 | rfc1123PCIaddress := deviceUID[:PCIAddressLength] 77 | pciAddress := strings.Replace(strings.Replace(rfc1123PCIaddress, "-", ":", 2), "-", ".", 1) 78 | deviceId := deviceUID[PCIAddressLength+1:] 79 | 80 | return pciAddress, deviceId 81 | } 82 | 83 | func DeviceUIDFromPCIinfo(pciAddress string, pciid string) string { 84 | // 0000:00:01.0, 0x0000 -> 0000-00-01-0-0x0000 85 | // Replace colons and the dot in PCI address with hyphens. 86 | rfc1123PCIaddress := strings.ReplaceAll(strings.ReplaceAll(pciAddress, ":", "-"), ".", "-") 87 | newUID := fmt.Sprintf("%v-%v", rfc1123PCIaddress, pciid) 88 | 89 | return newUID 90 | } 91 | 92 | func DeterminePCIRoot(link string) string { 93 | // e.g. /sys/devices/pci0000:16/0000:16:02.0/0000:17:00.0/0000:18:00.0/0000:19:00.0 94 | linkTarget, err := filepath.EvalSymlinks(link) 95 | if err != nil { 96 | klog.Errorf("Could not determine PCI root complex ID from '%v': %v", link, err) 97 | return "" 98 | } 99 | klog.V(5).Infof("PCI device location: %v", linkTarget) 100 | parts := strings.Split(linkTarget, "/") 101 | 102 | // To support arbitrary sysfs location, discard leading path elements 103 | // before devices minus one. 104 | trueSysfsRootIdx := 0 105 | for idx, pathElement := range parts { 106 | if pathElement == "devices" && idx > 0 { 107 | trueSysfsRootIdx = idx - 1 108 | break 109 | } 110 | } 111 | if trueSysfsRootIdx != 0 { 112 | parts = parts[trueSysfsRootIdx:] 113 | } 114 | 115 | if len(parts) > 2 && parts[1] == "devices" { 116 | return strings.Replace(parts[2], "pci0000:", "", 1) 117 | } 118 | klog.Warningf("could not parse sysfs link target %v: %v", linkTarget, parts) 119 | 120 | return "" 121 | } 122 | -------------------------------------------------------------------------------- /pkg/helpers/node_state.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, Intel Corporation. All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package helpers 18 | 19 | import ( 20 | "context" 21 | "encoding/json" 22 | "fmt" 23 | "os" 24 | "sync" 25 | 26 | "k8s.io/dynamic-resource-allocation/kubeletplugin" 27 | "k8s.io/klog/v2" 28 | cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" 29 | ) 30 | 31 | type ClaimPreparations map[string]kubeletplugin.PrepareResult 32 | 33 | type NodeState struct { 34 | sync.Mutex 35 | CdiCache *cdiapi.Cache 36 | Allocatable interface{} 37 | Prepared ClaimPreparations 38 | PreparedClaimsFilePath string 39 | NodeName string 40 | SysfsRoot string 41 | } 42 | 43 | func (s *NodeState) Unprepare(ctx context.Context, claimUID string) error { 44 | s.Lock() 45 | defer s.Unlock() 46 | 47 | if _, found := s.Prepared[claimUID]; !found { 48 | return nil 49 | } 50 | 51 | klog.V(5).Infof("Freeing devices from claim %v", claimUID) 52 | delete(s.Prepared, claimUID) 53 | 54 | // write prepared claims to file 55 | if err := WritePreparedClaimsToFile(s.PreparedClaimsFilePath, s.Prepared); err != nil { 56 | return fmt.Errorf("failed to write prepared claims to file: %v", err) 57 | } 58 | 59 | return nil 60 | } 61 | 62 | // GetOrCreatePreparedClaims reads a PreparedClaim from a file and deserializes it or creates the file. 63 | func GetOrCreatePreparedClaims(preparedClaimFilePath string) (ClaimPreparations, error) { 64 | if _, err := os.Stat(preparedClaimFilePath); os.IsNotExist(err) { 65 | klog.V(5).Infof("could not find file %v. Creating file", preparedClaimFilePath) 66 | f, err := os.OpenFile(preparedClaimFilePath, os.O_CREATE|os.O_WRONLY, 0600) 67 | if err != nil { 68 | return nil, fmt.Errorf("failed creating file %v. Err: %v", preparedClaimFilePath, err) 69 | } 70 | defer f.Close() 71 | 72 | if _, err := f.WriteString("{}"); err != nil { 73 | return nil, fmt.Errorf("failed writing to file %v. Err: %v", preparedClaimFilePath, err) 74 | } 75 | 76 | klog.V(5).Infof("empty prepared claims file created %v", preparedClaimFilePath) 77 | 78 | return make(ClaimPreparations), nil 79 | } 80 | 81 | return ReadPreparedClaimsFromFile(preparedClaimFilePath) 82 | } 83 | 84 | // ReadPreparedClaimToFile returns unmarshaled content for given prepared claims JSON file. 85 | func ReadPreparedClaimsFromFile(preparedClaimFilePath string) (ClaimPreparations, error) { 86 | 87 | preparedClaims := make(ClaimPreparations) 88 | 89 | preparedClaimsBytes, err := os.ReadFile(preparedClaimFilePath) 90 | if err != nil { 91 | klog.V(5).Infof("could not read prepared claims configuration from file %v. Err: %v", preparedClaimFilePath, err) 92 | return nil, fmt.Errorf("failed reading file %v. Err: %v", preparedClaimFilePath, err) 93 | } 94 | 95 | if err := json.Unmarshal(preparedClaimsBytes, &preparedClaims); err != nil { 96 | klog.V(5).Infof("Could not parse default prepared claims configuration from file %v. Err: %v", preparedClaimFilePath, err) 97 | return nil, fmt.Errorf("failed parsing file %v. Err: %v", preparedClaimFilePath, err) 98 | } 99 | 100 | return preparedClaims, nil 101 | } 102 | 103 | // WritePreparedClaimsToFile serializes PreparedClaims and writes it to a file. 104 | func WritePreparedClaimsToFile(preparedClaimFilePath string, preparedClaims ClaimPreparations) error { 105 | if preparedClaims == nil { 106 | preparedClaims = ClaimPreparations{} 107 | } 108 | encodedPreparedClaims, err := json.MarshalIndent(preparedClaims, "", " ") 109 | if err != nil { 110 | return fmt.Errorf("prepared claims JSON encoding failed. Err: %v", err) 111 | } 112 | return os.WriteFile(preparedClaimFilePath, encodedPreparedClaims, 0600) 113 | } 114 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Intel Corporation. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG LOCAL_LICENSES 16 | 17 | FROM golang:1.24.2@sha256:b51b7beeabe2e2d8438ba4295c59d584049873a480ba0e7b56d80db74b3e3a3a AS build 18 | WORKDIR /build 19 | COPY . . 20 | 21 | # Add xpu-smi shared library 22 | RUN \ 23 | wget -qO /tmp/xpu-smi.deb https://github.com/intel/xpumanager/releases/download/V1.3.1/xpu-smi_1.3.1_20250724.061629.60921e5e_u24.04_amd64.deb && \ 24 | dpkg -i --ignore-depends=level-zero,intel-gsc,libze-intel-gpu1 /tmp/xpu-smi.deb 25 | 26 | # Build GPU DRA driver 27 | RUN make gpu && \ 28 | mkdir -p /install_root && \ 29 | if [ -z "$LOCAL_LICENSES" ]; then \ 30 | make licenses; \ 31 | fi && \ 32 | cp -r licenses /install_root/ && \ 33 | cp bin/kubelet-gpu-plugin /install_root/ 34 | 35 | # Prepare dependencies 36 | FROM ubuntu:24.04@sha256:80dd3c3b9c6cecb9f1667e9290b3bc61b78c2678c02cbdae5f0fea92cc6734ab AS ubuntu 37 | COPY --from=build /tmp/xpu-smi.deb /tmp/xpu-smi.deb 38 | 39 | RUN \ 40 | sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources && \ 41 | apt-get update && \ 42 | apt-get install -y software-properties-common python3-launchpadlib 43 | 44 | RUN \ 45 | add-apt-repository -s -y ppa:kobuk-team/intel-graphics && \ 46 | apt-get update && \ 47 | apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-gsc libmetee5 && \ 48 | apt-get install -y /tmp/xpu-smi.deb 49 | 50 | RUN \ 51 | mkdir /tmp/src && \ 52 | cd /tmp/src && \ 53 | apt-get source --download-only dash glibc libcap2 libudev1 libstdc++6 libmetee5 && \ 54 | mkdir /licenses && \ 55 | for pkg in dash libc6 intel-gsc libcap2 libudev1 libze1 libigdgmm12 libstdc++6 libze-intel-gpu1 libmetee5; do \ 56 | mkdir -p /licenses/$pkg; \ 57 | cp /usr/share/doc/$pkg/copyright /licenses/$pkg/; \ 58 | done && \ 59 | if grep -q /common-licenses/ /licenses/*/copyright; then \ 60 | cp -r /usr/share/common-licenses/ /licenses/; \ 61 | fi 62 | 63 | FROM scratch 64 | WORKDIR / 65 | LABEL description="Intel GPU resource driver for Kubernetes" 66 | 67 | # /bin/sh is used by xpu-smi library. 68 | COPY --from=build /install_root / 69 | COPY --from=build /usr/lib/x86_64-linux-gnu/libxpum.so.1 /usr/lib/x86_64-linux-gnu/libxpum.so.1 70 | COPY --from=ubuntu /lib/x86_64-linux-gnu/libc.so.6 /lib/x86_64-linux-gnu/libc.so.6 71 | COPY --from=ubuntu /lib64/ld-linux-x86-64.so.2 /lib64/ld-linux-x86-64.so.2 72 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libm.so.6 73 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libdl.so.2 /usr/lib/x86_64-linux-gnu/libdl.so.2 74 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libz.so.1 /usr/lib/x86_64-linux-gnu/libz.so.1 75 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libze_loader.so.1 /usr/lib/x86_64-linux-gnu/libze_loader.so.1 76 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libigsc.so.0 /usr/lib/x86_64-linux-gnu/libigsc.so.0 77 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libudev.so.1 /usr/lib/x86_64-linux-gnu/libudev.so.1 78 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libcap.so.2 /usr/lib/x86_64-linux-gnu/libcap.so.2 79 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 80 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 81 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libze_intel_gpu.so.1 /usr/lib/x86_64-linux-gnu/libze_intel_gpu.so.1 82 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libmetee.so.5.0.0 /usr/lib/x86_64-linux-gnu/libmetee.so.5.0.0 83 | 84 | COPY --from=ubuntu /lib/x86_64-linux-gnu/libpciaccess.so.0 /lib/x86_64-linux-gnu/libpciaccess.so.0 85 | COPY --from=ubuntu /lib/x86_64-linux-gnu/libigdgmm.so.12 /lib/x86_64-linux-gnu/libigdgmm.so.12 86 | COPY --from=ubuntu /usr/lib/x86_64-linux-gnu/libigc.so.2 /usr/lib/x86_64-linux-gnu/libigc.so.2 87 | 88 | COPY --from=ubuntu /bin/sh /bin/sh 89 | COPY --from=ubuntu /licenses /licenses 90 | COPY --from=ubuntu /tmp/src /src 91 | CMD ["/kubelet-gpu-plugin"] 92 | --------------------------------------------------------------------------------