├── .gitignore
├── _static
└── custom.css
├── docs
├── index.html
├── images
│ ├── Ecosystem.png
│ ├── gaudinet_image.png
│ ├── model-server-status.png
│ ├── rhoai-deploy-model.png
│ ├── tgi-serving-runtime.png
│ ├── rhods-dashboard-route.png
│ ├── Ovms-Gpu-resource-limit.png
│ ├── Ovms-Gpu-resource-request.png
│ ├── openvino-accelerator-field.png
│ ├── Operator-Architecture-Options.png
│ ├── accelerator-profile-dropdown.png
│ ├── congestion_test_single_leaf_switch.png
│ ├── congestion_test_single_leaf_spine_switch.png
│ ├── bisection_bandwidth_testing_on_leaf_switch.png
│ ├── bisection_bandwidth_testing_on_leaf_spine_switchs.png
│ ├── Intel-Technology-Enabling-for-OpenShift-Architecture.png
│ ├── bisection_bandwidth_testing_all_Gaudis_leaf_spine_switchs.png
│ ├── bisection_bandwidth_testing_all_nodes_leaf_spine_switchs.png
│ └── Intel-Enterprise-AI-Foundation-for-OpenShift-Training-Solution.png
├── releases.rst
└── supported_platforms.md
├── requirements.txt
├── security
├── dsa_serviceAccount.yaml
├── dsa_role.yaml
├── dsa_roleBinding.yaml
├── qatlib_rbac.yaml
├── dsa_scc.yaml
└── qatlib_scc.yaml
├── tests
├── l2
│ ├── dsa
│ │ ├── dsa_imagestream.yaml
│ │ ├── rh_auth.yaml
│ │ ├── dsa_job.yaml
│ │ ├── dsa_build.yaml
│ │ └── README.md
│ ├── sgx
│ │ ├── sgx_imagestream.yaml
│ │ ├── sgx_job.yaml
│ │ ├── README.md
│ │ └── sgx_build.yaml
│ ├── dgpu
│ │ ├── clinfo_job.yaml
│ │ ├── intelvpl_job.yaml
│ │ ├── hwinfo_job.yaml
│ │ ├── vainfo_job.yaml
│ │ ├── hwinfo_build.yaml
│ │ ├── clinfo_build.yaml
│ │ ├── vainfo_build.yaml
│ │ ├── intelvpl_build.yaml
│ │ └── README.md
│ ├── qat
│ │ ├── qatlib_job.yaml
│ │ ├── qatlib_build.yaml
│ │ └── README.md
│ └── README.md
├── gaudi
│ └── l2
│ │ ├── vllm_hf_secret.yaml
│ │ ├── test-pod.yaml
│ │ ├── hl-smi_job.yaml
│ │ ├── hl-smi-firmware_job.yaml
│ │ ├── hccl_build.yaml
│ │ ├── hccl_job.yaml
│ │ ├── vllm_deployment.yaml
│ │ ├── vllm_buildconfig.yaml
│ │ └── README.md
├── l3
│ └── README.md
└── README.md
├── e2e
└── inference
│ ├── accelerator_profile_gaudi.yaml
│ ├── accelerator_profile_flex140.yaml
│ └── README.md
├── security.md
├── nfd
├── node-feature-discovery-openshift.yaml
├── node-feature-rules-openshift.yaml
└── README.md
├── device_plugins
├── sgx_device_plugin.yaml
├── gpu_device_plugin.yaml
├── dsa_device_plugin.yaml
├── qat_device_plugin.yaml
├── install_operator.yaml
├── deploy_sgx.md
├── deploy_gpu.md
├── deploy_dsa.md
├── README.md
└── deploy_qat.md
├── machine_configuration
├── 100-intel-iommu-on.yaml
└── README.md
├── gaudi
├── gaudi_spi_fw_upgrade_job.yaml
├── gaudi_install_operator.yaml
├── gaudi_spi_fw_upgrade_build.yaml
├── Gaudi-SPI-Firmware-Upgrade.md
├── gaudi_cluster_policy.yaml
└── README.md
├── workloads
└── opea
│ └── chatqna
│ ├── create_megaservice_container.sh
│ ├── persistent_volumes.yaml
│ ├── tgi_gaudi_servingruntime.yaml
│ ├── chatqna_megaservice_buildconfig.yaml
│ ├── redis_deployment_service.yaml
│ ├── chatqna_megaservice_deployment.yaml
│ └── README.md
├── _templates
└── versions.html
├── set-version.sh
├── kmmo
├── intel-dgpu.yaml
├── intel-dgpu-on-premise-build.yaml
└── README.md
├── make.bat
├── playbooks
├── install_device_plugins_operator.yaml
├── README.md
├── install_nfd_operator.yaml
├── configure_nfd.yaml
└── intel_ocp_provisioning.yaml
├── index.rst
├── Makefile
├── CONTRIBUTING.md
├── one_click
├── README.md
├── gaudi_provisioning_playbook.yaml
└── gpu_provisioning_playbook.yaml
├── .github
└── workflows
│ └── publish.yml
├── conf.py
└── CODE_OF_CONDUCT.md
/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 |
--------------------------------------------------------------------------------
/_static/custom.css:
--------------------------------------------------------------------------------
1 | .wy-nav-content {
2 | max-width: 90% !important;
3 | }
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/images/Ecosystem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Ecosystem.png
--------------------------------------------------------------------------------
/docs/images/gaudinet_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/gaudinet_image.png
--------------------------------------------------------------------------------
/docs/images/model-server-status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/model-server-status.png
--------------------------------------------------------------------------------
/docs/images/rhoai-deploy-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/rhoai-deploy-model.png
--------------------------------------------------------------------------------
/docs/images/tgi-serving-runtime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/tgi-serving-runtime.png
--------------------------------------------------------------------------------
/docs/images/rhods-dashboard-route.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/rhods-dashboard-route.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | docutils<0.18
2 | sphinx
3 | sphinx_rtd_theme
4 | recommonmark
5 | sphinx-markdown-tables
6 | sphinx-md
7 | myst_parser
8 | GitPython
--------------------------------------------------------------------------------
/docs/images/Ovms-Gpu-resource-limit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Ovms-Gpu-resource-limit.png
--------------------------------------------------------------------------------
/docs/images/Ovms-Gpu-resource-request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Ovms-Gpu-resource-request.png
--------------------------------------------------------------------------------
/docs/images/openvino-accelerator-field.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/openvino-accelerator-field.png
--------------------------------------------------------------------------------
/docs/images/Operator-Architecture-Options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Operator-Architecture-Options.png
--------------------------------------------------------------------------------
/docs/images/accelerator-profile-dropdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/accelerator-profile-dropdown.png
--------------------------------------------------------------------------------
/docs/images/congestion_test_single_leaf_switch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/congestion_test_single_leaf_switch.png
--------------------------------------------------------------------------------
/docs/images/congestion_test_single_leaf_spine_switch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/congestion_test_single_leaf_spine_switch.png
--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_on_leaf_switch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_on_leaf_switch.png
--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_on_leaf_spine_switchs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_on_leaf_spine_switchs.png
--------------------------------------------------------------------------------
/docs/images/Intel-Technology-Enabling-for-OpenShift-Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Intel-Technology-Enabling-for-OpenShift-Architecture.png
--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_all_Gaudis_leaf_spine_switchs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_all_Gaudis_leaf_spine_switchs.png
--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_all_nodes_leaf_spine_switchs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_all_nodes_leaf_spine_switchs.png
--------------------------------------------------------------------------------
/security/dsa_serviceAccount.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: v1
5 | kind: ServiceAccount
6 | metadata:
7 | name: intel-dsa
8 | namespace: intel-dsa
--------------------------------------------------------------------------------
/docs/images/Intel-Enterprise-AI-Foundation-for-OpenShift-Training-Solution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Intel-Enterprise-AI-Foundation-for-OpenShift-Training-Solution.png
--------------------------------------------------------------------------------
/tests/l2/dsa/dsa_imagestream.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-dsa-workload
8 | namespace: intel-dsa
--------------------------------------------------------------------------------
/tests/l2/sgx/sgx_imagestream.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-sgx-workload
8 | namespace: intel-sgx
--------------------------------------------------------------------------------
/tests/gaudi/l2/vllm_hf_secret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | apiVersion: v1
4 | kind: Secret
5 | metadata:
6 | name: hf-token
7 | namespace: gaudi-validation
8 | type: Opaque
9 | data:
10 | hf-token: # Add your token
--------------------------------------------------------------------------------
/tests/l2/dsa/rh_auth.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: v1
5 | kind: Secret
6 | metadata:
7 | name: rh-auth
8 | namespace: intel-dsa
9 | type: Opaque
10 | data:
11 | username: # Add username
12 | password: # Add password
13 |
--------------------------------------------------------------------------------
/e2e/inference/accelerator_profile_gaudi.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | apiVersion: dashboard.opendatahub.io/v1
4 | kind: AcceleratorProfile
5 | metadata:
6 | name: intel-gaudi-ai-accelerator
7 | spec:
8 | displayName: Intel Gaudi AI Accelerator
9 | description: Intel Gaudi AI Accelerator
10 | enabled: true
11 | identifier: habana.ai/gaudi
--------------------------------------------------------------------------------
/security/dsa_role.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: rbac.authorization.k8s.io/v1
5 | kind: Role
6 | metadata:
7 | name: intel-dsa
8 | namespace: intel-dsa
9 | rules:
10 | - apiGroups:
11 | - security.openshift.io
12 | resources:
13 | - securitycontextconstraints
14 | resourceNames:
15 | - intel-dsa-scc
16 | verbs:
17 | - use
--------------------------------------------------------------------------------
/security.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation.
3 |
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 |
--------------------------------------------------------------------------------
/security/dsa_roleBinding.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: rbac.authorization.k8s.io/v1
5 | kind: RoleBinding
6 | metadata:
7 | name: intel-dsa
8 | namespace: intel-dsa
9 | roleRef:
10 | apiGroup: rbac.authorization.k8s.io
11 | kind: Role
12 | name: intel-dsa
13 | subjects:
14 | - kind: ServiceAccount
15 | name: intel-dsa
16 | namespace: intel-dsa
--------------------------------------------------------------------------------
/tests/gaudi/l2/test-pod.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | apiVersion: v1
4 | kind: Pod
5 | metadata:
6 | name: test
7 | labels:
8 | app: test
9 | namespace: gaudi-validation
10 | spec:
11 | containers:
12 | - name: test
13 | command: [ "/bin/bash", "-c", "--" ]
14 | args: [ "while true; do sleep 30; done;"]
15 | image: registry.access.redhat.com/ubi9-minimal:latest
16 |
--------------------------------------------------------------------------------
/nfd/node-feature-discovery-openshift.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: nfd.openshift.io/v1
5 | kind: NodeFeatureDiscovery
6 | metadata:
7 | name: nfd-instance
8 | namespace: openshift-nfd
9 | spec:
10 | operand:
11 | image: registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.18
12 | imagePullPolicy: Always
13 | servicePort: 12000
14 | workerConfig:
15 | configData: |
--------------------------------------------------------------------------------
/device_plugins/sgx_device_plugin.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: deviceplugin.intel.com/v1
5 | kind: SgxDevicePlugin
6 | metadata:
7 | name: sgxdeviceplugin-sample
8 | spec:
9 | image: registry.connect.redhat.com/intel/intel-sgx-plugin@sha256:f2c77521c6dae6b4db1896a5784ba8b06a5ebb2a01684184fc90143cfcca7bf4
10 | enclaveLimit: 110
11 | provisionLimit: 110
12 | logLevel: 4
13 | nodeSelector:
14 | intel.feature.node.kubernetes.io/sgx: "true"
--------------------------------------------------------------------------------
/device_plugins/gpu_device_plugin.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: deviceplugin.intel.com/v1
5 | kind: GpuDevicePlugin
6 | metadata:
7 | name: gpudeviceplugin-sample
8 | spec:
9 | image: registry.connect.redhat.com/intel/intel-gpu-plugin@sha256:e2c2ce658e78c35c425f16a4f8e85c5f32ce31848d9b53a644a05e7f8b7f71b0
10 | preferredAllocationPolicy: none
11 | sharedDevNum: 1
12 | logLevel: 4
13 | nodeSelector:
14 | intel.feature.node.kubernetes.io/gpu: "true"
--------------------------------------------------------------------------------
/e2e/inference/accelerator_profile_flex140.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | apiVersion: dashboard.opendatahub.io/v1
4 | kind: AcceleratorProfile
5 | metadata:
6 | name: intel-gpu-flex-series-140
7 | spec:
8 | displayName: Intel® Data Center GPU Flex Series 140
9 | description: Intel Data Center GPU for inference
10 | enabled: true
11 | identifier: gpu.intel.com/i915
12 | tolerations:
13 | - effect: NoSchedule
14 | key: gpu.intel.com/flex-140
15 | operator: Exists
--------------------------------------------------------------------------------
/tests/l3/README.md:
--------------------------------------------------------------------------------
1 | # L3 Test
2 |
3 | ```{admonition} Disclaimer
4 | Please note that this module is currently under development and may contain partially implemented features, therefore it is not supported in the current release.
5 | ```
6 |
7 | ## L3 Overview
8 | Layer 3 consists of the software stack used to provision the e2e test for specific Intel features like Intel® Data Center GPU Flex Series and Intel® SGX. This layer is directly relied by the reference workloads -layer 4.
9 |
10 | ### GPU
11 | Please follow the [README](/e2e/inference/README.md) for Intel Data Center GPU Flex Series specific prerequisites.
--------------------------------------------------------------------------------
/machine_configuration/100-intel-iommu-on.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: machineconfiguration.openshift.io/v1
5 | kind: MachineConfig
6 | metadata:
7 | labels:
8 | machineconfiguration.openshift.io/role: worker
9 | name: 100-intel-iommu-on
10 | spec:
11 | config:
12 | ignition:
13 | version: 3.2.0
14 | kernelArguments:
15 | - intel_iommu=on,sm_on modules_load=vfio-pci vfio-pci.ids=8086:4941,8086:4943
16 | selector:
17 | intel.feature.node.kubernetes.io/qat: 'true'
18 | intel.feature.node.kubernetes.io/dsa: 'true'
19 |
--------------------------------------------------------------------------------
/tests/l2/dgpu/clinfo_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-dgpu-clinfo
8 | namespace: intel-dgpu
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | containers:
15 | - name: clinfo-pod
16 | image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-clinfo:latest
17 | command: ["clinfo"]
18 | resources:
19 | limits:
20 | gpu.intel.com/i915: 1
21 | imagePullPolicy: IfNotPresent
--------------------------------------------------------------------------------
/tests/l2/dgpu/intelvpl_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-dgpu-intelvpl
8 | namespace: intel-dgpu
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | containers:
15 | - name: intelvpl-pod
16 | image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-intelvpl:latest
17 | command: ["vpl-inspect"]
18 | resources:
19 | limits:
20 | gpu.intel.com/i915: 1
21 | imagePullPolicy: IfNotPresent
22 |
--------------------------------------------------------------------------------
/device_plugins/dsa_device_plugin.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | kind: DsaDevicePlugin
5 | apiVersion: deviceplugin.intel.com/v1
6 | metadata:
7 | name: dsadeviceplugin-sample
8 | spec:
9 | image: registry.connect.redhat.com/intel/intel-dsa-plugin@sha256:2742a13279cc3f301daa09b6389517024530f658d4e1dd13db495cc94d9ba57c
10 | initImage: registry.connect.redhat.com/intel/intel-idxd-config-initcontainer@sha256:b74dc43fa81ce14ea97f20ff6b2f726039f6309fdd868d5f45d751d0a8662cc1
11 | logLevel: 4
12 | nodeSelector:
13 | intel.feature.node.kubernetes.io/dsa: 'true'
14 | sharedDevNum: 10
15 |
--------------------------------------------------------------------------------
/tests/gaudi/l2/hl-smi_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: hl-smi-workload
8 | namespace: gaudi-validation
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | containers:
15 | - name: hl-smi-workload
16 | image: vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
17 | command: ["hl-smi"]
18 | resources:
19 | limits:
20 | habana.ai/gaudi: 8
21 | imagePullPolicy: IfNotPresent
--------------------------------------------------------------------------------
/tests/l2/dgpu/hwinfo_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-dgpu-hwinfo
8 | namespace: intel-dgpu
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | containers:
15 | - name: hwinfo-pod
16 | image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-hwinfo:latest
17 | command: ["hwinfo"]
18 | args: ["--display"]
19 | resources:
20 | limits:
21 | gpu.intel.com/i915: 1
22 | imagePullPolicy: IfNotPresent
--------------------------------------------------------------------------------
/tests/l2/sgx/sgx_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-sgx-job
8 | namespace: intel-sgx
9 | spec:
10 | template:
11 | spec:
12 | restartPolicy: Never
13 | containers:
14 | - name: intel-sgx-job
15 | image: image-registry.openshift-image-registry.svc:5000/intel-sgx/intel-sgx-workload:latest
16 | imagePullPolicy: Always
17 | workingDir: "/opt/intel/"
18 | command: ["/opt/intel/app"]
19 | resources:
20 | limits:
21 | sgx.intel.com/epc: "5Mi"
--------------------------------------------------------------------------------
/tests/gaudi/l2/hl-smi-firmware_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: hl-smi-firmware
8 | namespace: gaudi-validation
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | containers:
15 | - name: hl-smi-firmware
16 | image: vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
17 | command: ["/bin/bash", "-c", "hl-smi -L | grep SPI"]
18 | resources:
19 | limits:
20 | habana.ai/gaudi: 1
21 | imagePullPolicy: IfNotPresent
22 |
--------------------------------------------------------------------------------
/device_plugins/qat_device_plugin.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: deviceplugin.intel.com/v1
5 | kind: QatDevicePlugin
6 | metadata:
7 | name: qatdeviceplugin-sample
8 | spec:
9 | image: registry.connect.redhat.com/intel/intel-qat-plugin@sha256:8d79dba051b83ec770a4b0fdc3da6ac92264cb19cac8d455b707ed92a6a95d02
10 | initImage: registry.connect.redhat.com/intel/intel-qat-initcontainer@sha256:34f0b993ca654ea0b386217cba1a44d5ef3da841b3befc780508f5323e95fa90
11 | dpdkDriver: vfio-pci
12 | kernelVfDrivers:
13 | - 4xxxvf
14 | maxNumDevices: 128
15 | logLevel: 4
16 | nodeSelector:
17 | intel.feature.node.kubernetes.io/qat: "true"
--------------------------------------------------------------------------------
/tests/l2/dgpu/vainfo_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-dgpu-vainfo
8 | namespace: intel-dgpu
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | containers:
15 | - name: vainfo-pod
16 | image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-vainfo:latest
17 | command: ["vainfo"]
18 | args: ["--display", "drm", "--device", "/dev/dri/renderD128"]
19 | resources:
20 | limits:
21 | gpu.intel.com/i915: 1
22 | imagePullPolicy: IfNotPresent
23 |
24 |
--------------------------------------------------------------------------------
/device_plugins/install_operator.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: operators.coreos.com/v1alpha2
5 | kind: OperatorGroup
6 | metadata:
7 | name: global-operators
8 | namespace: openshift-operators
9 | ---
10 | apiVersion: operators.coreos.com/v1alpha1
11 | kind: Subscription
12 | metadata:
13 | labels:
14 | operators.coreos.com/intel-device-plugins-operator.openshiftoperators: ""
15 | name: intel-device-plugins-operator
16 | namespace: openshift-operators
17 | spec:
18 | channel: alpha
19 | installPlanApproval: Automatic
20 | name: intel-device-plugins-operator
21 | source: certified-operators
22 | sourceNamespace: openshift-marketplace
--------------------------------------------------------------------------------
/gaudi/gaudi_spi_fw_upgrade_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: gaudi-spi-firmware-upgrade
8 | namespace: gaudi-spi-fw-upgrade
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | serviceAccountName: gaudi-fw-upgrade-sa
15 | containers:
16 | - name: gaudi-spi-firmware-upgrade
17 | securityContext:
18 | privileged: true
19 | image: image-registry.openshift-image-registry.svc:5000/gaudi-spi-fw-upgrade/gaudi-spi-fw-upgrade:1.19.1-26
20 | command: [ "hl-fw-loader", "-y" ]
21 | imagePullPolicy: Always
--------------------------------------------------------------------------------
/workloads/opea/chatqna/create_megaservice_container.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/bin/sh
4 |
5 | tag="v0.8"
6 | namespace="opea-chatqna"
7 | repo="https://github.com/opea-project/GenAIExamples.git"
8 | yaml_url="https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/chatqna_megaservice_buildconfig.yaml"
9 |
10 | oc project $namespace &&
11 | git clone --depth 1 --branch $tag $repo &&
12 | cd GenAIExamples/ChatQnA/deprecated/langchain/docker &&
13 | oc extract secret/knative-serving-cert -n istio-system --to=. --keys=tls.crt &&
14 | oc apply -f $yaml_url &&
15 | oc start-build chatqna-megaservice --from-dir=./ --follow
--------------------------------------------------------------------------------
/_templates/versions.html:
--------------------------------------------------------------------------------
1 | {%- if versions %}
2 |
3 |
4 | Versions
5 | {{ version }}
6 |
7 |
8 |
9 | {% if versions|length >= 1 %}
10 |
11 | - {{ _('Versions') }}
12 | {% for slug, url in versions %}
13 | {% if slug == version %} {% endif %}
14 | - {{ slug }}
15 | {% if slug == version %} {% endif %}
16 | {% endfor %}
17 |
18 | {% endif %}
19 |
20 |
21 | {%- endif %}
--------------------------------------------------------------------------------
/security/qatlib_rbac.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: v1
5 | kind: ServiceAccount
6 | metadata:
7 | name: intel-qat
8 | namespace: intel-qat
9 | ---
10 | apiVersion: rbac.authorization.k8s.io/v1
11 | kind: Role
12 | metadata:
13 | name: intel-qat
14 | namespace: intel-qat
15 | rules:
16 | - apiGroups:
17 | - security.openshift.io
18 | resources:
19 | - securitycontextconstraints
20 | resourceNames:
21 | - intel-qat-scc
22 | verbs:
23 | - use
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 | name: intel-qat
29 | roleRef:
30 | apiGroup: rbac.authorization.k8s.io
31 | kind: Role
32 | name: intel-qat
33 | subjects:
34 | - kind: ServiceAccount
35 | name: intel-qat
36 | namespace: intel-qat
--------------------------------------------------------------------------------
/set-version.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh -eu
2 | #
3 | # Copyright 2024 Intel Corporation.
4 | #
5 | # SPDX-License-Identifier: Apache-2.0
6 | #
7 | # Invoke this script with a version as parameter
8 | # and it will update all hard-coded devel versions
9 | # to the tag versions in the source code.
10 | #
11 | # Adapted from https://github.com/intel/intel-device-plugins-for-kubernetes/
12 |
13 | if [ $# != 1 ] || [ "$1" = "?" ] || [ "$1" = "--help" ]; then
14 | echo "Please provide TAG version as an argument. Usage: $0 " >&2
15 | exit 1
16 | fi
17 |
18 | devel_link="intel/intel-technology-enabling-for-openshift/main/"
19 | tag_link="intel/intel-technology-enabling-for-openshift/$1/"
20 |
21 | files=$(git grep -lF $devel_link -- '*.md')
22 |
23 | for file in $files; do
24 | sed -i -e "s|$devel_link|$tag_link|g" "$file";
25 | done
--------------------------------------------------------------------------------
/kmmo/intel-dgpu.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: kmm.sigs.x-k8s.io/v1beta1
5 | kind: Module
6 | metadata:
7 | name: intel-dgpu
8 | namespace: openshift-kmm
9 | spec:
10 | moduleLoader:
11 | container:
12 | modprobe:
13 | moduleName: i915
14 | firmwarePath: /firmware
15 | modulesLoadingOrder:
16 | - i915
17 | - mei_gsc
18 | inTreeModulesToRemove: [i915, intel_vsec, mei_gsc, mei_me]
19 | kernelMappings:
20 | - regexp: '^.*\.x86_64$'
21 | containerImage: registry.connect.redhat.com/intel/intel-data-center-gpu-driver-container:3.0.0-$KERNEL_FULL_VERSION
22 | selector:
23 | intel.feature.node.kubernetes.io/gpu: 'true'
24 | intel.feature.node.kubernetes.io/dgpu-canary: 'true'
25 |
--------------------------------------------------------------------------------
/gaudi/gaudi_install_operator.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Adapted from https://docs.habana.ai/en/latest/Orchestration/HabanaAI_Operator/Deploying_HabanaAI_Operator.html#using-cli
4 | #
5 | ---
6 | apiVersion: v1
7 | kind: Namespace
8 | metadata:
9 | name: habana-ai-operator
10 | ---
11 | apiVersion: operators.coreos.com/v1
12 | kind: OperatorGroup
13 | metadata:
14 | name: habana-ai-operator
15 | namespace: habana-ai-operator
16 | spec:
17 | targetNamespaces:
18 | - habana-ai-operator
19 | ---
20 | apiVersion: operators.coreos.com/v1alpha1
21 | kind: Subscription
22 | metadata:
23 | name: habana-ai-operator
24 | namespace: habana-ai-operator
25 | spec:
26 | channel: stable
27 | installPlanApproval: Automatic
28 | name: habana-ai-operator
29 | source: certified-operators
30 | sourceNamespace: openshift-marketplace
--------------------------------------------------------------------------------
/tests/l2/qat/qatlib_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-qat-workload
8 | namespace: intel-qat
9 | spec:
10 | template:
11 | spec:
12 | restartPolicy: Never
13 | containers:
14 | - name: intel-qat-job
15 | image: image-registry.openshift-image-registry.svc:5000/intel-qat/intel-qat-workload:latest
16 | imagePullPolicy: IfNotPresent
17 | command: ["cpa_sample_code"]
18 | securityContext:
19 | capabilities:
20 | add:
21 | [IPC_LOCK]
22 | resources:
23 | requests:
24 | qat.intel.com/dc: '1'
25 | qat.intel.com/cy: '1'
26 | limits:
27 | qat.intel.com/dc: '1'
28 | qat.intel.com/cy: '1'
29 | serviceAccount: intel-qat
--------------------------------------------------------------------------------
/tests/l2/dsa/dsa_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: intel-dsa-workload
8 | namespace: intel-dsa
9 | spec:
10 | template:
11 | spec:
12 | restartPolicy: Never
13 | containers:
14 | - name: intel-dsa-job
15 | image: image-registry.openshift-image-registry.svc:5000/intel-dsa/intel-dsa-workload:latest
16 | imagePullPolicy: IfNotPresent
17 | workingDir: "/usr/libexec/accel-config/test/"
18 | command:
19 | - "./dsa_user_test_runner.sh"
20 | args:
21 | - "--skip-config"
22 | securityContext:
23 | capabilities:
24 | add:
25 | [SYS_RAWIO]
26 | resources:
27 | limits:
28 | dsa.intel.com/wq-user-dedicated: 1
29 | serviceAccountName: intel-dsa
--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/playbooks/install_device_plugins_operator.yaml:
--------------------------------------------------------------------------------
1 | - name: Create global OperatorGroup in openshift-operators namespace
2 | k8s:
3 | state: present
4 | definition:
5 | apiVersion: operators.coreos.com/v1alpha2
6 | kind: OperatorGroup
7 | metadata:
8 | name: global-operators
9 | namespace: openshift-operators
10 |
11 | - name: Create Intel Device Plugins Operator Subscription
12 | k8s:
13 | state: present
14 | definition:
15 | apiVersion: operators.coreos.com/v1alpha1
16 | kind: Subscription
17 | metadata:
18 | labels:
19 | operators.coreos.com/intel-device-plugins-operator.openshiftoperators: ""
20 | name: intel-device-plugins-operator
21 | namespace: openshift-operators
22 | spec:
23 | channel: alpha
24 | installPlanApproval: Automatic
25 | name: intel-device-plugins-operator
26 | source: certified-operators
27 | sourceNamespace: openshift-marketplace
--------------------------------------------------------------------------------
/tests/gaudi/l2/hccl_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: hccl-demo-workload
8 | namespace: gaudi-validation
9 | ---
10 | kind: BuildConfig
11 | apiVersion: build.openshift.io/v1
12 | metadata:
13 | name: hccl-demo-workload
14 | namespace: gaudi-validation
15 | spec:
16 | output:
17 | to:
18 | kind: ImageStreamTag
19 | name: 'hccl-demo-workload:1.19.1-26'
20 | strategy:
21 | type: Docker
22 | source:
23 | type: Dockerfile
24 | dockerfile: |
25 | ARG BUILDER=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
26 | FROM ${BUILDER} AS builder
27 |
28 | WORKDIR /
29 | RUN git clone https://github.com/HabanaAI/hccl_demo.git \
30 | && cd hccl_demo \
31 | && make
32 |
33 | WORKDIR /hccl_demo
34 | triggers:
35 | - type: ConfigChange
36 | runPolicy: Serial
--------------------------------------------------------------------------------
/security/dsa_scc.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: security.openshift.io/v1
5 | allowHostDirVolumePlugin: false
6 | allowHostIPC: false
7 | allowHostNetwork: false
8 | allowHostPID: false
9 | allowHostPorts: false
10 | allowPrivilegeEscalation: false
11 | allowPrivilegedContainer: false
12 | allowedCapabilities:
13 | - SYS_RAWIO
14 | defaultAddCapabilities: null
15 | fsGroup:
16 | type: MustRunAs
17 | groups: []
18 | kind: SecurityContextConstraints
19 | metadata:
20 | annotations:
21 | kubernetes.io/description: 'SCC for Intel DSA based workload'
22 | name: intel-dsa-scc
23 | priority: null
24 | readOnlyRootFilesystem: false
25 | requiredDropCapabilities:
26 | - ALL
27 | runAsUser:
28 | type: RunAsAny
29 | seLinuxContext:
30 | type: MustRunAs
31 | supplementalGroups:
32 | type: RunAsAny
33 | seccompProfiles:
34 | - runtime/default
35 | volumes:
36 | - configMap
37 | - downwardAPI
38 | - emptyDir
39 | - ephemeral
40 | - persistentVolumeClaim
41 | - projected
42 | - secret
--------------------------------------------------------------------------------
/security/qatlib_scc.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: security.openshift.io/v1
5 | allowHostDirVolumePlugin: false
6 | allowHostIPC: false
7 | allowHostNetwork: false
8 | allowHostPID: false
9 | allowHostPorts: false
10 | allowPrivilegeEscalation: false
11 | allowPrivilegedContainer: false
12 | allowedCapabilities:
13 | - IPC_LOCK
14 | defaultAddCapabilities: null
15 | fsGroup:
16 | type: MustRunAs
17 | groups: []
18 | kind: SecurityContextConstraints
19 | metadata:
20 | annotations:
21 | kubernetes.io/description: 'SCC for Intel QAT based workload'
22 | name: intel-qat-scc
23 | priority: null
24 | readOnlyRootFilesystem: false
25 | requiredDropCapabilities:
26 | - ALL
27 | runAsUser:
28 | type: RunAsAny
29 | seLinuxContext:
30 | type: MustRunAs
31 | supplementalGroups:
32 | type: RunAsAny
33 | seccompProfiles:
34 | - runtime/default
35 | volumes:
36 | - configMap
37 | - downwardAPI
38 | - emptyDir
39 | - ephemeral
40 | - persistentVolumeClaim
41 | - projected
42 | - secret
--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
1 | .. Intel Technology Enabling for OpenShift documentation master file, created by
2 | sphinx-quickstart on Wed Apr 17 23:49:16 2024.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Intel® Technology Enabling for OpenShift*
7 | ===================================================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | README.md
14 | nfd/README.md
15 | machine_configuration/README.md
16 | kmmo/README.md
17 | device_plugins/README.md
18 | gaudi/README.md
19 |
20 | .. toctree::
21 | :maxdepth: 2
22 | :caption: One-Click Deployment:
23 |
24 | one_click/README.md
25 |
26 | .. toctree::
27 | :maxdepth: 2
28 | :caption: End-to-end Solutions:
29 |
30 | e2e/inference/README.md
31 |
32 | .. toctree::
33 | :maxdepth: 2
34 | :caption: Releases:
35 |
36 | docs/releases.rst
37 |
38 | .. toctree::
39 | :maxdepth: 2
40 | :caption: Supported Platforms:
41 |
42 | docs/supported_platforms
--------------------------------------------------------------------------------
/workloads/opea/chatqna/persistent_volumes.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: v1
5 | kind: PersistentVolume
6 | metadata:
7 | name: chatqna-megaservice-pv-0
8 | spec:
9 | capacity:
10 | storage: 100Mi
11 | accessModes:
12 | - ReadWriteOnce
13 | persistentVolumeReclaimPolicy: Retain
14 | nfs:
15 | server: x.x.x.x # nfs server
16 | path: /nfs # nfs path
17 | ---
18 | apiVersion: v1
19 | kind: PersistentVolume
20 | metadata:
21 | name: chatqna-megaservice-pv-1
22 | spec:
23 | capacity:
24 | storage: 100Mi
25 | accessModes:
26 | - ReadWriteOnce
27 | persistentVolumeReclaimPolicy: Retain
28 | nfs:
29 | server: x.x.x.x # nfs server
30 | path: /nfs # nfs path
31 | ---
32 | apiVersion: v1
33 | kind: PersistentVolume
34 | metadata:
35 | name: chatqna-megaservice-pv-2
36 | spec:
37 | capacity:
38 | storage: 100Mi
39 | accessModes:
40 | - ReadWriteOnce
41 | persistentVolumeReclaimPolicy: Retain
42 | nfs:
43 | server: x.x.x.x # nfs server
44 | path: /nfs # nfs path
--------------------------------------------------------------------------------
/playbooks/README.md:
--------------------------------------------------------------------------------
1 | # Intel Technology Enabling Ansible Playbooks
2 |
3 | ## Overview
4 | This directory contains Ansible playbooks designed to automate the deployment and configuration of Intel technologies on Red Hat OpenShift clusters. These playbooks streamline the Intel feature provisioning and validation process on OpenShift environments.
5 |
6 | ## Prerequisites
7 | Before running the playbook, ensure the following prerequisites are met:
8 | - Provisioned RHOCP Cluster
9 | - Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster.
10 |
11 | ## Run the Playbook
12 |
13 | To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook.
14 |
15 | ```bash
16 | git clone https://github.com/intel/intel-technology-enabling-for-openshift.git
17 |
18 | cd intel-technology-enabling-for-openshift/
19 |
20 | ansible-playbook playbooks/intel_ocp_provisioning.yaml
21 | ```
--------------------------------------------------------------------------------
/playbooks/install_nfd_operator.yaml:
--------------------------------------------------------------------------------
1 | - name: Create namespace for Node Feature Discovery
2 | k8s:
3 | state: present
4 | definition:
5 | apiVersion: v1
6 | kind: Namespace
7 | metadata:
8 | name: openshift-nfd
9 |
10 | - name: Create operator group for Node Feature Discovery
11 | k8s:
12 | state: present
13 | definition:
14 | apiVersion: operators.coreos.com/v1
15 | kind: OperatorGroup
16 | metadata:
17 | generateName: openshift-nfd-
18 | name: openshift-nfd
19 | namespace: openshift-nfd
20 | spec:
21 | targetNamespaces:
22 | - openshift-nfd
23 |
24 | - name: Subscribe to Node Feature Discovery operator
25 | k8s:
26 | state: present
27 | definition:
28 | apiVersion: operators.coreos.com/v1alpha1
29 | kind: Subscription
30 | metadata:
31 | name: nfd
32 | namespace: openshift-nfd
33 | spec:
34 | channel: "stable"
35 | installPlanApproval: Automatic
36 | name: nfd
37 | source: redhat-operators
38 | sourceNamespace: openshift-marketplace
--------------------------------------------------------------------------------
/tests/gaudi/l2/hccl_job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: batch/v1
5 | kind: Job
6 | metadata:
7 | name: hccl-demo-workload
8 | namespace: gaudi-validation
9 | spec:
10 | template:
11 | metadata:
12 | spec:
13 | restartPolicy: Never
14 | serviceAccountName: hccl-demo-anyuid-sa
15 | containers:
16 | - name: hccl-demo-workload
17 | image: image-registry.openshift-image-registry.svc:5000/gaudi-validation/hccl-demo-workload:1.19.1-26
18 | workingDir: "/hccl_demo"
19 | command: ["/bin/bash", "-c", "--"]
20 | ## sleep for 20 seconds to avoid race condition
21 | args:
22 | - |
23 | sleep 20
24 | python3 run_hccl_demo.py --nranks 8 --node_id 0 --size 32m --test all_reduce --loop 1000 --ranks_per_node 8
25 | sleep 20
26 | env:
27 | - name: HCCL_COMM_ID
28 | value: '127.0.0.1:5555'
29 | resources:
30 | limits:
31 | habana.ai/gaudi: 8
32 | imagePullPolicy: IfNotPresent
33 |
--------------------------------------------------------------------------------
/kmmo/intel-dgpu-on-premise-build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | labels:
8 | app: intel-dgpu-driver-container-kmmo
9 | name: intel-dgpu-driver-container-kmmo
10 | namespace: openshift-kmm
11 | spec: {}
12 | ---
13 | apiVersion: kmm.sigs.x-k8s.io/v1beta1
14 | kind: Module
15 | metadata:
16 | name: intel-dgpu-on-premise-build
17 | namespace: openshift-kmm
18 | spec:
19 | moduleLoader:
20 | container:
21 | imagePullPolicy: Always
22 | modprobe:
23 | moduleName: i915
24 | firmwarePath: /firmware
25 | inTreeModuleToRemove: intel_vsec
26 | kernelMappings:
27 | - regexp: '^.*\.x86_64$'
28 | containerImage: image-registry.openshift-image-registry.svc:5000/openshift-kmm/intel-dgpu-driver-container-kmmo:$KERNEL_FULL_VERSION
29 | build:
30 | dockerfileConfigMap:
31 | name: intel-dgpu-dockerfile-configmap
32 | selector:
33 | intel.feature.node.kubernetes.io/gpu: 'true'
34 | intel.feature.node.kubernetes.io/dgpu-canary: 'true'
35 |
--------------------------------------------------------------------------------
/playbooks/configure_nfd.yaml:
--------------------------------------------------------------------------------
1 | - name: NFD - Create NFD discovery CR
2 | k8s:
3 | state: present
4 | definition: '{{ item }}'
5 | wait: yes
6 | wait_condition:
7 | type: Available
8 | status: 'False'
9 | with_items: '{{ lookup("url", "https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-discovery-openshift.yaml", split_lines=False) | from_yaml_all | list }}'
10 | when: (item is not none)
11 | - name: NFD - Create NFD rules instance CR
12 | k8s:
13 | state: present
14 | definition: '{{ item }}'
15 | wait: yes
16 | with_items: '{{ lookup("url", "https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-rules-openshift.yaml", split_lines=False) | from_yaml_all | list }}'
17 | when: (item is not none)
--------------------------------------------------------------------------------
/tests/l2/dgpu/hwinfo_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-dgpu-hwinfo
8 | namespace: intel-dgpu
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: intel-dgpu-hwinfo
15 | namespace: intel-dgpu
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | type: Dockerfile
23 | dockerfile: |
24 | ARG BUILDER=registry.access.redhat.com/ubi9-minimal:latest
25 | FROM ${BUILDER}
26 | RUN microdnf -y update && \
27 | rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
28 | microdnf install -y hwinfo
29 | ENTRYPOINT ["hwinfo"]
30 | strategy:
31 | type: Docker
32 | noCache: true
33 | dockerStrategy:
34 | buildArgs:
35 | - name: "BUILDER"
36 | value: "registry.access.redhat.com/ubi9-minimal:latest"
37 | output:
38 | to:
39 | kind: ImageStreamTag
40 | name: intel-dgpu-hwinfo:latest
--------------------------------------------------------------------------------
/device_plugins/deploy_sgx.md:
--------------------------------------------------------------------------------
1 | # Create Intel SGX Device Plugin CR
2 |
3 | ## Create a CR via web console
4 | 1. Go to **Operator** -> **Installed Operators**.
5 | 2. Open **Intel Device Plugins Operator**.
6 | 3. Navigate to tab **Intel Software Guard Extensions Device Plugin**.
7 | 4. Click **Create SgxDevicePlugin** -> set correct parameters -> Click **Create**
8 | 5. Optional: If you want to make any customizations, select YAML view and edit the details. When you are done, click **Create**.
9 |
10 | ## Verify via web console
11 | 1. Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-sgx-plugin**.
12 | 2. Now `SgxDevicePlugin` is created.
13 |
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/sgx_device_plugin.yaml
18 | ```
19 |
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready:
22 | ```
23 | $ oc get SgxDevicePlugin
24 | ```
25 | Output:
26 | ```
27 | NAME DESIRED READY NODE SELECTOR AGE
28 | sgxdeviceplugin-sample 1 1 {"intel.feature.node.kubernetes.io/sgx":"true"} 2m
29 | ```
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | # %: Makefile
20 | # @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
22 | # Generate doc site under _build/html with Sphinx.
23 | vhtml: _work/venv/.stamp
24 | . _work/venv/bin/activate && \
25 | $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 | cp docs/index.html $(BUILDDIR)/html/index.html
27 |
28 | html:
29 | $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
30 | cp docs/index.html $(BUILDDIR)/html/index.html
31 |
32 | clean-html:
33 | rm -rf $(BUILDDIR)/html
34 |
35 | # Set up a Python3 environment with the necessary tools for document creation.
36 | _work/venv/.stamp: ./requirements.txt
37 | rm -rf ${@D}
38 | python3 -m venv ${@D}
39 | . ${@D}/bin/activate && pip install wheel && pip install -r $<
40 | touch $@
--------------------------------------------------------------------------------
/tests/l2/sgx/README.md:
--------------------------------------------------------------------------------
1 | ### Verify Intel® Software Guard Extensions (Intel® SGX) Provisioning
2 | This [SampleEnclave](https://github.com/intel/linux-sgx/tree/master/SampleCode/SampleEnclave) application workload from the Intel SGX SDK runs an Intel SGX enclave utilizing the EPC resource from the Intel SGX provisioning.
3 |
4 | * Build the container image.
5 | ```
6 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/sgx/sgx_imagestream.yaml
7 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/sgx/sgx_build.yaml
8 | ```
9 |
10 | * Deploy and run the workload.
11 | ```
12 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/sgx/sgx_job.yaml
13 | ```
14 |
15 | * Check the results.
16 | ```
17 | $ oc get pods -n intel-sgx
18 | intel-sgx-job-4tnh5 0/1 Completed 0 2m10s
19 | intel-sgx-workload-1-build 0/1 Completed 0 30s
20 | ```
21 | ```
22 | $ oc logs intel-sgx-job-4tnh5 -n intel-sgx
23 | Checksum(0x0x7fffac6f41e0, 100) = 0xfffd4143
24 | Info: executing thread synchronization, please wait...
25 | Info: SampleEnclave successfully returned.
26 | Enter a character before exit ...
27 | ```
28 | ## See Also
29 | For Intel SGX demos on vanilla Kubernetes, refer to [link](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/sgx-sdk-demo)
30 |
--------------------------------------------------------------------------------
/gaudi/gaudi_spi_fw_upgrade_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: gaudi-spi-fw-upgrade
8 | namespace: gaudi-spi-fw-upgrade
9 | ---
10 | kind: BuildConfig
11 | apiVersion: build.openshift.io/v1
12 | metadata:
13 | name: gaudi-spi-fw-upgrade
14 | namespace: gaudi-spi-fw-upgrade
15 | spec:
16 | output:
17 | to:
18 | kind: ImageStreamTag
19 | name: gaudi-spi-fw-upgrade:1.19.1-26
20 | strategy:
21 | type: Docker
22 | source:
23 | type: Dockerfile
24 | dockerfile: >+
25 | ARG
26 | BUILDER=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
27 |
28 | FROM ${BUILDER} AS builder
29 |
30 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
31 | echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
32 | echo "baseurl=https://vault.habana.ai/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \
33 | echo "gpgkey=https://vault.habana.ai/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \
34 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo
35 |
36 | RUN yum makecache && dnf install -y habanalabs-firmware-odm
37 | triggers:
38 | - type: ConfigChange
39 | runPolicy: Serial
--------------------------------------------------------------------------------
/tests/l2/README.md:
--------------------------------------------------------------------------------
1 | # Verifying Intel Hardware Feature Provisioning
2 | ## Introduction
3 | After provisioning Intel hardware features on RHOCP, the respective hardware resources are exposed to the RHOCP cluster. The workload containers can request these resources. The following sample workloads help verify if these resources can be used as expected. These sample workloads container images are built and packaged on-premises through [RHOCP BuildConfig](https://docs.openshift.com/container-platform/4.14/cicd/builds/understanding-buildconfigs.html) and pushed to the embedded repository through [RHOCP ImageStream](https://docs.openshift.com/container-platform/4.14/openshift_images/image-streams-manage.html).
4 |
5 | ## Prerequisites
6 | • Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
7 |
8 | • Provisioning Intel HW features on RHOCP. Follow steps [here](/README.md#provisioning-intel-hardware-features-on-rhocp)
9 |
10 | ### Verify Intel® Data Center GPU provisioning
11 | Please refer to Intel DGPU provisioning validation tests [here](dgpu/README.md)
12 |
13 | ### Verify Intel® Software Guard Extensions (Intel® SGX) Provisioning
14 | Please refer to Intel SGX provisioning validation tests [here](sgx/README.md)
15 |
16 | ### Verify Intel® QuickAssist Technology provisioning
17 | Please refer to Intel QAT provisioning validation tests [here](qat/README.md)
18 |
19 | ### Verify Intel® Data Streaming Accelerator provisioning
20 | Please refer to Intel DSA provisioning validation tests [here](dsa/README.md)
--------------------------------------------------------------------------------
/workloads/opea/chatqna/tgi_gaudi_servingruntime.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: serving.kserve.io/v1alpha1
5 | kind: ServingRuntime
6 | metadata:
7 | name: tgi-gaudi-serving-runtime
8 | spec:
9 | containers:
10 | - name: kserve-container
11 | image: ghcr.io/huggingface/tgi-gaudi:1.2.1
12 | args:
13 | - --model-id
14 | - /mnt/models/
15 | - --port=8080
16 | - --num-shard=3 #Number of GPU's
17 | - --sharded=true
18 | - --json-output
19 | env: #Add variables according to the chosen model
20 | - name: HF_HOME
21 | value: /tmp/hf_home
22 | - name: HF_OFFLINE
23 | value: "1"
24 | - name: TRANSFORMERS_OFFLINE
25 | value: "1"
26 | - name: HF_HUB_CACHE
27 | value: /mnt/models
28 | - name: HUGGING_FACE_HUB_TOKEN
29 | valueFrom:
30 | secretKeyRef:
31 | key: HUGGING_FACE_HUB_TOKEN
32 | name: hf-token
33 | resources:
34 | limits:
35 | cpu: "16"
36 | memory: 128Gi
37 | requests:
38 | cpu: "16"
39 | memory: 128Gi
40 | readinessProbe:
41 | exec:
42 | command:
43 | - curl
44 | - localhost:8080/health
45 | initialDelaySeconds: 500
46 | livenessProbe:
47 | exec:
48 | command:
49 | - curl
50 | - localhost:8080/health
51 | initialDelaySeconds: 500
52 | ports:
53 | - containerPort: 8080
54 | protocol: TCP
55 | multiModel: false
56 | supportedModelFormats:
57 | - autoSelect: true
58 | name: llm
--------------------------------------------------------------------------------
/tests/l2/qat/qatlib_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-qat-workload
8 | namespace: intel-qat
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: intel-qat-workload
15 | namespace: intel-qat
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | type: Dockerfile
23 | dockerfile: |
24 |
25 | ARG BUILDER=registry.access.redhat.com/ubi9:latest
26 | FROM ${BUILDER}
27 | RUN subscription-manager register --username=${USERNAME} --password=${PASSWORD} && \
28 | subscription-manager attach --auto && \
29 | dnf repolist --disablerepo=* && \
30 | subscription-manager repos --enable codeready-builder-for-rhel-9-x86_64-rpms && \
31 | dnf -y update && \
32 | dnf install -y qatlib qatlib-tests
33 | strategy:
34 | type: Docker
35 | noCache: true
36 | dockerStrategy:
37 | buildArgs:
38 | - name: "BUILDER"
39 | value: "registry.access.redhat.com/ubi9:latest"
40 | env:
41 | - name: "USERNAME"
42 | valueFrom:
43 | secretKeyRef:
44 | key: username
45 | name: rh-auth
46 | - name: "PASSWORD"
47 | valueFrom:
48 | secretKeyRef:
49 | key: password
50 | name: rh-auth
51 |
52 | output:
53 | to:
54 | kind: ImageStreamTag
55 | name: intel-qat-workload:latest
--------------------------------------------------------------------------------
/workloads/opea/chatqna/chatqna_megaservice_buildconfig.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: chatqna-megaservice
8 | namespace: opea-chatqna
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: chatqna-megaservice
15 | namespace: opea-chatqna
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | dockerfile: |
23 | FROM langchain/langchain:latest
24 |
25 | RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
26 | libgl1-mesa-glx \
27 | libjemalloc-dev
28 |
29 | RUN useradd -m -s /bin/bash user && \
30 | mkdir -p /home/user && \
31 | chown -R user /home/user/
32 |
33 | USER user
34 | COPY requirements.txt /tmp/requirements.txt
35 |
36 | USER root
37 | COPY tls.crt /rhoai-ca/tls.crt
38 | RUN cat /rhoai-ca/tls.crt | tee -a '/usr/lib/ssl/cert.pem'
39 |
40 | USER user
41 | RUN pip install --no-cache-dir --upgrade pip && \
42 | pip install --no-cache-dir -r /tmp/requirements.txt
43 |
44 | ENV PYTHONPATH=$PYTHONPATH:/ws:/home/user:/home/user/qna-app/app
45 |
46 | WORKDIR /home/user/qna-app
47 | COPY qna-app /home/user/qna-app
48 |
49 | ENTRYPOINT ["/usr/bin/sleep", "infinity"]
50 | triggers:
51 | - type: ConfigChange
52 | runPolicy: SerialLatestOnly
53 | strategy:
54 | type: Docker
55 | dockerStrategy: {}
56 | postCommit: {}
57 | output:
58 | to:
59 | kind: ImageStreamTag
60 | name: chatqna-megaservice:latest
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Test Plan
2 |
3 | ```{admonition} Disclaimer
4 | Please note that this module is currently under development and may contain partially implemented features, therefore it is not supported in the current release.
5 | ```
6 |
7 | ## Overview
8 | To ensure the whole stack is working as expected and track bugs, a layer based test architecture is needed on OCP. This plan consists of four layers. The first and second layers mentioned below, would be a part of the future automation testing framework on each OCP (x,y,z) release.
9 |
10 | ### L1 First Layer: Host OS and Driver Interfaces
11 | Layer 1 test cases should be executed before deploying [Intel Device Plugins Operator](/device_plugins/README.md) and after deploying OOT drivers like [Intel Data Center GPU Driver on OpenShift](/kmmo/README.md). It includes test cases :
12 | * to check existence of in-tree and OOT tree drivers
13 | * for SELinux and host OS security
14 | * check for devices on all nodes
15 |
16 | ### L2 Second Layer: Device Plugin Resource Provisioning
17 | L2 test cases are executed after deploying the [Intel Device Plugins Operator](/device_plugins/README.md). Refer to [readme](l2/README.md). It includes:
18 | * Pod's resource allocation and scheduling
19 | * Simple workloads
20 | * Boundary testing for the resources
21 | * Future plan for any failure analysis needed during automation.
22 |
23 | ### L3 Third Layer: E2E solution
24 | L3 test cases are executed after the specific device plugin related [e2e solution](/e2e) has been deployed. Please refer to [L3 test cases](l3/README.md) for detail.
25 |
26 | ### L4 Fourth Layer: Reference Workloads
27 | This layer includes the reference final application/usecase for the user. It integrates the whole stack and is custom for each Intel hardware feature and device plugin. This layer is yet to be added in upcoming releases.
--------------------------------------------------------------------------------
/workloads/opea/chatqna/redis_deployment_service.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: v1
5 | kind: ServiceAccount
6 | metadata:
7 | name: opea-chatqna
8 | namespace: opea-chatqna
9 | ---
10 | apiVersion: rbac.authorization.k8s.io/v1
11 | kind: Role
12 | metadata:
13 | name: opea-chatqna
14 | namespace: opea-chatqna
15 | rules:
16 | - apiGroups:
17 | - security.openshift.io
18 | resources:
19 | - securitycontextconstraints
20 | resourceNames:
21 | - anyuid
22 | verbs:
23 | - use
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 | name: opea-chatqna
29 | roleRef:
30 | apiGroup: rbac.authorization.k8s.io
31 | kind: Role
32 | name: opea-chatqna
33 | subjects:
34 | - kind: ServiceAccount
35 | name: opea-chatqna
36 | namespace: opea-chatqna
37 | ---
38 | # Redis Vector DB deployment
39 | apiVersion: apps/v1
40 | kind: Deployment
41 | metadata:
42 | name: redis-vector-db
43 | namespace: opea-chatqna
44 | spec:
45 | replicas: 1
46 | selector:
47 | matchLabels:
48 | app: redis-vector-db
49 | template:
50 | metadata:
51 | labels:
52 | app: redis-vector-db
53 | spec:
54 | serviceAccount: opea-chatqna
55 | containers:
56 | - name: redis-vector-db
57 | image: redis/redis-stack:7.2.0-v9
58 | ports:
59 | - containerPort: 6379
60 | - containerPort: 8001
61 | ---
62 | # Redis Vector DB Service
63 | apiVersion: v1
64 | kind: Service
65 | metadata:
66 | name: redis-vector-db
67 | namespace: opea-chatqna
68 | spec:
69 | type: ClusterIP
70 | selector:
71 | app: redis-vector-db
72 | ports:
73 | - name: redis-service
74 | port: 6379
75 | targetPort: 6379
76 | - name: redis-insight
77 | port: 8001
78 | targetPort: 8001
--------------------------------------------------------------------------------
/gaudi/Gaudi-SPI-Firmware-Upgrade.md:
--------------------------------------------------------------------------------
1 | # Upgrade Intel Gaudi SPI Firmware
2 | To upgrade Intel Gaudi SPI Firmware, follow below steps:
3 |
4 | **NOTE:** Currently this is only supported on Single Node OpenShift cluster. Multi node cluster support will be added in the future.
5 |
6 | ## Prerequisites
7 | - Make sure Gaudi drivers are unloaded.
8 | - On Red Hat OpenShift, delete existing ClusterPolicy Custom Resource. Verify output on the node using below command:
9 | ```
10 | lsmod | grep habana
11 | ```
12 | - Check the firmware version following the [firmware version check](https://github.com/intel/intel-technology-enabling-for-openshift/tree/main/tests/gaudi/l2#check-firmware-version-with-hl-smi).
13 |
14 | ## SPI Firmware Upgrade
15 | Build the container image with `habanalabs-firmware-odm` tool:
16 | ```
17 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/gaudi/gaudi_spi_fw_upgrade_build.yaml
18 | ```
19 | Create service account with required permissions:
20 | ```
21 | oc create sa gaudi-fw-upgrade-sa -n gaudi-spi-fw-upgrade
22 | oc adm policy add-scc-to-user privileged -z gaudi-fw-upgrade-sa -n gaudi-spi-fw-upgrade
23 | ```
24 | Deploy and execute the SPI firmware upgrade tool:
25 | ```
26 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/gaudi/gaudi_spi_fw_upgrade_job.yaml
27 | ```
28 |
29 | Verify Output:
30 | ```
31 | oc get pods
32 |
33 | NAME READY STATUS RESTARTS AGE
34 | gaudi-spi-firmware-upgrade-ndmjp 0/1 Completed 0 10m
35 | ```
36 | ```
37 | oc logs gaudi-spi-firmware-upgrade-ndmjp
38 | .
39 | .
40 | ####
41 | #### Finished sending firmware: OK
42 | ```
43 | Verify by following the [firmware version check](https://github.com/intel/intel-technology-enabling-for-openshift/tree/main/tests/gaudi/l2#check-firmware-version-with-hl-smi).
--------------------------------------------------------------------------------
/tests/l2/dsa/dsa_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: build.openshift.io/v1
5 | kind: BuildConfig
6 | metadata:
7 | name: intel-dsa-workload
8 | namespace: intel-dsa
9 | spec:
10 | triggers:
11 | - type: "ConfigChange"
12 | - type: "ImageChange"
13 | runPolicy: "Serial"
14 | source:
15 | type: Dockerfile
16 | dockerfile: |
17 |
18 | ARG BUILDER=registry.access.redhat.com/ubi9:latest
19 | FROM ${BUILDER}
20 | RUN subscription-manager register --username=${USERNAME} --password=${PASSWORD} && \
21 | subscription-manager attach --auto && \
22 | dnf repolist --disablerepo=* && \
23 | subscription-manager repos --enable rhel-9-for-x86_64-baseos-rpms --enable codeready-builder-for-rhel-9-x86_64-rpms && \
24 | dnf -y update && \
25 | dnf install -y gcc g++ make cmake autoconf automake libtool pkg-config \
26 | git asciidoc xmlto libuuid-devel json-c-devel zlib-devel openssl-devel \
27 | pciutils accel-config
28 | RUN git clone -b accel-config-v4.1.8 https://github.com/intel/idxd-config && \
29 | cd idxd-config && ./autogen.sh && ./configure CFLAGS='-g -O2' --prefix=/usr \
30 | --sysconfdir=/etc --libdir=/usr/lib64 --enable-test=yes && make && make install
31 | strategy:
32 | type: Docker
33 | noCache: true
34 | dockerStrategy:
35 | buildArgs:
36 | - name: "BUILDER"
37 | value: "registry.access.redhat.com/ubi9:latest"
38 | env:
39 | - name: "USERNAME"
40 | valueFrom:
41 | secretKeyRef:
42 | key: username
43 | name: rh-auth
44 | - name: "PASSWORD"
45 | valueFrom:
46 | secretKeyRef:
47 | key: password
48 | name: rh-auth
49 | output:
50 | to:
51 | kind: ImageStreamTag
52 | name: intel-dsa-workload:latest
--------------------------------------------------------------------------------
/tests/l2/dgpu/clinfo_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-dgpu-clinfo
8 | namespace: intel-dgpu
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: intel-dgpu-clinfo
15 | namespace: intel-dgpu
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | type: Dockerfile
23 | dockerfile: |
24 | ARG BUILDER=registry.access.redhat.com/ubi9-minimal:latest
25 | FROM ${BUILDER}
26 |
27 | ARG OCL_ICD_VERSION=ocl-icd-2.2.13-4.el9.x86_64
28 | ARG CLINFO_VERSION=clinfo-3.0.21.02.21-4.el9.x86_64
29 |
30 | RUN microdnf install -y \
31 | glibc \
32 | yum-utils
33 |
34 | # install intel-opencl, ocl-icd and clinfo
35 | RUN dnf install -y 'dnf-command(config-manager)' && \
36 | dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/9.0/lts/2350/unified/intel-gpu-9.0.repo && \
37 | dnf install -y intel-opencl \
38 | https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os/Packages/$OCL_ICD_VERSION.rpm \
39 | https://dl.fedoraproject.org/pub/epel/9/Everything/x86_64/Packages/c/$CLINFO_VERSION.rpm && \
40 | dnf clean all && dnf autoremove && rm -rf /var/lib/dnf/lists/* && \
41 | rm -rf /etc/yum.repos.d/intel-graphics.repo
42 | strategy:
43 | type: Docker
44 | noCache: true
45 | dockerStrategy:
46 | buildArgs:
47 | - name: "BUILDER"
48 | value: "registry.access.redhat.com/ubi9-minimal:latest"
49 | - name: "OCL_ICD_VERSION"
50 | value: "ocl-icd-2.2.13-4.el9.x86_64"
51 | - name: "CLINFO_VERSION"
52 | value: "clinfo-3.0.21.02.21-4.el9.x86_64"
53 | output:
54 | to:
55 | kind: ImageStreamTag
56 | name: intel-dgpu-clinfo:latest
--------------------------------------------------------------------------------
/nfd/node-feature-rules-openshift.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 - 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: nfd.openshift.io/v1alpha1
5 | kind: NodeFeatureRule
6 | metadata:
7 | name: intel-dp-devices
8 | namespace: openshift-nfd
9 | spec:
10 | rules:
11 | - name: "intel.gpu"
12 | labels:
13 | "intel.feature.node.kubernetes.io/gpu": "true"
14 | matchFeatures:
15 | - feature: pci.device
16 | matchExpressions:
17 | vendor: {op: In, value: ["8086"]}
18 | class: {op: In, value: ["0300", "0380"]}
19 |
20 | - name: "intel.qat"
21 | labels:
22 | "intel.feature.node.kubernetes.io/qat": "true"
23 | matchFeatures:
24 | - feature: pci.device
25 | matchExpressions:
26 | vendor: {op: In, value: ["8086"]}
27 | device: {op: In, value: ["4940", "4942", "4944"]}
28 | class: {op: In, value: ["0b40"]}
29 | - feature: kernel.loadedmodule
30 | matchExpressions:
31 | intel_qat: {op: Exists}
32 |
33 | - name: "intel.sgx"
34 | labels:
35 | "intel.feature.node.kubernetes.io/sgx": "true"
36 | extendedResources:
37 | sgx.intel.com/epc: "@cpu.security.sgx.epc"
38 | matchFeatures:
39 | - feature: cpu.cpuid
40 | matchExpressions:
41 | SGX: {op: Exists}
42 | SGXLC: {op: Exists}
43 | - feature: cpu.security
44 | matchExpressions:
45 | sgx.enabled: {op: IsTrue}
46 | - feature: kernel.config
47 | matchExpressions:
48 | X86_SGX: {op: Exists}
49 | - name: "intel.dsa"
50 | labels:
51 | "intel.feature.node.kubernetes.io/dsa": "true"
52 | matchFeatures:
53 | - feature: pci.device
54 | matchExpressions:
55 | vendor: {op: In, value: ["8086"]}
56 | device: {op: In, value: ["0b25"]}
57 | class: {op: In, value: ["0880"]}
58 | - feature: kernel.loadedmodule
59 | matchExpressions:
60 | idxd: {op: Exists}
61 |
--------------------------------------------------------------------------------
/device_plugins/deploy_gpu.md:
--------------------------------------------------------------------------------
1 | # Create Intel GPU Device Plugin CR
2 |
3 | ## Create CR via web console
4 | 1. Go to **Operator** -> **Installed Operators**.
5 | 2. Open **Intel Device Plugins Operator**.
6 | 3. Navigate to tab **Intel GPU Device Plugin**.
7 | 4. Click **Create GpuDevicePlugin** -> set correct parameters -> Click **Create**.
8 | 5. Optional: If you want to make any customizations, select YAML view and edit the details. Once you are done, click **Create**.
9 |
10 | ## Verify via web console
11 | 1. Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-gpu-plugin**.
12 | 2. Now `GpuDevicePlugin` is created.
13 |
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/gpu_device_plugin.yaml
18 | ```
19 |
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready:
22 | ```
23 | $ oc get GpuDevicePlugin
24 | ```
25 | Output:
26 | ```
27 | NAME DESIRED READY NODE SELECTOR AGE
28 | gpudeviceplugin-sample 1 1 {"intel.feature.node.kubernetes.io/gpu":"true"} 3m12s
29 | ```
30 |
31 | # Using Intel Data Center GPU resource exclusively
32 | In this release, we only verified and support the single Intel GPU `i915` resource dedicated to the single workload pod. To achieve this, we set `sharedDevNum: 1` and `preferredAllocationPolicy: none` as default options.
33 | As the cluster administrator, use the [gpu_device_plugin.yaml](/device_plugins/gpu_device_plugin.yaml) provided from the previous section Create CR via CLI or use the default options from Create CR via web Console.
34 | As the application owner, when claiming the i915 resource, make sure the resource limits and requests are set as shown below:
35 | ```
36 | spec:
37 | containers:
38 | - name: gpu-pod
39 | resources:
40 | limits:
41 | gpu.intel.com/i915: 1
42 | requests:
43 | gpu.intel.com/i915: 1
44 | ```
45 | For more details, please refer to this [issue](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/1408).
--------------------------------------------------------------------------------
/tests/l2/dgpu/vainfo_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-dgpu-vainfo
8 | namespace: intel-dgpu
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: intel-dgpu-vainfo
15 | namespace: intel-dgpu
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | type: Dockerfile
23 | dockerfile: |
24 | ARG BUILDER=registry.access.redhat.com/ubi9:latest
25 | FROM ${BUILDER}
26 | RUN subscription-manager register --username=${USERNAME} --password=${PASSWORD} && \
27 | subscription-manager attach --auto && \
28 | subscription-manager repos --enable rhel-9-for-x86_64-appstream-rpms && \
29 | dnf -y update && \
30 | dnf install -y flex bison gcc gcc-c++ make autoconf libtool cmake git gdb \
31 | libva libva-devel libdrm-devel
32 | RUN dnf install -y 'dnf-command(config-manager)' && \
33 | dnf config-manager --add-repo \
34 | https://repositories.intel.com/gpu/rhel/9.2/lts/2350/unified/intel-gpu-9.2.repo
35 | RUN dnf -y update && \
36 | dnf install -y libva-utils intel-gmmlib-devel
37 | RUN git clone -b intel-media-23.4.3 --single-branch https://github.com/intel/media-driver.git && \
38 | cd media-driver && mkdir media-driver build && cd build && \
39 | cmake -D ENABLE_PRODUCTION_KMD=ON ../ && make -j $(nproc) && make install
40 | ENTRYPOINT ["/bin/sh"]
41 | strategy:
42 | type: Docker
43 | noCache: true
44 | dockerStrategy:
45 | buildArgs:
46 | - name: "BUILDER"
47 | value: "registry.access.redhat.com/ubi9:latest"
48 | env:
49 | - name: "USERNAME"
50 | valueFrom:
51 | secretKeyRef:
52 | key: username
53 | name: rh-auth
54 | - name: "PASSWORD"
55 | valueFrom:
56 | secretKeyRef:
57 | key: password
58 | name: rh-auth
59 | output:
60 | to:
61 | kind: ImageStreamTag
62 | name: intel-dgpu-vainfo:latest
--------------------------------------------------------------------------------
/tests/gaudi/l2/vllm_deployment.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | kind: PersistentVolumeClaim
5 | apiVersion: v1
6 | metadata:
7 | name: vllm-workload-pvc
8 | namespace: gaudi-validation
9 | spec:
10 | accessModes:
11 | - ReadWriteOnce
12 | resources:
13 | requests:
14 | storage: 60Gi
15 | storageClassName: "" # Add your storage class
16 | volumeMode: Filesystem
17 | ---
18 | apiVersion: apps/v1
19 | kind: Deployment
20 | metadata:
21 | name: vllm-workload
22 | namespace: gaudi-validation
23 | labels:
24 | app: vllm-workload
25 | spec:
26 | replicas: 1
27 | selector:
28 | matchLabels:
29 | app: vllm-workload
30 | template:
31 | metadata:
32 | labels:
33 | app: vllm-workload
34 | spec:
35 | containers:
36 | - name: vllm-container
37 | image: image-registry.openshift-image-registry.svc:5000/gaudi-validation/vllm-workload:latest
38 | command: [ "/bin/bash", "-c", "--" ]
39 | args: ["vllm serve meta-llama/Llama-3.1-8B"] # Add the model
40 | ports:
41 | - containerPort: 8000
42 | resources:
43 | limits:
44 | habana.ai/gaudi: 1
45 | env:
46 | - name: HF_TOKEN
47 | valueFrom:
48 | secretKeyRef:
49 | name: hf-token
50 | key: hf-token
51 | - name: HF_HOME
52 | value: /home/vllm/.cache/huggingface
53 | - name: HF_HUB_OFFLINE
54 | value: "0"
55 | imagePullPolicy: Always
56 | volumeMounts:
57 | - name: hf-cache
58 | mountPath: /home/vllm/.cache
59 | - name: shm
60 | mountPath: /dev/shm
61 | volumes:
62 | - name: hf-cache
63 | persistentVolumeClaim:
64 | claimName: vllm-workload-pvc
65 | - name: shm
66 | emptyDir:
67 | medium: Memory
68 | sizeLimit: "2Gi"
69 | livenessProbe:
70 | httpGet:
71 | path: /health
72 | port: 8000
73 | initialDelaySeconds: 60
74 | periodSeconds: 10
75 | readinessProbe:
76 | httpGet:
77 | path: /health
78 | port: 8000
79 | initialDelaySeconds: 60
80 | periodSeconds: 5
81 |
--------------------------------------------------------------------------------
/tests/l2/dgpu/intelvpl_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: intel-dgpu-intelvpl
8 | namespace: intel-dgpu
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: intel-dgpu-intelvpl
15 | namespace: intel-dgpu
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | type: Dockerfile
23 | dockerfile: |
24 | ARG BUILDER=registry.access.redhat.com/ubi9:latest
25 | FROM ${BUILDER}
26 | RUN subscription-manager register --username=${USERNAME} --password=${PASSWORD} && \
27 | subscription-manager attach --auto && \
28 | subscription-manager repos --enable rhel-9-for-x86_64-appstream-rpms && \
29 | dnf -y update && \
30 | dnf install -y flex bison gcc gcc-c++ make autoconf libtool cmake git gdb \
31 | libva libva-devel libdrm libdrm-devel
32 | RUN dnf install -y 'dnf-command(config-manager)' && \
33 | dnf config-manager --add-repo \
34 | https://repositories.intel.com/gpu/rhel/9.2/lts/2350/unified/intel-gpu-9.2.repo
35 | RUN dnf -y update && \
36 | dnf install -y libva-utils intel-gmmlib-devel libvpl2 libvpl-devel libvpl-tools \
37 | libmfx
38 | RUN git clone -b intel-media-23.4.3 --single-branch https://github.com/intel/media-driver.git && \
39 | cd media-driver && mkdir media-driver build && cd build && \
40 | cmake -D ENABLE_PRODUCTION_KMD=ON ../ && make -j $(nproc) && make install
41 | # we need this for testing samples
42 | RUN git clone https://github.com/intel/libvpl.git
43 | ENTRYPOINT ["/bin/sh"]
44 | strategy:
45 | type: Docker
46 | noCache: true
47 | dockerStrategy:
48 | buildArgs:
49 | - name: "BUILDER"
50 | value: "registry.access.redhat.com/ubi9:latest"
51 | env:
52 | - name: "USERNAME"
53 | valueFrom:
54 | secretKeyRef:
55 | key: username
56 | name: rh-auth
57 | - name: "PASSWORD"
58 | valueFrom:
59 | secretKeyRef:
60 | key: password
61 | name: rh-auth
62 | output:
63 | to:
64 | kind: ImageStreamTag
65 | name: intel-dgpu-intelvpl:latest
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | ### License
4 |
5 | Intel Technology Enabling For OpenShift project is licensed under the terms in [Apache-2.0](LICENSE.txt). By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
6 |
7 | ### Sign your work
8 |
9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 |
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 |
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 |
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 |
23 | Developer's Certificate of Origin 1.1
24 |
25 | By making a contribution to this project, I certify that:
26 |
27 | (a) The contribution was created in whole or in part by me and I
28 | have the right to submit it under the open source license
29 | indicated in the file; or
30 |
31 | (b) The contribution is based upon previous work that, to the best
32 | of my knowledge, is covered under an appropriate open source
33 | license and I have the right under that license to submit that
34 | work with modifications, whether created in whole or in part
35 | by me, under the same open source license (unless I am
36 | permitted to submit under a different license), as indicated
37 | in the file; or
38 |
39 | (c) The contribution was provided directly to me by some other
40 | person who certified (a), (b) or (c) and I have not modified
41 | it.
42 |
43 | (d) I understand and agree that this project and the contribution
44 | are public and that a record of the contribution (including all
45 | personal information I submit with it, including my sign-off) is
46 | maintained indefinitely and may be redistributed consistent with
47 | this project or the open source license(s) involved.
48 | ```
49 |
50 | Then you just add a line to every git commit message:
51 |
52 | Signed-off-by: Joe Smith
53 |
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 |
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 |
--------------------------------------------------------------------------------
/nfd/README.md:
--------------------------------------------------------------------------------
1 | # Setting up Node Feature Discovery
2 | [Node Feature Discovery (NFD) Operator](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator) manages the deployment and lifecycle of the NFD add-on to detect hardware features and system configuration, such as PCI cards, kernel, operating system version, etc.
3 |
4 | ## Prerequisites
5 | - Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
6 |
7 | ## Install NFD Operator
8 | Follow the guide below to install the NFD operator using CLI or web console.
9 | - [Install from the CLI](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator#install-operator-cli_psap-node-feature-discovery-operator)
10 | - [Install from the web console](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator#install-operator-web-console_psap-node-feature-discovery-operator)
11 |
12 | ## Configure NFD Operator
13 | Note: As RHOCP cluster administrator, you might need to merge the NFD operator config from the following Custom Resources (CRs) with other NFD operator configs that are already applied on your cluster.
14 |
15 | 1. Create `NodeFeatureDiscovery` CR instance.
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-discovery-openshift.yaml
18 | ```
19 |
20 | 2. Create `NodeFeatureRule` CR instance.
21 | ```
22 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-rules-openshift.yaml
23 | ```
24 |
25 | ## Verification
26 | Use the following command to get the node name
27 | ```
28 | $ oc get nodes
29 | ```
30 | Use the command shown below to verify whether the nodes are labeled properly by NFD:
31 | ```
32 | $ oc describe node | grep intel.feature.node.kubernetes.io
33 | ```
34 | Example output:
35 | ```
36 | intel.feature.node.kubernetes.io/dgpu-canary=true
37 | intel.feature.node.kubernetes.io/gpu=true
38 | ```
39 |
40 | ## Labels Table
41 | | Label | Intel hardware feature |
42 | | ----- | ---------------------- |
43 | | `intel.feature.node.kubernetes.io/gpu=true` | Intel® Data Center GPU Flex Series or Intel® Data Center GPU Max Series |
44 | | `intel.feature.node.kubernetes.io/sgx=true` | Intel® SGX |
45 | | `intel.feature.node.kubernetes.io/qat=true` | Intel® QAT |
46 | | `intel.feature.node.kubernetes.io/dsa=true` | Intel® DSA |
47 |
48 | ## See Also
49 |
--------------------------------------------------------------------------------
/docs/releases.rst:
--------------------------------------------------------------------------------
1 | Release Information
2 | ===================
3 | .. list-table::
4 | :align: left
5 | :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10
6 |
7 | * - **Release**
8 | - `1.6.1 `_
9 | - `1.6.0 `_
10 | - `1.5.2 `_
11 | - `1.5.1 `_
12 | - `1.5.0 `_
13 | - `1.4.0 `_
14 | - `1.3.1 `_
15 | - `1.3.0 `_
16 | - `1.2.1 `_
17 | - `1.2.0 `_
18 | - `1.1.0 `_
19 | - `1.0.1 `_
20 | - `1.0.0 `_
21 | * - **Red Hat OpenShift Version**
22 | - 4.18
23 | - 4.18
24 | - 4.17
25 | - 4.17
26 | - 4.17
27 | - 4.16
28 | - 4.14
29 | - 4.15
30 | - 4.14
31 | - 4.14
32 | - 4.13
33 | - 4.12
34 | - 4.12
35 | * - **Documentation**
36 | - `1.6.1 `_
37 | - `1.6.0 `_
38 | - `1.5.2 `_
39 | - `1.5.1 `_
40 | - `1.5.0 `_
41 | - `1.4.0 `_
42 | - `1.3.1 `_
43 | - `1.3.0 `_
44 | - NA
45 | - NA
46 | - NA
47 | - NA
48 | - NA
49 |
50 | **NOTE:** Release 1.3.1 supports OPEA and Gaudi
--------------------------------------------------------------------------------
/machine_configuration/README.md:
--------------------------------------------------------------------------------
1 | # Setting up Machine Configuration
2 |
3 | ## Introduction
4 | Machine configuration operation is used to configure [Red Hat Enterprise Linux CoreOS (RHCOS)](https://docs.openshift.com/container-platform/4.14/architecture/architecture-rhcos.html) on each node in a RHOCP cluster.
5 |
6 | [Machine config operator](https://github.com/openshift/machine-config-operator) (MCO) is provided by Red Hat to manage the operating system and machine configuration. In this project through the MCO, cluster administrators can configure and update the kernel to provision Intel Hardware features on the worker nodes.
7 |
8 | MCO is one of the technologies used in this project to manage the machine configuration. In current OCP, MCO might reboot the node to enable the machine configuration. Since rebooting the node is undesirable, alternative machine configuration technologies are under investigation. For more details, see this [issue](https://github.com/intel/intel-technology-enabling-for-openshift/issues/34).
9 |
10 | The best approach is to work with the RHCOS team to push the RHCOS configuration as the default configuration for a RHOCP cluster on [Day 0](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/).
11 |
12 | For some general configuration, we recommend you set it up while provisioning the cluster on [Day 1](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/).
13 |
14 | If the configuration cannot be set as the default setting, we recommend using some operator to set the configuration on the fly without rebooting the node on [Day 2](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/).
15 |
16 | Any contribution in this area is welcome.
17 |
18 | ## Prerequisites
19 | - Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
20 | - Setup node feature discovery (NFD). Follow steps [here](/nfd/README.md).
21 |
22 | ## Machine Configuration for Provisioning Intel® QAT and Intel® DSA
23 |
24 | * Turn on `intel_iommu,sm_on` kernel parameter and load `vfio_pci` at boot for QAT and DSA provisioning
25 |
26 | ```
27 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/machine_configuration/100-intel-iommu-on.yaml
28 | ```
29 |
30 | Note: This will reboot the worker nodes when changing the kernel parameter through MCO.
31 |
32 | ## Verification
33 | Navigate to the node terminal on the web console (Compute -> Nodes -> Select a node -> Terminal). Run the following commands in the terminal.
34 | ```
35 | $ cat /proc/cmdline
36 | ```
37 | Ensure that `intel_iommu=on,sm_on` is present.
38 |
39 | ```
40 | $ chroot /host
41 | $ lsmod | grep vfio_pci
42 | ```
43 | Ensure that `vfio_pci` driver is present.
44 |
45 | ## See Also
46 | - [Red Hat OpenShift Container Platform Day-2 operations](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/)
47 |
--------------------------------------------------------------------------------
/playbooks/intel_ocp_provisioning.yaml:
--------------------------------------------------------------------------------
1 | - hosts: localhost
2 | gather_facts: no
3 | vars:
4 | kubeconfig_path: "~/.kube/mojave-config"
5 | environment:
6 | KUBECONFIG: "{{ kubeconfig_path }}"
7 | vars_prompt:
8 | - name: "install_operators"
9 | prompt: "Do you want to install operators? 'Yes' to install NFD Operator and Intel Device Plugins Operator, or 'No' to skip"
10 | private: no
11 | - name: "validation_feature"
12 | prompt: "Which Intel feature do you want to validate? Enter 1 for Intel SGX, 2 for Intel QAT, 3 for Intel DSA, 4 for Intel GPU"
13 | private: no
14 |
15 | tasks:
16 | - name: Validate Inputs
17 | block:
18 | - name: Invalid Install Operators Input
19 | fail:
20 | msg: "Invalid input for Install Operators. Please enter a valid option for Install Operators (Yes/No)."
21 | when: install_operators not in ["Yes", "No"]
22 | - name: Invalid Validation Feature Input
23 | fail:
24 | msg: "Invalid input for validation feature. Please enter a valid option (1-4)."
25 | when: validation_feature not in ["1", "2", "3", "4"]
26 |
27 | - name: Install Operators
28 | block:
29 | - name: NFD - Install NFD Operator
30 | include_tasks: install_nfd_operator.yaml
31 | - name: IDPO - Install Intel Device Plugins Operator
32 | include_tasks: install_device_plugins_operator.yaml
33 | - name: NFD - Wait until the nfd-operator-controller Deployment is available
34 | k8s_info:
35 | kind: Deployment
36 | wait: yes
37 | name: nfd-controller-manager
38 | label_selectors:
39 | - operators.coreos.com/nfd.openshift-nfd
40 | - control-plane=controller-manager
41 | namespace: openshift-nfd
42 | wait_condition:
43 | type: Available
44 | status: 'True'
45 | - name: NFD - Configure NFD Operator
46 | include_tasks: configure_nfd.yaml
47 | - name: IDPO - Wait until the inteldeviceplugins-controller-manager Deployment is available
48 | k8s_info:
49 | kind: Deployment
50 | name: inteldeviceplugins-controller-manager
51 | namespace: openshift-operators
52 | wait: yes
53 | wait_condition:
54 | type: Available
55 | status: 'True'
56 | reason: MinimumReplicasAvailable
57 | when: install_operators == "Yes"
58 |
59 | - name: Skip Operator Installation
60 | debug:
61 | msg: "Skipping operator installation as per user input."
62 | when: install_operators == "No"
63 |
64 | - name: Validate Intel SGX
65 | include_tasks: validate_sgx.yaml
66 | when: validation_feature == "1"
67 |
68 | - name: Validate Intel QAT
69 | include_tasks: validate_gpu.yaml
70 | when: validation_feature == "2"
71 |
72 | - name: Validate Intel DSA
73 | include_tasks: validate_qat.yaml
74 | when: validation_feature == "3"
75 |
76 | - name: Validate Intel GPU
77 | include_tasks: validate_dsa.yaml
78 | when: validation_feature == "4"
--------------------------------------------------------------------------------
/tests/l2/sgx/sgx_build.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: build.openshift.io/v1
5 | kind: BuildConfig
6 | metadata:
7 | name: intel-sgx-workload
8 | namespace: intel-sgx
9 | spec:
10 | triggers:
11 | - type: "ConfigChange"
12 | - type: "ImageChange"
13 | runPolicy: "Serial"
14 | source:
15 | type: Dockerfile
16 | dockerfile: |
17 | ARG BUILDER=registry.access.redhat.com/ubi9:latest
18 | ARG BASE=registry.access.redhat.com/ubi9-minimal:latest
19 | ARG LINUX_SGX_VERSION=2.26
20 | FROM ${BUILDER} AS builder
21 |
22 | ARG SGX_SDK=sgx_linux_x64_sdk_2.26.100.0.bin
23 | ARG LINUX_SGX_VERSION
24 |
25 | RUN dnf -y update && \
26 | dnf -y install \
27 | wget \
28 | openssl \
29 | gcc-c++ \
30 | make \
31 | protobuf-c && \
32 | dnf clean all
33 |
34 | # SGX SDK installed in /opt/intel directory
35 | WORKDIR /opt/intel
36 | RUN wget https://download.01.org/intel-sgx/sgx-linux/$LINUX_SGX_VERSION/distro/rhel9.4-server/$SGX_SDK \
37 | && chmod +x $SGX_SDK \
38 | && echo "yes" | ./$SGX_SDK \
39 | && rm $SGX_SDK
40 |
41 | RUN cd sgxsdk/SampleCode/SampleEnclave \
42 | && . /opt/intel/sgxsdk/environment \
43 | && make
44 | FROM ${BASE}
45 | ARG LINUX_SGX_VERSION
46 | RUN microdnf -y update && \
47 | microdnf -y install \
48 | wget \
49 | tar \
50 | gzip && \
51 | microdnf clean all && rm -rf /var/cache/dnf
52 |
53 | # Download SGX PSW and install SGX runtime components to create SGX enclave
54 | WORKDIR /opt/intel
55 | RUN wget https://download.01.org/intel-sgx/sgx-linux/$LINUX_SGX_VERSION/distro/rhel9.4-server/sgx_rpm_local_repo.tgz \
56 | && sha256sum sgx_rpm_local_repo.tgz \
57 | && tar xvf sgx_rpm_local_repo.tgz \
58 | && rm -rf sgx_rpm_local_repo.tgz
59 |
60 | RUN cd sgx_rpm_local_repo && rpm -i \
61 | libsgx-headers-$LINUX_SGX_VERSION* \
62 | libsgx-enclave-common-$LINUX_SGX_VERSION* \
63 | libsgx-urts-$LINUX_SGX_VERSION* && \
64 | rm -r /opt/intel/sgx_rpm_local_repo
65 |
66 | COPY --from=builder /opt/intel/sgxsdk/SampleCode/SampleEnclave/app app
67 | COPY --from=builder /opt/intel/sgxsdk/SampleCode/SampleEnclave/enclave.signed.so enclave.signed.so
68 |
69 | ENTRYPOINT /opt/intel/app
70 |
71 | strategy:
72 | type: Docker
73 | noCache: true
74 | dockerStrategy:
75 | buildArgs:
76 | - name: "BUILDER"
77 | value: "registry.access.redhat.com/ubi9:9.4"
78 | - name: "BASE"
79 | value: "registry.access.redhat.com/ubi9-minimal:9.4"
80 | - name: "SGX_SDK"
81 | value: "sgx_linux_x64_sdk_2.26.100.0.bin"
82 | - name: "LINUX_SGX_VERSION"
83 | value: "2.26"
84 | output:
85 | to:
86 | kind: ImageStreamTag
87 | name: intel-sgx-workload:latest
--------------------------------------------------------------------------------
/device_plugins/deploy_dsa.md:
--------------------------------------------------------------------------------
1 | # Create Intel DSA Device Plugin CR
2 |
3 | ## Create a CR via web console
4 | 1. Go to **Operator** -> **Installed Operators**.
5 | 2. Open **Intel Device Plugins Operator**.
6 | 3. Navigate to tab **Intel DSA Device Plugin**.
7 | 4. Click **Create DSADevicePlugin** -> set correct parameters -> Click **Create**
8 | 5. Optional: If you want to make any customizations, select YAML view and edit the details. When you are done, click **Create**.
9 |
10 | ## Verify via web console
11 | 1. Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-dsa-plugin**.
12 | 2. Now `DsaDevicePlugin` is created.
13 |
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/dsa_device_plugin.yaml
18 | ```
19 |
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready:
22 | ```
23 | $ oc get DsaDevicePlugin
24 | ```
25 | Output:
26 | ```
27 | NAME DESIRED READY NODE SELECTOR AGE
28 | dsadeviceplugin-sample 3 3 {"intel.feature.node.kubernetes.io/dsa":"true"} 98m
29 | ```
30 |
31 | # Verify DSA Device Plugin
32 | After the plugin is deployed, use below command to verify DSA resources:
33 | ```
34 | $ oc describe node srf-2 | grep dsa.intel.com
35 | dsa.intel.com/wq-user-dedicated: 0
36 | dsa.intel.com/wq-user-shared: 160
37 | dsa.intel.com/wq-user-dedicated: 0
38 | dsa.intel.com/wq-user-shared: 160
39 | ```
40 |
41 | ## DSA Resource Configuration
42 | By default the DSA plugin uses [this configuration file](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/demo/dsa.conf).
43 | The dsa init container comes with a utility called `accel-config` which takes file as input and configures the DSA hardwares based on that.
44 | The default configuration has creates dedicated WQs for each DSA device so that it's four groups per device where each groups is with 1 WQ linked to 1 engine.
45 | Users can customise the config and can use the pre-customised config for their specific use case from [here](https://github.com/intel/idxd-config/tree/stable/contrib/configs)
46 | There's also a possibility for a node specific configuration by passing a node specific profile via configMap volume mount.
47 | Users can use the steps below to customize the DSA resource configuration:
48 | 1. Create the configmap for DSA resource configuration
49 | ```
50 | $ oc create configmap --namespace=openshift-operators intel-dsa-config --from-file=dsa[-$NODE_NAME].conf
51 | 2. Create DSA device plugin CR with -provisioning-config set as the name of the ConfigMap (created in step 1) in the dsa_device_plugin.yaml file or set ConfigMap name in the provisioning-config option from web console.
52 |
53 | # Run Intel DSA based workloads on RHOCP
54 | To run the Intel DSA based workloads as an unprivileged pod, you need to use a customised SCC. The customized `dsa-scc` Security Context Constraint (SCC) is provided to bind with service account and run the DSA based workload.
55 |
56 | See [Verify Intel DSA Provisioning](/tests/l2/dsa/README.md) for the detailed steps.
57 |
--------------------------------------------------------------------------------
/tests/l2/qat/README.md:
--------------------------------------------------------------------------------
1 | ### Verify Intel® QuickAssist Technology provisioning
2 | This workload runs [qatlib](https://github.com/intel/qatlib) sample tests using RedHat built and distributed Qatlib RPM packages from the codeready-builder-for-rhel-9-x86_64-rpms repo. Refer to the [qatlib readme](https://github.com/intel/qatlib/blob/main/INSTALL) for more details.
3 |
4 | * Create and use ```intel-qat``` namespace for the workload
5 |
6 | ```
7 | $ oc new-project intel-qat
8 | ```
9 |
10 | * Build the workload container image
11 |
12 | Please replace the credentials in buildconfig yaml with your RedHat account login credentials.
13 |
14 | ```
15 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/qat/qatlib_build.yaml
16 | ```
17 |
18 | * Create SCC intel-qat-scc for Intel QAT based workload, if this SCC is not created
19 |
20 | ```
21 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/qatlib_scc.yaml
22 | ```
23 |
24 | * Create the intel-qat service account to use intel-qat-scc
25 |
26 | ```
27 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/qatlib_rbac.yaml
28 | ```
29 |
30 | * Deploy the qatlib workload job with intel-qat service account
31 |
32 | ```
33 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/qat/qatlib_job.yaml
34 | ```
35 |
36 | * Check the results.
37 | ```
38 | $ oc get pods -n intel-qat
39 | intel-qat-workload-c6g9v 0/1 Completed 0 4m13s
40 | ```
41 |
42 |
43 | * For all sample tests `cpa_sample_code`
44 |
45 | ```
46 | $ oc logs intel-qat-workload-c6g9v -n intel-qat
47 | qaeMemInit started
48 | icp_sal_userStartMultiProcess("SSL") started
49 | There are no crypto instances
50 | *** QA version information ***
51 | device ID = 0
52 | software = 23.2.0
53 | *** END QA version information ***
54 | Inst 0, Affin: 0, Dev: 0, Accel 0, EE 0, BDF ED:00:01
55 | Inst 1, Affin: 1, Dev: 0, Accel 0, EE 0, BDF ED:00:01
56 | Inst 2, Affin: 2, Dev: 0, Accel 0, EE 0, BDF ED:00:01
57 | Inst 3, Affin: 3, Dev: 0, Accel 0, EE 0, BDF ED:00:01
58 | ---------------------------------------
59 | API Traditional
60 | Session State STATELESS
61 | Algorithm DEFLATE
62 | Huffman Type STATIC
63 | Mode ASYNCHRONOUS
64 | CNV Enabled YES
65 | Direction COMPRESS
66 | Packet Size 8192
67 | Compression Level 1
68 | Corpus CALGARY_CORPUS
69 | Corpus Filename calgary
70 | CNV Recovery Enabled YES
71 | Number of threads 4
72 | Total Responses 158400
73 | Total Retries 2242671
74 | Clock Cycles Start 126150916653843
75 | Clock Cycles End 126151409143747
76 | Total Cycles 492489904
77 | CPU Frequency(kHz) 1700160
78 | Throughput(Mbps) 35920
79 | Compression Ratio 0.4897
80 | ---------------------------------------
81 |
82 | Inst 0, Affin: 0, Dev: 0, Accel 0, EE 0, BDF ED:00:01
83 | Inst 1, Affin: 1, Dev: 0, Accel 0, EE 0, BDF ED:00:01
84 | Inst 2, Affin: 2, Dev: 0, Accel 0, EE 0, BDF ED:00:01
85 | Inst 3, Affin: 3, Dev: 0, Accel 0, EE 0, BDF ED:00:01
86 | ---------------------------------------
87 | ```
88 |
--------------------------------------------------------------------------------
/one_click/README.md:
--------------------------------------------------------------------------------
1 | # Deploy Intel Technology Enabling Solutions with Red Hat OpenShift using “One-Click”
2 |
3 | ## Overview
4 | Red Hat [Ansible](https://www.ansible.com/) and Operator technologies are used for “One-Click Deployment” of Intel technology enabling solutions with Red Hat OpenShift Container Platform (RHOCP). Ansible technology automates the operator installation and configuration steps using a playbook, making deployment as simple as a single click.
5 |
6 | The referenced Ansible playbooks here can be used by the cluster administrators to customize their own playbooks.
7 |
8 | **Note:** It is recommended to start from [Get started](/README.md#getting-started) to get familiar with the installation and configuration of the general operator before composing the first playbook.
9 |
10 | ## Reference Playbook – Intel Data Center GPU Provisioning
11 |
12 | This playbook demonstrates the one-click provisioning of Intel Data Center GPU on an RHOCP cluster. The steps involved are installation and configuration of general Operators including Node Feature Discovery (NFD) operator, Kernel Module Management (KMM) operator, and the Intel Device Plugins Operator.
13 |
14 | ### Prerequisite
15 | Before running the playbook, ensure the following prerequisites are met:
16 | - Provisioned RHOCP Cluster
17 | - Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster.
18 |
19 | ### Run the Playbook
20 | To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook.
21 | ```
22 | $ git clone https://github.com/intel/intel-technology-enabling-for-openshift.git
23 | $ cd intel-technology-enabling-for-openshift/one_click
24 | ```
25 | Execute below single command to provision Intel Data Center GPU:
26 | ```
27 | $ ansible-playbook gpu_provisioning_playbook.yaml
28 | ```
29 |
30 | ## Reference Playbook – Intel Gaudi Provisioning
31 | This playbook demonstrates the one-click provisioning of Intel Gaudi AI Accelerator on an RHOCP cluster. The steps involved are installation and configuration of general Operators including Node Feature Discovery (NFD) operator, Kernel Module Management (KMM) operator, and the Intel Gaudi Base Operator. The playbook also creates the Gaudi `DeviceConfig` CR which deploys the Gaudi Out-of-Tree drivers, Gaudi device plugins, Habana container runtime and Habana node metrics.
32 |
33 | ### Prerequisite
34 | Before running the playbook, ensure the following prerequisites are met:
35 | - Provisioned RHOCP Cluster
36 | - Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster.
37 | - Set Firmware search path using MCO, follow [Update Kernel Firmware Search Path with MCO](/gaudi/README.md#update-kernel-firmware-search-path-with-mco).
38 |
39 | ### Run the Playbook
40 | To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook.
41 | ```
42 | $ git clone https://github.com/intel/intel-technology-enabling-for-openshift.git
43 | $ cd intel-technology-enabling-for-openshift/one_click
44 | ```
45 | Execute below single command to provision Intel Gaudi Accelerator:
46 | ```
47 | $ ansible-playbook gaudi_provisioning_playbook.yaml
48 | ```
--------------------------------------------------------------------------------
/workloads/opea/chatqna/chatqna_megaservice_deployment.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | ---
4 | apiVersion: v1
5 | kind: PersistentVolumeClaim
6 | metadata:
7 | name: chatqna-megaservice-pvc-0
8 | namespace: opea-chatqna
9 | spec:
10 | accessModes:
11 | - ReadWriteOnce
12 | resources:
13 | requests:
14 | storage: 100Mi
15 | ---
16 | apiVersion: v1
17 | kind: PersistentVolumeClaim
18 | metadata:
19 | name: chatqna-megaservice-pvc-1
20 | namespace: opea-chatqna
21 | spec:
22 | accessModes:
23 | - ReadWriteOnce
24 | resources:
25 | requests:
26 | storage: 100Mi
27 | ---
28 | apiVersion: v1
29 | kind: PersistentVolumeClaim
30 | metadata:
31 | name: chatqna-megaservice-pvc-2
32 | namespace: opea-chatqna
33 | spec:
34 | accessModes:
35 | - ReadWriteOnce
36 | resources:
37 | requests:
38 | storage: 100Mi
39 | ---
40 | apiVersion: apps/v1
41 | kind: Deployment
42 | metadata:
43 | name: chatqna-megaservice
44 | namespace: opea-chatqna
45 | spec:
46 | selector:
47 | matchLabels:
48 | app: chatqna-megaservice
49 | replicas: 1
50 | template:
51 | metadata:
52 | labels:
53 | app: chatqna-megaservice
54 | spec:
55 | serviceAccount: opea-chatqna
56 | containers:
57 | - name: chatqna-megaservice
58 | image: 'image-registry.openshift-image-registry.svc:5000/opea-chatqna/chatqna-megaservice:latest'
59 | env:
60 | - name: EMBED_MODEL
61 | value: BAAI/bge-base-en-v1.5
62 | - name: HUGGINGFACEHUB_API_TOKEN
63 | valueFrom:
64 | secretKeyRef:
65 | key: HUGGINGFACEHUB_API_TOKEN
66 | name: hf-token
67 | - name: MODEL_SIZE
68 | value: 70b
69 | - name: PYTHONPATH
70 | value: $PYTHONPATH:/ws:/home/user:/home/user/qna-app/app
71 | - name: RAG_UPLOAD_DIR
72 | value: /upload_dir
73 | - name: REDIS_PORT
74 | value: "6379"
75 | - name: REDIS_HOST
76 | value: "redis-vector-db"
77 | - name: REDIS_SCHEMA
78 | value: schema_dim_768.yml
79 | - name: TGI_ENDPOINT
80 | value: http://xxx.xxx.xxx.xxx:xxx
81 | ports:
82 | - containerPort: 8000
83 | command:
84 | - /bin/bash
85 | - '-c'
86 | - |
87 | cd /ws && \
88 | python ingest.py /ws/data_intel/ && \
89 | cd /home/user/qna-app && \
90 | python app/server.py
91 | volumeMounts:
92 | - mountPath: /ws
93 | name: chatqna-megaservice-pvc-0
94 | - mountPath: /test
95 | name: chatqna-megaservice-pvc-1
96 | - mountPath: /upload_dir
97 | name: chatqna-megaservice-pvc-2
98 | volumes:
99 | - name: chatqna-megaservice-pvc-0
100 | persistentVolumeClaim:
101 | claimName: chatqna-megaservice-pvc-0
102 | - name: chatqna-megaservice-pvc-1
103 | persistentVolumeClaim:
104 | claimName: chatqna-megaservice-pvc-1
105 | - name: chatqna-megaservice-pvc-2
106 | persistentVolumeClaim:
107 | claimName: chatqna-megaservice-pvc-2
108 | ---
109 | # Chatqna megaservice Service
110 | apiVersion: v1
111 | kind: Service
112 | metadata:
113 | name: chatqna-megaservice
114 | namespace: opea-chatqna
115 | spec:
116 | type: ClusterIP
117 | selector:
118 | app: chatqna-megaservice
119 | ports:
120 | - port: 8000
121 | targetPort: 8000
--------------------------------------------------------------------------------
/tests/l2/dsa/README.md:
--------------------------------------------------------------------------------
1 | ### Verify Intel® Data Streaming Accelerator (DSA) Technology provisioning
2 | This workload runs [accel-config](https://github.com/intel/idxd-config) sample tests using RedHat built and distributed accel-config RPM packages from the rhel-9-for-x86_64-baseos-rpms repo. Refer to the [accel config readme](https://github.com/intel/idxd-config/blob/stable/README.md) for more details.
3 |
4 | * Create the RedHat auth secret. Please replace the credentials in the secret yaml with your RedHat account login credentials.
5 |
6 | ```
7 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/rh_auth.yaml
8 | ```
9 |
10 | * Build the workload container image
11 |
12 | ```
13 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/dsa_imagestream.yaml
14 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/dsa_build.yaml
15 | ```
16 |
17 | * Create SCC intel-dsa-scc for Intel DSA based workload, if this SCC is not created
18 |
19 | ```
20 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_scc.yaml
21 | ```
22 |
23 | * Create the intel-dsa serviceAccount, role and roleBinding to use intel-dsa-scc
24 |
25 | ```
26 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_serviceAccount.yaml
27 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_role.yaml
28 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_roleBinding.yaml
29 | ```
30 |
31 | * Deploy the accel-config workload job with intel-dsa service account
32 |
33 | ```
34 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/dsa_job.yaml
35 | ```
36 |
37 | * Check the results.
38 | ```
39 | $ oc get pods -n intel-dsa
40 | intel-dsa-workload-244xm 0/1 Completed 0 3m12s
41 | ```
42 |
43 | * sample test logs
44 | ```
45 | $ oc logs intel-dsa-workload-244xm -n intel-dsa
46 | dsa0/wq0.1
47 | dsa0
48 | Testing with 'block on fault' flag ON
49 | Performing dedicated WQ NOOP testing
50 | Testing 1 bytes
51 | [ info] alloc wq 1 dedicated size 16 addr 0x7f0cde00b000 batch sz 0x400 xfer sz 0x80000000
52 | [ info] testnoop: tflags 0x1 num_desc 1
53 | [ info] preparing descriptor for noop
54 | [ info] Submitted all noop jobs
55 | [ info] verifying task result for 0x2041620
56 | [ info] test with op 0 passed
57 | Testing 4096 bytes
58 | [ info] alloc wq 1 dedicated size 16 addr 0x7fd4881da000 batch sz 0x400 xfer sz 0x80000000
59 | [ info] testnoop: tflags 0x1 num_desc 1
60 | [ info] preparing descriptor for noop
61 | [ info] Submitted all noop jobs
62 | [ info] verifying task result for 0x82f620
63 | [ info] test with op 0 passed
64 | Testing 65536 bytes
65 | [ info] alloc wq 1 dedicated size 16 addr 0x7f462bbed000 batch sz 0x400 xfer sz 0x80000000
66 | [ info] testnoop: tflags 0x1 num_desc 1
67 | [ info] preparing descriptor for noop
68 | [ info] Submitted all noop jobs
69 | [ info] verifying task result for 0xe4e620
70 | [ info] test with op 0 passed
71 | Testing 1048576 bytes
72 | [ info] alloc wq 1 dedicated size 16 addr 0x7fac2ac0c000 batch sz 0x400 xfer sz 0x80000000
73 | [ info] testnoop: tflags 0x1 num_desc 1
74 | [ info] preparing descriptor for noop
75 | [ info] Submitted all noop jobs
76 | [ info] verifying task result for 0xf21620
77 | [ info] test with op 0 passed
78 | Testing 2097152 bytes
79 | [ info] alloc wq 1 dedicated size 16 addr 0x7f7426a5c000 batch sz 0x400 xfer sz 0x80000000
80 | [ info] testnoop: tflags 0x1 num_desc 1
81 | [ info] preparing descriptor for noop
82 | [ info] Submitted all noop jobs
83 | [ info] verifying task result for 0xeec620
84 | [ info] test with op 0 passed
85 | Performing shared WQ NOOP testing
86 | ```
87 |
--------------------------------------------------------------------------------
/gaudi/gaudi_cluster_policy.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | # Adapted from https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Kubernetes_Installation/Kubernetes_Operator.html#id2
4 | #
5 | apiVersion: habanalabs.habana.ai/v1
6 | kind: ClusterPolicy
7 | metadata:
8 | name: habana-ai
9 | spec:
10 | image_registry: vault.habana.ai
11 | driver:
12 | driver_loader:
13 | images:
14 | ubuntu_22.04:
15 | repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer
16 | tag: 1.19.1-26
17 | rhel_8.6:
18 | repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer
19 | tag: 1.19.1-26
20 | rhel_9.2:
21 | repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer
22 | tag: 1.19.1-26
23 | rhel_9.4:
24 | repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer
25 | tag: 1.19.1-26
26 | tencentos_3.1:
27 | repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer
28 | tag: 1.19.1-26
29 | resources:
30 | limits:
31 | cpu: cpu_str_or_int_optional
32 | memory: memory_str_optional
33 | requests:
34 | cpu: cpu_str_or_int_optional
35 | memory: memory_str_optional
36 | repo_server: vault.habana.ai
37 | repo_path: artifactory/gaudi-installer/repos
38 | mlnx_ofed_repo_path: artifactory/gaudi-installer/deps
39 | mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz
40 | hugepages: hugepages_number_int_optional
41 | external_ports: turn_on_external_port_bool_optional
42 | firmware_flush: flush_firmware_on_the_gaudi_cards_bool_optional
43 | driver_runner:
44 | image:
45 | repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer
46 | tag: 1.19.1-26
47 | resources:
48 | limits:
49 | cpu: cpu_str_or_int_optional
50 | memory: memory_str_optional
51 | requests:
52 | cpu: cpu_str_or_int_optional
53 | memory: memory_str_optional
54 | device_plugin:
55 | image:
56 | repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin
57 | tag: 1.19.1
58 | resources:
59 | limits:
60 | cpu: cpu_str_or_int_optional
61 | memory: memory_str_optional
62 | requests:
63 | cpu: cpu_str_or_int_optional
64 | memory: memory_str_optional
65 | runtime:
66 | runner:
67 | image:
68 | repository: vault.habana.ai/habana-ai-operator/habana-container-runtime
69 | tag: 1.19.1-26
70 | resources:
71 | limits:
72 | cpu: cpu_str_or_int_optional
73 | memory: memory_str_optional
74 | requests:
75 | cpu: cpu_str_or_int_optional
76 | memory: memory_str_optional
77 | configuration:
78 | container_engine: one_of_containerd_docker_crio
79 | engine_container_runtime_configuration: container_engine_configuration_optional
80 | habana_container_runtime_configuration: container_runtime_configuration_optional
81 | metric_exporter:
82 | runner:
83 | image:
84 | repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter
85 | tag: 1.19.1-26
86 | resources:
87 | limits:
88 | cpu: cpu_str_or_int_optional
89 | memory: memory_str_optional
90 | requests:
91 | cpu: cpu_str_or_int_optional
92 | memory: memory_str_optional
93 | port: 41611
94 | interval: 20
95 | feature_discovery:
96 | runner:
97 | image:
98 | repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery
99 | tag: 1.19.1-26
100 | resources:
101 | limits:
102 | cpu: cpu_str_or_int_optional
103 | memory: memory_str_optional
104 | requests:
105 | cpu: cpu_str_or_int_optional
106 | memory: memory_str_optional
107 | nfd_plugin: boolean_nfd_installed
108 | bmc_monitoring:
109 | image:
110 | repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter
111 | tag: 1.19.1-26
112 | resources:
113 | limits:
114 | cpu: cpu_str_or_int_optional
115 | memory: memory_str_optional
116 | requests:
117 | cpu: cpu_str_or_int_optional
118 | memory: memory_str_optional
119 | node_selector:
120 | key_optional: value_optional
--------------------------------------------------------------------------------
/device_plugins/README.md:
--------------------------------------------------------------------------------
1 | # Setting up Intel Device Plugins Operator
2 |
3 | ## Overview
4 | Intel Device Plugins are utilized to advertise Intel hardware features (resources) to a Red Hat OpenShift Container Platform (RHOCP) Cluster. This allows workloads running on pods deployed within the clusters to leverage these features. To handle the deployment and lifecycle of these device plugins, the [Intel Device Plugins Operator](https://catalog.redhat.com/software/container-stacks/detail/61e9f2d7b9cdd99018fc5736) is used. The Intel Device Plugins container images and operator have been officially certified and published on the [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software/container-stacks/detail/61e9f2d7b9cdd99018fc5736). For more details on the upstream project, please refer to [Intel Device Plugins for Kubernetes](https://github.com/intel/intel-device-plugins-for-kubernetes).
5 |
6 | ## Prerequisities
7 | - Provisioned RHOCP cluster. Follow steps [here](/README.md).
8 | - Setup Node Feature Discovery (NFD). Follow steps [here](/nfd/README.md).
9 | - Follow the additional prerequisites for provisioning Intel® Data Center GPU:
10 | - Setup out of tree drivers for Intel Data Center GPU provisioning. Follow the steps listed [here](/kmmo/README.md).
11 | - Follow the additional prerequisites for provisioning Intel® QuickAssist Technology:
12 | - Configure MCO for provisioning Intel QAT. Follow steps [here](/machine_configuration/README.md#machine-configuration-for-provisioning-intel-qat).
13 |
14 | ## Install Intel Device Plugins Operator on Red Hat OpenShift
15 | ### Installation via web console
16 | Follow the steps below to install Intel Device Plugins Operator using OpenShift web console:
17 | 1. In the OpenShift web console, navigate to **Operator** -> **OperatorHub**.
18 | 2. Search for **Intel Device Plugins Operator** in all items field -> Click **Install**.
19 | ### Verify Installation via web console
20 | 1. Go to **Operator** -> **Installed Operators**.
21 | 2. Verify that the status of the operator is **Succeeded**.
22 |
23 | ### Installation via command line interface (CLI)
24 | Apply the [install_operator.yaml](/device_plugins/install_operator.yaml) file:
25 | ```
26 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/install_operator.yaml
27 | ```
28 |
29 | ### Verify Installation via CLI
30 | Verify that the operator controller manager pod is up and running:
31 | ```
32 | $ oc get pod | grep inteldeviceplugins-controller-manager
33 |
34 | inteldeviceplugins-controller-manager-6b8c76c867-hftqm 2/2 Running 0 17m
35 | ```
36 |
37 | ## Resources Provided by Intel Device Plugins
38 | The resources are the user interface for customers to claim and consume the hardware features provided by Intel Device Plugins from the user pods. See below table for the details:
39 |
40 | | Feature | Resources | Description | Usage |
41 | | ------- | --------- | ----------- | ----- |
42 | | Intel® SGX | `sgx.intel.com/epc` | Intel SGX EPC memory for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/64a6c86f3be25459c14ea988e892f9f5d873a8ca/tests/l2/sgx/sgx_job.yaml#L21) |
43 | | Intel® Data Center GPU Flex Series Intel® Data Center GPU Max Series | `gpu.intel.com/i915 ` | Intel Data Center GPU Card for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/device_plugins/deploy_gpu.md#using-intel-data-center-gpu-resource-exclusively) |
44 | | Intel® QAT | `qat.intel.com/cy` `qat.intel.com/dc` | `cy`: Intel QAT VFIO Virtual Function device configured for cryptography for user pod to claim `dc`: Intel QAT VFIO Virtual Function device configured for cryptography for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/tests/l2/qat/qatlib_job.yaml#L24) [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/tests/l2/qat/qatlib_job.yaml#L28) |
45 | | Intel® DSA | `dsa.intel.com/wq-user-shared` `dsa.intel.com/wq-user-dedicated` | Intel DSA Work Queue for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/tests/l2/dsa/dsa_job.yaml#L27) |
46 |
47 |
48 | ## Creating Intel Device Plugin custom resource (CR)
49 | - To create an Intel SGX device plugin CR, follow this [link](/device_plugins/deploy_sgx.md).
50 | - To create an Intel GPU device plugin CR, follow this [link](/device_plugins/deploy_gpu.md).
51 | - To create an Intel QAT device plugin CR, follow this [link](/device_plugins/deploy_qat.md).
52 | - To create an Intel DSA device plugin CR, follow this [link](/device_plugins/deploy_dsa.md).
53 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish Documentation
2 | on:
3 | push:
4 | branches:
5 | - main
6 | - release-v1.3.0
7 | - release-v1.3.1
8 | - release-v1.4.0
9 | - release-v1.5.0
10 | - release-v1.5.1
11 | - release-v1.5.2
12 | - release-v1.6.0
13 | - release-v1.6.1
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | build:
20 | permissions:
21 | contents: write # for Git to git push
22 | runs-on: ubuntu-22.04
23 |
24 | steps:
25 | - name: Install dependencies
26 | run: |
27 | sudo apt-get update
28 | sudo apt-get install -y python3-venv git
29 | - uses: actions/checkout@v4
30 | with:
31 | fetch-depth: 0
32 | ref: main
33 | - name: Set up doc directory
34 | run: |
35 | mkdir $HOME/output
36 | touch $HOME/output/.nojekyll
37 | echo "" >"$HOME/output/index.html"
38 | - name: Build devel
39 | run: |
40 | export GITHUB_SHA=$(git rev-parse HEAD)
41 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
42 | rm -rf _work/venv
43 | make vhtml
44 | mv _build/html $HOME/output/development
45 | - uses: actions/checkout@v4
46 | with:
47 | fetch-depth: 0
48 | ref: release-v1.3.0
49 | - name: Build release-v1.3.0
50 | run: |
51 | export GITHUB_SHA=$(git rev-parse HEAD)
52 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
53 | rm -rf _work/venv
54 | make vhtml
55 | mv _build/html $HOME/output/v1.3.0
56 | - uses: actions/checkout@v4
57 | with:
58 | fetch-depth: 0
59 | ref: release-v1.3.1
60 | - name: Build release-v1.3.1
61 | run: |
62 | export GITHUB_SHA=$(git rev-parse HEAD)
63 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
64 | rm -rf _work/venv
65 | make vhtml
66 | mv _build/html $HOME/output/v1.3.1
67 | - uses: actions/checkout@v4
68 | with:
69 | fetch-depth: 0
70 | ref: release-v1.4.0
71 | - name: Build release-v1.4.0
72 | run: |
73 | export GITHUB_SHA=$(git rev-parse HEAD)
74 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
75 | rm -rf _work/venv
76 | make vhtml
77 | mv _build/html $HOME/output/v1.4.0
78 | - uses: actions/checkout@v4
79 | with:
80 | fetch-depth: 0
81 | ref: release-v1.5.0
82 | - name: Build release-v1.5.0
83 | run: |
84 | export GITHUB_SHA=$(git rev-parse HEAD)
85 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
86 | rm -rf _work/venv
87 | make vhtml
88 | mv _build/html $HOME/output/v1.5.0
89 | - uses: actions/checkout@v4
90 | with:
91 | fetch-depth: 0
92 | ref: release-v1.5.1
93 | - name: Build release-v1.5.1
94 | run: |
95 | export GITHUB_SHA=$(git rev-parse HEAD)
96 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
97 | rm -rf _work/venv
98 | make vhtml
99 | mv _build/html $HOME/output/v1.5.1
100 | - uses: actions/checkout@v4
101 | with:
102 | fetch-depth: 0
103 | ref: release-v1.5.2
104 | - name: Build release-v1.5.2
105 | run: |
106 | export GITHUB_SHA=$(git rev-parse HEAD)
107 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
108 | rm -rf _work/venv
109 | make vhtml
110 | mv _build/html $HOME/output/v1.5.2
111 | - uses: actions/checkout@v4
112 | with:
113 | fetch-depth: 0
114 | ref: release-v1.6.0
115 | - name: Build release-v1.6.0
116 | run: |
117 | export GITHUB_SHA=$(git rev-parse HEAD)
118 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
119 | rm -rf _work/venv
120 | make vhtml
121 | mv _build/html $HOME/output/v1.6.0
122 | - uses: actions/checkout@v4
123 | with:
124 | fetch-depth: 0
125 | ref: release-v1.6.1
126 | - name: Build release-v1.6.1
127 | run: |
128 | export GITHUB_SHA=$(git rev-parse HEAD)
129 | export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
130 | rm -rf _work/venv
131 | make vhtml
132 | mv _build/html $HOME/output/v1.6.1
133 | - name: Deploy the docs
134 | shell: bash
135 | env:
136 | GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
137 | run: |
138 | cd $HOME/output
139 | git init
140 | git config --global user.name "${GITHUB_ACTOR}"
141 | git config --global user.email "${GITHUB_ACTOR}@github.com"
142 | git add .
143 | git commit -m "latest html output"
144 | git push -f https://${GITHUB_ACTOR}:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git HEAD:gh-pages
--------------------------------------------------------------------------------
/tests/gaudi/l2/vllm_buildconfig.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 | name: vllm-workload
8 | namespace: gaudi-validation
9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 | name: vllm-workload
15 | namespace: gaudi-validation
16 | spec:
17 | triggers:
18 | - type: "ConfigChange"
19 | - type: "ImageChange"
20 | runPolicy: "Serial"
21 | source:
22 | type: Dockerfile
23 | git:
24 | uri: https://github.com/HabanaAI/vllm-fork.git
25 | ref: v1.19.1
26 | dockerfile: |
27 | ARG BASE_IMAGE=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
28 | FROM ${BASE_IMAGE} as habana-base
29 |
30 | USER root
31 |
32 | ENV VLLM_TARGET_DEVICE="hpu"
33 | ENV HABANA_SOFTWARE_VERSION="1.19.1"
34 |
35 | RUN dnf -y update --best --allowerasing --skip-broken && dnf clean all
36 |
37 | WORKDIR /workspace
38 |
39 | ## Python Installer #################################################################
40 | FROM habana-base as python-install
41 |
42 | ARG PYTHON_VERSION=3.11
43 |
44 | ENV VIRTUAL_ENV=/opt/vllm
45 | ENV PATH="$VIRTUAL_ENV/bin:$PATH"
46 | RUN dnf install -y --setopt=install_weak_deps=0 --nodocs \
47 | python${PYTHON_VERSION}-wheel && \
48 | python${PYTHON_VERSION} -m venv $VIRTUAL_ENV --system-site-packages && pip install --no-cache -U pip wheel && dnf clean all
49 |
50 | ## Python Habana base #################################################################
51 | FROM python-install as python-habana-base
52 |
53 | ENV VIRTUAL_ENV=/opt/vllm
54 | ENV PATH="$VIRTUAL_ENV/bin:$PATH"
55 |
56 | # install Habana Software and common dependencies
57 | RUN --mount=type=cache,target=/root/.cache/pip \
58 | --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
59 | --mount=type=bind,source=requirements-hpu.txt,target=requirements-hpu.txt \
60 | pip install \
61 | -r requirements-hpu.txt
62 |
63 | ## Builder #####################################################################
64 | FROM python-habana-base AS build
65 |
66 | # install build dependencies
67 |
68 | # copy input files
69 | COPY csrc csrc
70 | COPY setup.py setup.py
71 | COPY cmake cmake
72 | COPY CMakeLists.txt CMakeLists.txt
73 | COPY requirements-common.txt requirements-common.txt
74 | COPY requirements-hpu.txt requirements-hpu.txt
75 | COPY pyproject.toml pyproject.toml
76 |
77 | # max jobs used by Ninja to build extensions
78 | ARG max_jobs=2
79 | ENV MAX_JOBS=${max_jobs}
80 | # # make sure punica kernels are built (for LoRA)
81 | # HPU currently doesn't support LoRA
82 | # ENV VLLM_INSTALL_PUNICA_KERNELS=1
83 |
84 | # Copy the entire directory before building wheel
85 | COPY vllm vllm
86 |
87 | ENV CCACHE_DIR=/root/.cache/ccache
88 | RUN --mount=type=cache,target=/root/.cache/ccache \
89 | --mount=type=cache,target=/root/.cache/pip \
90 | --mount=type=bind,src=.git,target=/workspace/.git \
91 | env CFLAGS="-march=haswell" \
92 | CXXFLAGS="$CFLAGS $CXXFLAGS" \
93 | CMAKE_BUILD_TYPE=Release \
94 | python3 setup.py bdist_wheel --dist-dir=dist
95 |
96 | ## Release #####################################################################
97 | FROM python-install AS vllm-openai
98 |
99 | WORKDIR /workspace
100 |
101 | ENV VIRTUAL_ENV=/opt/vllm
102 | ENV PATH=$VIRTUAL_ENV/bin/:$PATH
103 |
104 | # Triton needs a CC compiler
105 | RUN dnf install -y --setopt=install_weak_deps=0 --nodocs gcc \
106 | && dnf clean all
107 |
108 | # install vllm wheel first, so that torch etc will be installed
109 | RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
110 | --mount=type=cache,target=/root/.cache/pip \
111 | pip install $(echo dist/*.whl)'[tensorizer]' --verbose
112 |
113 | ENV HF_HUB_OFFLINE=1 \
114 | PORT=8000 \
115 | HOME=/home/vllm \
116 | VLLM_USAGE_SOURCE=production-docker-image
117 |
118 | # setup non-root user for OpenShift
119 | # In OpenShift the user ID is randomly assigned, for compatibility we also
120 | # set up a non-root user here.
121 | RUN umask 002 \
122 | && useradd --uid 2000 --gid 0 vllm \
123 | && chmod g+rwx $HOME /usr/src /workspace
124 |
125 | COPY LICENSE /licenses/vllm.md
126 |
127 | USER 2000
128 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
129 | strategy:
130 | type: Docker
131 | noCache: true
132 | dockerStrategy:
133 | buildArgs:
134 | - name: "BASE_IMAGE"
135 | value: "vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26"
136 | output:
137 | to:
138 | kind: ImageStreamTag
139 | name: vllm-workload:latest
--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/master/config
8 |
9 | # -- Path setup --------------------------------------------------------------
10 |
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | # import os
16 | # import sys
17 | # sys.path.insert(0, os.path.abspath('.'))
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = 'Intel® Technology Enabling for OpenShift*'
22 | copyright = '2024, Intel® Corporation'
23 | author = 'Intel® Corporation'
24 |
25 | # The short X.Y version
26 | # version = 'devel'
27 | # The full version, including alpha/beta/rc tags
28 | # release = 'GA'
29 |
30 |
31 | # ---------------------------------
32 | # Reference for sphinx_md : https://pypi.org/project/sphinx-md/
33 | # ---------------------------------
34 | from os import getenv
35 |
36 | baseBranch = "main"
37 | sphinx_md_useGitHubURL = True
38 | commitSHA = getenv('GITHUB_SHA')
39 | githubBaseURL = 'https://github.com/' + (getenv('GITHUB_REPOSITORY') or 'intel/intel-technology-enabling-for-openshift') + '/'
40 | githubFileURL = githubBaseURL + "blob/"
41 | githubDirURL = githubBaseURL + "tree/"
42 | if commitSHA:
43 | githubFileURL = githubFileURL + commitSHA + "/"
44 | githubDirURL = githubDirURL + commitSHA + "/"
45 | else:
46 | githubFileURL = githubFileURL + baseBranch + "/"
47 | githubDirURL = githubDirURL + baseBranch + "/"
48 | sphinx_md_githubFileURL = githubFileURL
49 | sphinx_md_githubDirURL = githubDirURL
50 |
51 | # Version displayed in the upper left corner
52 | # This value is set in the github workflow environment
53 | commitREF = getenv('GITHUB_SHA_REF', default = "unknown")
54 | if commitREF.startswith("release-"):
55 | version = commitREF[len("release-"):].strip()
56 | else:
57 | version = "development"
58 |
59 |
60 | # Versions list with URLs using tags displayed in the lower left corner
61 | from git import Repo
62 | versions_to_exclude = set(['v1.0.0', 'v1.0.1','v1.1.0', 'v1.2.0', 'v1.2.1'])
63 | repo = Repo( search_parent_directories=True )
64 | github_repo = "/intel-technology-enabling-for-openshift/"
65 | release_versions = [("development", github_repo)]
66 | tags = reversed([tag.name for tag in repo.tags])
67 | release_versions.extend((str(tag), github_repo + tag) for tag in tags if str(tag) not in versions_to_exclude)
68 |
69 | # -- General configuration ---------------------------------------------------
70 |
71 | # If your documentation needs a minimal Sphinx version, state it here.
72 | #
73 | # needs_sphinx = '1.0'
74 |
75 | # Add any Sphinx extension module names here, as strings. They can be
76 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
77 | # ones.
78 | extensions = ['myst_parser', 'sphinx_md', ]
79 | myst_heading_anchors = 5
80 | # myst_enable_extensions = [
81 | # "html_admonition",
82 | # ]
83 | # Add any paths that contain templates here, relative to this directory.
84 | templates_path = ['_templates']
85 |
86 | # The suffix(es) of source filenames.
87 | # You can specify multiple suffix as a list of string:
88 | #
89 | source_suffix = ['.rst', '.md']
90 |
91 | # List of patterns, relative to source directory, that match files and
92 | # directories to ignore when looking for source files.
93 | # This pattern also affects html_static_path and html_extra_path.
94 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
95 |
96 | # -- Options for HTML output -------------------------------------------------
97 |
98 | # The theme to use for HTML and HTML Help pages. See the documentation for
99 | # a list of builtin themes.
100 | #
101 | html_theme = 'sphinx_rtd_theme'
102 | html_title = "Intel® Technology Enabling for OpenShift*"
103 | # Theme options are theme-specific and customize the look and feel of a theme
104 | # further. For a list of options available for each theme, see the
105 | # documentation.
106 | #
107 | html_theme_options = {
108 | "display_version": True,
109 | }
110 |
111 | html_context = {
112 | 'display_github': True,
113 | 'github_host': 'github.com',
114 | 'github_user': 'intel',
115 | 'github_repo': 'intel-technology-enabling-for-openshift',
116 | 'github_version': 'main/',
117 | 'versions_menu': True,
118 | 'version': version,
119 | 'versions': release_versions,
120 | }
121 | html_css_files = [
122 | 'custom.css',
123 | ]
124 |
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 |
129 | html_static_path = ['_static']
130 |
131 |
132 | # Custom sidebar templates, must be a dictionary that maps document names
133 | # to template names.
134 | #
135 | # The default sidebars (for documents that don't match any pattern) are
136 | # defined by theme itself. Builtin themes are using these templates by
137 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
138 | # 'searchbox.html']``.
139 | #
140 | # html_sidebars = {}
141 |
142 |
143 | # -- Options for HTMLHelp output ---------------------------------------------
144 |
145 | # Output file base name for HTML help builder.
146 | htmlhelp_basename = 'IntelTechnologyEnablingforOpenShiftdoc'
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, caste, color, religion, or sexual
10 | identity and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the overall
26 | community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or advances of
31 | any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email address,
35 | without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | CommunityCodeOfConduct AT intel DOT com.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series of
86 | actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or permanent
93 | ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 |
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 |
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 |
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 |
--------------------------------------------------------------------------------
/gaudi/README.md:
--------------------------------------------------------------------------------
1 | # Setting up Intel Gaudi AI Accelerator Operator
2 |
3 | ## Overview
4 | [Intel Gaudi AI Accelerator Operator](https://catalog.redhat.com/software/container-stacks/detail/6683b2cce45daa25e36bddcb) is used to provision Intel Gaudi Accelerator with OpenShift. The steps and yaml files mentioned in this document to provision the Gaudi accelerator are based on [Intel Gaudi AI Accelerator Operator for OpenShift](https://docs.habana.ai/en/latest/Orchestration/Intel_Gaudi_Base_Operator/index.html).
5 |
6 | If you are familiar with the steps here to manually provision the accelerator, the Red Hat certified Operator and Ansible based [One-Click](/one_click/README.md#reference-playbook-–-habana-gaudi-provisioning) solution can be used as a reference to provision the accelerator automatically.
7 |
8 | ## Prerequisities
9 | - To Provision RHOCP cluster, follow steps [here](/README.md#provisioning-rhocp-cluster).
10 |
11 | ## Install Intel Gaudi AI Accelerator Operator on Red Hat OpenShift
12 | ### Installation via web console
13 | Follow the steps below to install Intel Gaudi AI Accelerator Operator using OpenShift web console:
14 | 1. In the OpenShift web console, navigate to **Operator** -> **OperatorHub**.
15 | 2. Search for **Intel Gaudi AI Accelerator Operator** in all items field -> Click **Install**.
16 | ### Verify Installation via web console
17 | 1. Go to **Operator** -> **Installed Operators**.
18 | 2. Verify that the status of the operator is **Succeeded**.
19 |
20 | ### Installation via Command Line Interface (CLI)
21 | ```
22 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/gaudi/gaudi_install_operator.yaml
23 | ```
24 |
25 | ### Verify Installation via CLI
26 | Verify that the operator controller manager pod is up and running:
27 | ```
28 | oc get pods -n habana-ai-operator
29 |
30 | NAME READY STATUS RESTARTS AGE
31 | controller-manager-6c8459d9cb-fqs8h 2/2 Running 0 25m
32 | ```
33 |
34 | ## Creating Intel Gaudi AI Accelerator Operator ClusterPolicy Instance
35 | To create a Habana Gaudi device plugin CR, follow the steps below.
36 |
37 | ### Create CR via web console
38 | 1. Go to **Operator** -> **Installed Operators**.
39 | 2. Open **Intel Gaudi AI Accelerator Operator**.
40 | 3. Navigate to tab **Cluster Policy**.
41 | 4. Click **Create ClusterPolicy** -> set correct parameters -> Click **Create**. To set correct parameters please refer [Using RedHat OpenShift Container Platform Console](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Kubernetes_Installation/Kubernetes_Operator.html#id1).
42 |
43 | ### Verify via web console
44 | 1. Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **habana-ai-device-plugin-ds**, **habana-ai-driver-rhel-9-4-xxxxx**, **habana-ai-feature-discovery-ds**, **habana-ai-metric-exporter-ds**, **habana-ai-runtime-ds**.
45 | 2. Now `ClusterPolicy` is created.
46 |
47 | ### Create CR via CLI
48 | Apply the CR yaml file:
49 | ```
50 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/gaudi/gaudi_cluster_policy.yaml
51 | ```
52 |
53 | ### Verify the ClusterPolicy CR is created
54 | You can use command below to verify that the `ClusterPolicy` CR has been created:
55 | ```
56 | oc get pod -n habana-ai-operator
57 |
58 | NAME READY STATUS RESTARTS AGE
59 | habana-ai-device-plugin-ds-thj7b 1/1 Running 0 10d
60 | habana-ai-driver-rhel-9-4-416-94-202412170927-0-ds-vqhzb 1/1 Running 2 10d
61 | habana-ai-feature-discovery-ds-ztl2j 1/1 Running 5 10d
62 | habana-ai-metric-exporter-ds-g5qqh 1/1 Running 0 10d
63 | habana-ai-operator-controller-manager-6c995b5646-wl7cp 2/2 Running 0 10d
64 | habana-ai-runtime-ds-x49lf 1/1 Running 0 10d
65 | ```
66 | Alternatively, you can also check the status of the `ClusterPolicy` CR like below:
67 | ```
68 | oc describe ClusterPolicy habana-ai -n habana-ai-operator
69 |
70 | Name: habana-ai
71 | Namespace: habana-ai-operator
72 | .
73 | .
74 | Status:
75 | Conditions:
76 | Last Transition Time: 2025-01-21T18:50:46Z
77 | Message: All resources have been successfully reconciled
78 | Reason: Reconciled
79 | Status: True
80 | ```
81 | ## Verify Gaudi Provisioning
82 | After the `ClusterPolicy` instance CR is created, it will take some time for the operator to download the Gaudi OOT driver source code and build it on-premise with the help of the KMM operator. The OOT driver module binaries will be loaded into the RHCOS kernel on each node with Gaudi cards labelled by feature discovery. Then, the Gaudi device plugin can advertise the Gaudi resources listed in the table for the pods on OpenShit to use. Run the command below to check the availability of Gaudi resources:
83 | ```
84 | oc describe node | grep habana.ai/gaudi
85 |
86 | habana.ai/gaudi: 8 -> Gaudi cards number on the cluster
87 | habana.ai/gaudi: 8 -> Gaudi cards number allocatble on the cluster
88 | habana.ai/gaudi 4 4 -> number of Gaudi cards allocated and number of Gardi cards available
89 | ```
90 |
91 | To view the metrics on a node with Gaudi card, refer [Collecting Metrics](https://docs.habana.ai/en/latest/Orchestration/Prometheus_Metric_Exporter.html?highlight=metrics#collecting-metrics).
92 |
93 | ## Resources Provided by Habana Gaudi Device Plugin
94 | The resources provided are the user interface for customers to claim and consume the hardware features from the user pods. See below table for the details:
95 |
96 | | Feature | Resources | Description |
97 | | ------- | --------- | ----------- |
98 | | Habana Gaudi | `habana.ai/gaudi` | Number of Habana Gaudi Card resources ready to claim |
99 |
100 | ## Upgrade Intel Gaudi SPI Firmware
101 | Refer [Upgrade Intel Gaudi SPI Firmware](/gaudi/Gaudi-SPI-Firmware-Upgrade.md) to upgrade the SPI Firmware on Intel Gaudi.
--------------------------------------------------------------------------------
/workloads/opea/chatqna/README.md:
--------------------------------------------------------------------------------
1 | # Deploy OPEA ChatQnA workload on OCP
2 |
3 | ## Overview
4 | The workload is based on the [OPEA ChatQnA Application](https://github.com/opea-project/GenAIExamples/tree/v0.8/ChatQnA) running on Intel® Gaudi Accelerator with OpenShift and OpenShift AI. Refer to the [OPEA Generative AI Examples](https://github.com/opea-project/GenAIExamples/tree/v0.8) for more details about the OPEA workloads.
5 |
6 | **Note**: It is still under heavy development, and the updates are expected.
7 |
8 | ## Prerequisites
9 | * Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster)
10 | * The Persistent storage using NFS is ready. Refer to [documentation](https://docs.openshift.com/container-platform/4.16/storage/persistent_storage/persistent-storage-nfs.html) for the details to set it up.
11 |
12 | **Note**: Refer to [documentation](https://docs.openshift.com/container-platform/4.16/storage/index.html) for setting up other types of persistent storages.
13 | * Provisioned Intel Gaudi accelerator on RHOCP cluster. Follow steps [here](/gaudi/README.md)
14 | * RHOAI is installed. Follow steps [here](/e2e/inference/README.md/#install-rhoai)
15 | * The Intel Gaudi AI accelerator is enabled with RHOAI. Follow steps [here](/e2e/inference/README.md/#enable-intel-gaudi-ai-accelerator-with-rhoai)
16 | * Minio based S3 service ready for RHOAI. Follow steps [here](https://ai-on-openshift.io/tools-and-applications/minio/minio/#create-a-matching-data-connection-for-minio)
17 |
18 | ## Deploy Model Serving for OPEA ChatQnA Microservices with RHOAI
19 |
20 | ### Create OpenShift AI Data Science Project
21 |
22 | * Click ```Search -> Routes -> rhods-dashboard``` from the OCP web console and launch the RHOAI dashboard.
23 |
24 | * Follow the dashboard and click ```Data Science Projects``` to create a project. For example, ```OPEA-chatqna-modserving```.
25 |
26 | ### Preload the models
27 |
28 | * Refer to [link](https://huggingface.co/docs/hub/en/models-downloading) and download the model [Llama2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf).
29 |
30 | * Refer to [link](https://ai-on-openshift.io/tools-and-applications/minio/minio/#create-a-matching-data-connection-for-minio) and upload the model to minio/s3 storage.
31 |
32 | * Click ```OPEA-chatqna-modserving```, and choose ```Data Connection``` section. In the fields, add your access and secret keys from minio. Follow [link](https://ai-on-openshift.io/tools-and-applications/minio/minio/#create-a-matching-data-connection-for-minio).
33 |
34 | ### Launch the Model Serving with Intel Gaudi AI Accelerator
35 |
36 | * Click on the Settings and choose ```ServingRuntime```. Copy or import the [tgi_gaudi_servingruntime.yaml](tgi_gaudi_servingruntime.yaml). The [tgi-gaudi](https://github.com/huggingface/tgi-gaudi) serving runtime is used. Follow the image below.
37 |
38 | 
39 |
40 | * In the project ```OPEA-chatqna-modserving``` --> ```Models``` section and follow the image below.
41 |
42 | 
43 |
44 | * The model server is now in the creation state. Once ready, the status will be updated to green and the inference endpoint can be seen. Refer to the image below.
45 |
46 | 
47 |
48 | ## Deploy ChatQnA Megaservice and Database
49 |
50 | ### Create namespace
51 |
52 | ```
53 | oc create namespace opea-chatqna
54 | ```
55 |
56 | ### Create persistent volumes
57 | The NFS is used to create the Persistent Volumes for ChatQnA MegaService to claim and use.
58 |
59 | Make sure to update NFS server IP and path in ```persistent_volumes.yaml``` before applying command below.
60 | For example:
61 | ```
62 | nfs:
63 | server: 10.20.1.2 # nfs server
64 | path: /my_nfs # nfs path
65 | ```
66 |
67 | ```
68 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/persistent_volumes.yaml
69 |
70 | ```
71 |
72 | * Check that the persistent volumes are created:
73 |
74 | ```
75 | $ oc get pv
76 | NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS
77 | chatqna-megaservice-pv-0 100Mi RWO Retain Available
78 | chatqna-megaservice-pv-1 100Mi RWO Retain Available
79 | chatqna-megaservice-pv-2 100Mi RWO Retain Available
80 |
81 | ```
82 | ### Building OPEA ChatQnA MegaService Container Image
83 | ```
84 | create_megaservice_container.sh
85 | ```
86 |
87 | ### Deploy Redis Vector Database Service
88 | ```
89 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/redis_deployment_service.yaml
90 |
91 | ```
92 |
93 | Check that the pod and service are running:
94 |
95 | ```
96 | $ oc get pods
97 | NAME READY STATUS RESTARTS AGE
98 | redis-vector-db-6b5747bf7-sl8fr 1/1 Running 0 21s
99 | ```
100 |
101 | ```
102 | $ oc get svc
103 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
104 | redis-vector-db ClusterIP 1.2.3.4 6379/TCP,8001/TCP 43s
105 | ```
106 |
107 | ### Deploy ChatQnA MegaService
108 |
109 | Update the inference endpoint from the in the chatqna_megaservice_deployment.
110 |
111 | ```
112 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/chatqna_megaservice_deployment.yaml
113 | ```
114 |
115 | Check that the pod and service are running:
116 |
117 | ```
118 | $ oc get pods
119 | NAME READY STATUS RESTARTS AGE
120 | chatqna-megaservice-54487649b5-sgsh2 1/1 Running 0 95s
121 | ```
122 |
123 | ```
124 | $ oc get svc
125 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
126 | chatqna-megaservice ClusterIP 1.2.3.4 8000/TCP 99s
127 | ```
128 |
129 | ### Verify the Megaservice
130 | Use the command below:
131 |
132 | ```
133 | curl /v1/rag/chat_stream \
134 | -X POST \
135 | -d '{"query":"What is a constellation?"}' \
136 | -H 'Content-Type: application/json'
137 |
138 | ```
--------------------------------------------------------------------------------
/kmmo/README.md:
--------------------------------------------------------------------------------
1 | # Setting up Out of Tree Drivers
2 |
3 | ## Introduction
4 | [Kernel module management (KMM) operator](https://github.com/rh-ecosystem-edge/kernel-module-management) manages the deployment and lifecycle of out-of-tree kernel modules on RHOCP.
5 |
6 | In this release, KMM operator is used to manage and deploy the Intel® Data Center GPU driver container image on the RHOCP cluster.
7 |
8 | Intel data center GPU driver container images are released from [Intel Data Center GPU Driver for OpenShift Project](https://github.com/intel/intel-data-center-gpu-driver-for-openshift/tree/main/release#intel-data-center-gpu-driver-container-images-for-openshift-release).
9 |
10 | ## KMM operator working mode
11 | - **Pre-build mode** - This is the default and recommended mode. KMM Operator uses [this pre-built and certified Intel Data Center GPU driver container image](https://catalog.redhat.com/software/containers/intel/intel-data-center-gpu-driver-container/6495ee55c8b2461e35fb8264), which is published on the Red Hat Ecosystem Catalog to provision Intel Data Center GPUs on a RHOCP cluster.
12 | - **On-premises build mode** - Users can optionally build and deploy their own driver container images on-premises through the KMM operator.
13 |
14 | ## Prerequisites
15 | - Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
16 | - Setup node feature discovery. Follow steps [here](/nfd/README.md).
17 |
18 | ## Install KMM operator
19 | Follow the installation guide below to install the KMM operator via CLI or web console.
20 | - [Install from CLI](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/kernel-module-management-operator#kmm-install-using-cli_kernel-module-management-operator)
21 | - [Install from web console](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/kernel-module-management-operator#kmm-install-using-web-console_kernel-module-management-operator)
22 |
23 | ## Canary deployment with KMM
24 | Canary deployment is enabled by default to deploy the driver container image only on specific node(s) to ensure the initial deployment succeeds prior to rollout to all the eligible nodes in the cluster. This safety mechanism can reduce risk and prevent a deployment from adversely affecting the entire cluster.
25 |
26 | ## Set alternative firmware path at runtime with KMM
27 | **NOTE**: This update is required only when using KMM version of v2.1.1 or lower. Starting with v2.2.0, it is not required.
28 |
29 | Follow the steps below to set the alternative firmware path at runtime.
30 |
31 | 1. Update KMM operator `ConfigMap` to set `worker.firmwareHostPath` to `/var/lib/firmware`
32 |
33 | ```
34 | $ oc patch configmap kmm-operator-manager-config -n openshift-kmm --type='json' -p='[{"op": "add", "path": "/data/controller_config.yaml", "value": "healthProbeBindAddress: :8081\nmetricsBindAddress: 127.0.0.1:8080\nleaderElection:\n enabled: true\n resourceID: kmm.sigs.x-k8s.io\nwebhook:\n disableHTTP2: true\n port: 9443\nworker:\n runAsUser: 0\n seLinuxType: spc_t\n firmwareHostPath: /var/lib/firmware"}]'
35 | ```
36 |
37 | 2. Delete the KMM operator controller pod for `ConfigMap` changes to take effect.
38 | ```
39 | $ oc get pods -n openshift-kmm | grep -i "kmm-operator-controller-" | awk '{print $1}' | xargs oc delete pod -n openshift-kmm
40 | ```
41 |
42 | For more details, see [link.](https://openshift-kmm.netlify.app/documentation/firmwares/#setting-the-kernels-firmware-search-path)
43 |
44 | ## Deploy Intel Data Center GPU Driver with pre-build mode
45 | Follow the steps below to deploy the driver container image with pre-build mode.
46 | 1. Find all nodes with an Intel Data Center GPU card using the following command:
47 | ```
48 | $ oc get nodes -l intel.feature.node.kubernetes.io/gpu=true
49 | ```
50 | Example output:
51 | ```
52 | NAME STATUS ROLES AGE VERSION
53 | icx-dgpu-1 Ready worker 30d v1.25.4+18eadca
54 | ```
55 |
56 | 2. Label the node(s) in the cluster using the command shown below for the initial canary deployment.
57 | ```
58 | $ oc label node intel.feature.node.kubernetes.io/dgpu-canary=true
59 | ```
60 |
61 | 3. Use pre-build mode to deploy the driver container.
62 | ```
63 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/kmmo/intel-dgpu.yaml
64 | ```
65 |
66 | 4. After the driver is verified on the cluster through the canary deployment, simply remove the line shown below from the [`intel-dgpu.yaml`](/kmmo/intel-dgpu.yaml) file and reapply the yaml file to deploy the driver to the entire cluster. As a cluster administrator, you can also select another deployment policy.
67 | ```
68 | intel.feature.node.kubernetes.io/dgpu-canary: 'true'
69 | ```
70 |
71 | ## Verification
72 | To verify that the drivers have been loaded, follow the steps below:
73 | 1. List the nodes labeled with `kmm.node.kubernetes.io/openshift-kmm.intel-dgpu.ready` using the command shown below:
74 | ```
75 | $ oc get nodes -l kmm.node.kubernetes.io/openshift-kmm.intel-dgpu.ready
76 | ```
77 | Example output:
78 | ```
79 | NAME STATUS ROLES AGE VERSION
80 | icx-dgpu-1 Ready worker 30d v1.25.4+18eadca
81 | ```
82 | The label shown above indicates that the KMM operator has successfully deployed the drivers and firmware on the node.
83 |
84 | 2. If you want to further debug the driver on the node, follow these steps:
85 | a. Navigate to the web console (Compute -> Nodes -> Select a node that has the GPU card -> Terminal).
86 | b. Run the commands shown below in the web console terminal:
87 | ```
88 | $ chroot /host
89 | $ lsmod | grep i915
90 | ```
91 | Ensure `i915` and `intel_vsec` are loaded in the kernel, as shown in the output below:
92 | ```
93 | i915 3633152 0
94 | i915_compat 16384 1 i915
95 | intel_vsec 16384 1 i915
96 | intel_gtt 20480 1 i915
97 | video 49152 1 i915
98 | i2c_algo_bit 16384 1 i915
99 | drm_kms_helper 290816 1 i915
100 | drm 589824 3 drm_kms_helper,i915
101 | dmabuf 77824 4 drm_kms_helper,i915,i915_compat,dr
102 | ```
103 | c. Run dmesg to ensure there are no errors in the kernel message log.
104 |
105 | ## See Also
106 |
--------------------------------------------------------------------------------
/e2e/inference/README.md:
--------------------------------------------------------------------------------
1 | # Intel AI Inference End-to-End Solution
2 |
3 | ## Overview
4 | Intel AI inference end-to-end solution with RHOCP is based on the Intel® Data Center GPU Flex Series provisioning, Intel® OpenVINO™, and [Red Hat OpenShift AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai) (RHOAI) on RHOCP. There are two AI inference modes verified with Intel® Xeon® processors and Intel Data Center GPU Flex Series with RHOCP.
5 | * Interactive mode – RHOAI provides OpenVINO based Jupyter Notebooks for users to interactively debug the inference applications or [optimize the models](https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html) on RHOCP using data center GPU cards or Intel Xeon processors.
6 | * Deployment mode – [OpenVINO Model Sever](https://github.com/openvinotoolkit/model_server) (OVMS) can be used to deploy the inference workloads in data center and edge computing environments on RHOCP.
7 |
8 | ## Prerequisites
9 | * Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster)
10 | * Provisioning Intel Data Center GPU Flex Series. Follow steps [here](/README.md#provisioning-intel-hardware-features-on-rhocp)
11 | * Setup node feature discovery (NFD). Follow the steps [here](/nfd/README.md)
12 | * Setup out of tree drivers for Intel GPU provisioning. Follow the steps [here](/kmmo/README.md)
13 | * Setup Intel device plugins operator and create Intel GPU device plugin. Follow the steps [here](/device_plugins/README.md)
14 |
15 | ## Install RHOAI
16 | The Red Hat certified RHOAI operator is published at [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software/container-stacks/detail/63b85b573112fe5a95ee9a3a). You can use the command line interface (CLI) or web console to install it.
17 | ### Install using CLI (To be added)
18 | ### Install using Web Console
19 | 1. On the RHOCP web console, click Operators → OperatorHub.
20 | 2. Search RedHat OpenShift AI Operator and click Install. The operator is installed in the namespace `redhat-ods-operator`.
21 | ### Verification
22 | 1. Navigate to Operators → Installed Operators page.
23 | 2. Ensure that in the redhat-ods-operator namespace, RedHat OpenShift AI status is InstallSucceeded
24 | 3. Click on `Search` -> `Routes` -> `rhods-dashboard` from the web console and access the RHOAI UI link.
25 | **Note:** When installing the operator, the default `kfdef` Custom Resource (CR) is created. This CR enables the dashboard for users to browse and launch Jupyter Notebooks projects on an RHOCP cluster. Please refer to this [link](https://github.com/red-hat-data-services/odh-deployer) for more details about `kfdef`.
26 | ## Install OpenVINO Operator
27 | The OpenVINO operator is published at [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software/container-stacks/detail/60649a56209af65d24b7ca9e). You can use the CLI or web console to install it.
28 | ### Install using CLI (To be added)
29 | ### Install using Web Console
30 | Follow this [link](https://github.com/openvinotoolkit/operator/blob/v1.1.0/docs/operator_installation.md#operator-instalation) to install the operator via the web console.
31 |
32 | ## Work with Interactive Mode
33 | To enable the interactive mode, the OpenVINO notebook CR needs to be created and integrated with RHOAI.
34 | 1. Click on the `create Notebook` option from the web console and follow these [steps](https://github.com/openvinotoolkit/operator/blob/main/docs/notebook_in_rhods.md#integration-with-openshift-data-science-and-open-data-hub) to create the notebook CR.
35 | 2. Enable Intel Data Center GPU on RHOAI Dashboard- **Technical Preview feature**
36 |
37 | Create `AcceleratorProfile` in the `redhat-ods-applications` namespace
38 | ```
39 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/e2e/inference/accelerator_profile_flex140.yaml
40 | ```
41 |
42 | 3. Navigate to `openvino-notebooks` ImageStream and add the above created `AcceleratorProfile` key to the annotation field, as shown in the image below:
43 |
44 | 
45 |
46 | 4. Navigate to `Search` -> `Networking` -> `Routes` from the web console and access `rhods-dashboard` route in the `redhat-ods-applications` namespace, as in the image below. Click on the location link to launch RHOAI dashboard.
47 |
48 | 
49 |
50 | 5. If step 2 is successful, ```Intel® Data Center GPU Flex Series 140 ``` is shown in the accelerator dropdown menu in ```rhods-dashboard```. Users can run OpenVINO notebook image with Intel® Data Center GPU Flex Series 140 card.
51 |
52 | 
53 |
54 | Follow the [link](https://github.com/openvinotoolkit/operator/blob/main/docs/notebook_in_rhods.md#integration-with-openshift-data-science-and-open-data-hub) for more details on the available Jupyter Notebooks.
55 |
56 | ## Work with Deployment Mode
57 | 1. From the web console, click on the ModelServer option in this [link](https://github.com/openvinotoolkit/operator/blob/v1.1.0/docs/operator_installation.md#operator-instalation) and follow the [steps](https://github.com/openvinotoolkit/operator/blob/v1.1.0/docs/modelserver.md#managing-model-servers-via-operator) to start the OVMS instance.
58 | 2. To enable the Intel Data Center GPU, make sure to modify the OVMS instance options according to the screenshot below.
59 |
60 | * Below images show `gpu.intel.com/i915` resource requests and limits for OVMS
61 |
62 | 
63 |
64 | 
65 |
66 | # Enable Intel Gaudi AI Accelerator with RHOAI
67 |
68 | * From web console:
69 |
70 | To enable and use the Intel Gaudi accelerator on RHOAI web console, follow the [documentation](https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.11/html/working_with_accelerators/intel-gaudi-ai-accelerator-integration_accelerators#enabling-intel-gaudi-ai-accelerators_accelerators).
71 | * From cli:
72 |
73 | Deploy the ```accelerator_profile_gaudi.yaml``` in the redhat-ods-applications namespace.
74 |
75 | ```
76 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/e2e/inference/accelerator_profile_gaudi.yaml
77 | ```
78 | ## Intel Gaudi AI Accelerator with Intel® Gaudi AI Software Tools Containers on OpenShift AI
79 | To use Intel Gaudi AI Accelerator with Intel® Gaudi AI Software Tools Containers on OpenShift AI, follow the [documentation](https://github.com/intel/ai-containers/blob/main/enterprise/redhat/openshift-ai/gaudi/README.md).
80 |
81 | ## See Also
82 | [GPU accelerated demo with OpenVINO](https://www.youtube.com/watch?v=3fTz_k4JT2A)
--------------------------------------------------------------------------------
/one_click/gaudi_provisioning_playbook.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | - hosts: localhost
4 | vars:
5 | kubeconfig_path: "~/.kube/config"
6 | environment:
7 | KUBECONFIG: "{{ kubeconfig_path }}"
8 | tasks:
9 | - name: Install Dependencies
10 | tags:
11 | - install_dependencies
12 | block:
13 | - name: NFD - Install Node Feature Discovery Operator
14 | tags:
15 | - nfd
16 | block:
17 | - name: NFD - Create openshift-nfd namespace
18 | k8s:
19 | name: openshift-nfd
20 | api_version: v1
21 | kind: Namespace
22 | state: present
23 | wait: yes
24 | - name: NFD - Create an nfd-operator group v1
25 | k8s:
26 | definition:
27 | apiVersion: operators.coreos.com/v1
28 | kind: OperatorGroup
29 | metadata:
30 | generateName: openshift-nfd-
31 | name: openshift-nfd
32 | namespace: openshift-nfd
33 | spec:
34 | targetNamespaces:
35 | - openshift-nfd
36 | wait: yes
37 | - name: NFD - Create subscription for RH NFD operator
38 | k8s:
39 | definition:
40 | apiVersion: operators.coreos.com/v1alpha1
41 | kind: Subscription
42 | metadata:
43 | name: nfd
44 | namespace: openshift-nfd
45 | spec:
46 | channel: "stable"
47 | installPlanApproval: Automatic
48 | name: nfd
49 | source: redhat-operators
50 | sourceNamespace: openshift-marketplace
51 | wait: yes
52 | wait_condition:
53 | reason: AllCatalogSourcesHealthy
54 | type: CatalogSourcesUnhealthy
55 | status: 'False'
56 | - name: NFD - Wait until the nfd-operator-controller Deployment is available
57 | k8s_info:
58 | kind: Deployment
59 | wait: yes
60 | name: nfd-controller-manager
61 | namespace: openshift-nfd
62 | wait_condition:
63 | type: Available
64 | status: 'True'
65 | reason: MinimumReplicasAvailable
66 | - name: KMM - Install Kernel Module Management Operator
67 | tags:
68 | - kmm
69 | block:
70 | - name: KMM - Create openshift-kmm namespace
71 | k8s:
72 | name: openshift-kmm
73 | api_version: v1
74 | kind: Namespace
75 | state: present
76 | wait: yes
77 | - name: KMM - Create OperatorGroup v1 in openshift-kmm namespace
78 | k8s:
79 | definition:
80 | apiVersion: operators.coreos.com/v1
81 | kind: OperatorGroup
82 | metadata:
83 | name: kernel-module-management
84 | namespace: openshift-kmm
85 | wait: yes
86 | - name: KMM - Create Subscription for KMM Operator
87 | k8s:
88 | definition:
89 | apiVersion: operators.coreos.com/v1alpha1
90 | kind: Subscription
91 | metadata:
92 | name: kernel-module-management
93 | namespace: openshift-kmm
94 | spec:
95 | channel: stable
96 | installPlanApproval: Automatic
97 | name: kernel-module-management
98 | source: redhat-operators
99 | sourceNamespace: openshift-marketplace
100 | wait: yes
101 | wait_condition:
102 | reason: AllCatalogSourcesHealthy
103 | type: CatalogSourcesUnhealthy
104 | status: 'False'
105 | - name: KMM - Wait until the kmm-operator-controller Deployment is available
106 | k8s_info:
107 | kind: Deployment
108 | wait: yes
109 | name: kmm-operator-controller
110 | namespace: openshift-kmm
111 | wait_condition:
112 | type: Available
113 | status: 'True'
114 | reason: MinimumReplicasAvailable
115 | - name: Install Intel Gaudi Base Operator
116 | tags:
117 | - intel-gaudi
118 | block:
119 | - name: Install Intel Gaudi Base Operator
120 | k8s:
121 | state: present
122 | src: "../gaudi/gaudi_install_operator.yaml"
123 | wait: yes
124 | - name: Wait until the Intel Gaudi controller-manager Deployment is available
125 | k8s_info:
126 | kind: Deployment
127 | wait: yes
128 | name: controller-manager
129 | namespace: habana-ai-operator
130 | wait_condition:
131 | type: Available
132 | status: 'True'
133 | reason: MinimumReplicasAvailable
134 | - name: NFD - Install NFD CRs
135 | block:
136 | - name: NFD - Create NFD discovery instance for Intel Gaudi
137 | k8s:
138 | state: present
139 | src: "../gaudi/gaudi_nfd_instance_openshift.yaml"
140 | wait: yes
141 | - name: Install Intel Gaudi DeviceConfig CR
142 | block:
143 | - name: Create Intel Gaudi DeviceConfig
144 | k8s:
145 | state: present
146 | src: "../gaudi/gaudi_device_config.yaml"
147 | wait: yes
148 | - name: Verify Intel Gaudi Resources
149 | tags:
150 | - gaudi_resource_test
151 | block:
152 | - name: Get Gaudi Node Resource Information
153 | kubernetes.core.k8s_info:
154 | api: v1
155 | kind: Node
156 | label_selectors:
157 | - "kmm.node.kubernetes.io/habana-ai-operator.intel-gaudi-module.device-plugin-ready="
158 | - "kmm.node.kubernetes.io/habana-ai-operator.intel-gaudi-module.ready="
159 | wait: yes
160 | wait_timeout: 120
161 | register: cluster_nodes_info
162 | until:
163 | - cluster_nodes_info.resources is defined
164 | - name: Print cluster resources
165 | debug:
166 | msg:
167 | - "Please verify Capacity and Allocatable Habana Gaudi Resources on the node - "
168 | - "Capacity: {
169 | 'habana.ai/gaudi': {{ cluster_nodes_info.resources[0].status.capacity['habana.ai/gaudi'] }},"
170 | - "Allocatable Resources: {
171 | 'habana.ai/gaudi': {{ cluster_nodes_info.resources[0].status.allocatable['habana.ai/gaudi'] }},"
--------------------------------------------------------------------------------
/device_plugins/deploy_qat.md:
--------------------------------------------------------------------------------
1 | # Create Intel QAT Device Plugin CR
2 |
3 | ## Create a CR via web console
4 | 1. Go to **Operator** -> **Installed Operators**.
5 | 2. Open **Intel Device Plugins Operator**.
6 | 3. Navigate to tab **Intel QuickAssist Technology Device Plugin**.
7 | 4. Click **Create QatDevicePlugin** -> set correct parameters -> Click **Create**
8 | 5. Optional: If you want to make any customizations, select YAML view and edit the details. When you are done, click **Create**.
9 |
10 | ## Verify via web console
11 | 1. Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-qat-plugin**.
12 | 2. Now `QatDevicePlugin` is created.
13 |
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/qat_device_plugin.yaml
18 | ```
19 |
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready:
22 | ```
23 | $ oc get QatDevicePlugin
24 | ```
25 | Output:
26 | ```
27 | NAME DESIRED READY NODE SELECTOR AGE
28 | qatdeviceplugin-sample 1 1 {"intel.feature.node.kubernetes.io/qat":"true"} 3h27m
29 | ```
30 |
31 | # Verify QAT Device Plugin
32 | After the plugin is deployed, use below command to verify QAT resources:
33 | ```
34 | $ oc describe node | grep qat.intel.com
35 | qat.intel.com/cy: 32
36 | qat.intel.com/cy: 32
37 | qat.intel.com/dc: 32
38 | qat.intel.com/dc: 32
39 | ```
40 | **Note**: By default the device plugin registers half resources each for `qat.intel.com/cy` and `qat.intel.com/dc` respectively. For more details about the QAT resources configuration, please refer to the QAT Device Plugin Configuration section below.
41 |
42 | # QAT Device Plugin Configuration
43 | > **Note**: The QAT device plugin can be configured with the flags. In this release, only the configurations in the table below are verified and supported on RHOCP.
44 |
45 | For more details about the QAT device plugin configuration flags, see [Modes and Configurations Options](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md#modes-and-configuration-options).
46 |
47 | | Flag | Configuration | Description |
48 | | ---- | ---- | ---- |
49 | | `-dpdk-driver` | vfio-pci | Using vfio-pci driver to manage QAT VFIO device. See details [here](https://doc.dpdk.org/guides/linux_gsg/linux_drivers.html) |
50 | | `-kernel-vf-drivers` | 4xxxvf | Supporting 4xxx QAT device **Note**: Verified on 4th Gen Intel® Xeon® Scalable processors. See details [here](https://github.com/intel/qatlib/blob/main/INSTALL#L72) |
51 | | `-max-num-devices ` | 128 | It is the maximum VF device it can support for 4xxx QAT device. If the number exceeds the maximum number the QAT device supports, then the maximum number will be enabled. |
52 | | `-provisioning-config ` | Name of ConfigMap | See section [QAT resource configuration](/device_plugins/deploy_qat.md#qat-resource-configuration-experimental) |
53 |
54 | ## QAT Resource Configuration (experimental)
55 |
56 | **NOTE**: In this release, this is an experimental feature. The efforts to [enhance this feature](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/1529) and [make it more stable](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/1542) are on going.
57 |
58 | Users can use the steps below to customize the QAT resource configuration:
59 | 1. Create the configmap for qat resource configuration
60 | ```
61 | $ oc create configmap --namespace=openshift-operators --from-literal "qat.conf=ServicesEnabled=