├── .gitignore
├── _static
    └── custom.css
├── docs
    ├── index.html
    ├── images
    │   ├── Ecosystem.png
    │   ├── gaudinet_image.png
    │   ├── model-server-status.png
    │   ├── rhoai-deploy-model.png
    │   ├── tgi-serving-runtime.png
    │   ├── rhods-dashboard-route.png
    │   ├── Ovms-Gpu-resource-limit.png
    │   ├── Ovms-Gpu-resource-request.png
    │   ├── openvino-accelerator-field.png
    │   ├── Operator-Architecture-Options.png
    │   ├── accelerator-profile-dropdown.png
    │   ├── congestion_test_single_leaf_switch.png
    │   ├── congestion_test_single_leaf_spine_switch.png
    │   ├── bisection_bandwidth_testing_on_leaf_switch.png
    │   ├── bisection_bandwidth_testing_on_leaf_spine_switchs.png
    │   ├── Intel-Technology-Enabling-for-OpenShift-Architecture.png
    │   ├── bisection_bandwidth_testing_all_Gaudis_leaf_spine_switchs.png
    │   ├── bisection_bandwidth_testing_all_nodes_leaf_spine_switchs.png
    │   └── Intel-Enterprise-AI-Foundation-for-OpenShift-Training-Solution.png
    ├── releases.rst
    └── supported_platforms.md
├── requirements.txt
├── security
    ├── dsa_serviceAccount.yaml
    ├── dsa_role.yaml
    ├── dsa_roleBinding.yaml
    ├── qatlib_rbac.yaml
    ├── dsa_scc.yaml
    └── qatlib_scc.yaml
├── tests
    ├── l2
    │   ├── dsa
    │   │   ├── dsa_imagestream.yaml
    │   │   ├── rh_auth.yaml
    │   │   ├── dsa_job.yaml
    │   │   ├── dsa_build.yaml
    │   │   └── README.md
    │   ├── sgx
    │   │   ├── sgx_imagestream.yaml
    │   │   ├── sgx_job.yaml
    │   │   ├── README.md
    │   │   └── sgx_build.yaml
    │   ├── dgpu
    │   │   ├── clinfo_job.yaml
    │   │   ├── intelvpl_job.yaml
    │   │   ├── hwinfo_job.yaml
    │   │   ├── vainfo_job.yaml
    │   │   ├── hwinfo_build.yaml
    │   │   ├── clinfo_build.yaml
    │   │   ├── vainfo_build.yaml
    │   │   ├── intelvpl_build.yaml
    │   │   └── README.md
    │   ├── qat
    │   │   ├── qatlib_job.yaml
    │   │   ├── qatlib_build.yaml
    │   │   └── README.md
    │   └── README.md
    ├── gaudi
    │   └── l2
    │   │   ├── vllm_hf_secret.yaml
    │   │   ├── test-pod.yaml
    │   │   ├── hl-smi_job.yaml
    │   │   ├── hl-smi-firmware_job.yaml
    │   │   ├── hccl_build.yaml
    │   │   ├── hccl_job.yaml
    │   │   ├── vllm_deployment.yaml
    │   │   ├── vllm_buildconfig.yaml
    │   │   └── README.md
    ├── l3
    │   └── README.md
    └── README.md
├── e2e
    └── inference
    │   ├── accelerator_profile_gaudi.yaml
    │   ├── accelerator_profile_flex140.yaml
    │   └── README.md
├── security.md
├── nfd
    ├── node-feature-discovery-openshift.yaml
    ├── node-feature-rules-openshift.yaml
    └── README.md
├── device_plugins
    ├── sgx_device_plugin.yaml
    ├── gpu_device_plugin.yaml
    ├── dsa_device_plugin.yaml
    ├── qat_device_plugin.yaml
    ├── install_operator.yaml
    ├── deploy_sgx.md
    ├── deploy_gpu.md
    ├── deploy_dsa.md
    ├── README.md
    └── deploy_qat.md
├── machine_configuration
    ├── 100-intel-iommu-on.yaml
    └── README.md
├── gaudi
    ├── gaudi_spi_fw_upgrade_job.yaml
    ├── gaudi_install_operator.yaml
    ├── gaudi_spi_fw_upgrade_build.yaml
    ├── Gaudi-SPI-Firmware-Upgrade.md
    ├── gaudi_cluster_policy.yaml
    └── README.md
├── workloads
    └── opea
    │   └── chatqna
    │       ├── create_megaservice_container.sh
    │       ├── persistent_volumes.yaml
    │       ├── tgi_gaudi_servingruntime.yaml
    │       ├── chatqna_megaservice_buildconfig.yaml
    │       ├── redis_deployment_service.yaml
    │       ├── chatqna_megaservice_deployment.yaml
    │       └── README.md
├── _templates
    └── versions.html
├── set-version.sh
├── kmmo
    ├── intel-dgpu.yaml
    ├── intel-dgpu-on-premise-build.yaml
    └── README.md
├── make.bat
├── playbooks
    ├── install_device_plugins_operator.yaml
    ├── README.md
    ├── install_nfd_operator.yaml
    ├── configure_nfd.yaml
    └── intel_ocp_provisioning.yaml
├── index.rst
├── Makefile
├── CONTRIBUTING.md
├── one_click
    ├── README.md
    ├── gaudi_provisioning_playbook.yaml
    └── gpu_provisioning_playbook.yaml
├── .github
    └── workflows
    │   └── publish.yml
├── conf.py
└── CODE_OF_CONDUCT.md


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | 


--------------------------------------------------------------------------------
/_static/custom.css:
--------------------------------------------------------------------------------
1 | .wy-nav-content {
2 |     max-width: 90% !important;
3 | }


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 | <meta http-equiv="refresh" content="0; URL='README.html'" />


--------------------------------------------------------------------------------
/docs/images/Ecosystem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Ecosystem.png


--------------------------------------------------------------------------------
/docs/images/gaudinet_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/gaudinet_image.png


--------------------------------------------------------------------------------
/docs/images/model-server-status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/model-server-status.png


--------------------------------------------------------------------------------
/docs/images/rhoai-deploy-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/rhoai-deploy-model.png


--------------------------------------------------------------------------------
/docs/images/tgi-serving-runtime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/tgi-serving-runtime.png


--------------------------------------------------------------------------------
/docs/images/rhods-dashboard-route.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/rhods-dashboard-route.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | docutils<0.18
2 | sphinx
3 | sphinx_rtd_theme
4 | recommonmark
5 | sphinx-markdown-tables
6 | sphinx-md
7 | myst_parser
8 | GitPython


--------------------------------------------------------------------------------
/docs/images/Ovms-Gpu-resource-limit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Ovms-Gpu-resource-limit.png


--------------------------------------------------------------------------------
/docs/images/Ovms-Gpu-resource-request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Ovms-Gpu-resource-request.png


--------------------------------------------------------------------------------
/docs/images/openvino-accelerator-field.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/openvino-accelerator-field.png


--------------------------------------------------------------------------------
/docs/images/Operator-Architecture-Options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Operator-Architecture-Options.png


--------------------------------------------------------------------------------
/docs/images/accelerator-profile-dropdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/accelerator-profile-dropdown.png


--------------------------------------------------------------------------------
/docs/images/congestion_test_single_leaf_switch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/congestion_test_single_leaf_switch.png


--------------------------------------------------------------------------------
/docs/images/congestion_test_single_leaf_spine_switch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/congestion_test_single_leaf_spine_switch.png


--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_on_leaf_switch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_on_leaf_switch.png


--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_on_leaf_spine_switchs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_on_leaf_spine_switchs.png


--------------------------------------------------------------------------------
/docs/images/Intel-Technology-Enabling-for-OpenShift-Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Intel-Technology-Enabling-for-OpenShift-Architecture.png


--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_all_Gaudis_leaf_spine_switchs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_all_Gaudis_leaf_spine_switchs.png


--------------------------------------------------------------------------------
/docs/images/bisection_bandwidth_testing_all_nodes_leaf_spine_switchs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/bisection_bandwidth_testing_all_nodes_leaf_spine_switchs.png


--------------------------------------------------------------------------------
/security/dsa_serviceAccount.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | apiVersion: v1
5 | kind: ServiceAccount
6 | metadata:
7 |   name: intel-dsa
8 |   namespace: intel-dsa


--------------------------------------------------------------------------------
/docs/images/Intel-Enterprise-AI-Foundation-for-OpenShift-Training-Solution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/HEAD/docs/images/Intel-Enterprise-AI-Foundation-for-OpenShift-Training-Solution.png


--------------------------------------------------------------------------------
/tests/l2/dsa/dsa_imagestream.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 |   name: intel-dsa-workload
8 |   namespace: intel-dsa


--------------------------------------------------------------------------------
/tests/l2/sgx/sgx_imagestream.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | apiVersion: image.openshift.io/v1
5 | kind: ImageStream
6 | metadata:
7 |   name: intel-sgx-workload
8 |   namespace: intel-sgx


--------------------------------------------------------------------------------
/tests/gaudi/l2/vllm_hf_secret.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | apiVersion: v1
 4 | kind: Secret
 5 | metadata:
 6 |   name: hf-token
 7 |   namespace: gaudi-validation
 8 | type: Opaque
 9 | data:
10 |   hf-token:  # Add your token 


--------------------------------------------------------------------------------
/tests/l2/dsa/rh_auth.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: v1
 5 | kind: Secret
 6 | metadata:
 7 |   name: rh-auth
 8 |   namespace: intel-dsa
 9 | type: Opaque
10 | data:
11 |   username: # Add username
12 |   password: # Add password
13 | 


--------------------------------------------------------------------------------
/e2e/inference/accelerator_profile_gaudi.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | apiVersion: dashboard.opendatahub.io/v1
 4 | kind: AcceleratorProfile
 5 | metadata:
 6 |   name: intel-gaudi-ai-accelerator
 7 | spec:
 8 |   displayName: Intel Gaudi AI Accelerator
 9 |   description: Intel Gaudi AI Accelerator 
10 |   enabled: true
11 |   identifier: habana.ai/gaudi


--------------------------------------------------------------------------------
/security/dsa_role.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: rbac.authorization.k8s.io/v1
 5 | kind: Role
 6 | metadata:
 7 |   name: intel-dsa
 8 |   namespace: intel-dsa
 9 | rules:
10 | - apiGroups:
11 |   - security.openshift.io
12 |   resources:
13 |   - securitycontextconstraints
14 |   resourceNames:
15 |   - intel-dsa-scc
16 |   verbs:
17 |   - use


--------------------------------------------------------------------------------
/security.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 


--------------------------------------------------------------------------------
/security/dsa_roleBinding.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: rbac.authorization.k8s.io/v1
 5 | kind: RoleBinding
 6 | metadata:
 7 |   name: intel-dsa
 8 |   namespace: intel-dsa
 9 | roleRef:
10 |   apiGroup: rbac.authorization.k8s.io
11 |   kind: Role
12 |   name: intel-dsa
13 | subjects:
14 | - kind: ServiceAccount
15 |   name: intel-dsa
16 |   namespace: intel-dsa


--------------------------------------------------------------------------------
/tests/gaudi/l2/test-pod.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | apiVersion: v1
 4 | kind: Pod
 5 | metadata:
 6 |   name: test
 7 |   labels:
 8 |     app: test
 9 |   namespace: gaudi-validation
10 | spec:
11 |   containers:
12 |     - name: test
13 |       command: [ "/bin/bash", "-c", "--" ]
14 |       args: [ "while true; do sleep 30; done;"]
15 |       image: registry.access.redhat.com/ubi9-minimal:latest
16 | 


--------------------------------------------------------------------------------
/nfd/node-feature-discovery-openshift.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: nfd.openshift.io/v1
 5 | kind: NodeFeatureDiscovery
 6 | metadata:
 7 |   name: nfd-instance
 8 |   namespace: openshift-nfd
 9 | spec:
10 |   operand:
11 |     image: registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.18 
12 |     imagePullPolicy: Always
13 |     servicePort: 12000
14 |   workerConfig:
15 |     configData: |


--------------------------------------------------------------------------------
/device_plugins/sgx_device_plugin.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: deviceplugin.intel.com/v1
 5 | kind: SgxDevicePlugin
 6 | metadata:
 7 |   name: sgxdeviceplugin-sample
 8 | spec:
 9 |   image: registry.connect.redhat.com/intel/intel-sgx-plugin@sha256:f2c77521c6dae6b4db1896a5784ba8b06a5ebb2a01684184fc90143cfcca7bf4
10 |   enclaveLimit: 110
11 |   provisionLimit: 110
12 |   logLevel: 4
13 |   nodeSelector:
14 |     intel.feature.node.kubernetes.io/sgx: "true"


--------------------------------------------------------------------------------
/device_plugins/gpu_device_plugin.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: deviceplugin.intel.com/v1
 5 | kind: GpuDevicePlugin
 6 | metadata:
 7 |   name: gpudeviceplugin-sample
 8 | spec:
 9 |   image: registry.connect.redhat.com/intel/intel-gpu-plugin@sha256:e2c2ce658e78c35c425f16a4f8e85c5f32ce31848d9b53a644a05e7f8b7f71b0
10 |   preferredAllocationPolicy: none
11 |   sharedDevNum: 1
12 |   logLevel: 4
13 |   nodeSelector:
14 |     intel.feature.node.kubernetes.io/gpu: "true"


--------------------------------------------------------------------------------
/e2e/inference/accelerator_profile_flex140.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | apiVersion: dashboard.opendatahub.io/v1
 4 | kind: AcceleratorProfile
 5 | metadata:
 6 |   name: intel-gpu-flex-series-140
 7 | spec:
 8 |   displayName: Intel® Data Center GPU Flex Series 140
 9 |   description: Intel Data Center GPU for inference 
10 |   enabled: true
11 |   identifier: gpu.intel.com/i915
12 |   tolerations:
13 |   - effect: NoSchedule
14 |     key: gpu.intel.com/flex-140
15 |     operator: Exists


--------------------------------------------------------------------------------
/tests/l3/README.md:
--------------------------------------------------------------------------------
 1 | # L3 Test
 2 | 
 3 | ```{admonition} Disclaimer  
 4 | Please note that this module is currently under development and may contain partially implemented features, therefore it is not supported in the current release.
 5 | ```
 6 | 
 7 | ## L3 Overview
 8 | Layer 3 consists of the software stack used to provision the e2e test for specific Intel features like Intel® Data Center GPU Flex Series and Intel® SGX. This layer is directly relied by the reference workloads -layer 4.
 9 | 
10 | ### GPU
11 | Please follow the [README](/e2e/inference/README.md) for Intel Data Center GPU Flex Series specific prerequisites.


--------------------------------------------------------------------------------
/machine_configuration/100-intel-iommu-on.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: machineconfiguration.openshift.io/v1
 5 | kind: MachineConfig
 6 | metadata:
 7 |   labels:
 8 |     machineconfiguration.openshift.io/role: worker
 9 |   name: 100-intel-iommu-on
10 | spec:
11 |   config:
12 |     ignition:
13 |       version: 3.2.0
14 |   kernelArguments:
15 |       - intel_iommu=on,sm_on modules_load=vfio-pci vfio-pci.ids=8086:4941,8086:4943
16 |   selector:
17 |     intel.feature.node.kubernetes.io/qat: 'true'
18 |     intel.feature.node.kubernetes.io/dsa: 'true'
19 | 


--------------------------------------------------------------------------------
/tests/l2/dgpu/clinfo_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-dgpu-clinfo
 8 |   namespace: intel-dgpu
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       containers:
15 |       - name: clinfo-pod
16 |         image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-clinfo:latest
17 |         command: ["clinfo"]
18 |         resources:
19 |           limits:
20 |             gpu.intel.com/i915: 1
21 |         imagePullPolicy: IfNotPresent


--------------------------------------------------------------------------------
/tests/l2/dgpu/intelvpl_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-dgpu-intelvpl
 8 |   namespace: intel-dgpu
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       containers:
15 |       - name: intelvpl-pod
16 |         image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-intelvpl:latest
17 |         command: ["vpl-inspect"]
18 |         resources:
19 |           limits:
20 |             gpu.intel.com/i915: 1
21 |         imagePullPolicy: IfNotPresent
22 | 


--------------------------------------------------------------------------------
/device_plugins/dsa_device_plugin.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | kind: DsaDevicePlugin
 5 | apiVersion: deviceplugin.intel.com/v1
 6 | metadata:
 7 |   name: dsadeviceplugin-sample
 8 | spec:
 9 |   image: registry.connect.redhat.com/intel/intel-dsa-plugin@sha256:2742a13279cc3f301daa09b6389517024530f658d4e1dd13db495cc94d9ba57c
10 |   initImage: registry.connect.redhat.com/intel/intel-idxd-config-initcontainer@sha256:b74dc43fa81ce14ea97f20ff6b2f726039f6309fdd868d5f45d751d0a8662cc1
11 |   logLevel: 4
12 |   nodeSelector:
13 |     intel.feature.node.kubernetes.io/dsa: 'true'
14 |   sharedDevNum: 10
15 | 


--------------------------------------------------------------------------------
/tests/gaudi/l2/hl-smi_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: hl-smi-workload
 8 |   namespace: gaudi-validation
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       containers:
15 |         - name: hl-smi-workload
16 |           image: vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
17 |           command: ["hl-smi"]
18 |           resources:
19 |             limits:
20 |               habana.ai/gaudi: 8
21 |           imagePullPolicy: IfNotPresent


--------------------------------------------------------------------------------
/tests/l2/dgpu/hwinfo_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-dgpu-hwinfo
 8 |   namespace: intel-dgpu
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       containers:
15 |       - name: hwinfo-pod
16 |         image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-hwinfo:latest
17 |         command: ["hwinfo"]
18 |         args: ["--display"]
19 |         resources:
20 |           limits:
21 |             gpu.intel.com/i915: 1
22 |         imagePullPolicy: IfNotPresent


--------------------------------------------------------------------------------
/tests/l2/sgx/sgx_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-sgx-job
 8 |   namespace: intel-sgx
 9 | spec:
10 |   template:
11 |     spec:
12 |       restartPolicy: Never
13 |       containers: 
14 |         - name: intel-sgx-job
15 |           image: image-registry.openshift-image-registry.svc:5000/intel-sgx/intel-sgx-workload:latest
16 |           imagePullPolicy: Always
17 |           workingDir: "/opt/intel/"
18 |           command: ["/opt/intel/app"]
19 |           resources:
20 |             limits:
21 |               sgx.intel.com/epc: "5Mi"


--------------------------------------------------------------------------------
/tests/gaudi/l2/hl-smi-firmware_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: hl-smi-firmware
 8 |   namespace: gaudi-validation
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       containers:
15 |         - name: hl-smi-firmware
16 |           image: vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
17 |           command: ["/bin/bash", "-c", "hl-smi -L | grep SPI"]
18 |           resources:
19 |             limits:
20 |               habana.ai/gaudi: 1
21 |           imagePullPolicy: IfNotPresent
22 | 


--------------------------------------------------------------------------------
/device_plugins/qat_device_plugin.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: deviceplugin.intel.com/v1
 5 | kind: QatDevicePlugin
 6 | metadata:
 7 |   name: qatdeviceplugin-sample
 8 | spec:
 9 |   image: registry.connect.redhat.com/intel/intel-qat-plugin@sha256:8d79dba051b83ec770a4b0fdc3da6ac92264cb19cac8d455b707ed92a6a95d02
10 |   initImage: registry.connect.redhat.com/intel/intel-qat-initcontainer@sha256:34f0b993ca654ea0b386217cba1a44d5ef3da841b3befc780508f5323e95fa90
11 |   dpdkDriver: vfio-pci
12 |   kernelVfDrivers:
13 |     - 4xxxvf
14 |   maxNumDevices: 128
15 |   logLevel: 4
16 |   nodeSelector:
17 |     intel.feature.node.kubernetes.io/qat: "true"


--------------------------------------------------------------------------------
/tests/l2/dgpu/vainfo_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-dgpu-vainfo
 8 |   namespace: intel-dgpu
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       containers:
15 |       - name: vainfo-pod
16 |         image: image-registry.openshift-image-registry.svc:5000/intel-dgpu/intel-dgpu-vainfo:latest
17 |         command: ["vainfo"]
18 |         args: ["--display", "drm", "--device", "/dev/dri/renderD128"]
19 |         resources:
20 |           limits:
21 |             gpu.intel.com/i915: 1
22 |         imagePullPolicy: IfNotPresent
23 | 
24 | 


--------------------------------------------------------------------------------
/device_plugins/install_operator.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: operators.coreos.com/v1alpha2
 5 | kind: OperatorGroup
 6 | metadata:
 7 |   name: global-operators
 8 |   namespace: openshift-operators
 9 | ---
10 | apiVersion: operators.coreos.com/v1alpha1
11 | kind: Subscription
12 | metadata:
13 |   labels:
14 |     operators.coreos.com/intel-device-plugins-operator.openshiftoperators: ""
15 |   name: intel-device-plugins-operator
16 |   namespace: openshift-operators
17 | spec:
18 |   channel: alpha
19 |   installPlanApproval: Automatic
20 |   name: intel-device-plugins-operator
21 |   source: certified-operators
22 |   sourceNamespace: openshift-marketplace


--------------------------------------------------------------------------------
/gaudi/gaudi_spi_fw_upgrade_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: gaudi-spi-firmware-upgrade
 8 |   namespace: gaudi-spi-fw-upgrade
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       serviceAccountName: gaudi-fw-upgrade-sa
15 |       containers:
16 |         - name: gaudi-spi-firmware-upgrade
17 |           securityContext:
18 |             privileged: true
19 |           image: image-registry.openshift-image-registry.svc:5000/gaudi-spi-fw-upgrade/gaudi-spi-fw-upgrade:1.19.1-26
20 |           command: [ "hl-fw-loader", "-y" ]
21 |           imagePullPolicy: Always


--------------------------------------------------------------------------------
/workloads/opea/chatqna/create_megaservice_container.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/bin/sh
 4 | 
 5 | tag="v0.8"
 6 | namespace="opea-chatqna"
 7 | repo="https://github.com/opea-project/GenAIExamples.git"
 8 | yaml_url="https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/chatqna_megaservice_buildconfig.yaml"
 9 | 
10 | oc project $namespace &&
11 |     git clone --depth 1 --branch $tag $repo && 
12 |         cd GenAIExamples/ChatQnA/deprecated/langchain/docker &&
13 |             oc extract secret/knative-serving-cert -n istio-system --to=. --keys=tls.crt &&
14 |                 oc apply -f $yaml_url &&
15 |                     oc start-build chatqna-megaservice --from-dir=./ --follow


--------------------------------------------------------------------------------
/_templates/versions.html:
--------------------------------------------------------------------------------
 1 | {%- if versions %}
 2 | <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
 3 |   <span class="rst-current-version" data-toggle="rst-current-version">
 4 |     <span class="fa fa-book"> Versions</span>
 5 |       {{ version }}
 6 |     <span class="fa fa-caret-down"></span>
 7 |   </span>
 8 |   <div class="rst-other-versions">
 9 |     {% if versions|length >= 1 %}
10 |       <dl>
11 |         <dt>{{ _('Versions') }}</dt>
12 |         {% for slug, url in versions %}
13 |           {% if slug == version %} <strong> {% endif %}
14 |           <dd><a href="{{ url }}">{{ slug }}</a></dd>
15 |           {% if slug == version %} </strong> {% endif %}
16 |         {% endfor %}
17 |       </dl>
18 |       {% endif %}
19 |   </div>
20 | </div>
21 | {%- endif %}


--------------------------------------------------------------------------------
/security/qatlib_rbac.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: v1
 5 | kind: ServiceAccount
 6 | metadata:
 7 |   name: intel-qat
 8 |   namespace: intel-qat
 9 | ---
10 | apiVersion: rbac.authorization.k8s.io/v1
11 | kind: Role
12 | metadata:
13 |   name: intel-qat
14 |   namespace: intel-qat
15 | rules:
16 | - apiGroups:
17 |   - security.openshift.io
18 |   resources:
19 |   - securitycontextconstraints
20 |   resourceNames:
21 |   - intel-qat-scc
22 |   verbs:
23 |   - use
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 |   name: intel-qat
29 | roleRef:
30 |   apiGroup: rbac.authorization.k8s.io
31 |   kind: Role
32 |   name: intel-qat
33 | subjects:
34 | - kind: ServiceAccount
35 |   name: intel-qat
36 |   namespace: intel-qat


--------------------------------------------------------------------------------
/set-version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -eu
 2 | #
 3 | # Copyright 2024 Intel Corporation.
 4 | #
 5 | # SPDX-License-Identifier: Apache-2.0
 6 | #
 7 | # Invoke this script with a version as parameter
 8 | # and it will update all hard-coded devel versions
 9 | # to the tag versions in the source code.
10 | #
11 | # Adapted from https://github.com/intel/intel-device-plugins-for-kubernetes/
12 | 
13 | if [ $# != 1 ] || [ "$1" = "?" ] || [ "$1" = "--help" ]; then
14 |     echo "Please provide TAG version as an argument. Usage: $0 <tag_version>" >&2
15 |     exit 1
16 | fi
17 | 
18 | devel_link="intel/intel-technology-enabling-for-openshift/main/"
19 | tag_link="intel/intel-technology-enabling-for-openshift/$1/"
20 | 
21 | files=$(git grep -lF $devel_link -- '*.md')
22 | 
23 | for file in $files; do
24 |     sed -i -e "s|$devel_link|$tag_link|g" "$file";
25 | done


--------------------------------------------------------------------------------
/kmmo/intel-dgpu.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: kmm.sigs.x-k8s.io/v1beta1
 5 | kind: Module
 6 | metadata:
 7 |   name: intel-dgpu
 8 |   namespace: openshift-kmm
 9 | spec:
10 |   moduleLoader:
11 |     container:
12 |       modprobe:
13 |         moduleName: i915
14 |         firmwarePath: /firmware
15 |         modulesLoadingOrder:
16 |           - i915
17 |           - mei_gsc
18 |       inTreeModulesToRemove: [i915, intel_vsec, mei_gsc, mei_me]
19 |       kernelMappings:
20 |         - regexp: '^.*\.x86_64$'
21 |           containerImage: registry.connect.redhat.com/intel/intel-data-center-gpu-driver-container:3.0.0-$KERNEL_FULL_VERSION
22 |   selector:
23 |     intel.feature.node.kubernetes.io/gpu: 'true'
24 |     intel.feature.node.kubernetes.io/dgpu-canary: 'true'
25 | 


--------------------------------------------------------------------------------
/gaudi/gaudi_install_operator.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # Adapted from https://docs.habana.ai/en/latest/Orchestration/HabanaAI_Operator/Deploying_HabanaAI_Operator.html#using-cli
 4 | #
 5 | ---
 6 | apiVersion: v1
 7 | kind: Namespace
 8 | metadata:
 9 |    name: habana-ai-operator
10 | ---
11 | apiVersion: operators.coreos.com/v1
12 | kind: OperatorGroup
13 | metadata:
14 |    name: habana-ai-operator
15 |    namespace: habana-ai-operator
16 | spec:
17 |    targetNamespaces:
18 |    - habana-ai-operator
19 | ---
20 | apiVersion: operators.coreos.com/v1alpha1
21 | kind: Subscription
22 | metadata:
23 |    name: habana-ai-operator
24 |    namespace: habana-ai-operator
25 | spec:
26 |    channel: stable
27 |    installPlanApproval: Automatic
28 |    name: habana-ai-operator
29 |    source: certified-operators
30 |    sourceNamespace: openshift-marketplace


--------------------------------------------------------------------------------
/tests/l2/qat/qatlib_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-qat-workload
 8 |   namespace: intel-qat
 9 | spec:
10 |   template:
11 |     spec:
12 |       restartPolicy: Never
13 |       containers:
14 |       - name: intel-qat-job
15 |         image: image-registry.openshift-image-registry.svc:5000/intel-qat/intel-qat-workload:latest
16 |         imagePullPolicy: IfNotPresent
17 |         command: ["cpa_sample_code"]
18 |         securityContext:
19 |           capabilities:
20 |             add:
21 |               [IPC_LOCK]
22 |         resources:
23 |           requests:
24 |             qat.intel.com/dc: '1'
25 |             qat.intel.com/cy: '1'
26 |           limits:
27 |             qat.intel.com/dc: '1'
28 |             qat.intel.com/cy: '1'
29 |       serviceAccount: intel-qat


--------------------------------------------------------------------------------
/tests/l2/dsa/dsa_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: intel-dsa-workload
 8 |   namespace: intel-dsa
 9 | spec:
10 |   template:
11 |     spec:
12 |       restartPolicy: Never
13 |       containers:
14 |       - name: intel-dsa-job
15 |         image: image-registry.openshift-image-registry.svc:5000/intel-dsa/intel-dsa-workload:latest
16 |         imagePullPolicy: IfNotPresent
17 |         workingDir: "/usr/libexec/accel-config/test/"
18 |         command:
19 |           - "./dsa_user_test_runner.sh"
20 |         args:
21 |           - "--skip-config"
22 |         securityContext:
23 |           capabilities:
24 |             add:
25 |               [SYS_RAWIO]
26 |         resources:
27 |           limits:
28 |             dsa.intel.com/wq-user-dedicated: 1
29 |       serviceAccountName: intel-dsa


--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/playbooks/install_device_plugins_operator.yaml:
--------------------------------------------------------------------------------
 1 | - name: Create global OperatorGroup in openshift-operators namespace
 2 |   k8s:
 3 |     state: present
 4 |     definition:
 5 |       apiVersion: operators.coreos.com/v1alpha2
 6 |       kind: OperatorGroup
 7 |       metadata:
 8 |         name: global-operators
 9 |         namespace: openshift-operators
10 | 
11 | - name: Create Intel Device Plugins Operator Subscription
12 |   k8s:
13 |     state: present
14 |     definition:
15 |       apiVersion: operators.coreos.com/v1alpha1
16 |       kind: Subscription
17 |       metadata:
18 |         labels:
19 |           operators.coreos.com/intel-device-plugins-operator.openshiftoperators: ""
20 |         name: intel-device-plugins-operator
21 |         namespace: openshift-operators
22 |       spec:
23 |         channel: alpha
24 |         installPlanApproval: Automatic
25 |         name: intel-device-plugins-operator
26 |         source: certified-operators
27 |         sourceNamespace: openshift-marketplace


--------------------------------------------------------------------------------
/tests/gaudi/l2/hccl_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: hccl-demo-workload
 8 |   namespace: gaudi-validation
 9 | ---
10 | kind: BuildConfig
11 | apiVersion: build.openshift.io/v1
12 | metadata:
13 |   name: hccl-demo-workload
14 |   namespace: gaudi-validation
15 | spec:
16 |   output:
17 |     to:
18 |       kind: ImageStreamTag
19 |       name: 'hccl-demo-workload:1.19.1-26'
20 |   strategy:
21 |     type: Docker
22 |   source:
23 |     type: Dockerfile
24 |     dockerfile: |
25 |       ARG BUILDER=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
26 |       FROM ${BUILDER} AS builder
27 | 
28 |       WORKDIR /
29 |       RUN git clone https://github.com/HabanaAI/hccl_demo.git \
30 |           && cd hccl_demo \
31 |           && make
32 | 
33 |       WORKDIR /hccl_demo
34 |   triggers:
35 |     - type: ConfigChange
36 |   runPolicy: Serial


--------------------------------------------------------------------------------
/security/dsa_scc.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: security.openshift.io/v1
 5 | allowHostDirVolumePlugin: false
 6 | allowHostIPC: false
 7 | allowHostNetwork: false
 8 | allowHostPID: false
 9 | allowHostPorts: false
10 | allowPrivilegeEscalation: false
11 | allowPrivilegedContainer: false
12 | allowedCapabilities: 
13 | - SYS_RAWIO
14 | defaultAddCapabilities: null
15 | fsGroup:
16 |    type: MustRunAs
17 | groups: []
18 | kind: SecurityContextConstraints
19 | metadata:
20 |   annotations:
21 |     kubernetes.io/description: 'SCC for Intel DSA based workload'
22 |   name: intel-dsa-scc
23 | priority: null
24 | readOnlyRootFilesystem: false
25 | requiredDropCapabilities:
26 | - ALL
27 | runAsUser:
28 |   type: RunAsAny
29 | seLinuxContext:
30 |   type: MustRunAs
31 | supplementalGroups:
32 |   type: RunAsAny
33 | seccompProfiles:
34 | - runtime/default
35 | volumes:
36 | - configMap
37 | - downwardAPI
38 | - emptyDir
39 | - ephemeral
40 | - persistentVolumeClaim
41 | - projected
42 | - secret


--------------------------------------------------------------------------------
/security/qatlib_scc.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: security.openshift.io/v1
 5 | allowHostDirVolumePlugin: false
 6 | allowHostIPC: false
 7 | allowHostNetwork: false
 8 | allowHostPID: false
 9 | allowHostPorts: false
10 | allowPrivilegeEscalation: false
11 | allowPrivilegedContainer: false
12 | allowedCapabilities: 
13 | - IPC_LOCK
14 | defaultAddCapabilities: null
15 | fsGroup:
16 |    type: MustRunAs
17 | groups: []
18 | kind: SecurityContextConstraints
19 | metadata:
20 |   annotations:
21 |     kubernetes.io/description: 'SCC for Intel QAT based workload'
22 |   name: intel-qat-scc
23 | priority: null
24 | readOnlyRootFilesystem: false
25 | requiredDropCapabilities:
26 | - ALL
27 | runAsUser:
28 |   type: RunAsAny
29 | seLinuxContext:
30 |   type: MustRunAs
31 | supplementalGroups:
32 |   type: RunAsAny
33 | seccompProfiles:
34 | - runtime/default
35 | volumes:
36 | - configMap
37 | - downwardAPI
38 | - emptyDir
39 | - ephemeral
40 | - persistentVolumeClaim
41 | - projected
42 | - secret


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | .. Intel Technology Enabling for OpenShift documentation master file, created by
 2 |    sphinx-quickstart on Wed Apr 17 23:49:16 2024.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Intel® Technology Enabling for OpenShift*
 7 | ===================================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    README.md
14 |    nfd/README.md
15 |    machine_configuration/README.md
16 |    kmmo/README.md
17 |    device_plugins/README.md
18 |    gaudi/README.md
19 | 
20 | .. toctree::
21 |    :maxdepth: 2
22 |    :caption: One-Click Deployment:
23 | 
24 |    one_click/README.md
25 | 
26 | .. toctree::
27 |    :maxdepth: 2
28 |    :caption: End-to-end Solutions:
29 | 
30 |    e2e/inference/README.md
31 | 
32 | .. toctree::
33 |    :maxdepth: 2
34 |    :caption: Releases:
35 |    
36 |    docs/releases.rst
37 |    
38 | .. toctree::
39 |    :maxdepth: 2
40 |    :caption: Supported Platforms:
41 |    
42 |    docs/supported_platforms


--------------------------------------------------------------------------------
/workloads/opea/chatqna/persistent_volumes.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: v1
 5 | kind: PersistentVolume
 6 | metadata:
 7 |   name: chatqna-megaservice-pv-0
 8 | spec:
 9 |   capacity:
10 |     storage: 100Mi
11 |   accessModes:
12 |     - ReadWriteOnce
13 |   persistentVolumeReclaimPolicy: Retain
14 |   nfs:
15 |     server: x.x.x.x # nfs server
16 |     path: /nfs # nfs path
17 | ---
18 | apiVersion: v1
19 | kind: PersistentVolume
20 | metadata:
21 |   name: chatqna-megaservice-pv-1
22 | spec:
23 |   capacity:
24 |     storage: 100Mi
25 |   accessModes:
26 |     - ReadWriteOnce
27 |   persistentVolumeReclaimPolicy: Retain
28 |   nfs:
29 |     server: x.x.x.x # nfs server
30 |     path: /nfs # nfs path
31 | ---
32 | apiVersion: v1
33 | kind: PersistentVolume
34 | metadata:
35 |   name: chatqna-megaservice-pv-2
36 | spec:
37 |   capacity:
38 |     storage: 100Mi
39 |   accessModes:
40 |     - ReadWriteOnce
41 |   persistentVolumeReclaimPolicy: Retain
42 |   nfs:
43 |     server: x.x.x.x # nfs server
44 |     path: /nfs # nfs path


--------------------------------------------------------------------------------
/playbooks/README.md:
--------------------------------------------------------------------------------
 1 | # Intel Technology Enabling Ansible Playbooks
 2 | 
 3 | ## Overview
 4 | This directory contains Ansible playbooks designed to automate the deployment and configuration of Intel technologies on Red Hat OpenShift clusters. These playbooks streamline the Intel feature provisioning and validation process on OpenShift environments.
 5 | 
 6 | ## Prerequisites
 7 | Before running the playbook, ensure the following prerequisites are met:
 8 | - Provisioned RHOCP Cluster
 9 | - Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster.
10 | 
11 | ## Run the Playbook
12 | 
13 | To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook.
14 | 
15 | ```bash
16 | git clone https://github.com/intel/intel-technology-enabling-for-openshift.git
17 | 
18 | cd intel-technology-enabling-for-openshift/
19 | 
20 | ansible-playbook playbooks/intel_ocp_provisioning.yaml
21 | ```


--------------------------------------------------------------------------------
/playbooks/install_nfd_operator.yaml:
--------------------------------------------------------------------------------
 1 | - name: Create namespace for Node Feature Discovery
 2 |   k8s:
 3 |     state: present
 4 |     definition:
 5 |       apiVersion: v1
 6 |       kind: Namespace
 7 |       metadata:
 8 |         name: openshift-nfd
 9 | 
10 | - name: Create operator group for Node Feature Discovery
11 |   k8s:
12 |     state: present
13 |     definition:
14 |       apiVersion: operators.coreos.com/v1
15 |       kind: OperatorGroup
16 |       metadata:
17 |         generateName: openshift-nfd-
18 |         name: openshift-nfd
19 |         namespace: openshift-nfd
20 |       spec:
21 |         targetNamespaces:
22 |         - openshift-nfd
23 | 
24 | - name: Subscribe to Node Feature Discovery operator
25 |   k8s:
26 |     state: present
27 |     definition:
28 |       apiVersion: operators.coreos.com/v1alpha1
29 |       kind: Subscription
30 |       metadata:
31 |         name: nfd
32 |         namespace: openshift-nfd
33 |       spec:
34 |         channel: "stable"
35 |         installPlanApproval: Automatic
36 |         name: nfd
37 |         source: redhat-operators
38 |         sourceNamespace: openshift-marketplace


--------------------------------------------------------------------------------
/tests/gaudi/l2/hccl_job.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: batch/v1
 5 | kind: Job
 6 | metadata:
 7 |   name: hccl-demo-workload
 8 |   namespace: gaudi-validation
 9 | spec:
10 |   template:
11 |     metadata:
12 |     spec:
13 |       restartPolicy: Never
14 |       serviceAccountName: hccl-demo-anyuid-sa
15 |       containers:
16 |         - name: hccl-demo-workload
17 |           image: image-registry.openshift-image-registry.svc:5000/gaudi-validation/hccl-demo-workload:1.19.1-26
18 |           workingDir: "/hccl_demo"
19 |           command: ["/bin/bash",  "-c", "--"]
20 |           ## sleep for 20 seconds to avoid race condition 
21 |           args:
22 |           - |
23 |             sleep 20
24 |             python3 run_hccl_demo.py --nranks 8 --node_id 0 --size 32m --test all_reduce --loop 1000 --ranks_per_node 8
25 |             sleep 20
26 |           env:
27 |           - name: HCCL_COMM_ID
28 |             value: '127.0.0.1:5555'
29 |           resources:
30 |             limits:
31 |               habana.ai/gaudi: 8
32 |           imagePullPolicy: IfNotPresent
33 | 


--------------------------------------------------------------------------------
/kmmo/intel-dgpu-on-premise-build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   labels:
 8 |     app: intel-dgpu-driver-container-kmmo
 9 |   name: intel-dgpu-driver-container-kmmo
10 |   namespace: openshift-kmm
11 | spec: {}
12 | ---
13 | apiVersion: kmm.sigs.x-k8s.io/v1beta1
14 | kind: Module
15 | metadata:
16 |   name: intel-dgpu-on-premise-build
17 |   namespace: openshift-kmm
18 | spec:
19 |   moduleLoader:
20 |     container:
21 |       imagePullPolicy: Always
22 |       modprobe:
23 |         moduleName: i915
24 |         firmwarePath: /firmware
25 |       inTreeModuleToRemove: intel_vsec
26 |       kernelMappings:
27 |         - regexp: '^.*\.x86_64$'
28 |           containerImage: image-registry.openshift-image-registry.svc:5000/openshift-kmm/intel-dgpu-driver-container-kmmo:$KERNEL_FULL_VERSION
29 |           build:
30 |             dockerfileConfigMap:
31 |               name: intel-dgpu-dockerfile-configmap
32 |   selector:
33 |     intel.feature.node.kubernetes.io/gpu: 'true'
34 |     intel.feature.node.kubernetes.io/dgpu-canary: 'true'
35 | 


--------------------------------------------------------------------------------
/playbooks/configure_nfd.yaml:
--------------------------------------------------------------------------------
 1 | - name: NFD - Create NFD discovery CR
 2 |   k8s: 
 3 |     state: present
 4 |     definition: '{{ item }}'
 5 |     wait: yes
 6 |     wait_condition:
 7 |       type: Available
 8 |       status: 'False'                                                                                                                                                                                    
 9 |   with_items: '{{ lookup("url", "https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-discovery-openshift.yaml", split_lines=False) | from_yaml_all | list }}'
10 |   when: (item is not none)  
11 | - name: NFD - Create NFD rules instance CR
12 |   k8s: 
13 |     state: present
14 |     definition: '{{ item }}'
15 |     wait: yes                                                                                                                                                                             
16 |   with_items: '{{ lookup("url", "https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-rules-openshift.yaml", split_lines=False) | from_yaml_all | list }}'
17 |   when: (item is not none)


--------------------------------------------------------------------------------
/tests/l2/dgpu/hwinfo_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: intel-dgpu-hwinfo
 8 |   namespace: intel-dgpu
 9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 |   name: intel-dgpu-hwinfo
15 |   namespace: intel-dgpu
16 | spec:
17 |   triggers:
18 |     - type: "ConfigChange"
19 |     - type: "ImageChange"
20 |   runPolicy: "Serial"
21 |   source:
22 |     type: Dockerfile
23 |     dockerfile: | 
24 |         ARG BUILDER=registry.access.redhat.com/ubi9-minimal:latest
25 |         FROM ${BUILDER} 
26 |         RUN microdnf -y update && \
27 |           rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
28 |           microdnf install -y hwinfo
29 |         ENTRYPOINT  ["hwinfo"]
30 |   strategy:
31 |     type: Docker
32 |     noCache: true
33 |     dockerStrategy:
34 |       buildArgs:
35 |           - name: "BUILDER"
36 |             value: "registry.access.redhat.com/ubi9-minimal:latest"
37 |   output:
38 |     to:
39 |       kind: ImageStreamTag
40 |       name: intel-dgpu-hwinfo:latest


--------------------------------------------------------------------------------
/device_plugins/deploy_sgx.md:
--------------------------------------------------------------------------------
 1 | # Create Intel SGX Device Plugin CR
 2 | 
 3 | ## Create a CR via web console
 4 | 1.	Go to **Operator** -> **Installed Operators**.
 5 | 2.	Open **Intel Device Plugins Operator**.
 6 | 3.	Navigate to tab **Intel Software Guard Extensions Device Plugin**.
 7 | 4.	Click **Create SgxDevicePlugin** -> set correct parameters -> Click **Create** 
 8 | 5.	Optional: If you want to make any customizations, select YAML view and edit the details. When you are done, click **Create**.
 9 | 
10 | ## Verify via web console
11 | 1.	Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-sgx-plugin**.
12 | 2.	Now `SgxDevicePlugin` is created.
13 | 
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/sgx_device_plugin.yaml
18 | ```
19 | 
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready: 
22 | ```
23 | $ oc get SgxDevicePlugin
24 | ```
25 | Output: 
26 | ```
27 | NAME		        DESIRED		READY	NODE SELECTOR	                                    AGE
28 | sgxdeviceplugin-sample  1 	        1       {"intel.feature.node.kubernetes.io/sgx":"true"}     2m
29 | ```


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | # %: Makefile
20 | # 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | # Generate doc site under _build/html with Sphinx.
23 | vhtml: _work/venv/.stamp
24 | 	. _work/venv/bin/activate && \
25 | 		$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 | 		cp docs/index.html $(BUILDDIR)/html/index.html
27 | 
28 | html:
29 | 		$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
30 | 		cp docs/index.html $(BUILDDIR)/html/index.html
31 | 
32 | clean-html:
33 | 	rm -rf $(BUILDDIR)/html
34 | 
35 | # Set up a Python3 environment with the necessary tools for document creation.
36 | _work/venv/.stamp: ./requirements.txt
37 | 	rm -rf ${@D}
38 | 	python3 -m venv ${@D}
39 | 	. ${@D}/bin/activate && pip install wheel && pip install -r $<
40 | 	touch $@


--------------------------------------------------------------------------------
/tests/l2/sgx/README.md:
--------------------------------------------------------------------------------
 1 | ### Verify Intel® Software Guard Extensions (Intel® SGX) Provisioning
 2 | This [SampleEnclave](https://github.com/intel/linux-sgx/tree/master/SampleCode/SampleEnclave) application workload from the Intel SGX SDK runs an Intel SGX enclave utilizing the EPC resource from the Intel SGX provisioning.
 3 | 
 4 | * Build the container image. 
 5 | ```
 6 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/sgx/sgx_imagestream.yaml
 7 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/sgx/sgx_build.yaml
 8 | ```
 9 | 
10 | * Deploy and run the workload.
11 | ```
12 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/sgx/sgx_job.yaml
13 | ```
14 | 
15 | *	Check the results.
16 | ```
17 |   $ oc get pods -n intel-sgx
18 |   intel-sgx-job-4tnh5          0/1     Completed   0          2m10s
19 |   intel-sgx-workload-1-build   0/1     Completed   0          30s
20 | ```
21 | ```
22 | $ oc logs intel-sgx-job-4tnh5 -n intel-sgx
23 |   Checksum(0x0x7fffac6f41e0, 100) = 0xfffd4143
24 |   Info: executing thread synchronization, please wait...
25 |   Info: SampleEnclave successfully returned.
26 |   Enter a character before exit ...
27 | ```
28 | ## See Also
29 | For Intel SGX demos on vanilla Kubernetes, refer to [link](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/sgx-sdk-demo) 
30 | 


--------------------------------------------------------------------------------
/gaudi/gaudi_spi_fw_upgrade_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: gaudi-spi-fw-upgrade
 8 |   namespace: gaudi-spi-fw-upgrade
 9 | ---
10 | kind: BuildConfig
11 | apiVersion: build.openshift.io/v1
12 | metadata:
13 |   name: gaudi-spi-fw-upgrade
14 |   namespace: gaudi-spi-fw-upgrade
15 | spec:
16 |   output:
17 |     to:
18 |       kind: ImageStreamTag
19 |       name: gaudi-spi-fw-upgrade:1.19.1-26
20 |   strategy:
21 |     type: Docker
22 |   source:
23 |     type: Dockerfile
24 |     dockerfile: >+
25 |       ARG
26 |       BUILDER=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
27 | 
28 |       FROM ${BUILDER} AS builder
29 | 
30 |       RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
31 |         echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
32 |         echo "baseurl=https://vault.habana.ai/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \
33 |         echo "gpgkey=https://vault.habana.ai/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \
34 |         echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo
35 |      
36 |       RUN yum makecache && dnf install -y habanalabs-firmware-odm           
37 |   triggers:
38 |     - type: ConfigChange
39 |   runPolicy: Serial


--------------------------------------------------------------------------------
/tests/l2/README.md:
--------------------------------------------------------------------------------
 1 | # Verifying Intel Hardware Feature Provisioning
 2 | ## Introduction
 3 | After provisioning Intel hardware features on RHOCP, the respective hardware resources are exposed to the RHOCP cluster. The workload containers can request these resources. The following sample workloads help verify if these resources can be used as expected. These sample workloads container images are built and packaged on-premises through [RHOCP BuildConfig](https://docs.openshift.com/container-platform/4.14/cicd/builds/understanding-buildconfigs.html) and pushed to the embedded repository through [RHOCP ImageStream](https://docs.openshift.com/container-platform/4.14/openshift_images/image-streams-manage.html).
 4 | 
 5 | ## Prerequisites
 6 | •	Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster). 
 7 | 
 8 | •	Provisioning Intel HW features on RHOCP. Follow steps [here](/README.md#provisioning-intel-hardware-features-on-rhocp)
 9 | 
10 | ### Verify Intel® Data Center GPU provisioning
11 | Please refer to Intel DGPU provisioning validation tests [here](dgpu/README.md)
12 | 
13 | ### Verify Intel® Software Guard Extensions (Intel® SGX) Provisioning
14 | Please refer to Intel SGX provisioning validation tests [here](sgx/README.md)
15 | 
16 | ### Verify Intel® QuickAssist Technology provisioning
17 | Please refer to Intel QAT provisioning validation tests [here](qat/README.md)
18 | 
19 | ### Verify Intel® Data Streaming Accelerator provisioning
20 | Please refer to Intel DSA provisioning validation tests [here](dsa/README.md)


--------------------------------------------------------------------------------
/workloads/opea/chatqna/tgi_gaudi_servingruntime.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: serving.kserve.io/v1alpha1
 5 | kind: ServingRuntime
 6 | metadata:
 7 |   name: tgi-gaudi-serving-runtime
 8 | spec:
 9 |   containers:
10 |   - name: kserve-container
11 |     image: ghcr.io/huggingface/tgi-gaudi:1.2.1
12 |     args:
13 |     - --model-id
14 |     - /mnt/models/
15 |     - --port=8080
16 |     - --num-shard=3 #Number of GPU's
17 |     - --sharded=true
18 |     - --json-output
19 |     env: #Add variables according to the chosen model
20 |     - name: HF_HOME
21 |       value: /tmp/hf_home
22 |     - name: HF_OFFLINE 
23 |       value: "1"
24 |     - name: TRANSFORMERS_OFFLINE
25 |       value: "1"
26 |     - name: HF_HUB_CACHE
27 |       value: /mnt/models
28 |     - name: HUGGING_FACE_HUB_TOKEN
29 |       valueFrom:
30 |         secretKeyRef:
31 |           key: HUGGING_FACE_HUB_TOKEN
32 |           name: hf-token
33 |     resources:
34 |       limits:
35 |         cpu: "16"
36 |         memory: 128Gi
37 |       requests:
38 |         cpu: "16"
39 |         memory: 128Gi
40 |     readinessProbe:
41 |       exec:
42 |         command:
43 |         - curl
44 |         - localhost:8080/health
45 |       initialDelaySeconds: 500
46 |     livenessProbe:
47 |       exec:
48 |         command:
49 |         - curl
50 |         - localhost:8080/health
51 |       initialDelaySeconds: 500
52 |     ports:
53 |     - containerPort: 8080
54 |       protocol: TCP
55 |   multiModel: false
56 |   supportedModelFormats:
57 |   - autoSelect: true
58 |     name: llm


--------------------------------------------------------------------------------
/tests/l2/qat/qatlib_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: intel-qat-workload
 8 |   namespace: intel-qat
 9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 |   name: intel-qat-workload
15 |   namespace: intel-qat
16 | spec:
17 |   triggers:
18 |     - type: "ConfigChange"
19 |     - type: "ImageChange"
20 |   runPolicy: "Serial"
21 |   source:
22 |     type: Dockerfile
23 |     dockerfile: |
24 |         
25 |         ARG BUILDER=registry.access.redhat.com/ubi9:latest
26 |         FROM ${BUILDER} 
27 |         RUN subscription-manager register  --username=${USERNAME} --password=${PASSWORD} && \
28 |             subscription-manager attach --auto && \
29 |             dnf repolist --disablerepo=* && \
30 |             subscription-manager repos --enable codeready-builder-for-rhel-9-x86_64-rpms  && \
31 |             dnf -y update && \
32 |             dnf install -y  qatlib qatlib-tests
33 |   strategy:
34 |     type: Docker
35 |     noCache: true
36 |     dockerStrategy:
37 |       buildArgs:
38 |           - name: "BUILDER"
39 |             value: "registry.access.redhat.com/ubi9:latest"
40 |       env:
41 |           - name: "USERNAME"
42 |             valueFrom:
43 |               secretKeyRef:
44 |                 key: username
45 |                 name: rh-auth
46 |           - name: "PASSWORD"
47 |             valueFrom:
48 |               secretKeyRef:
49 |                 key: password
50 |                 name: rh-auth
51 | 
52 |   output:
53 |     to:
54 |       kind: ImageStreamTag
55 |       name: intel-qat-workload:latest


--------------------------------------------------------------------------------
/workloads/opea/chatqna/chatqna_megaservice_buildconfig.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: chatqna-megaservice
 8 |   namespace: opea-chatqna
 9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 |   name: chatqna-megaservice
15 |   namespace: opea-chatqna
16 | spec:
17 |   triggers:
18 |     - type: "ConfigChange"
19 |     - type: "ImageChange"
20 |   runPolicy: "Serial"
21 |   source:
22 |     dockerfile: |
23 |       FROM langchain/langchain:latest
24 | 
25 |       RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
26 |       libgl1-mesa-glx \
27 |       libjemalloc-dev
28 | 
29 |       RUN useradd -m -s /bin/bash user && \
30 |       mkdir -p /home/user && \
31 |       chown -R user /home/user/
32 | 
33 |       USER user
34 |       COPY requirements.txt /tmp/requirements.txt
35 |       
36 |       USER root
37 |       COPY tls.crt /rhoai-ca/tls.crt
38 |       RUN cat /rhoai-ca/tls.crt  | tee -a '/usr/lib/ssl/cert.pem'
39 | 
40 |       USER user
41 |       RUN pip install --no-cache-dir --upgrade pip && \
42 |       pip install --no-cache-dir -r /tmp/requirements.txt
43 | 
44 |       ENV PYTHONPATH=$PYTHONPATH:/ws:/home/user:/home/user/qna-app/app
45 | 
46 |       WORKDIR /home/user/qna-app
47 |       COPY qna-app /home/user/qna-app
48 | 
49 |       ENTRYPOINT ["/usr/bin/sleep", "infinity"]
50 |   triggers:
51 |     - type: ConfigChange
52 |   runPolicy: SerialLatestOnly
53 |   strategy:
54 |     type: Docker
55 |     dockerStrategy: {}
56 |   postCommit: {}
57 |   output:
58 |     to:
59 |       kind: ImageStreamTag
60 |       name: chatqna-megaservice:latest


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Test Plan
 2 | 
 3 | ```{admonition} Disclaimer  
 4 | Please note that this module is currently under development and may contain partially implemented features, therefore it is not supported in the current release.
 5 | ```
 6 | 
 7 | ## Overview
 8 | To ensure the whole stack is working as expected and track bugs, a layer based test architecture is needed on OCP. This plan consists of four layers. The first and second layers mentioned below, would be a part of the future automation testing framework on each OCP (x,y,z) release.
 9 | 
10 | ### L1 First Layer: Host OS and Driver Interfaces 
11 | Layer 1 test cases should be executed before deploying [Intel Device Plugins Operator](/device_plugins/README.md) and after deploying OOT drivers like [Intel Data Center GPU Driver on OpenShift](/kmmo/README.md). It includes test cases :
12 | * to check existence of in-tree and OOT tree drivers
13 | * for SELinux and host OS security
14 | * check for devices on all nodes
15 |   
16 | ### L2 Second Layer: Device Plugin Resource Provisioning  
17 | L2 test cases are executed after deploying the [Intel Device Plugins Operator](/device_plugins/README.md). Refer to [readme](l2/README.md). It includes: 
18 | * Pod's resource allocation and scheduling 
19 | * Simple workloads 
20 | * Boundary testing for the resources
21 | * Future plan for any failure analysis needed during automation. 
22 | 
23 | ### L3 Third Layer: E2E solution
24 | L3 test cases are executed after the specific device plugin related [e2e solution](/e2e) has been deployed. Please refer to [L3 test cases](l3/README.md) for detail.
25 | 
26 | ### L4 Fourth Layer: Reference Workloads
27 | This layer includes the reference final application/usecase for the user. It integrates the whole stack and is custom for each Intel hardware feature and device plugin. This layer is yet to be added in upcoming releases.


--------------------------------------------------------------------------------
/workloads/opea/chatqna/redis_deployment_service.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | apiVersion: v1
 5 | kind: ServiceAccount
 6 | metadata:
 7 |   name: opea-chatqna
 8 |   namespace: opea-chatqna
 9 | ---
10 | apiVersion: rbac.authorization.k8s.io/v1
11 | kind: Role
12 | metadata:
13 |   name: opea-chatqna
14 |   namespace: opea-chatqna
15 | rules:
16 | - apiGroups:
17 |   - security.openshift.io
18 |   resources:
19 |   - securitycontextconstraints
20 |   resourceNames:
21 |   - anyuid
22 |   verbs:
23 |   - use
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 |   name: opea-chatqna
29 | roleRef:
30 |   apiGroup: rbac.authorization.k8s.io
31 |   kind: Role
32 |   name: opea-chatqna
33 | subjects:
34 | - kind: ServiceAccount
35 |   name: opea-chatqna
36 |   namespace: opea-chatqna
37 | ---
38 | # Redis Vector DB deployment
39 | apiVersion: apps/v1
40 | kind: Deployment
41 | metadata:
42 |   name: redis-vector-db
43 |   namespace: opea-chatqna
44 | spec:
45 |   replicas: 1
46 |   selector:
47 |     matchLabels:
48 |       app: redis-vector-db
49 |   template:
50 |     metadata:
51 |       labels:
52 |         app: redis-vector-db
53 |     spec:
54 |       serviceAccount: opea-chatqna
55 |       containers:
56 |       - name: redis-vector-db
57 |         image: redis/redis-stack:7.2.0-v9
58 |         ports:
59 |         - containerPort: 6379
60 |         - containerPort: 8001
61 | ---
62 | # Redis Vector DB Service
63 | apiVersion: v1
64 | kind: Service
65 | metadata:
66 |   name: redis-vector-db
67 |   namespace: opea-chatqna
68 | spec:
69 |   type: ClusterIP
70 |   selector:
71 |     app: redis-vector-db
72 |   ports:
73 |   - name: redis-service
74 |     port: 6379
75 |     targetPort: 6379
76 |   - name: redis-insight
77 |     port: 8001
78 |     targetPort: 8001


--------------------------------------------------------------------------------
/gaudi/Gaudi-SPI-Firmware-Upgrade.md:
--------------------------------------------------------------------------------
 1 | # Upgrade Intel Gaudi SPI Firmware
 2 | To upgrade Intel Gaudi SPI Firmware, follow below steps: 
 3 | 
 4 | **NOTE:** Currently this is only supported on Single Node OpenShift cluster. Multi node cluster support will be added in the future.
 5 | 
 6 | ## Prerequisites
 7 | - Make sure Gaudi drivers are unloaded.
 8 |   - On Red Hat OpenShift, delete existing ClusterPolicy Custom Resource. Verify output on the node using below command: 
 9 |     ```
10 |     lsmod | grep habana
11 |     ```
12 |   - Check the firmware version following the [firmware version check](https://github.com/intel/intel-technology-enabling-for-openshift/tree/main/tests/gaudi/l2#check-firmware-version-with-hl-smi).
13 | 
14 | ## SPI Firmware Upgrade
15 | Build the container image with `habanalabs-firmware-odm` tool:
16 | ```
17 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/gaudi/gaudi_spi_fw_upgrade_build.yaml
18 | ```
19 | Create service account with required permissions: 
20 | ```
21 | oc create sa gaudi-fw-upgrade-sa -n gaudi-spi-fw-upgrade
22 | oc adm policy add-scc-to-user privileged -z gaudi-fw-upgrade-sa -n gaudi-spi-fw-upgrade
23 | ```
24 | Deploy and execute the SPI firmware upgrade tool:
25 | ```
26 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/gaudi/gaudi_spi_fw_upgrade_job.yaml
27 | ```
28 | 
29 | Verify Output:
30 | ``` 
31 | oc get pods
32 | 
33 | NAME                               READY   STATUS      RESTARTS   AGE
34 | gaudi-spi-firmware-upgrade-ndmjp   0/1     Completed   0          10m
35 | ```
36 | ```
37 | oc logs gaudi-spi-firmware-upgrade-ndmjp
38 | .
39 | .
40 | ####
41 | #### Finished sending firmware: OK
42 | ```
43 | Verify by following the [firmware version check](https://github.com/intel/intel-technology-enabling-for-openshift/tree/main/tests/gaudi/l2#check-firmware-version-with-hl-smi).


--------------------------------------------------------------------------------
/tests/l2/dsa/dsa_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: build.openshift.io/v1
 5 | kind: BuildConfig
 6 | metadata:
 7 |   name: intel-dsa-workload
 8 |   namespace: intel-dsa
 9 | spec:
10 |   triggers:
11 |     - type: "ConfigChange"
12 |     - type: "ImageChange"
13 |   runPolicy: "Serial"
14 |   source:
15 |     type: Dockerfile
16 |     dockerfile: |
17 |         
18 |         ARG BUILDER=registry.access.redhat.com/ubi9:latest
19 |         FROM ${BUILDER} 
20 |         RUN subscription-manager register  --username=${USERNAME} --password=${PASSWORD} && \
21 |             subscription-manager attach --auto && \
22 |             dnf repolist --disablerepo=* && \
23 |             subscription-manager repos --enable rhel-9-for-x86_64-baseos-rpms --enable codeready-builder-for-rhel-9-x86_64-rpms && \
24 |             dnf -y update && \
25 |             dnf install -y gcc g++ make cmake autoconf automake libtool pkg-config \
26 |               git asciidoc xmlto libuuid-devel json-c-devel zlib-devel openssl-devel \
27 |               pciutils accel-config
28 |         RUN git clone -b accel-config-v4.1.8 https://github.com/intel/idxd-config && \
29 |           cd idxd-config && ./autogen.sh && ./configure CFLAGS='-g -O2' --prefix=/usr \
30 |           --sysconfdir=/etc --libdir=/usr/lib64 --enable-test=yes && make && make install
31 |   strategy:
32 |     type: Docker
33 |     noCache: true
34 |     dockerStrategy:
35 |       buildArgs:
36 |           - name: "BUILDER"
37 |             value: "registry.access.redhat.com/ubi9:latest"
38 |       env:
39 |           - name: "USERNAME"
40 |             valueFrom:
41 |               secretKeyRef:
42 |                 key: username
43 |                 name: rh-auth
44 |           - name: "PASSWORD"
45 |             valueFrom:
46 |               secretKeyRef:
47 |                 key: password
48 |                 name: rh-auth
49 |   output:
50 |     to:
51 |       kind: ImageStreamTag
52 |       name: intel-dsa-workload:latest


--------------------------------------------------------------------------------
/tests/l2/dgpu/clinfo_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: intel-dgpu-clinfo
 8 |   namespace: intel-dgpu
 9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 |   name: intel-dgpu-clinfo
15 |   namespace: intel-dgpu
16 | spec:
17 |   triggers:
18 |     - type: "ConfigChange"
19 |     - type: "ImageChange"
20 |   runPolicy: "Serial"
21 |   source:
22 |     type: Dockerfile
23 |     dockerfile: |
24 |         ARG BUILDER=registry.access.redhat.com/ubi9-minimal:latest 
25 |         FROM ${BUILDER}  
26 | 
27 |         ARG OCL_ICD_VERSION=ocl-icd-2.2.13-4.el9.x86_64
28 |         ARG CLINFO_VERSION=clinfo-3.0.21.02.21-4.el9.x86_64
29 | 
30 |         RUN microdnf install -y \
31 |           glibc \
32 |           yum-utils 
33 |         
34 |         # install intel-opencl, ocl-icd and clinfo
35 |         RUN dnf install -y 'dnf-command(config-manager)' && \
36 |           dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/9.0/lts/2350/unified/intel-gpu-9.0.repo && \
37 |           dnf install -y intel-opencl  \
38 |           https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os/Packages/$OCL_ICD_VERSION.rpm  \
39 |           https://dl.fedoraproject.org/pub/epel/9/Everything/x86_64/Packages/c/$CLINFO_VERSION.rpm && \
40 |           dnf clean all && dnf autoremove && rm -rf /var/lib/dnf/lists/* && \
41 |               rm -rf /etc/yum.repos.d/intel-graphics.repo     
42 |   strategy:
43 |     type: Docker
44 |     noCache: true
45 |     dockerStrategy:
46 |       buildArgs:
47 |           - name: "BUILDER"
48 |             value: "registry.access.redhat.com/ubi9-minimal:latest"
49 |           - name: "OCL_ICD_VERSION"
50 |             value: "ocl-icd-2.2.13-4.el9.x86_64"
51 |           - name: "CLINFO_VERSION"
52 |             value: "clinfo-3.0.21.02.21-4.el9.x86_64"
53 |   output:
54 |     to:
55 |       kind: ImageStreamTag
56 |       name: intel-dgpu-clinfo:latest


--------------------------------------------------------------------------------
/nfd/node-feature-rules-openshift.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 - 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: nfd.openshift.io/v1alpha1
 5 | kind: NodeFeatureRule
 6 | metadata:
 7 |   name: intel-dp-devices
 8 |   namespace: openshift-nfd
 9 | spec:
10 |   rules:
11 |     - name: "intel.gpu"
12 |       labels:
13 |         "intel.feature.node.kubernetes.io/gpu": "true"
14 |       matchFeatures:
15 |         - feature: pci.device
16 |           matchExpressions:
17 |             vendor: {op: In, value: ["8086"]}
18 |             class: {op: In, value: ["0300", "0380"]}
19 | 
20 |     - name: "intel.qat"
21 |       labels:
22 |         "intel.feature.node.kubernetes.io/qat": "true"
23 |       matchFeatures:
24 |         - feature: pci.device
25 |           matchExpressions:
26 |             vendor: {op: In, value: ["8086"]}
27 |             device: {op: In, value: ["4940", "4942", "4944"]}
28 |             class: {op: In, value: ["0b40"]}
29 |         - feature: kernel.loadedmodule
30 |           matchExpressions:
31 |             intel_qat: {op: Exists}
32 | 
33 |     - name: "intel.sgx"
34 |       labels:
35 |         "intel.feature.node.kubernetes.io/sgx": "true"
36 |       extendedResources:
37 |         sgx.intel.com/epc: "@cpu.security.sgx.epc"
38 |       matchFeatures:
39 |         - feature: cpu.cpuid
40 |           matchExpressions:
41 |             SGX: {op: Exists}
42 |             SGXLC: {op: Exists}
43 |         - feature: cpu.security
44 |           matchExpressions:
45 |             sgx.enabled: {op: IsTrue}
46 |         - feature: kernel.config
47 |           matchExpressions:
48 |             X86_SGX: {op: Exists}
49 |     - name: "intel.dsa"
50 |       labels:
51 |         "intel.feature.node.kubernetes.io/dsa": "true"
52 |       matchFeatures:
53 |         - feature: pci.device
54 |           matchExpressions:
55 |             vendor: {op: In, value: ["8086"]}
56 |             device: {op: In, value: ["0b25"]}
57 |             class: {op: In, value: ["0880"]}
58 |         - feature: kernel.loadedmodule
59 |           matchExpressions:
60 |             idxd: {op: Exists}
61 | 


--------------------------------------------------------------------------------
/device_plugins/deploy_gpu.md:
--------------------------------------------------------------------------------
 1 | # Create Intel GPU Device Plugin CR
 2 | 
 3 | ## Create CR via web console
 4 | 1.	Go to **Operator** -> **Installed Operators**.
 5 | 2.	Open **Intel Device Plugins Operator**.
 6 | 3.	Navigate to tab **Intel GPU Device Plugin**.
 7 | 4.	Click **Create GpuDevicePlugin** -> set correct parameters -> Click **Create**.
 8 | 5.	Optional: If you want to  make any customizations, select YAML view and edit the details. Once you are done, click **Create**.
 9 | 
10 | ## Verify via web console
11 | 1.	Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-gpu-plugin**.
12 | 2.	Now `GpuDevicePlugin` is created.
13 | 
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f  https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/gpu_device_plugin.yaml
18 | ```
19 | 
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready:
22 | ```
23 | $ oc get GpuDevicePlugin 
24 | ```
25 | Output: 
26 | ```
27 | NAME		        DESIRED	  READY	  NODE SELECTOR	                                    AGE
28 | gpudeviceplugin-sample  1         1       {"intel.feature.node.kubernetes.io/gpu":"true"}   3m12s
29 | ```
30 | 
31 | # Using Intel Data Center GPU resource exclusively
32 | In this release, we only verified and support the single Intel GPU `i915` resource dedicated to the single workload pod. To achieve this, we set `sharedDevNum: 1` and `preferredAllocationPolicy: none` as default options.   
33 | As the cluster administrator, use the [gpu_device_plugin.yaml](/device_plugins/gpu_device_plugin.yaml) provided from the previous section Create CR via CLI or use the default options from Create CR via web Console.  
34 | As the application owner, when claiming the i915 resource, make sure the resource limits and requests are set as shown below:
35 | ```
36 | spec:
37 |   containers:
38 |     - name: gpu-pod
39 |       resources:
40 |         limits:
41 |           gpu.intel.com/i915: 1
42 |         requests:
43 |           gpu.intel.com/i915: 1
44 | ```
45 | For more details, please refer to this [issue](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/1408).  


--------------------------------------------------------------------------------
/tests/l2/dgpu/vainfo_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: intel-dgpu-vainfo
 8 |   namespace: intel-dgpu
 9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 |   name: intel-dgpu-vainfo
15 |   namespace: intel-dgpu
16 | spec:
17 |   triggers:
18 |     - type: "ConfigChange"
19 |     - type: "ImageChange"
20 |   runPolicy: "Serial"
21 |   source:
22 |     type: Dockerfile
23 |     dockerfile: |
24 |         ARG BUILDER=registry.access.redhat.com/ubi9:latest
25 |         FROM ${BUILDER}
26 |         RUN subscription-manager register --username=${USERNAME} --password=${PASSWORD} && \
27 |             subscription-manager attach --auto && \
28 |             subscription-manager repos --enable rhel-9-for-x86_64-appstream-rpms && \
29 |             dnf -y update && \
30 |             dnf install -y flex bison gcc gcc-c++ make autoconf libtool cmake git gdb \
31 |               libva libva-devel libdrm-devel
32 |         RUN dnf install -y 'dnf-command(config-manager)' && \
33 |             dnf config-manager --add-repo \
34 |             https://repositories.intel.com/gpu/rhel/9.2/lts/2350/unified/intel-gpu-9.2.repo
35 |         RUN dnf -y update && \
36 |             dnf install -y libva-utils intel-gmmlib-devel
37 |         RUN git clone -b intel-media-23.4.3 --single-branch https://github.com/intel/media-driver.git && \
38 |               cd media-driver && mkdir media-driver build && cd build && \
39 |                 cmake -D ENABLE_PRODUCTION_KMD=ON ../ && make -j $(nproc) && make install
40 |         ENTRYPOINT  ["/bin/sh"]
41 |   strategy:
42 |     type: Docker
43 |     noCache: true
44 |     dockerStrategy:
45 |       buildArgs:
46 |           - name: "BUILDER"
47 |             value: "registry.access.redhat.com/ubi9:latest"
48 |       env:
49 |           - name: "USERNAME"
50 |             valueFrom:
51 |               secretKeyRef:
52 |                 key: username
53 |                 name: rh-auth
54 |           - name: "PASSWORD"
55 |             valueFrom:
56 |               secretKeyRef:
57 |                 key: password
58 |                 name: rh-auth
59 |   output:
60 |     to:
61 |       kind: ImageStreamTag
62 |       name: intel-dgpu-vainfo:latest


--------------------------------------------------------------------------------
/tests/gaudi/l2/vllm_deployment.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | ---
 4 | kind: PersistentVolumeClaim
 5 | apiVersion: v1
 6 | metadata:
 7 |   name: vllm-workload-pvc
 8 |   namespace: gaudi-validation
 9 | spec:
10 |   accessModes:
11 |     - ReadWriteOnce
12 |   resources:
13 |     requests:
14 |       storage: 60Gi
15 |   storageClassName: ""  # Add your storage class
16 |   volumeMode: Filesystem
17 | ---
18 | apiVersion: apps/v1
19 | kind: Deployment
20 | metadata:
21 |   name: vllm-workload
22 |   namespace: gaudi-validation
23 | labels:
24 |     app: vllm-workload
25 | spec:
26 |   replicas: 1
27 |   selector:
28 |     matchLabels:
29 |       app: vllm-workload
30 |   template:
31 |     metadata:
32 |       labels:
33 |         app: vllm-workload
34 |     spec:
35 |       containers:
36 |         - name: vllm-container
37 |           image: image-registry.openshift-image-registry.svc:5000/gaudi-validation/vllm-workload:latest
38 |           command: [ "/bin/bash", "-c", "--" ]
39 |           args: ["vllm serve meta-llama/Llama-3.1-8B"] # Add the model 
40 |           ports:
41 |           - containerPort: 8000
42 |           resources:
43 |             limits:
44 |               habana.ai/gaudi: 1
45 |           env:
46 |             - name: HF_TOKEN
47 |               valueFrom:
48 |                 secretKeyRef:
49 |                   name: hf-token
50 |                   key: hf-token
51 |             - name: HF_HOME
52 |               value: /home/vllm/.cache/huggingface
53 |             - name: HF_HUB_OFFLINE
54 |               value: "0"
55 |           imagePullPolicy: Always
56 |           volumeMounts:
57 |             - name: hf-cache 
58 |               mountPath: /home/vllm/.cache
59 |             - name: shm
60 |               mountPath: /dev/shm
61 |       volumes:
62 |         - name: hf-cache
63 |           persistentVolumeClaim:
64 |             claimName: vllm-workload-pvc
65 |         - name: shm
66 |           emptyDir:
67 |             medium: Memory
68 |             sizeLimit: "2Gi"
69 |       livenessProbe:
70 |           httpGet:
71 |             path: /health
72 |             port: 8000
73 |           initialDelaySeconds: 60
74 |           periodSeconds: 10
75 |       readinessProbe:
76 |           httpGet:
77 |             path: /health
78 |             port: 8000
79 |           initialDelaySeconds: 60
80 |           periodSeconds: 5
81 | 


--------------------------------------------------------------------------------
/tests/l2/dgpu/intelvpl_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: image.openshift.io/v1
 5 | kind: ImageStream
 6 | metadata:
 7 |   name: intel-dgpu-intelvpl
 8 |   namespace: intel-dgpu
 9 | spec: {}
10 | ---
11 | apiVersion: build.openshift.io/v1
12 | kind: BuildConfig
13 | metadata:
14 |   name: intel-dgpu-intelvpl
15 |   namespace: intel-dgpu
16 | spec:
17 |   triggers:
18 |     - type: "ConfigChange"
19 |     - type: "ImageChange"
20 |   runPolicy: "Serial"
21 |   source:
22 |     type: Dockerfile
23 |     dockerfile: |
24 |         ARG BUILDER=registry.access.redhat.com/ubi9:latest
25 |         FROM ${BUILDER}
26 |         RUN subscription-manager register --username=${USERNAME} --password=${PASSWORD} && \
27 |             subscription-manager attach --auto && \
28 |             subscription-manager repos --enable rhel-9-for-x86_64-appstream-rpms && \
29 |             dnf -y update && \
30 |             dnf install -y flex bison gcc gcc-c++ make autoconf libtool cmake git gdb \
31 |               libva libva-devel libdrm libdrm-devel
32 |         RUN dnf install -y 'dnf-command(config-manager)' && \
33 |             dnf config-manager --add-repo \
34 |             https://repositories.intel.com/gpu/rhel/9.2/lts/2350/unified/intel-gpu-9.2.repo
35 |         RUN dnf -y update && \
36 |             dnf install -y libva-utils intel-gmmlib-devel libvpl2 libvpl-devel libvpl-tools \
37 |               libmfx
38 |         RUN git clone -b intel-media-23.4.3 --single-branch https://github.com/intel/media-driver.git && \
39 |               cd media-driver && mkdir media-driver build && cd build && \
40 |                 cmake -D ENABLE_PRODUCTION_KMD=ON ../ && make -j $(nproc) && make install
41 |         # we need this for testing samples
42 |         RUN git clone https://github.com/intel/libvpl.git
43 |         ENTRYPOINT  ["/bin/sh"]
44 |   strategy:
45 |     type: Docker
46 |     noCache: true
47 |     dockerStrategy:
48 |       buildArgs:
49 |           - name: "BUILDER"
50 |             value: "registry.access.redhat.com/ubi9:latest"
51 |       env:
52 |           - name: "USERNAME"
53 |             valueFrom:
54 |               secretKeyRef:
55 |                 key: username
56 |                 name: rh-auth
57 |           - name: "PASSWORD"
58 |             valueFrom:
59 |               secretKeyRef:
60 |                 key: password
61 |                 name: rh-auth
62 |   output:
63 |     to:
64 |       kind: ImageStreamTag
65 |       name: intel-dgpu-intelvpl:latest


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | Intel Technology Enabling For OpenShift project is licensed under the terms in [Apache-2.0](LICENSE.txt). By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ### Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 | 


--------------------------------------------------------------------------------
/nfd/README.md:
--------------------------------------------------------------------------------
 1 | # Setting up Node Feature Discovery
 2 | [Node Feature Discovery (NFD) Operator](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator) manages the deployment and lifecycle of the NFD add-on to detect hardware features and system configuration, such as PCI cards, kernel, operating system version, etc.
 3 | 
 4 | ## Prerequisites
 5 | - Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
 6 | 
 7 | ## Install NFD Operator
 8 | Follow the guide below to install the NFD operator using CLI or web console. 
 9 | - [Install from the CLI](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator#install-operator-cli_psap-node-feature-discovery-operator)
10 | - [Install from the web console](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator#install-operator-web-console_psap-node-feature-discovery-operator)
11 | 
12 | ## Configure NFD Operator
13 | Note: As RHOCP cluster administrator, you might need to merge the NFD operator config from the following Custom Resources (CRs) with other NFD operator configs that are already applied on your cluster.  
14 | 
15 | 1. Create `NodeFeatureDiscovery` CR instance.
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-discovery-openshift.yaml 
18 | ```
19 | 
20 | 2.	Create `NodeFeatureRule` CR instance.
21 | ```
22 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/nfd/node-feature-rules-openshift.yaml
23 | ```
24 | 
25 | ## Verification 
26 | Use the following command to get the node name
27 | ```
28 | $ oc get nodes
29 | ```
30 | Use the command shown below to verify whether the nodes are labeled properly by NFD:
31 | ```
32 | $ oc describe node <node_name> | grep intel.feature.node.kubernetes.io
33 | ```
34 | Example output: 
35 | ```
36 | intel.feature.node.kubernetes.io/dgpu-canary=true
37 | intel.feature.node.kubernetes.io/gpu=true
38 | ```
39 | 
40 | ## Labels Table
41 | | Label | Intel hardware feature | 
42 | | ----- | ---------------------- |
43 | | `intel.feature.node.kubernetes.io/gpu=true` | Intel® Data Center GPU Flex Series or Intel® Data Center GPU Max Series | 
44 | | `intel.feature.node.kubernetes.io/sgx=true` | Intel® SGX | 
45 | | `intel.feature.node.kubernetes.io/qat=true` | Intel® QAT | 
46 | | `intel.feature.node.kubernetes.io/dsa=true` | Intel® DSA | 
47 | 
48 | ## See Also
49 | 


--------------------------------------------------------------------------------
/docs/releases.rst:
--------------------------------------------------------------------------------
 1 | Release Information
 2 | ===================
 3 | .. list-table:: 
 4 |    :align: left
 5 |    :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10
 6 | 
 7 |    * - **Release**
 8 |      - `1.6.1 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.6.1>`_
 9 |      - `1.6.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.6.0>`_
10 |      - `1.5.2 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.5.2>`_
11 |      - `1.5.1 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.5.1>`_
12 |      - `1.5.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.5.0>`_
13 |      - `1.4.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.4.0>`_
14 |      - `1.3.1 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.3.1>`_
15 |      - `1.3.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.3.0>`_
16 |      - `1.2.1 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.2.1>`_
17 |      - `1.2.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.2.0>`_
18 |      - `1.1.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.1.0>`_
19 |      - `1.0.1 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.0.1>`_
20 |      - `1.0.0 <https://github.com/intel/intel-technology-enabling-for-openshift/releases/tag/v1.0.0>`_
21 |    * - **Red Hat OpenShift Version**
22 |      - 4.18
23 |      - 4.18
24 |      - 4.17
25 |      - 4.17
26 |      - 4.17
27 |      - 4.16
28 |      - 4.14
29 |      - 4.15
30 |      - 4.14
31 |      - 4.14
32 |      - 4.13
33 |      - 4.12
34 |      - 4.12
35 |    * - **Documentation**
36 |      - `1.6.1 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.6.1/README.html>`_
37 |      - `1.6.0 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.6.0/README.html>`_
38 |      - `1.5.2 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.5.2/README.html>`_
39 |      - `1.5.1 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.5.1/README.html>`_
40 |      - `1.5.0 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.5.0/README.html>`_
41 |      - `1.4.0 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.4.0/README.html>`_
42 |      - `1.3.1 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.3.1/README.html>`_
43 |      - `1.3.0 <https://intel.github.io/intel-technology-enabling-for-openshift/v1.3.0/README.html>`_
44 |      - NA
45 |      - NA
46 |      - NA
47 |      - NA
48 |      - NA
49 | 
50 | **NOTE:** Release 1.3.1 supports OPEA and Gaudi 


--------------------------------------------------------------------------------
/machine_configuration/README.md:
--------------------------------------------------------------------------------
 1 | # Setting up Machine Configuration
 2 | 
 3 | ## Introduction
 4 | Machine configuration operation is used to configure [Red Hat Enterprise Linux CoreOS (RHCOS)](https://docs.openshift.com/container-platform/4.14/architecture/architecture-rhcos.html) on each node in a RHOCP cluster.
 5 | 
 6 | [Machine config operator](https://github.com/openshift/machine-config-operator) (MCO) is provided by Red Hat to manage the operating system and machine configuration. In this project through the MCO, cluster administrators can configure and update the kernel to provision Intel Hardware features on the worker nodes.
 7 | 
 8 | MCO is one of the technologies used in this project to manage the machine configuration. In current OCP, MCO might reboot the node to enable the machine configuration. Since rebooting the node is undesirable, alternative machine configuration technologies are under investigation. For more details, see this [issue](https://github.com/intel/intel-technology-enabling-for-openshift/issues/34).  
 9 | 
10 | The best approach is to work with the RHCOS team to push the RHCOS configuration as the default configuration for a RHOCP cluster on [Day 0](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/). 
11 | 
12 | For some general configuration, we recommend you set it up while provisioning the cluster on [Day 1](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/).
13 | 
14 | If the configuration cannot be set as the default setting, we recommend using some operator to set the configuration on the fly without rebooting the node on [Day 2](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/).
15 | 
16 | Any contribution in this area is welcome. 
17 | 
18 | ## Prerequisites 
19 | - Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
20 | - Setup node feature discovery (NFD). Follow steps [here](/nfd/README.md).
21 | 
22 | ## Machine Configuration for Provisioning Intel® QAT and Intel® DSA
23 | 
24 | * Turn on `intel_iommu,sm_on` kernel parameter and load `vfio_pci` at boot for QAT and DSA provisioning
25 | 
26 | ```
27 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/machine_configuration/100-intel-iommu-on.yaml
28 | ```
29 | 
30 | Note: This will reboot the worker nodes when changing the kernel parameter through MCO.
31 | 
32 | ## Verification
33 | Navigate to the node terminal on the web console (Compute -> Nodes -> Select a node -> Terminal). Run the following commands in the terminal.
34 | ```
35 | $ cat /proc/cmdline
36 | ```
37 | Ensure that `intel_iommu=on,sm_on` is present.
38 | 
39 | ```
40 | $ chroot /host
41 | $ lsmod | grep vfio_pci
42 | ```
43 | Ensure that `vfio_pci` driver is present.
44 | 
45 | ## See Also
46 | - [Red Hat OpenShift Container Platform Day-2 operations](https://www.ibm.com/cloud/architecture/content/course/red-hat-openshift-container-platform-day-2-ops/)
47 | 


--------------------------------------------------------------------------------
/playbooks/intel_ocp_provisioning.yaml:
--------------------------------------------------------------------------------
 1 | - hosts: localhost
 2 |   gather_facts: no
 3 |   vars:
 4 |     kubeconfig_path: "~/.kube/mojave-config"
 5 |   environment:
 6 |     KUBECONFIG: "{{ kubeconfig_path }}"
 7 |   vars_prompt:
 8 |     - name: "install_operators"
 9 |       prompt: "Do you want to install operators? 'Yes' to install NFD Operator and Intel Device Plugins Operator, or 'No' to skip"
10 |       private: no
11 |     - name: "validation_feature"
12 |       prompt: "Which Intel feature do you want to validate? Enter 1 for Intel SGX, 2 for Intel QAT, 3 for Intel DSA, 4 for Intel GPU"
13 |       private: no
14 |   
15 |   tasks:
16 |   - name: Validate Inputs
17 |     block: 
18 |     - name: Invalid Install Operators Input
19 |       fail:
20 |         msg: "Invalid input for Install Operators. Please enter a valid option for Install Operators (Yes/No)."
21 |       when: install_operators not in ["Yes", "No"]
22 |     - name: Invalid Validation Feature Input
23 |       fail:
24 |         msg: "Invalid input for validation feature. Please enter a valid option (1-4)."
25 |       when: validation_feature not in ["1", "2", "3", "4"]
26 | 
27 |   - name: Install Operators
28 |     block:
29 |     - name: NFD - Install NFD Operator
30 |       include_tasks: install_nfd_operator.yaml
31 |     - name: IDPO - Install Intel Device Plugins Operator
32 |       include_tasks: install_device_plugins_operator.yaml
33 |     - name: NFD - Wait until the nfd-operator-controller Deployment is available
34 |       k8s_info:
35 |         kind: Deployment
36 |         wait: yes
37 |         name: nfd-controller-manager
38 |         label_selectors:
39 |           - operators.coreos.com/nfd.openshift-nfd
40 |           - control-plane=controller-manager
41 |         namespace: openshift-nfd
42 |         wait_condition:
43 |           type: Available
44 |           status: 'True'
45 |     - name: NFD - Configure NFD Operator
46 |       include_tasks: configure_nfd.yaml
47 |     - name: IDPO - Wait until the inteldeviceplugins-controller-manager Deployment is available
48 |       k8s_info:
49 |         kind: Deployment
50 |         name: inteldeviceplugins-controller-manager
51 |         namespace: openshift-operators
52 |         wait: yes
53 |         wait_condition:
54 |           type: Available
55 |           status: 'True'
56 |           reason: MinimumReplicasAvailable
57 |     when: install_operators == "Yes"
58 | 
59 |   - name: Skip Operator Installation
60 |     debug:
61 |       msg: "Skipping operator installation as per user input."
62 |     when: install_operators == "No"
63 | 
64 |   - name: Validate Intel SGX
65 |     include_tasks: validate_sgx.yaml
66 |     when: validation_feature == "1"
67 | 
68 |   - name: Validate Intel QAT
69 |     include_tasks: validate_gpu.yaml
70 |     when: validation_feature == "2"
71 | 
72 |   - name: Validate Intel DSA
73 |     include_tasks: validate_qat.yaml
74 |     when: validation_feature == "3"
75 | 
76 |   - name: Validate Intel GPU
77 |     include_tasks: validate_dsa.yaml
78 |     when: validation_feature == "4"


--------------------------------------------------------------------------------
/tests/l2/sgx/sgx_build.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | apiVersion: build.openshift.io/v1
 5 | kind: BuildConfig
 6 | metadata:
 7 |   name: intel-sgx-workload
 8 |   namespace: intel-sgx
 9 | spec:
10 |   triggers:
11 |     - type: "ConfigChange"
12 |     - type: "ImageChange"
13 |   runPolicy: "Serial"
14 |   source:
15 |     type: Dockerfile
16 |     dockerfile: |
17 |         ARG BUILDER=registry.access.redhat.com/ubi9:latest 
18 |         ARG BASE=registry.access.redhat.com/ubi9-minimal:latest
19 |         ARG LINUX_SGX_VERSION=2.26
20 |         FROM ${BUILDER} AS builder
21 | 
22 |         ARG SGX_SDK=sgx_linux_x64_sdk_2.26.100.0.bin
23 |         ARG LINUX_SGX_VERSION
24 | 
25 |         RUN dnf -y update && \
26 |           dnf -y install \
27 |           wget \
28 |           openssl \
29 |           gcc-c++ \
30 |           make \
31 |           protobuf-c && \
32 |           dnf clean all
33 | 
34 |         # SGX SDK installed in /opt/intel directory
35 |         WORKDIR /opt/intel
36 |         RUN wget https://download.01.org/intel-sgx/sgx-linux/$LINUX_SGX_VERSION/distro/rhel9.4-server/$SGX_SDK \
37 |             && chmod +x  $SGX_SDK \
38 |             && echo "yes" | ./$SGX_SDK \
39 |             && rm $SGX_SDK
40 | 
41 |         RUN cd sgxsdk/SampleCode/SampleEnclave \
42 |             && . /opt/intel/sgxsdk/environment \
43 |             && make 
44 |         FROM ${BASE} 
45 |         ARG LINUX_SGX_VERSION
46 |         RUN microdnf -y update && \
47 |             microdnf -y install \
48 |             wget \
49 |             tar \
50 |             gzip && \
51 |             microdnf clean all && rm -rf /var/cache/dnf
52 | 
53 |         # Download SGX PSW and install SGX runtime components to create SGX enclave
54 |         WORKDIR /opt/intel
55 |         RUN wget https://download.01.org/intel-sgx/sgx-linux/$LINUX_SGX_VERSION/distro/rhel9.4-server/sgx_rpm_local_repo.tgz \
56 |             && sha256sum sgx_rpm_local_repo.tgz \
57 |             && tar xvf sgx_rpm_local_repo.tgz \
58 |             && rm -rf sgx_rpm_local_repo.tgz
59 | 
60 |         RUN cd sgx_rpm_local_repo && rpm -i \
61 |             libsgx-headers-$LINUX_SGX_VERSION* \
62 |             libsgx-enclave-common-$LINUX_SGX_VERSION* \
63 |             libsgx-urts-$LINUX_SGX_VERSION* && \
64 |             rm -r /opt/intel/sgx_rpm_local_repo
65 | 
66 |         COPY --from=builder /opt/intel/sgxsdk/SampleCode/SampleEnclave/app app
67 |         COPY --from=builder /opt/intel/sgxsdk/SampleCode/SampleEnclave/enclave.signed.so enclave.signed.so
68 | 
69 |         ENTRYPOINT /opt/intel/app  
70 |    
71 |   strategy:
72 |     type: Docker
73 |     noCache: true
74 |     dockerStrategy:
75 |       buildArgs:
76 |           - name: "BUILDER"
77 |             value: "registry.access.redhat.com/ubi9:9.4"
78 |           - name: "BASE"
79 |             value: "registry.access.redhat.com/ubi9-minimal:9.4"
80 |           - name: "SGX_SDK"
81 |             value: "sgx_linux_x64_sdk_2.26.100.0.bin"
82 |           - name: "LINUX_SGX_VERSION"
83 |             value: "2.26"
84 |   output:
85 |     to:
86 |       kind: ImageStreamTag
87 |       name: intel-sgx-workload:latest


--------------------------------------------------------------------------------
/device_plugins/deploy_dsa.md:
--------------------------------------------------------------------------------
 1 | # Create Intel DSA Device Plugin CR
 2 | 
 3 | ## Create a CR via web console
 4 | 1.	Go to **Operator** -> **Installed Operators**.
 5 | 2.	Open **Intel Device Plugins Operator**.
 6 | 3.	Navigate to tab **Intel DSA Device Plugin**.
 7 | 4.	Click **Create DSADevicePlugin** -> set correct parameters -> Click **Create** 
 8 | 5.	Optional: If you want to make any customizations, select YAML view and edit the details. When you are done, click **Create**.
 9 | 
10 | ## Verify via web console
11 | 1.	Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-dsa-plugin**.
12 | 2.	Now `DsaDevicePlugin` is created.
13 | 
14 | ## Create CR via CLI
15 | Apply the CR yaml file:
16 | ```
17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/dsa_device_plugin.yaml
18 | ```
19 | 
20 | ## Verify via CLI
21 | Verify that the device plugin CR is ready: 
22 | ```
23 | $ oc get DsaDevicePlugin
24 | ```
25 | Output: 
26 | ```
27 | NAME		        DESIRED		READY	NODE SELECTOR	                                    AGE
28 | dsadeviceplugin-sample  3 	        3       {"intel.feature.node.kubernetes.io/dsa":"true"}     98m
29 | ```
30 | 
31 | # Verify DSA Device Plugin 
32 | After the plugin is deployed, use below command to verify DSA resources: 
33 | ```
34 | $ oc describe node srf-2 | grep dsa.intel.com
35 |   dsa.intel.com/wq-user-dedicated:  0
36 |   dsa.intel.com/wq-user-shared:     160
37 |   dsa.intel.com/wq-user-dedicated:  0
38 |   dsa.intel.com/wq-user-shared:     160
39 |  ```
40 | 
41 | ## DSA Resource Configuration
42 | By default the DSA plugin uses [this configuration file](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/demo/dsa.conf).
43 | The dsa init container comes with a utility called `accel-config` which takes file as input and configures the DSA hardwares based on that.
44 | The default configuration has creates dedicated WQs for each DSA device so that it's four groups per device where each groups is with 1 WQ linked to 1 engine.
45 | Users can customise the config and can use the pre-customised config for their specific use case from [here](https://github.com/intel/idxd-config/tree/stable/contrib/configs)
46 | There's also a possibility for a node specific configuration by passing a node specific profile via configMap volume mount.
47 | Users can use the steps below to customize the DSA resource configuration:  
48 | 1. Create the configmap for DSA resource configuration 
49 |     ```
50 |     $ oc create configmap --namespace=openshift-operators intel-dsa-config --from-file=dsa[-$NODE_NAME].conf 
51 | 2. Create DSA device plugin CR with -provisioning-config set as the name of the ConfigMap (created in step 1) in the dsa_device_plugin.yaml file or set ConfigMap name in the provisioning-config option from web console. 
52 | 
53 | # Run Intel DSA based workloads on RHOCP
54 | To run the Intel DSA based workloads as an unprivileged pod, you need to use a customised SCC. The customized `dsa-scc` Security Context Constraint (SCC) is provided to bind with service account and run the DSA based workload. 
55 | 
56 | See [Verify Intel DSA Provisioning](/tests/l2/dsa/README.md) for the detailed steps.  
57 | 


--------------------------------------------------------------------------------
/tests/l2/qat/README.md:
--------------------------------------------------------------------------------
 1 | ### Verify Intel® QuickAssist Technology provisioning
 2 | This workload runs [qatlib](https://github.com/intel/qatlib) sample tests using RedHat built and distributed Qatlib RPM packages from the codeready-builder-for-rhel-9-x86_64-rpms repo. Refer to the [qatlib readme](https://github.com/intel/qatlib/blob/main/INSTALL) for more details. 
 3 | 
 4 | * Create and use ```intel-qat``` namespace for the workload
 5 | 
 6 | ```
 7 | $ oc new-project intel-qat
 8 | ```
 9 | 
10 | *	Build the workload container image
11 | 
12 | Please replace the credentials in buildconfig yaml with your RedHat account login credentials. 
13 | 
14 | ```
15 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/qat/qatlib_build.yaml 
16 | ```
17 | 
18 | * Create SCC intel-qat-scc for Intel QAT based workload, if this SCC is not created   
19 |   
20 | ```
21 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/qatlib_scc.yaml
22 | ```
23 |       
24 | * Create the intel-qat service account to use intel-qat-scc
25 |   
26 | ```
27 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/qatlib_rbac.yaml
28 | ```
29 | 
30 | * Deploy the qatlib workload job with intel-qat service account
31 |   
32 | ```
33 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/qat/qatlib_job.yaml
34 | ```
35 | 
36 | * Check the results.
37 | ``` 
38 |   $ oc get pods -n intel-qat
39 |   intel-qat-workload-c6g9v   0/1     Completed   0          4m13s
40 | ```
41 | 
42 | 
43 | * For all sample tests `cpa_sample_code` 
44 | 
45 | ```
46 | $ oc logs intel-qat-workload-c6g9v -n intel-qat
47 | qaeMemInit started
48 | icp_sal_userStartMultiProcess("SSL") started
49 | There are no crypto instances
50 | *** QA version information ***
51 | device ID               = 0
52 | software                = 23.2.0
53 | *** END QA version information ***
54 | Inst 0, Affin: 0, Dev: 0, Accel 0, EE 0, BDF ED:00:01
55 | Inst 1, Affin: 1, Dev: 0, Accel 0, EE 0, BDF ED:00:01
56 | Inst 2, Affin: 2, Dev: 0, Accel 0, EE 0, BDF ED:00:01
57 | Inst 3, Affin: 3, Dev: 0, Accel 0, EE 0, BDF ED:00:01
58 | ---------------------------------------
59 | API                    Traditional
60 | Session State          STATELESS
61 | Algorithm              DEFLATE
62 | Huffman Type           STATIC
63 | Mode                   ASYNCHRONOUS
64 | CNV Enabled            YES
65 | Direction              COMPRESS
66 | Packet Size            8192
67 | Compression Level      1
68 | Corpus                 CALGARY_CORPUS
69 | Corpus Filename        calgary
70 | CNV Recovery Enabled   YES
71 | Number of threads      4
72 | Total Responses        158400
73 | Total Retries          2242671
74 | Clock Cycles Start     126150916653843
75 | Clock Cycles End       126151409143747
76 | Total Cycles           492489904
77 | CPU Frequency(kHz)     1700160
78 | Throughput(Mbps)       35920
79 | Compression Ratio      0.4897
80 | ---------------------------------------
81 | 
82 | Inst 0, Affin: 0, Dev: 0, Accel 0, EE 0, BDF ED:00:01
83 | Inst 1, Affin: 1, Dev: 0, Accel 0, EE 0, BDF ED:00:01
84 | Inst 2, Affin: 2, Dev: 0, Accel 0, EE 0, BDF ED:00:01
85 | Inst 3, Affin: 3, Dev: 0, Accel 0, EE 0, BDF ED:00:01
86 | ---------------------------------------
87 | ```
88 | 


--------------------------------------------------------------------------------
/one_click/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Intel Technology Enabling Solutions with Red Hat OpenShift using “One-Click”
 2 | 
 3 | ## Overview
 4 | Red Hat [Ansible](https://www.ansible.com/) and Operator technologies are used for “One-Click Deployment” of Intel technology enabling solutions with Red Hat OpenShift Container Platform (RHOCP). Ansible technology automates the operator installation and configuration steps using a playbook, making deployment as simple as a single click.
 5 | 
 6 | The referenced Ansible playbooks here can be used by the cluster administrators to customize their own playbooks.
 7 | 
 8 | **Note:** It is recommended to start from [Get started](/README.md#getting-started) to get familiar with the installation and configuration of the general operator before composing the first playbook.
 9 | 
10 | ## Reference Playbook – Intel Data Center GPU Provisioning
11 | 
12 | This playbook demonstrates the one-click provisioning of Intel Data Center GPU  on an RHOCP cluster. The steps involved are installation and configuration of general Operators including Node Feature Discovery (NFD) operator, Kernel Module Management (KMM) operator, and the Intel Device Plugins Operator. 
13 | 
14 | ### Prerequisite
15 | Before running the playbook, ensure the following prerequisites are met:
16 | - Provisioned RHOCP Cluster
17 | - Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster.
18 | 
19 | ### Run the Playbook
20 | To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook. 
21 | ```
22 | $ git clone https://github.com/intel/intel-technology-enabling-for-openshift.git
23 | $ cd intel-technology-enabling-for-openshift/one_click 
24 | ```
25 | Execute below single command to provision Intel Data Center GPU:
26 | ```
27 | $ ansible-playbook gpu_provisioning_playbook.yaml
28 | ```
29 | 
30 | ## Reference Playbook – Intel Gaudi Provisioning
31 | This playbook demonstrates the one-click provisioning of Intel Gaudi AI Accelerator on an RHOCP cluster. The steps involved are installation and configuration of general Operators including Node Feature Discovery (NFD) operator, Kernel Module Management (KMM) operator, and the Intel Gaudi Base Operator. The playbook also creates the Gaudi `DeviceConfig` CR which deploys the Gaudi Out-of-Tree drivers, Gaudi device plugins, Habana container runtime and Habana node metrics.
32 | 
33 | ### Prerequisite
34 | Before running the playbook, ensure the following prerequisites are met:
35 | - Provisioned RHOCP Cluster
36 | - Red Hat Enterprise Linux (RHEL) system with [Ansible](https://docs.ansible.com/ansible/2.9/installation_guide/intro_installation.html#installing-ansible-on-rhel-centos-or-fedora) installed and configured with a `kubeconfig` to connect to your RHOCP cluster.
37 | - Set Firmware search path using MCO, follow [Update Kernel Firmware Search Path with MCO](/gaudi/README.md#update-kernel-firmware-search-path-with-mco).
38 | 
39 | ### Run the Playbook
40 | To run the ansible playbook, clone this repository to your RHEL system. Navigate to the directory containing the playbook. 
41 | ```
42 | $ git clone https://github.com/intel/intel-technology-enabling-for-openshift.git
43 | $ cd intel-technology-enabling-for-openshift/one_click 
44 | ```
45 | Execute below single command to provision Intel Gaudi Accelerator:
46 | ```
47 | $ ansible-playbook gaudi_provisioning_playbook.yaml
48 | ```


--------------------------------------------------------------------------------
/workloads/opea/chatqna/chatqna_megaservice_deployment.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | ---
  4 | apiVersion: v1
  5 | kind: PersistentVolumeClaim
  6 | metadata:
  7 |   name: chatqna-megaservice-pvc-0
  8 |   namespace: opea-chatqna
  9 | spec:
 10 |   accessModes:
 11 |     - ReadWriteOnce
 12 |   resources:
 13 |     requests:
 14 |       storage: 100Mi
 15 | ---
 16 | apiVersion: v1
 17 | kind: PersistentVolumeClaim
 18 | metadata:
 19 |   name: chatqna-megaservice-pvc-1
 20 |   namespace: opea-chatqna
 21 | spec:
 22 |   accessModes:
 23 |     - ReadWriteOnce
 24 |   resources:
 25 |     requests:
 26 |       storage: 100Mi
 27 | ---
 28 | apiVersion: v1
 29 | kind: PersistentVolumeClaim
 30 | metadata:
 31 |   name: chatqna-megaservice-pvc-2
 32 |   namespace: opea-chatqna
 33 | spec:
 34 |   accessModes:
 35 |     - ReadWriteOnce
 36 |   resources:
 37 |     requests:
 38 |       storage: 100Mi
 39 | ---
 40 | apiVersion: apps/v1
 41 | kind: Deployment
 42 | metadata:
 43 |   name: chatqna-megaservice
 44 |   namespace: opea-chatqna
 45 | spec:
 46 |   selector:
 47 |     matchLabels:
 48 |       app: chatqna-megaservice
 49 |   replicas: 1
 50 |   template:
 51 |     metadata:
 52 |       labels:
 53 |         app: chatqna-megaservice
 54 |     spec:
 55 |       serviceAccount: opea-chatqna
 56 |       containers:
 57 |         - name: chatqna-megaservice
 58 |           image: 'image-registry.openshift-image-registry.svc:5000/opea-chatqna/chatqna-megaservice:latest'
 59 |           env:
 60 |             - name: EMBED_MODEL
 61 |               value: BAAI/bge-base-en-v1.5
 62 |             - name: HUGGINGFACEHUB_API_TOKEN
 63 |               valueFrom:
 64 |                 secretKeyRef:
 65 |                   key:  HUGGINGFACEHUB_API_TOKEN
 66 |                   name: hf-token
 67 |             - name: MODEL_SIZE
 68 |               value: 70b
 69 |             - name: PYTHONPATH
 70 |               value: $PYTHONPATH:/ws:/home/user:/home/user/qna-app/app
 71 |             - name: RAG_UPLOAD_DIR
 72 |               value: /upload_dir
 73 |             - name: REDIS_PORT
 74 |               value: "6379"
 75 |             - name: REDIS_HOST
 76 |               value: "redis-vector-db"
 77 |             - name: REDIS_SCHEMA
 78 |               value: schema_dim_768.yml
 79 |             - name: TGI_ENDPOINT
 80 |               value: http://xxx.xxx.xxx.xxx:xxx
 81 |           ports:
 82 |             - containerPort: 8000
 83 |           command:
 84 |             - /bin/bash
 85 |             - '-c'
 86 |             - |
 87 |               cd /ws && \
 88 |               python ingest.py /ws/data_intel/ && \
 89 |               cd /home/user/qna-app && \
 90 |               python app/server.py
 91 |           volumeMounts:
 92 |           - mountPath: /ws
 93 |             name: chatqna-megaservice-pvc-0
 94 |           - mountPath: /test
 95 |             name: chatqna-megaservice-pvc-1
 96 |           - mountPath: /upload_dir
 97 |             name: chatqna-megaservice-pvc-2
 98 |       volumes:
 99 |       - name: chatqna-megaservice-pvc-0
100 |         persistentVolumeClaim:
101 |           claimName: chatqna-megaservice-pvc-0
102 |       - name: chatqna-megaservice-pvc-1
103 |         persistentVolumeClaim:
104 |           claimName: chatqna-megaservice-pvc-1
105 |       - name: chatqna-megaservice-pvc-2
106 |         persistentVolumeClaim:
107 |           claimName: chatqna-megaservice-pvc-2
108 | ---
109 | # Chatqna megaservice Service
110 | apiVersion: v1
111 | kind: Service
112 | metadata:
113 |   name: chatqna-megaservice
114 |   namespace: opea-chatqna
115 | spec:
116 |   type: ClusterIP
117 |   selector:
118 |     app: chatqna-megaservice
119 |   ports:
120 |   - port: 8000
121 |     targetPort: 8000


--------------------------------------------------------------------------------
/tests/l2/dsa/README.md:
--------------------------------------------------------------------------------
 1 | ### Verify Intel® Data Streaming Accelerator (DSA) Technology provisioning
 2 | This workload runs [accel-config](https://github.com/intel/idxd-config) sample tests using RedHat built and distributed accel-config RPM packages from the rhel-9-for-x86_64-baseos-rpms repo. Refer to the [accel config readme](https://github.com/intel/idxd-config/blob/stable/README.md) for more details. 
 3 | 
 4 | * Create the RedHat auth secret. Please replace the credentials in the secret yaml with your RedHat account login credentials. 
 5 | 
 6 | ```
 7 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/rh_auth.yaml 
 8 | ```
 9 | 
10 | *	Build the workload container image
11 | 
12 | ```
13 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/dsa_imagestream.yaml 
14 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/dsa_build.yaml 
15 | ```
16 | 
17 | * Create SCC intel-dsa-scc for Intel DSA based workload, if this SCC is not created   
18 |   
19 | ```
20 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_scc.yaml
21 | ```
22 |       
23 | * Create the intel-dsa serviceAccount, role and roleBinding to use intel-dsa-scc
24 |   
25 | ```
26 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_serviceAccount.yaml
27 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_role.yaml
28 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/security/dsa_roleBinding.yaml
29 | ```
30 | 
31 | * Deploy the accel-config workload job with intel-dsa service account
32 |   
33 | ```
34 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dsa/dsa_job.yaml
35 | ```
36 | 
37 | * Check the results.
38 | ``` 
39 |   $ oc get pods -n intel-dsa
40 |   intel-dsa-workload-244xm   0/1     Completed   0          3m12s
41 | ```
42 | 
43 | * sample test logs
44 | ```
45 | $ oc logs intel-dsa-workload-244xm -n intel-dsa
46 | dsa0/wq0.1
47 | dsa0
48 | Testing with 'block on fault' flag ON
49 | Performing dedicated WQ NOOP testing
50 | Testing 1 bytes
51 | [ info] alloc wq 1 dedicated size 16 addr 0x7f0cde00b000 batch sz 0x400 xfer sz 0x80000000
52 | [ info] testnoop: tflags 0x1 num_desc 1
53 | [ info] preparing descriptor for noop
54 | [ info] Submitted all noop jobs
55 | [ info] verifying task result for 0x2041620
56 | [ info] test with op 0 passed
57 | Testing 4096 bytes
58 | [ info] alloc wq 1 dedicated size 16 addr 0x7fd4881da000 batch sz 0x400 xfer sz 0x80000000
59 | [ info] testnoop: tflags 0x1 num_desc 1
60 | [ info] preparing descriptor for noop
61 | [ info] Submitted all noop jobs
62 | [ info] verifying task result for 0x82f620
63 | [ info] test with op 0 passed
64 | Testing 65536 bytes
65 | [ info] alloc wq 1 dedicated size 16 addr 0x7f462bbed000 batch sz 0x400 xfer sz 0x80000000
66 | [ info] testnoop: tflags 0x1 num_desc 1
67 | [ info] preparing descriptor for noop
68 | [ info] Submitted all noop jobs
69 | [ info] verifying task result for 0xe4e620
70 | [ info] test with op 0 passed
71 | Testing 1048576 bytes
72 | [ info] alloc wq 1 dedicated size 16 addr 0x7fac2ac0c000 batch sz 0x400 xfer sz 0x80000000
73 | [ info] testnoop: tflags 0x1 num_desc 1
74 | [ info] preparing descriptor for noop
75 | [ info] Submitted all noop jobs
76 | [ info] verifying task result for 0xf21620
77 | [ info] test with op 0 passed
78 | Testing 2097152 bytes
79 | [ info] alloc wq 1 dedicated size 16 addr 0x7f7426a5c000 batch sz 0x400 xfer sz 0x80000000
80 | [ info] testnoop: tflags 0x1 num_desc 1
81 | [ info] preparing descriptor for noop
82 | [ info] Submitted all noop jobs
83 | [ info] verifying task result for 0xeec620
84 | [ info] test with op 0 passed
85 | Performing shared WQ NOOP testing
86 | ```
87 | 


--------------------------------------------------------------------------------
/gaudi/gaudi_cluster_policy.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # Adapted from https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Kubernetes_Installation/Kubernetes_Operator.html#id2
  4 | #
  5 | apiVersion: habanalabs.habana.ai/v1
  6 | kind: ClusterPolicy
  7 | metadata:
  8 |   name: habana-ai
  9 | spec:
 10 |   image_registry: vault.habana.ai
 11 |   driver:
 12 |     driver_loader:
 13 |       images:
 14 |         ubuntu_22.04:
 15 |           repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer
 16 |           tag: 1.19.1-26
 17 |         rhel_8.6:
 18 |           repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer
 19 |           tag: 1.19.1-26
 20 |         rhel_9.2:
 21 |           repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer
 22 |           tag: 1.19.1-26
 23 |         rhel_9.4:
 24 |           repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer
 25 |           tag: 1.19.1-26
 26 |         tencentos_3.1:
 27 |           repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer
 28 |           tag: 1.19.1-26
 29 |       resources:
 30 |         limits:
 31 |           cpu: cpu_str_or_int_optional
 32 |           memory: memory_str_optional
 33 |         requests:
 34 |           cpu: cpu_str_or_int_optional
 35 |           memory: memory_str_optional
 36 |       repo_server: vault.habana.ai
 37 |       repo_path: artifactory/gaudi-installer/repos
 38 |       mlnx_ofed_repo_path: artifactory/gaudi-installer/deps
 39 |       mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz
 40 |       hugepages: hugepages_number_int_optional
 41 |       external_ports: turn_on_external_port_bool_optional
 42 |       firmware_flush: flush_firmware_on_the_gaudi_cards_bool_optional
 43 |     driver_runner:
 44 |       image:
 45 |         repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer
 46 |         tag: 1.19.1-26
 47 |       resources:
 48 |         limits:
 49 |           cpu: cpu_str_or_int_optional
 50 |           memory: memory_str_optional
 51 |         requests:
 52 |           cpu: cpu_str_or_int_optional
 53 |           memory: memory_str_optional
 54 |   device_plugin:
 55 |     image:
 56 |       repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin
 57 |       tag: 1.19.1
 58 |     resources:
 59 |       limits:
 60 |           cpu: cpu_str_or_int_optional
 61 |           memory: memory_str_optional
 62 |       requests:
 63 |           cpu: cpu_str_or_int_optional
 64 |           memory: memory_str_optional
 65 |   runtime:
 66 |     runner:
 67 |       image:
 68 |         repository: vault.habana.ai/habana-ai-operator/habana-container-runtime
 69 |         tag: 1.19.1-26
 70 |       resources:
 71 |         limits:
 72 |           cpu: cpu_str_or_int_optional
 73 |           memory: memory_str_optional
 74 |         requests:
 75 |           cpu: cpu_str_or_int_optional
 76 |           memory: memory_str_optional
 77 |     configuration:
 78 |       container_engine: one_of_containerd_docker_crio
 79 |       engine_container_runtime_configuration: container_engine_configuration_optional
 80 |       habana_container_runtime_configuration: container_runtime_configuration_optional
 81 |   metric_exporter:
 82 |     runner:
 83 |       image:
 84 |         repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter
 85 |         tag: 1.19.1-26
 86 |       resources:
 87 |         limits:
 88 |           cpu: cpu_str_or_int_optional
 89 |           memory: memory_str_optional
 90 |         requests:
 91 |           cpu: cpu_str_or_int_optional
 92 |           memory: memory_str_optional
 93 |     port: 41611
 94 |     interval: 20
 95 |   feature_discovery:
 96 |     runner:
 97 |       image:
 98 |         repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery
 99 |         tag: 1.19.1-26
100 |       resources:
101 |         limits:
102 |           cpu: cpu_str_or_int_optional
103 |           memory: memory_str_optional
104 |         requests:
105 |           cpu: cpu_str_or_int_optional
106 |           memory: memory_str_optional
107 |     nfd_plugin: boolean_nfd_installed
108 |   bmc_monitoring:
109 |     image:
110 |       repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter
111 |       tag: 1.19.1-26
112 |     resources:
113 |       limits:
114 |           cpu: cpu_str_or_int_optional
115 |           memory: memory_str_optional
116 |       requests:
117 |           cpu: cpu_str_or_int_optional
118 |           memory: memory_str_optional
119 |   node_selector:
120 |     key_optional: value_optional


--------------------------------------------------------------------------------
/device_plugins/README.md:
--------------------------------------------------------------------------------
 1 | # Setting up Intel Device Plugins Operator 
 2 | 
 3 | ## Overview
 4 | Intel Device Plugins are utilized to advertise Intel hardware features (resources) to a Red Hat OpenShift Container Platform (RHOCP) Cluster. This allows workloads running on pods deployed within the clusters to leverage these features. To handle the deployment and lifecycle of these device plugins, the [Intel Device Plugins Operator](https://catalog.redhat.com/software/container-stacks/detail/61e9f2d7b9cdd99018fc5736) is used. The Intel Device Plugins container images and operator have been officially certified and published on the [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software/container-stacks/detail/61e9f2d7b9cdd99018fc5736). For more details on the upstream project, please refer to [Intel Device Plugins for Kubernetes](https://github.com/intel/intel-device-plugins-for-kubernetes).  
 5 | 
 6 | ## Prerequisities
 7 | - Provisioned RHOCP cluster. Follow steps [here](/README.md).
 8 | - Setup Node Feature Discovery (NFD). Follow steps [here](/nfd/README.md).
 9 | - Follow the additional prerequisites for provisioning Intel® Data Center GPU: 
10 |     - Setup out of tree drivers for Intel Data Center GPU provisioning. Follow the steps listed [here](/kmmo/README.md). 
11 | - Follow the additional prerequisites for provisioning Intel® QuickAssist Technology: 
12 |     - Configure MCO for provisioning Intel QAT. Follow steps [here](/machine_configuration/README.md#machine-configuration-for-provisioning-intel-qat).
13 | 
14 | ## Install Intel Device Plugins Operator on Red Hat OpenShift
15 | ### Installation via web console
16 | Follow the steps below to install Intel Device Plugins Operator using OpenShift web console:
17 | 1.	In the OpenShift web console, navigate to **Operator** -> **OperatorHub**.
18 | 2.	Search for **Intel Device Plugins Operator** in all items field -> Click **Install**.
19 | ### Verify Installation via web console
20 | 1.	Go to **Operator** -> **Installed Operators**.
21 | 2.	Verify that the status of the operator is **Succeeded**.
22 | 
23 | ### Installation via command line interface (CLI)
24 | Apply the [install_operator.yaml](/device_plugins/install_operator.yaml) file:
25 | ```
26 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/install_operator.yaml
27 | ```
28 | 
29 | ### Verify Installation via CLI
30 | Verify that the operator controller manager pod is up and running:
31 | ```
32 | $ oc get pod | grep inteldeviceplugins-controller-manager
33 | 
34 | inteldeviceplugins-controller-manager-6b8c76c867-hftqm   2/2     Running   0          17m
35 | ```
36 | 
37 | ## Resources Provided by Intel Device Plugins
38 | The resources are the user interface for customers to claim and consume the hardware features provided by Intel Device Plugins from the user pods. See below table for the details:
39 | 
40 | | Feature | Resources | Description | Usage |
41 | | ------- | --------- | ----------- | ----- |
42 | | Intel® SGX | `sgx.intel.com/epc` | Intel SGX EPC memory for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/64a6c86f3be25459c14ea988e892f9f5d873a8ca/tests/l2/sgx/sgx_job.yaml#L21) |
43 | | Intel® Data Center GPU Flex Series </br> Intel® Data Center GPU Max Series | `gpu.intel.com/i915 ` | Intel Data Center GPU Card for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/device_plugins/deploy_gpu.md#using-intel-data-center-gpu-resource-exclusively) |
44 | | Intel® QAT | `qat.intel.com/cy` </br> `qat.intel.com/dc` | `cy`: Intel QAT VFIO Virtual Function device configured for cryptography for user pod to claim </br> `dc`: Intel QAT VFIO Virtual Function device configured for cryptography for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/tests/l2/qat/qatlib_job.yaml#L24) </br> [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/tests/l2/qat/qatlib_job.yaml#L28) |
45 | | Intel® DSA | `dsa.intel.com/wq-user-shared` </br> `dsa.intel.com/wq-user-dedicated` | Intel DSA Work Queue for user pod to claim | [Link](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/tests/l2/dsa/dsa_job.yaml#L27) |
46 | 
47 | 
48 | ## Creating Intel Device Plugin custom resource (CR)
49 | - To create an Intel SGX device plugin CR, follow this [link](/device_plugins/deploy_sgx.md).
50 | - To create an Intel GPU device plugin CR, follow this [link](/device_plugins/deploy_gpu.md).
51 | - To create an Intel QAT device plugin CR, follow this [link](/device_plugins/deploy_qat.md).
52 | - To create an Intel DSA device plugin CR, follow this [link](/device_plugins/deploy_dsa.md).
53 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | name: Publish Documentation
  2 | on:
  3 |   push:
  4 |     branches:
  5 |       - main
  6 |       - release-v1.3.0
  7 |       - release-v1.3.1
  8 |       - release-v1.4.0
  9 |       - release-v1.5.0
 10 |       - release-v1.5.1
 11 |       - release-v1.5.2
 12 |       - release-v1.6.0
 13 |       - release-v1.6.1
 14 | 
 15 | permissions:
 16 |   contents: read
 17 | 
 18 | jobs:
 19 |   build:
 20 |     permissions:
 21 |       contents: write  # for Git to git push
 22 |     runs-on: ubuntu-22.04
 23 | 
 24 |     steps:
 25 |     - name: Install dependencies
 26 |       run: |
 27 |         sudo apt-get update
 28 |         sudo apt-get install -y python3-venv git
 29 |     - uses: actions/checkout@v4
 30 |       with:
 31 |         fetch-depth: 0
 32 |         ref: main
 33 |     - name: Set up doc directory
 34 |       run: |
 35 |         mkdir $HOME/output
 36 |         touch $HOME/output/.nojekyll
 37 |         echo "<meta http-equiv=\"refresh\" content=\"0; URL='development/index.html'\" />" >"$HOME/output/index.html"
 38 |     - name: Build devel
 39 |       run: |
 40 |         export GITHUB_SHA=$(git rev-parse HEAD)
 41 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
 42 |         rm -rf _work/venv
 43 |         make vhtml
 44 |         mv _build/html $HOME/output/development
 45 |     - uses: actions/checkout@v4
 46 |       with:
 47 |         fetch-depth: 0
 48 |         ref: release-v1.3.0
 49 |     - name: Build release-v1.3.0
 50 |       run: |
 51 |         export GITHUB_SHA=$(git rev-parse HEAD)
 52 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
 53 |         rm -rf _work/venv
 54 |         make vhtml
 55 |         mv _build/html $HOME/output/v1.3.0
 56 |     - uses: actions/checkout@v4
 57 |       with:
 58 |         fetch-depth: 0
 59 |         ref: release-v1.3.1
 60 |     - name: Build release-v1.3.1
 61 |       run: |
 62 |         export GITHUB_SHA=$(git rev-parse HEAD)
 63 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
 64 |         rm -rf _work/venv
 65 |         make vhtml
 66 |         mv _build/html $HOME/output/v1.3.1
 67 |     - uses: actions/checkout@v4
 68 |       with:
 69 |         fetch-depth: 0
 70 |         ref: release-v1.4.0
 71 |     - name: Build release-v1.4.0
 72 |       run: |
 73 |         export GITHUB_SHA=$(git rev-parse HEAD)
 74 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
 75 |         rm -rf _work/venv
 76 |         make vhtml
 77 |         mv _build/html $HOME/output/v1.4.0
 78 |     - uses: actions/checkout@v4
 79 |       with:
 80 |         fetch-depth: 0
 81 |         ref: release-v1.5.0
 82 |     - name: Build release-v1.5.0
 83 |       run: |
 84 |         export GITHUB_SHA=$(git rev-parse HEAD)
 85 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
 86 |         rm -rf _work/venv
 87 |         make vhtml
 88 |         mv _build/html $HOME/output/v1.5.0
 89 |     - uses: actions/checkout@v4
 90 |       with:
 91 |         fetch-depth: 0
 92 |         ref: release-v1.5.1
 93 |     - name: Build release-v1.5.1
 94 |       run: |
 95 |         export GITHUB_SHA=$(git rev-parse HEAD)
 96 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
 97 |         rm -rf _work/venv
 98 |         make vhtml
 99 |         mv _build/html $HOME/output/v1.5.1
100 |     - uses: actions/checkout@v4
101 |       with:
102 |         fetch-depth: 0
103 |         ref: release-v1.5.2
104 |     - name: Build release-v1.5.2
105 |       run: |
106 |         export GITHUB_SHA=$(git rev-parse HEAD)
107 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
108 |         rm -rf _work/venv
109 |         make vhtml
110 |         mv _build/html $HOME/output/v1.5.2
111 |     - uses: actions/checkout@v4
112 |       with:
113 |         fetch-depth: 0
114 |         ref: release-v1.6.0
115 |     - name: Build release-v1.6.0
116 |       run: |
117 |         export GITHUB_SHA=$(git rev-parse HEAD)
118 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
119 |         rm -rf _work/venv
120 |         make vhtml
121 |         mv _build/html $HOME/output/v1.6.0
122 |     - uses: actions/checkout@v4
123 |       with:
124 |         fetch-depth: 0
125 |         ref: release-v1.6.1
126 |     - name: Build release-v1.6.1
127 |       run: |
128 |         export GITHUB_SHA=$(git rev-parse HEAD)
129 |         export GITHUB_SHA_REF=$(git rev-parse --abbrev-ref HEAD)
130 |         rm -rf _work/venv
131 |         make vhtml
132 |         mv _build/html $HOME/output/v1.6.1
133 |     - name: Deploy the docs
134 |       shell: bash
135 |       env:
136 |         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
137 |       run: |
138 |         cd $HOME/output
139 |         git init
140 |         git config --global user.name "${GITHUB_ACTOR}"
141 |         git config --global user.email "${GITHUB_ACTOR}@github.com"
142 |         git add .
143 |         git commit -m "latest html output"
144 |         git push -f https://${GITHUB_ACTOR}:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git HEAD:gh-pages


--------------------------------------------------------------------------------
/tests/gaudi/l2/vllm_buildconfig.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | apiVersion: image.openshift.io/v1
  5 | kind: ImageStream
  6 | metadata:
  7 |   name: vllm-workload
  8 |   namespace: gaudi-validation
  9 | spec: {}
 10 | ---
 11 | apiVersion: build.openshift.io/v1
 12 | kind: BuildConfig
 13 | metadata:
 14 |   name: vllm-workload
 15 |   namespace: gaudi-validation
 16 | spec:
 17 |   triggers:
 18 |     - type: "ConfigChange"
 19 |     - type: "ImageChange"
 20 |   runPolicy: "Serial"
 21 |   source:
 22 |     type: Dockerfile
 23 |     git:
 24 |         uri: https://github.com/HabanaAI/vllm-fork.git
 25 |         ref: v1.19.1
 26 |     dockerfile: |
 27 |         ARG BASE_IMAGE=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
 28 |         FROM ${BASE_IMAGE} as habana-base
 29 | 
 30 |         USER root
 31 | 
 32 |         ENV VLLM_TARGET_DEVICE="hpu"
 33 |         ENV HABANA_SOFTWARE_VERSION="1.19.1"
 34 | 
 35 |         RUN dnf -y update --best --allowerasing --skip-broken && dnf clean all
 36 | 
 37 |         WORKDIR /workspace
 38 | 
 39 |         ## Python Installer #################################################################
 40 |         FROM habana-base as python-install
 41 | 
 42 |         ARG PYTHON_VERSION=3.11
 43 | 
 44 |         ENV VIRTUAL_ENV=/opt/vllm
 45 |         ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 46 |         RUN dnf install -y --setopt=install_weak_deps=0 --nodocs \
 47 |             python${PYTHON_VERSION}-wheel && \
 48 |             python${PYTHON_VERSION} -m venv $VIRTUAL_ENV --system-site-packages && pip install --no-cache -U pip wheel && dnf clean all
 49 | 
 50 |         ## Python Habana base #################################################################
 51 |         FROM python-install as python-habana-base
 52 | 
 53 |         ENV VIRTUAL_ENV=/opt/vllm
 54 |         ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 55 | 
 56 |         # install Habana Software and common dependencies
 57 |         RUN --mount=type=cache,target=/root/.cache/pip \
 58 |             --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
 59 |             --mount=type=bind,source=requirements-hpu.txt,target=requirements-hpu.txt \
 60 |             pip install \
 61 |             -r requirements-hpu.txt
 62 | 
 63 |         ## Builder #####################################################################
 64 |         FROM python-habana-base AS build
 65 | 
 66 |         # install build dependencies
 67 | 
 68 |         # copy input files
 69 |         COPY csrc csrc
 70 |         COPY setup.py setup.py
 71 |         COPY cmake cmake
 72 |         COPY CMakeLists.txt CMakeLists.txt
 73 |         COPY requirements-common.txt requirements-common.txt
 74 |         COPY requirements-hpu.txt requirements-hpu.txt
 75 |         COPY pyproject.toml pyproject.toml
 76 | 
 77 |         # max jobs used by Ninja to build extensions
 78 |         ARG max_jobs=2
 79 |         ENV MAX_JOBS=${max_jobs}
 80 |         # # make sure punica kernels are built (for LoRA)
 81 |         # HPU currently doesn't support LoRA
 82 |         # ENV VLLM_INSTALL_PUNICA_KERNELS=1
 83 | 
 84 |         # Copy the entire directory before building wheel
 85 |         COPY vllm vllm
 86 | 
 87 |         ENV CCACHE_DIR=/root/.cache/ccache
 88 |         RUN --mount=type=cache,target=/root/.cache/ccache \
 89 |             --mount=type=cache,target=/root/.cache/pip \
 90 |             --mount=type=bind,src=.git,target=/workspace/.git \
 91 |             env CFLAGS="-march=haswell" \
 92 |             CXXFLAGS="$CFLAGS $CXXFLAGS" \
 93 |             CMAKE_BUILD_TYPE=Release \
 94 |             python3 setup.py bdist_wheel --dist-dir=dist
 95 | 
 96 |         ## Release #####################################################################
 97 |         FROM python-install AS vllm-openai
 98 | 
 99 |         WORKDIR /workspace
100 | 
101 |         ENV VIRTUAL_ENV=/opt/vllm
102 |         ENV PATH=$VIRTUAL_ENV/bin/:$PATH
103 | 
104 |         # Triton needs a CC compiler
105 |         RUN dnf install -y --setopt=install_weak_deps=0 --nodocs gcc \
106 |             && dnf clean all
107 | 
108 |         # install vllm wheel first, so that torch etc will be installed
109 |         RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
110 |             --mount=type=cache,target=/root/.cache/pip \
111 |             pip install $(echo dist/*.whl)'[tensorizer]' --verbose
112 | 
113 |         ENV HF_HUB_OFFLINE=1 \
114 |             PORT=8000 \
115 |             HOME=/home/vllm \
116 |             VLLM_USAGE_SOURCE=production-docker-image
117 | 
118 |         # setup non-root user for OpenShift
119 |         # In OpenShift the user ID is randomly assigned, for compatibility we also
120 |         # set up a non-root user here.
121 |         RUN umask 002 \
122 |             && useradd --uid 2000 --gid 0 vllm \
123 |             && chmod g+rwx $HOME /usr/src /workspace
124 | 
125 |         COPY LICENSE /licenses/vllm.md
126 | 
127 |         USER 2000
128 |         ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
129 |   strategy:
130 |     type: Docker
131 |     noCache: true
132 |     dockerStrategy:
133 |       buildArgs:
134 |         - name: "BASE_IMAGE"
135 |           value: "vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26"
136 |   output:
137 |     to:
138 |       kind: ImageStreamTag
139 |       name: vllm-workload:latest


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | # -- Project information -----------------------------------------------------
 20 | 
 21 | project = 'Intel® Technology Enabling for OpenShift*'
 22 | copyright = '2024, Intel® Corporation'
 23 | author = 'Intel® Corporation'
 24 | 
 25 | # The short X.Y version
 26 | # version = 'devel'
 27 | # The full version, including alpha/beta/rc tags
 28 | # release = 'GA'
 29 | 
 30 | 
 31 | # ---------------------------------
 32 | # Reference for sphinx_md : https://pypi.org/project/sphinx-md/
 33 | # --------------------------------- 
 34 | from os import getenv
 35 | 
 36 | baseBranch = "main"
 37 | sphinx_md_useGitHubURL = True
 38 | commitSHA = getenv('GITHUB_SHA')
 39 | githubBaseURL = 'https://github.com/' + (getenv('GITHUB_REPOSITORY') or 'intel/intel-technology-enabling-for-openshift') + '/'
 40 | githubFileURL = githubBaseURL + "blob/"
 41 | githubDirURL = githubBaseURL + "tree/"
 42 | if commitSHA:
 43 |     githubFileURL = githubFileURL + commitSHA + "/"
 44 |     githubDirURL = githubDirURL + commitSHA + "/"
 45 | else:
 46 |     githubFileURL = githubFileURL + baseBranch + "/"
 47 |     githubDirURL = githubDirURL + baseBranch + "/"
 48 | sphinx_md_githubFileURL = githubFileURL
 49 | sphinx_md_githubDirURL = githubDirURL
 50 | 
 51 | # Version displayed in the upper left corner
 52 | # This value is set in the github workflow environment
 53 | commitREF = getenv('GITHUB_SHA_REF', default = "unknown")
 54 | if commitREF.startswith("release-"):
 55 |     version = commitREF[len("release-"):].strip()
 56 | else:
 57 |     version = "development"
 58 | 
 59 | 
 60 | # Versions list with URLs using tags displayed in the lower left corner
 61 | from git import Repo
 62 | versions_to_exclude = set(['v1.0.0', 'v1.0.1','v1.1.0', 'v1.2.0', 'v1.2.1'])
 63 | repo = Repo( search_parent_directories=True )
 64 | github_repo = "/intel-technology-enabling-for-openshift/"
 65 | release_versions = [("development", github_repo)]
 66 | tags = reversed([tag.name for tag in repo.tags])
 67 | release_versions.extend((str(tag), github_repo + tag) for tag in tags if str(tag) not in versions_to_exclude)
 68 | 
 69 | # -- General configuration ---------------------------------------------------
 70 | 
 71 | # If your documentation needs a minimal Sphinx version, state it here.
 72 | #
 73 | # needs_sphinx = '1.0'
 74 | 
 75 | # Add any Sphinx extension module names here, as strings. They can be
 76 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 77 | # ones.
 78 | extensions = ['myst_parser', 'sphinx_md', ]
 79 | myst_heading_anchors = 5
 80 | # myst_enable_extensions = [
 81 | #     "html_admonition",
 82 | # ]
 83 | # Add any paths that contain templates here, relative to this directory.
 84 | templates_path = ['_templates']
 85 | 
 86 | # The suffix(es) of source filenames.
 87 | # You can specify multiple suffix as a list of string:
 88 | #
 89 | source_suffix = ['.rst', '.md']
 90 | 
 91 | # List of patterns, relative to source directory, that match files and
 92 | # directories to ignore when looking for source files.
 93 | # This pattern also affects html_static_path and html_extra_path.
 94 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 95 | 
 96 | # -- Options for HTML output -------------------------------------------------
 97 | 
 98 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 99 | # a list of builtin themes.
100 | #
101 | html_theme = 'sphinx_rtd_theme'
102 | html_title = "Intel® Technology Enabling for OpenShift*"
103 | # Theme options are theme-specific and customize the look and feel of a theme
104 | # further.  For a list of options available for each theme, see the
105 | # documentation.
106 | #
107 | html_theme_options = {
108 |     "display_version": True,
109 | }
110 | 
111 | html_context = {
112 |     'display_github': True,
113 |     'github_host': 'github.com',
114 |     'github_user': 'intel',
115 |     'github_repo': 'intel-technology-enabling-for-openshift',
116 |     'github_version': 'main/',
117 |     'versions_menu': True,
118 |     'version': version,
119 |     'versions': release_versions,
120 | }
121 | html_css_files = [
122 |     'custom.css',
123 | ]
124 | 
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 | 
129 | html_static_path = ['_static']
130 | 
131 | 
132 | # Custom sidebar templates, must be a dictionary that maps document names
133 | # to template names.
134 | #
135 | # The default sidebars (for documents that don't match any pattern) are
136 | # defined by theme itself.  Builtin themes are using these templates by
137 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
138 | # 'searchbox.html']``.
139 | #
140 | # html_sidebars = {}
141 | 
142 | 
143 | # -- Options for HTMLHelp output ---------------------------------------------
144 | 
145 | # Output file base name for HTML help builder.
146 | htmlhelp_basename = 'IntelTechnologyEnablingforOpenShiftdoc'


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | CommunityCodeOfConduct AT intel DOT com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | 


--------------------------------------------------------------------------------
/gaudi/README.md:
--------------------------------------------------------------------------------
  1 | # Setting up Intel Gaudi AI Accelerator Operator
  2 | 
  3 | ## Overview
  4 | [Intel Gaudi AI Accelerator Operator](https://catalog.redhat.com/software/container-stacks/detail/6683b2cce45daa25e36bddcb) is used to provision Intel Gaudi Accelerator with OpenShift. The steps and yaml files mentioned in this document to provision the Gaudi accelerator are based on [Intel Gaudi AI Accelerator Operator for OpenShift](https://docs.habana.ai/en/latest/Orchestration/Intel_Gaudi_Base_Operator/index.html).
  5 | 
  6 | If you are familiar with the steps here to manually provision the accelerator, the Red Hat certified Operator and Ansible based [One-Click](/one_click/README.md#reference-playbook-–-habana-gaudi-provisioning) solution can be used as a reference to provision the accelerator automatically.
  7 | 
  8 | ## Prerequisities
  9 | - To Provision RHOCP cluster, follow steps [here](/README.md#provisioning-rhocp-cluster).
 10 | 
 11 | ## Install Intel Gaudi AI Accelerator Operator on Red Hat OpenShift
 12 | ### Installation via web console
 13 | Follow the steps below to install Intel Gaudi AI Accelerator Operator using OpenShift web console:
 14 | 1.	In the OpenShift web console, navigate to **Operator** -> **OperatorHub**.
 15 | 2.	Search for **Intel Gaudi AI Accelerator Operator** in all items field -> Click **Install**.
 16 | ### Verify Installation via web console
 17 | 1.	Go to **Operator** -> **Installed Operators**.
 18 | 2.	Verify that the status of the operator is **Succeeded**.
 19 | 
 20 | ### Installation via Command Line Interface (CLI)
 21 | ```
 22 | oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/gaudi/gaudi_install_operator.yaml
 23 | ```
 24 | 
 25 | ### Verify Installation via CLI
 26 | Verify that the operator controller manager pod is up and running:
 27 | ```
 28 | oc get pods -n habana-ai-operator
 29 | 
 30 | NAME                                  READY   STATUS    RESTARTS   AGE
 31 | controller-manager-6c8459d9cb-fqs8h   2/2     Running   0          25m
 32 | ```
 33 | 
 34 | ## Creating Intel Gaudi AI Accelerator Operator ClusterPolicy Instance
 35 | To create a Habana Gaudi device plugin CR, follow the steps below.
 36 | 
 37 | ### Create CR via web console
 38 | 1.	Go to **Operator** -> **Installed Operators**.
 39 | 2.	Open **Intel Gaudi AI Accelerator Operator**.
 40 | 3.	Navigate to tab **Cluster Policy**.
 41 | 4.	Click **Create ClusterPolicy** -> set correct parameters -> Click **Create**. To set correct parameters please refer [Using RedHat OpenShift Container Platform Console](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Kubernetes_Installation/Kubernetes_Operator.html#id1).
 42 | 
 43 | ### Verify via web console
 44 | 1.	Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **habana-ai-device-plugin-ds**,  **habana-ai-driver-rhel-9-4-xxxxx**, **habana-ai-feature-discovery-ds**, **habana-ai-metric-exporter-ds**, **habana-ai-runtime-ds**.
 45 | 2.	Now `ClusterPolicy` is created.
 46 | 
 47 | ### Create CR via CLI
 48 | Apply the CR yaml file:
 49 | ```
 50 | oc apply -f  https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/gaudi/gaudi_cluster_policy.yaml
 51 | ```
 52 | 
 53 | ### Verify the ClusterPolicy CR is created
 54 | You can use command below to verify that the `ClusterPolicy` CR has been created:
 55 | ```
 56 | oc get pod -n habana-ai-operator
 57 | 
 58 | NAME                                                       READY   STATUS    RESTARTS      AGE
 59 | habana-ai-device-plugin-ds-thj7b                           1/1     Running   0             10d
 60 | habana-ai-driver-rhel-9-4-416-94-202412170927-0-ds-vqhzb   1/1     Running   2             10d
 61 | habana-ai-feature-discovery-ds-ztl2j                       1/1     Running   5             10d
 62 | habana-ai-metric-exporter-ds-g5qqh                         1/1     Running   0             10d
 63 | habana-ai-operator-controller-manager-6c995b5646-wl7cp     2/2     Running   0             10d
 64 | habana-ai-runtime-ds-x49lf                                 1/1     Running   0             10d
 65 | ```
 66 | Alternatively, you can also check the status of the `ClusterPolicy` CR like below: 
 67 | ```
 68 | oc describe ClusterPolicy habana-ai -n habana-ai-operator
 69 | 
 70 | Name:         habana-ai
 71 | Namespace:    habana-ai-operator
 72 | .
 73 | .
 74 | Status:
 75 |   Conditions:
 76 |     Last Transition Time:  2025-01-21T18:50:46Z
 77 |     Message:               All resources have been successfully reconciled
 78 |     Reason:                Reconciled
 79 |     Status:                True
 80 | ```
 81 | ## Verify Gaudi Provisioning
 82 | After the `ClusterPolicy` instance CR is created, it will take some time for the operator to download the Gaudi OOT driver source code and build it on-premise with the help of the KMM operator. The OOT driver module binaries will be loaded into the RHCOS kernel on each node with Gaudi cards labelled by feature discovery. Then, the Gaudi device plugin can advertise the Gaudi resources listed in the table for the pods on OpenShit to use. Run the command below to check the availability of Gaudi resources:
 83 | ```
 84 | oc describe node | grep habana.ai/gaudi
 85 | 
 86 |   habana.ai/gaudi:    8 -> Gaudi cards number on the cluster
 87 |   habana.ai/gaudi:    8 -> Gaudi cards number allocatble on the cluster
 88 |   habana.ai/gaudi    4       4 -> number of Gaudi cards allocated and number of Gardi cards available
 89 | ```
 90 | 
 91 | To view the metrics on a node with Gaudi card, refer [Collecting Metrics](https://docs.habana.ai/en/latest/Orchestration/Prometheus_Metric_Exporter.html?highlight=metrics#collecting-metrics).
 92 | 
 93 | ## Resources Provided by Habana Gaudi Device Plugin
 94 | The resources provided are the user interface for customers to claim and consume the hardware features from the user pods. See below table for the details:
 95 | 
 96 | | Feature | Resources | Description |
 97 | | ------- | --------- | ----------- |
 98 | | Habana Gaudi | `habana.ai/gaudi` | Number of Habana Gaudi Card resources ready to claim | 
 99 | 
100 | ## Upgrade Intel Gaudi SPI Firmware
101 | Refer [Upgrade Intel Gaudi SPI Firmware](/gaudi/Gaudi-SPI-Firmware-Upgrade.md) to upgrade the SPI Firmware on Intel Gaudi.


--------------------------------------------------------------------------------
/workloads/opea/chatqna/README.md:
--------------------------------------------------------------------------------
  1 | # Deploy OPEA ChatQnA workload on OCP
  2 | 
  3 | ## Overview
  4 | The workload is based on the [OPEA ChatQnA Application](https://github.com/opea-project/GenAIExamples/tree/v0.8/ChatQnA) running on Intel® Gaudi Accelerator with OpenShift and OpenShift AI. Refer to the [OPEA Generative AI Examples](https://github.com/opea-project/GenAIExamples/tree/v0.8) for more details about the OPEA workloads.
  5 | 
  6 | **Note**: It is still under heavy development, and the updates are expected.
  7 |  
  8 | ## Prerequisites
  9 | * Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster)
 10 | * The Persistent storage using NFS is ready. Refer to [documentation](https://docs.openshift.com/container-platform/4.16/storage/persistent_storage/persistent-storage-nfs.html) for the details to set it up.
 11 | 
 12 |     **Note**: Refer to [documentation](https://docs.openshift.com/container-platform/4.16/storage/index.html) for setting up other types of persistent storages.
 13 | * Provisioned Intel Gaudi accelerator on RHOCP cluster. Follow steps [here](/gaudi/README.md)
 14 | * RHOAI is installed. Follow steps [here](/e2e/inference/README.md/#install-rhoai) 
 15 | * The Intel Gaudi AI accelerator is enabled with RHOAI. Follow steps [here](/e2e/inference/README.md/#enable-intel-gaudi-ai-accelerator-with-rhoai)
 16 | * Minio based S3 service ready for RHOAI. Follow steps [here](https://ai-on-openshift.io/tools-and-applications/minio/minio/#create-a-matching-data-connection-for-minio)
 17 | 
 18 | ## Deploy Model Serving for OPEA ChatQnA Microservices with RHOAI
 19 | 
 20 | ### Create OpenShift AI Data Science Project
 21 | 
 22 | * Click ```Search -> Routes -> rhods-dashboard``` from the OCP web console and launch the RHOAI dashboard. 
 23 | 
 24 | * Follow the dashboard and click ```Data Science Projects``` to create a project. For example, ```OPEA-chatqna-modserving```.
 25 | 
 26 | ### Preload the models
 27 | 
 28 | * Refer to [link](https://huggingface.co/docs/hub/en/models-downloading) and download the model [Llama2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf). 
 29 | 
 30 | * Refer to [link](https://ai-on-openshift.io/tools-and-applications/minio/minio/#create-a-matching-data-connection-for-minio) and upload the model to minio/s3 storage. 
 31 | 
 32 | * Click ```OPEA-chatqna-modserving```, and choose ```Data Connection``` section. In the fields, add your access and secret keys from minio. Follow [link](https://ai-on-openshift.io/tools-and-applications/minio/minio/#create-a-matching-data-connection-for-minio). 
 33 | 
 34 | ### Launch the Model Serving with Intel Gaudi AI Accelerator
 35 | 
 36 | * Click on the Settings and choose ```ServingRuntime```. Copy or import the [tgi_gaudi_servingruntime.yaml](tgi_gaudi_servingruntime.yaml). The [tgi-gaudi](https://github.com/huggingface/tgi-gaudi) serving runtime is used. Follow the image below.
 37 | 
 38 | ![Alt text](/docs/images/tgi-serving-runtime.png)
 39 | 
 40 | * In the project ```OPEA-chatqna-modserving``` --> ```Models``` section and follow the image below.
 41 | 
 42 | ![Alt text](/docs/images/rhoai-deploy-model.png)
 43 | 
 44 | * The model server is now in the creation state. Once ready, the status will be updated to green and the inference endpoint can be seen. Refer to the image below. 
 45 | 
 46 | ![Alt text](/docs/images/model-server-status.png)
 47 | 
 48 | ## Deploy ChatQnA Megaservice and Database
 49 | 
 50 | ### Create namespace 
 51 | 
 52 | ``` 
 53 |   oc create namespace opea-chatqna
 54 | ```
 55 | 
 56 | ### Create persistent volumes
 57 | The NFS is used to create the Persistent Volumes for ChatQnA MegaService to claim and use.
 58 | 
 59 | Make sure to update NFS server IP and path in ```persistent_volumes.yaml``` before applying command below.
 60 | For example:
 61 | ```
 62 |   nfs:
 63 |     server: 10.20.1.2 # nfs server
 64 |     path: /my_nfs # nfs path
 65 | ```
 66 |   
 67 | ``` 
 68 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/persistent_volumes.yaml
 69 | 
 70 | ```
 71 | 
 72 | * Check that the persistent volumes are created:
 73 | 
 74 | ```
 75 | $ oc get pv
 76 | NAME                           CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS      
 77 | chatqna-megaservice-pv-0        100Mi      RWO            Retain           Available
 78 | chatqna-megaservice-pv-1        100Mi      RWO            Retain           Available
 79 | chatqna-megaservice-pv-2        100Mi      RWO            Retain           Available
 80 | 
 81 | ```
 82 | ### Building OPEA ChatQnA MegaService Container Image
 83 | ```
 84 | create_megaservice_container.sh
 85 | ```
 86 | 
 87 | ### Deploy Redis Vector Database Service
 88 | ```
 89 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/redis_deployment_service.yaml
 90 | 
 91 | ```
 92 | 
 93 | Check that the pod and service are running:
 94 | 
 95 | ```
 96 | $ oc get pods
 97 | NAME                                   READY   STATUS      RESTARTS   AGE
 98 | redis-vector-db-6b5747bf7-sl8fr        1/1     Running     0          21s
 99 | ```
100 | 
101 | ```
102 | $ oc get svc
103 | NAME                  TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)             AGE
104 | redis-vector-db       ClusterIP   1.2.3.4          <none>        6379/TCP,8001/TCP   43s
105 | ```
106 | 
107 | ### Deploy ChatQnA MegaService
108 | 
109 | Update the inference endpoint from the <image name> in the chatqna_megaservice_deployment.
110 | 
111 | ```
112 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/workloads/opea/chatqna/chatqna_megaservice_deployment.yaml
113 | ```
114 | 
115 | Check that the pod and service are running:
116 | 
117 | ```
118 | $ oc get pods
119 | NAME                                   READY   STATUS      RESTARTS   AGE
120 | chatqna-megaservice-54487649b5-sgsh2   1/1     Running     0          95s         
121 | ```
122 | 
123 | ```
124 | $ oc get svc
125 | NAME                  TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)             AGE
126 | chatqna-megaservice   ClusterIP   1.2.3.4          <none>        8000/TCP            99s
127 | ```
128 | 
129 | ### Verify the Megaservice
130 | Use the command below:
131 | 
132 | ```
133 |   curl <megaservice_pod_ip>/v1/rag/chat_stream \
134 |   -X POST \
135 |   -d '{"query":"What is a constellation?"}' \
136 |   -H 'Content-Type: application/json'
137 | 
138 | ```


--------------------------------------------------------------------------------
/kmmo/README.md:
--------------------------------------------------------------------------------
  1 | # Setting up Out of Tree Drivers
  2 | 
  3 | ## Introduction
  4 | [Kernel module management (KMM) operator](https://github.com/rh-ecosystem-edge/kernel-module-management) manages the deployment and lifecycle of out-of-tree kernel modules on RHOCP.
  5 | 
  6 | In this release, KMM operator is used to manage and deploy the Intel® Data Center GPU driver container image on the RHOCP cluster.
  7 | 
  8 | Intel data center GPU driver container images are released from [Intel Data Center GPU Driver for OpenShift Project](https://github.com/intel/intel-data-center-gpu-driver-for-openshift/tree/main/release#intel-data-center-gpu-driver-container-images-for-openshift-release).
  9 | 
 10 | ## KMM operator working mode
 11 | - **Pre-build mode** - This is the default and recommended mode. KMM Operator uses [this pre-built and certified Intel Data Center GPU driver container image](https://catalog.redhat.com/software/containers/intel/intel-data-center-gpu-driver-container/6495ee55c8b2461e35fb8264), which is published on the Red Hat Ecosystem Catalog to provision Intel Data Center GPUs on a RHOCP cluster.
 12 | - **On-premises build mode** - Users can optionally build and deploy their own driver container images on-premises through the KMM operator.
 13 | 
 14 | ## Prerequisites
 15 | - Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster).
 16 | - Setup node feature discovery. Follow steps [here](/nfd/README.md).
 17 | 
 18 | ## Install KMM operator
 19 | Follow the installation guide below to install the KMM operator via CLI or web console. 
 20 | - [Install from CLI](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/kernel-module-management-operator#kmm-install-using-cli_kernel-module-management-operator)
 21 | - [Install from web console](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/specialized_hardware_and_driver_enablement/kernel-module-management-operator#kmm-install-using-web-console_kernel-module-management-operator)
 22 | 
 23 | ## Canary deployment with KMM
 24 | Canary deployment is enabled by default to deploy the driver container image only on specific node(s) to ensure the initial deployment succeeds prior to rollout to all the eligible nodes in the cluster. This safety mechanism can reduce risk and prevent a deployment from adversely affecting the entire cluster.
 25 | 
 26 | ## Set alternative firmware path at runtime with KMM
 27 | **NOTE**: This update is required only when using KMM version of v2.1.1 or lower. Starting with v2.2.0, it is not required.
 28 | 
 29 | Follow the steps below to set the alternative firmware path at runtime.
 30 | 
 31 | 1. Update KMM operator `ConfigMap` to set `worker.firmwareHostPath` to `/var/lib/firmware`
 32 | 
 33 | ``` 
 34 | $ oc patch configmap kmm-operator-manager-config -n openshift-kmm --type='json' -p='[{"op": "add", "path": "/data/controller_config.yaml", "value": "healthProbeBindAddress: :8081\nmetricsBindAddress: 127.0.0.1:8080\nleaderElection:\n  enabled: true\n  resourceID: kmm.sigs.x-k8s.io\nwebhook:\n  disableHTTP2: true\n  port: 9443\nworker:\n  runAsUser: 0\n  seLinuxType: spc_t\n  firmwareHostPath: /var/lib/firmware"}]'
 35 | ```
 36 | 
 37 | 2. Delete the KMM operator controller pod for `ConfigMap` changes to take effect.
 38 | ``` 
 39 | $ oc get pods -n openshift-kmm | grep -i "kmm-operator-controller-" | awk '{print $1}' | xargs oc delete pod -n openshift-kmm
 40 | ```
 41 | 
 42 | For more details, see [link.](https://openshift-kmm.netlify.app/documentation/firmwares/#setting-the-kernels-firmware-search-path)
 43 | 
 44 | ## Deploy Intel Data Center GPU Driver with pre-build mode
 45 | Follow the steps below to deploy the driver container image with pre-build mode.
 46 | 1.	Find all nodes with an Intel Data Center GPU card using the following command:
 47 | ``` 
 48 | $ oc get nodes -l intel.feature.node.kubernetes.io/gpu=true
 49 | ```
 50 | Example output: 
 51 | ```
 52 | NAME         STATUS   ROLES    AGE   VERSION
 53 | icx-dgpu-1   Ready    worker   30d   v1.25.4+18eadca
 54 | ```
 55 | 
 56 | 2.	Label the node(s) in the cluster using the command shown below for the initial canary deployment.
 57 | ```
 58 | $ oc label node <node_name> intel.feature.node.kubernetes.io/dgpu-canary=true
 59 | ```
 60 | 
 61 | 3.	Use pre-build mode to deploy the driver container.
 62 | ```
 63 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/kmmo/intel-dgpu.yaml   
 64 | ```
 65 | 
 66 | 4.	After the driver is verified on the cluster through the canary deployment, simply remove the line shown below from the [`intel-dgpu.yaml`](/kmmo/intel-dgpu.yaml) file and reapply the yaml file to deploy the driver to the entire cluster. As a cluster administrator, you can also select another deployment policy.
 67 | ```
 68 | intel.feature.node.kubernetes.io/dgpu-canary: 'true'
 69 | ```
 70 | 
 71 | ## Verification
 72 | To verify that the drivers have been loaded, follow the steps below:
 73 | 1.	List the nodes labeled with `kmm.node.kubernetes.io/openshift-kmm.intel-dgpu.ready` using the command shown below:
 74 | ```
 75 | $ oc get nodes -l kmm.node.kubernetes.io/openshift-kmm.intel-dgpu.ready
 76 | ```
 77 | Example output: 
 78 | ```
 79 | NAME         STATUS   ROLES    AGE   VERSION
 80 | icx-dgpu-1   Ready    worker   30d   v1.25.4+18eadca
 81 | ```
 82 | The label shown above indicates that the KMM operator has successfully deployed the drivers and firmware on the node.
 83 | 
 84 | 2.	If you want to further debug the driver on the node, follow these steps:  
 85 |     a. Navigate to the web console (Compute -> Nodes -> Select a node that has the GPU card -> Terminal).  
 86 |     b. Run the commands shown below in the web console terminal:  
 87 |     ```
 88 |     $ chroot /host 
 89 |     $ lsmod | grep i915
 90 |     ```
 91 |     Ensure `i915` and `intel_vsec` are loaded in the kernel, as shown in the output below:
 92 |     ```
 93 |     i915                   3633152 0
 94 |     i915_compat            16384 1 i915
 95 |     intel_vsec             16384  1 i915
 96 |     intel_gtt              20480  1 i915
 97 |     video                  49152  1 i915
 98 |     i2c_algo_bit           16384  1 i915
 99 |     drm_kms_helper        290816  1 i915
100 |     drm                   589824  3 drm_kms_helper,i915
101 |     dmabuf                 77824  4 drm_kms_helper,i915,i915_compat,dr
102 |     ```
103 |     c. Run dmesg to ensure there are no errors in the kernel message log.
104 | 
105 | ## See Also
106 | 


--------------------------------------------------------------------------------
/e2e/inference/README.md:
--------------------------------------------------------------------------------
 1 | # Intel AI Inference End-to-End Solution
 2 | 
 3 | ## Overview
 4 | Intel AI inference end-to-end solution with RHOCP is based on the Intel® Data Center GPU Flex Series provisioning, Intel® OpenVINO™, and [Red Hat OpenShift AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai) (RHOAI) on RHOCP. There are two AI inference modes verified with Intel® Xeon® processors and Intel Data Center GPU Flex Series with RHOCP.
 5 | * Interactive mode – RHOAI provides OpenVINO based Jupyter Notebooks for users to interactively debug the inference applications or [optimize the models](https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html) on RHOCP using data center GPU cards or Intel Xeon processors.
 6 | * Deployment mode – [OpenVINO Model Sever](https://github.com/openvinotoolkit/model_server) (OVMS) can be used to deploy the inference workloads in data center and edge computing environments on RHOCP.  
 7 | 
 8 | ## Prerequisites
 9 | * Provisioned RHOCP cluster. Follow steps [here](/README.md#provisioning-rhocp-cluster)
10 | * Provisioning Intel Data Center GPU Flex Series. Follow steps [here](/README.md#provisioning-intel-hardware-features-on-rhocp)
11 |   * Setup node feature discovery (NFD). Follow the steps [here](/nfd/README.md)
12 |   * Setup out of tree drivers for Intel GPU provisioning. Follow the steps [here](/kmmo/README.md)
13 |   * Setup Intel device plugins operator and create Intel GPU device plugin. Follow the steps [here](/device_plugins/README.md)  
14 | 
15 | ## Install RHOAI
16 | The Red Hat certified RHOAI operator is published at [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software/container-stacks/detail/63b85b573112fe5a95ee9a3a). You can use the command line interface (CLI) or web console to install it.
17 | ### Install using CLI (To be added)
18 | ### Install using Web Console
19 | 1.	On the RHOCP web console, click Operators → OperatorHub.
20 | 2.	Search RedHat OpenShift AI Operator and click Install. The operator is installed in the namespace `redhat-ods-operator`.
21 | ### Verification
22 | 1.	Navigate to Operators → Installed Operators page.
23 | 2.	Ensure that in the redhat-ods-operator namespace, RedHat OpenShift AI status is InstallSucceeded 
24 | 3.	Click on `Search` -> `Routes` -> `rhods-dashboard` from the web console and access the RHOAI UI link.  
25 | **Note:** When installing the operator, the default `kfdef` Custom Resource (CR) is created. This CR enables the dashboard for users to browse and launch Jupyter Notebooks projects on an RHOCP cluster. Please refer to this [link](https://github.com/red-hat-data-services/odh-deployer) for more details about `kfdef`.
26 | ## Install OpenVINO Operator
27 | The OpenVINO operator is published at [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software/container-stacks/detail/60649a56209af65d24b7ca9e). You can use the CLI or web console to install it.
28 | ### Install using CLI (To be added)
29 | ### Install using Web Console
30 | Follow this [link](https://github.com/openvinotoolkit/operator/blob/v1.1.0/docs/operator_installation.md#operator-instalation)  to install the operator via the web console. 
31 | 
32 | ## Work with Interactive Mode
33 | To enable the interactive mode, the OpenVINO notebook CR needs to be created and integrated with RHOAI.  
34 | 1.	Click on the `create Notebook` option from the web console and follow these [steps](https://github.com/openvinotoolkit/operator/blob/main/docs/notebook_in_rhods.md#integration-with-openshift-data-science-and-open-data-hub) to create the notebook CR.
35 | 2.	Enable Intel Data Center GPU on RHOAI Dashboard- **Technical Preview feature**
36 | 
37 | Create `AcceleratorProfile` in the `redhat-ods-applications` namespace 
38 | ```
39 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/e2e/inference/accelerator_profile_flex140.yaml
40 | ```
41 | 
42 | 3. Navigate to `openvino-notebooks` ImageStream and add the above created `AcceleratorProfile` key to the annotation field, as shown in the image below:
43 | 
44 | ![Alt text](/docs/images/openvino-accelerator-field.png)
45 | 
46 | 4. Navigate to `Search` -> `Networking` -> `Routes` from the web console and access `rhods-dashboard` route in the `redhat-ods-applications` namespace, as in the image below. Click on the location link to launch RHOAI dashboard. 
47 |    
48 | ![Alt text](/docs/images/rhods-dashboard-route.png)
49 |  
50 | 5. If step 2 is successful, ```Intel® Data Center GPU Flex Series 140 ``` is shown in the accelerator dropdown menu in ```rhods-dashboard```. Users can run OpenVINO notebook image with Intel® Data Center GPU Flex Series 140 card. 
51 |    
52 | ![Alt text](/docs/images/accelerator-profile-dropdown.png)
53 | 
54 | Follow the [link](https://github.com/openvinotoolkit/operator/blob/main/docs/notebook_in_rhods.md#integration-with-openshift-data-science-and-open-data-hub) for more details on the available Jupyter Notebooks.
55 | 
56 | ## Work with Deployment Mode
57 | 1.	From the web console, click on the ModelServer option in this [link](https://github.com/openvinotoolkit/operator/blob/v1.1.0/docs/operator_installation.md#operator-instalation) and follow the [steps](https://github.com/openvinotoolkit/operator/blob/v1.1.0/docs/modelserver.md#managing-model-servers-via-operator) to start the OVMS instance.  
58 | 2.	To enable the Intel Data Center GPU, make sure to modify the OVMS instance options according to the screenshot below.
59 | 
60 | * Below images show `gpu.intel.com/i915` resource requests and limits for OVMS
61 | 
62 | ![Alt text](/docs/images/Ovms-Gpu-resource-limit.png)
63 | 
64 | ![Alt text](/docs/images/Ovms-Gpu-resource-request.png)
65 | 
66 | # Enable Intel Gaudi AI Accelerator with RHOAI
67 | 
68 | * From web console:
69 | 
70 |   To enable and use the Intel Gaudi accelerator on RHOAI web console, follow the [documentation](https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.11/html/working_with_accelerators/intel-gaudi-ai-accelerator-integration_accelerators#enabling-intel-gaudi-ai-accelerators_accelerators).
71 | * From cli:
72 |   
73 |   Deploy the ```accelerator_profile_gaudi.yaml``` in the redhat-ods-applications namespace.
74 | 
75 | ```
76 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/e2e/inference/accelerator_profile_gaudi.yaml
77 | ```
78 | ## Intel Gaudi AI Accelerator with Intel® Gaudi AI Software Tools Containers on OpenShift AI
79 | To use Intel Gaudi AI Accelerator with Intel® Gaudi AI Software Tools Containers on OpenShift AI, follow the [documentation](https://github.com/intel/ai-containers/blob/main/enterprise/redhat/openshift-ai/gaudi/README.md).
80 | 
81 | ## See Also 
82 | [GPU accelerated demo with OpenVINO](https://www.youtube.com/watch?v=3fTz_k4JT2A)


--------------------------------------------------------------------------------
/one_click/gaudi_provisioning_playbook.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | - hosts: localhost
  4 |   vars: 
  5 |     kubeconfig_path: "~/.kube/config"
  6 |   environment:
  7 |     KUBECONFIG: "{{ kubeconfig_path }}" 
  8 |   tasks:  
  9 |     - name: Install Dependencies
 10 |       tags: 
 11 |         - install_dependencies
 12 |       block:
 13 |       - name: NFD - Install Node Feature Discovery Operator
 14 |         tags: 
 15 |           - nfd
 16 |         block:
 17 |           - name: NFD - Create openshift-nfd namespace
 18 |             k8s:
 19 |               name: openshift-nfd
 20 |               api_version: v1
 21 |               kind: Namespace
 22 |               state: present
 23 |               wait: yes
 24 |           - name: NFD - Create an nfd-operator group v1
 25 |             k8s:
 26 |               definition:
 27 |                 apiVersion: operators.coreos.com/v1
 28 |                 kind: OperatorGroup
 29 |                 metadata:
 30 |                   generateName: openshift-nfd-
 31 |                   name: openshift-nfd
 32 |                   namespace: openshift-nfd
 33 |                 spec:
 34 |                   targetNamespaces:
 35 |                   - openshift-nfd
 36 |               wait: yes
 37 |           - name: NFD - Create subscription for RH NFD operator
 38 |             k8s:
 39 |               definition:
 40 |                 apiVersion: operators.coreos.com/v1alpha1
 41 |                 kind: Subscription
 42 |                 metadata:
 43 |                   name: nfd
 44 |                   namespace: openshift-nfd
 45 |                 spec:
 46 |                   channel: "stable"
 47 |                   installPlanApproval: Automatic
 48 |                   name: nfd
 49 |                   source: redhat-operators
 50 |                   sourceNamespace: openshift-marketplace
 51 |               wait: yes
 52 |               wait_condition:
 53 |                 reason: AllCatalogSourcesHealthy
 54 |                 type: CatalogSourcesUnhealthy
 55 |                 status: 'False'
 56 |           - name: NFD - Wait until the nfd-operator-controller Deployment is available
 57 |             k8s_info:
 58 |               kind: Deployment
 59 |               wait: yes
 60 |               name: nfd-controller-manager
 61 |               namespace: openshift-nfd
 62 |               wait_condition:
 63 |                 type: Available
 64 |                 status: 'True'
 65 |                 reason: MinimumReplicasAvailable
 66 |       - name: KMM - Install Kernel Module Management Operator
 67 |         tags: 
 68 |           - kmm
 69 |         block: 
 70 |         - name: KMM - Create openshift-kmm namespace
 71 |           k8s:
 72 |             name: openshift-kmm
 73 |             api_version: v1
 74 |             kind: Namespace
 75 |             state: present
 76 |             wait: yes
 77 |         - name: KMM - Create OperatorGroup v1 in openshift-kmm namespace
 78 |           k8s:
 79 |             definition:
 80 |               apiVersion: operators.coreos.com/v1
 81 |               kind: OperatorGroup
 82 |               metadata:
 83 |                 name: kernel-module-management
 84 |                 namespace: openshift-kmm
 85 |             wait: yes
 86 |         - name: KMM - Create Subscription for KMM Operator
 87 |           k8s:
 88 |             definition:
 89 |               apiVersion: operators.coreos.com/v1alpha1
 90 |               kind: Subscription
 91 |               metadata:
 92 |                 name: kernel-module-management
 93 |                 namespace: openshift-kmm
 94 |               spec:
 95 |                 channel: stable
 96 |                 installPlanApproval: Automatic
 97 |                 name: kernel-module-management
 98 |                 source: redhat-operators
 99 |                 sourceNamespace: openshift-marketplace
100 |             wait: yes
101 |             wait_condition:
102 |               reason: AllCatalogSourcesHealthy
103 |               type: CatalogSourcesUnhealthy
104 |               status: 'False'
105 |         - name: KMM - Wait until the kmm-operator-controller Deployment is available
106 |           k8s_info:
107 |             kind: Deployment
108 |             wait: yes
109 |             name: kmm-operator-controller
110 |             namespace: openshift-kmm
111 |             wait_condition:
112 |               type: Available
113 |               status: 'True'
114 |               reason: MinimumReplicasAvailable
115 |       - name: Install Intel Gaudi Base Operator
116 |         tags: 
117 |           - intel-gaudi
118 |         block:
119 |           - name: Install Intel Gaudi Base Operator
120 |             k8s:
121 |               state: present
122 |               src: "../gaudi/gaudi_install_operator.yaml"
123 |               wait: yes
124 |           - name: Wait until the Intel Gaudi controller-manager Deployment is available
125 |             k8s_info:
126 |               kind: Deployment
127 |               wait: yes
128 |               name: controller-manager
129 |               namespace: habana-ai-operator
130 |               wait_condition:
131 |                 type: Available
132 |                 status: 'True'
133 |                 reason: MinimumReplicasAvailable
134 |     - name: NFD - Install NFD CRs
135 |       block:
136 |         - name: NFD - Create NFD discovery instance for Intel Gaudi
137 |           k8s:
138 |             state: present
139 |             src: "../gaudi/gaudi_nfd_instance_openshift.yaml"
140 |             wait: yes 
141 |     - name: Install Intel Gaudi DeviceConfig CR
142 |       block: 
143 |         - name: Create Intel Gaudi DeviceConfig
144 |           k8s:
145 |             state: present
146 |             src: "../gaudi/gaudi_device_config.yaml"
147 |             wait: yes
148 |     - name: Verify Intel Gaudi Resources 
149 |       tags: 
150 |         - gaudi_resource_test
151 |       block: 
152 |         - name: Get Gaudi Node Resource Information
153 |           kubernetes.core.k8s_info:
154 |             api: v1
155 |             kind: Node
156 |             label_selectors:
157 |               - "kmm.node.kubernetes.io/habana-ai-operator.intel-gaudi-module.device-plugin-ready="
158 |               - "kmm.node.kubernetes.io/habana-ai-operator.intel-gaudi-module.ready="
159 |             wait: yes
160 |             wait_timeout: 120
161 |           register: cluster_nodes_info
162 |           until:
163 |             - cluster_nodes_info.resources is defined
164 |         - name: Print cluster resources
165 |           debug: 
166 |             msg: 
167 |             - "Please verify Capacity and Allocatable Habana Gaudi Resources on the node - "
168 |             - "Capacity: { 
169 |                 'habana.ai/gaudi': {{ cluster_nodes_info.resources[0].status.capacity['habana.ai/gaudi'] }},"
170 |             - "Allocatable Resources: { 
171 |                 'habana.ai/gaudi': {{ cluster_nodes_info.resources[0].status.allocatable['habana.ai/gaudi'] }},"


--------------------------------------------------------------------------------
/device_plugins/deploy_qat.md:
--------------------------------------------------------------------------------
  1 | # Create Intel QAT Device Plugin CR
  2 | 
  3 | ## Create a CR via web console
  4 | 1.	Go to **Operator** -> **Installed Operators**.
  5 | 2.	Open **Intel Device Plugins Operator**.
  6 | 3.	Navigate to tab **Intel QuickAssist Technology Device Plugin**.
  7 | 4.	Click **Create QatDevicePlugin** -> set correct parameters -> Click **Create** 
  8 | 5.	Optional: If you want to make any customizations, select YAML view and edit the details. When you are done, click **Create**.
  9 | 
 10 | ## Verify via web console
 11 | 1.	Verify CR by checking the status of **Workloads** -> **DaemonSet** -> **intel-qat-plugin**.
 12 | 2.	Now `QatDevicePlugin` is created.
 13 | 
 14 | ## Create CR via CLI
 15 | Apply the CR yaml file:
 16 | ```
 17 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/device_plugins/qat_device_plugin.yaml
 18 | ```
 19 | 
 20 | ## Verify via CLI
 21 | Verify that the device plugin CR is ready: 
 22 | ```
 23 | $ oc get QatDevicePlugin
 24 | ```
 25 | Output: 
 26 | ```
 27 | NAME		        DESIRED		READY	NODE SELECTOR	                                    AGE
 28 | qatdeviceplugin-sample  1 	        1       {"intel.feature.node.kubernetes.io/qat":"true"}     3h27m
 29 | ```
 30 | 
 31 | # Verify QAT Device Plugin 
 32 | After the plugin is deployed, use below command to verify QAT resources: 
 33 | ```
 34 | $ oc describe node <node name> | grep qat.intel.com  
 35 |  qat.intel.com/cy: 32 
 36 |  qat.intel.com/cy: 32 
 37 |  qat.intel.com/dc: 32 
 38 |  qat.intel.com/dc: 32 
 39 |  ```
 40 | **Note**: By default the device plugin registers half resources each for `qat.intel.com/cy` and `qat.intel.com/dc` respectively. For more details about the QAT resources configuration, please refer to the QAT Device Plugin Configuration section below.
 41 | 
 42 | # QAT Device Plugin Configuration
 43 | > **Note**: The QAT device plugin can be configured with the flags. In this release, only the configurations in the table below are verified and supported on RHOCP. 
 44 | 
 45 | For more details about the QAT device plugin configuration flags, see [Modes and Configurations Options](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md#modes-and-configuration-options).
 46 | 
 47 | | Flag | Configuration | Description |
 48 | | ---- | ---- | ---- |
 49 | | `-dpdk-driver` | vfio-pci | Using vfio-pci driver to manage QAT VFIO device. See details [here](https://doc.dpdk.org/guides/linux_gsg/linux_drivers.html) |
 50 | | `-kernel-vf-drivers` | 4xxxvf | Supporting 4xxx QAT device </br> **Note**: Verified on 4th Gen Intel® Xeon® Scalable processors. See details [here](https://github.com/intel/qatlib/blob/main/INSTALL#L72) |
 51 | | `-max-num-devices ` | 128 | It is the maximum VF device it can support for 4xxx QAT device. If the number exceeds the maximum number the QAT device supports, then the maximum number will be enabled. |
 52 | | `-provisioning-config ` | Name of ConfigMap | See section [QAT resource configuration](/device_plugins/deploy_qat.md#qat-resource-configuration-experimental)  |
 53 | 
 54 | ## QAT Resource Configuration (experimental)
 55 | 
 56 | **NOTE**: In this release, this is an experimental feature. The efforts to [enhance this feature](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/1529) and [make it more stable](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/1542) are on going.
 57 | 
 58 | Users can use the steps below to customize the QAT resource configuration:  
 59 | 1. Create the configmap for qat resource configuration 
 60 |     ```
 61 |     $ oc create configmap --namespace=openshift-operators --from-literal "qat.conf=ServicesEnabled=<option>" <name-of-configmap> 
 62 |     ```
 63 |     Options- (refer to [link](https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-driver-qat) for more details):  
 64 |     `dc` : Configure all the QAT VF devices managed by the device plugin CR for compression services. The resource created is `qat.intel.com/dc`.
 65 | 
 66 |     `sym;asym` and `asym;sym`: Configure all the QAT VF devices managed by the device plugin CR for crypto services. The resource created is `qat.intel.com/cy`.
 67 | 
 68 |     `sym;dc` and `dc;sym` : Configure all the QAT VF devices managed by the device plugin CR for symmetric crypto and compression services. The resource created is `qat.intel.com/sym-dc`.
 69 | 
 70 |     `asym;dc` and `dc;asym`: Configure all the QAT VF devices managed by the device plugin CR for asymmetric crypto and compression services. The resource created is `qat.intel.com/asym-dc`.
 71 | 
 72 |     `sym`: Configure all the QAT VF devices managed by the device plugin CR for running symmetric crypto services. The resource created is `qat.intel.com/sym`. 
 73 | 
 74 |     `asym`: Configure all the QAT VF devices managed by the device plugin CR for running asymmetric crypto services. The resource created is `qat.intel.com/asym`. 
 75 | 
 76 | 
 77 | 2. Create QAT device plugin CR with -provisioning-config set as the name of the ConfigMap (created in step 1) in the qat_device_plugin.yaml file or set ConfigMap name in the provisioning-config option from web console. 
 78 | 
 79 | # Multiple Custom Resources
 80 | 
 81 | The [feature](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/cmd/operator#multiple-custom-resources) can be used with a `nodeSelector` label representing the capabilities supported on the node. Multiple custom resources on nodes can be supported with different QAT capabilities.
 82 | 
 83 | * For example, to assign the capabilities `sym` and `asym` for two nodes: create the labels on the nodes
 84 | ```
 85 | oc label node <node1_name> qat.mode=sym
 86 | oc label node <node2_name> qat.mode=asym
 87 | ```
 88 | * Create a separate `QatDevicePlugin` by adding each of the above labels in `nodeSelector` and choose the configmaps created with `sym` and `asym` capabilities respectively. Please refer to [QAT Resource Configuration](#qat-resource-configuration-experimental).
 89 | 
 90 | `qatdeviceplugin-sym`
 91 | ```
 92 | nodeSelector:
 93 |     qat.mode: sym
 94 | ```
 95 | `qatdeviceplugin-asym`
 96 | ```
 97 | nodeSelector:
 98 |     qat.mode: asym
 99 | ```
100 | 
101 | * Verify that the device plugin CRs are ready
102 | ```
103 | $ oc get QatDevicePlugin
104 | ```
105 | Output: 
106 | ```
107 | NAME		        DESIRED		READY	NODE SELECTOR	                                    AGE
108 | qatdeviceplugin-sym  1 	        1       {"intel.feature.node.kubernetes.io/qat":"true","qat.mode":"sym"}     72m
109 | qatdeviceplugin-asym  1 	        1       {"intel.feature.node.kubernetes.io/qat":"true","qat.mode":"asym"}     71m
110 | ```
111 | For more information about the CR and the node it is using:
112 | ```
113 | $ oc get QatDevicePlugin -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.nodeNames}{"\n"}{end}'
114 | ```
115 | Output:
116 | ```
117 | qatdeviceplugin-sym       ["node1"]
118 | qatdeviceplugin-asym      ["node2"]
119 | ```
120 | * Check the resources on the nodes:
121 | ```
122 | oc describe <node1_name> | grep qat.intel.com
123 |     qat.intel.com/sym:                128
124 |     qat.intel.com/sym:                128
125 |     qat.intel.com/sym                0            0
126 | 
127 | oc describe <node2_name> | grep qat.intel.com
128 |     qat.intel.com/asym:                128
129 |     qat.intel.com/asym:                128
130 |     qat.intel.com/asym                0            0
131 | ```
132 | 
133 | # Run Intel QAT based workloads on RHOCP
134 | To run the Intel QAT based workloads as an unprivileged pod (see [issue](https://github.com/intel/intel-technology-enabling-for-openshift/issues/122)). The customized `qat-scc` Security Context Constraint (SCC) is provided to bind with service account and run the QAT based workload. 
135 | 
136 | See [Verify Intel QuickAssist Technology Provisioning](/tests/l2/README.md#verify-intel-quickassist-technology-provisioning) for the detailed steps.  
137 | 


--------------------------------------------------------------------------------
/one_click/gpu_provisioning_playbook.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Intel Corporation
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | - hosts: localhost
  4 |   vars: 
  5 |     kubeconfig_path: "~/.kube/config"
  6 |   environment:
  7 |     KUBECONFIG: "{{ kubeconfig_path }}" 
  8 |   tasks:  
  9 |     - name: Install Dependencies
 10 |       tags: 
 11 |         - install_dependencies
 12 |       block:
 13 |       - name: NFD - Install Node Feature Discovery Operator
 14 |         tags: 
 15 |           - nfd
 16 |         block:
 17 |           - name: NFD - Create openshift-nfd namespace
 18 |             k8s:
 19 |               name: openshift-nfd
 20 |               api_version: v1
 21 |               kind: Namespace
 22 |               state: present
 23 |               wait: yes
 24 |           - name: NFD - Create an nfd-operator group v1
 25 |             k8s:
 26 |               definition:
 27 |                 apiVersion: operators.coreos.com/v1
 28 |                 kind: OperatorGroup
 29 |                 metadata:
 30 |                   generateName: openshift-nfd-
 31 |                   name: openshift-nfd
 32 |                   namespace: openshift-nfd
 33 |                 spec:
 34 |                   targetNamespaces:
 35 |                   - openshift-nfd
 36 |               wait: yes
 37 |           - name: NFD - Create subscription for RH NFD operator
 38 |             k8s:
 39 |               definition:
 40 |                 apiVersion: operators.coreos.com/v1alpha1
 41 |                 kind: Subscription
 42 |                 metadata:
 43 |                   name: nfd
 44 |                   namespace: openshift-nfd
 45 |                 spec:
 46 |                   channel: "stable"
 47 |                   installPlanApproval: Automatic
 48 |                   name: nfd
 49 |                   source: redhat-operators
 50 |                   sourceNamespace: openshift-marketplace
 51 |               wait: yes
 52 |               wait_condition:
 53 |                 reason: AllCatalogSourcesHealthy
 54 |                 type: CatalogSourcesUnhealthy
 55 |                 status: 'False'
 56 |           - name: NFD - Wait until the nfd-operator-controller Deployment is available
 57 |             k8s_info:
 58 |               kind: Deployment
 59 |               wait: yes
 60 |               name: nfd-controller-manager
 61 |               namespace: openshift-nfd
 62 |               wait_condition:
 63 |                 type: Available
 64 |                 status: 'True'
 65 |                 reason: MinimumReplicasAvailable
 66 |       - name: KMM - Install Kernel Module Management Operator
 67 |         tags: 
 68 |           - kmm
 69 |         block: 
 70 |         - name: KMM - Create openshift-kmm namespace
 71 |           k8s:
 72 |             name: openshift-kmm
 73 |             api_version: v1
 74 |             kind: Namespace
 75 |             state: present
 76 |             wait: yes
 77 |         - name: KMM - Create OperatorGroup v1 in openshift-kmm namespace
 78 |           k8s:
 79 |             definition:
 80 |               apiVersion: operators.coreos.com/v1
 81 |               kind: OperatorGroup
 82 |               metadata:
 83 |                 name: kernel-module-management
 84 |                 namespace: openshift-kmm
 85 |             wait: yes
 86 |         - name: KMM - Create Subscription for KMM Operator
 87 |           k8s:
 88 |             definition:
 89 |               apiVersion: operators.coreos.com/v1alpha1
 90 |               kind: Subscription
 91 |               metadata:
 92 |                 name: kernel-module-management
 93 |                 namespace: openshift-kmm
 94 |               spec:
 95 |                 channel: stable
 96 |                 installPlanApproval: Automatic
 97 |                 name: kernel-module-management
 98 |                 source: redhat-operators
 99 |                 sourceNamespace: openshift-marketplace
100 |                 startingCSV: kernel-module-management.v2.1.1
101 |             wait: yes
102 |             wait_condition:
103 |               reason: AllCatalogSourcesHealthy
104 |               type: CatalogSourcesUnhealthy
105 |               status: 'False'
106 |         - name: KMM - Wait until the kmm-operator-controller Deployment is available
107 |           k8s_info:
108 |             kind: Deployment
109 |             wait: yes
110 |             name: kmm-operator-controller
111 |             namespace: openshift-kmm
112 |             wait_condition:
113 |               type: Available
114 |               status: 'True'
115 |               reason: MinimumReplicasAvailable
116 |         - name: KMM - Update KMM CM to set FW path 
117 |           command: |
118 |             oc patch configmap kmm-operator-manager-config -n openshift-kmm --type='json' -p='[{"op": "add", "path": "/data/controller_config.yaml", "value": "healthProbeBindAddress: :8081\nmetricsBindAddress: 127.0.0.1:8080\nleaderElection:\n  enabled: true\n  resourceID: kmm.sigs.x-k8s.io\nwebhook:\n  disableHTTP2: true\n  port: 9443\nworker:\n  runAsUser: 0\n  seLinuxType: spc_t\n  setFirmwareClassPath: /var/lib/firmware"}]'
119 |         - name: KMM - Delete the KMM operator controller pod for `ConfigMap` changes to take effect.
120 |           shell: oc get pods -n openshift-kmm | grep -i "kmm-operator-controller-" | awk '{print $1}' | xargs oc delete pod -n openshift-kmm
121 |         - name: KMM - wait 10 seconds until KMM operator pod is up and running
122 |           pause:
123 |             seconds: 10
124 |       - name: IDPO - Install Intel Device Plugins Operator
125 |         tags: 
126 |           - idpo
127 |         block:
128 |           - name: IDPO - Install Intel Device Plugins Operator
129 |             k8s:
130 |               state: present
131 |               src: "../device_plugins/install_operator.yaml"
132 |               wait: yes
133 |           - name: IDPO - Wait until the inteldeviceplugins-controller-manager Deployment is available
134 |             k8s_info:
135 |               kind: Deployment
136 |               wait: yes
137 |               name: inteldeviceplugins-operator-controller
138 |               namespace: openshift-operators
139 |               wait_condition:
140 |                 type: Available
141 |                 status: 'True'
142 |                 reason: MinimumReplicasAvailable
143 |     - name: NFD - Install NFD CRs
144 |       block:
145 |         - name: NFD - Create NFD discovery CR
146 |           k8s:
147 |             state: present
148 |             src: "../nfd/node-feature-discovery-openshift.yaml"
149 |             wait: yes 
150 |         - name: NFD - Create NFD rules instance CR
151 |           k8s:
152 |             state: present
153 |             src: "../nfd/node-feature-rules-openshift.yaml"
154 |             wait: yes
155 |     - name: KMM - Deploy Pre-built Out-of-Tree Intel Data Center GPU Driver Container for OpenShift using KMM
156 |       block: 
157 |         - name: KMM - Install KMM pre-build Module CR
158 |           k8s:
159 |             state: present
160 |             src: "../kmmo/intel-dgpu.yaml"
161 |             wait: yes
162 |     - name: IDPO - Install GPU plugin
163 |       block: 
164 |         - name: IDPO - Create Intel GPU device plugin
165 |           k8s:
166 |             state: present
167 |             src: "../device_plugins/gpu_device_plugin.yaml"
168 |             wait: yes
169 |     - name: GPU TEST 
170 |       tags: 
171 |         - gpu_test
172 |       block: 
173 |         - name: GPU TEST - Get Node Resource Information
174 |           kubernetes.core.k8s_info:
175 |             api: v1
176 |             kind: Node
177 |             label_selectors:
178 |               - "intel.feature.node.kubernetes.io/gpu=true"
179 |               - "kmm.node.kubernetes.io/openshift-kmm.intel-dgpu.ready"
180 |             wait: yes
181 |             wait_timeout: 120
182 |           register: cluster_nodes_info
183 |           until:
184 |             - cluster_nodes_info.resources is defined
185 |         - name: Print cluster resources
186 |           debug: 
187 |             msg: 
188 |             - "Please verify Capacity and Allocatable Intel Data Center GPU Resources on the node - "
189 |             - "Capacity: { 
190 |                 'gpu.intel.com/i915': {{ cluster_nodes_info.resources[0].status.capacity['gpu.intel.com/i915'] }},"
191 |             - "Allocatable Resources: { 
192 |                 'gpu.intel.com/i915': {{ cluster_nodes_info.resources[0].status.allocatable['gpu.intel.com/i915'] }},"


--------------------------------------------------------------------------------
/tests/l2/dgpu/README.md:
--------------------------------------------------------------------------------
  1 | ### Verify Intel® Data Center GPU provisioning
  2 | It is supported for Intel® Data Center GPU Flex and Max Series. 
  3 | #### clinfo
  4 | This workload runs [clinfo](https://github.com/Oblomov/clinfo) utilizing the i915 resource from GPU provisioning and displays the related GPU information.
  5 | 
  6 | *	Build the workload container image. 
  7 | 
  8 | ```
  9 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/clinfo_build.yaml 
 10 | ```
 11 | 
 12 | *	Deploy and execute the workload.
 13 | 
 14 | ```
 15 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/clinfo_job.yaml
 16 | ```
 17 | 
 18 | * Check the results.
 19 | ``` 
 20 |   $ oc get pods
 21 |   intel-dgpu-clinfo-1-build        0/1     Completed   0          3m20s
 22 |   intel-dgpu-clinfo-56mh2          0/1     Completed   0          35s
 23 | ```
 24 | ```
 25 | $ oc logs intel-dgpu-clinfo-56mh2  
 26 |   Platform Name                                   Intel(R) OpenCL HD Graphics
 27 |   Number of devices                                 1
 28 |   Device Name                                     Intel(R) Data Center GPU Flex Series 140 [0x56c1]
 29 |   Device Vendor                                   Intel(R) Corporation
 30 |   Device Vendor ID                                0x8086
 31 |   Device Version                                  OpenCL 3.0 NEO
 32 |   Device UUID                                     86800000-c156-0000-0000-000000000000
 33 |   Driver UUID                                     32322e34-332e-3234-3539-352e33350000
 34 |   Valid Device LUID                               No
 35 |   Device LUID                                     80c6-4e56fd7f0000
 36 |   Device Node Mask                                0
 37 |   Device Numeric Version                          0xc00000 (3.0.0)
 38 |   Driver Version                                  22.43.24595.35
 39 |   Device OpenCL C Version                         OpenCL C 1.2
 40 |   Device OpenCL C all versions                    OpenCL 
 41 | ```                                               
 42 | #### hwinfo
 43 | 
 44 | This workload runs ```hwinfo``` utilizing the i915 resource from GPU provisioning and displays the related GPU information. Refer to [link](https://dgpu-docs.intel.com/driver/installation.html#verify-install)
 45 | 
 46 | 
 47 | *	Build the workload container image. 
 48 | 
 49 | ```
 50 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/hwinfo_build.yaml
 51 | ```
 52 | 
 53 | *	Deploy and execute the workload.
 54 | 
 55 | ```
 56 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/hwinfo_job.yaml
 57 | ```
 58 | 
 59 | * Check the results
 60 | ``` 
 61 |   $ oc get pods
 62 |   intel-dgpu-hwinfo-1-build   0/1     Completed   0          2m23s
 63 |   intel-dgpu-hwinfo-44k4d     0/1     Completed   0          106s
 64 | ```
 65 | ```
 66 | $ oc logs intel-dgpu-hwinfo-44k4d  
 67 |  282: PCI aa00.0: 0380 Display controller
 68 |   [Created at pci.386]
 69 |   Unique ID: YxOB.+ER_Ec9Ujm4
 70 |   Parent ID: xBFW.xbjkZcxCQYD
 71 |   SysFS ID: /devices/pci0000:a7/0000:a7:01.0/0000:a8:00.0/0000:a9:01.0/0000:aa:00.0
 72 |   SysFS BusID: 0000:aa:00.0
 73 |   Hardware Class: graphics card
 74 |   Model: "Intel Display controller"
 75 |   Vendor: pci 0x8086 "Intel Corporation"
 76 |   Device: pci 0x0bda
 77 |   SubVendor: pci 0x8086 "Intel Corporation"
 78 |   SubDevice: pci 0x0000
 79 |   Revision: 0x2f
 80 |   Driver: "i915"
 81 |   Driver Modules: "i915"
 82 |   Memory Range: 0x44fe3f000000-0x44fe3fffffff (ro,non-prefetchable)
 83 |   Memory Range: 0x447000000000-0x447fffffffff (ro,non-prefetchable)
 84 |   IRQ: 511 (140 events)
 85 |   Module Alias: "pci:v00008086d00000BDAsv00008086sd00000000bc03sc80i00"
 86 |   Config Status: cfg=new, avail=yes, need=no, active=unknown
 87 |   Attached to: #89 (PCI bridge)
 88 | ```                        
 89 | 
 90 | ### Verify Media feature provisioning for Intel® Data Center GPU
 91 | It is verified on Intel® Data Center GPU Flex Series.
 92 | 
 93 | #### VAInfo
 94 | This workload runs [vainfo](https://github.com/intel/libva-utils) utilizing the i915 resource from GPU provisioning and displays the Media features supported by the GPU.
 95 | 
 96 | *	Build the workload container image.
 97 | 
 98 | ```
 99 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/vainfo_build.yaml
100 | ```
101 | 
102 | *	Deploy and execute the workload.
103 | 
104 | ```
105 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/vainfo_job.yaml
106 | ```
107 | 
108 | * Check the results.
109 | ```
110 |   $ oc get pods -n intel-dgpu
111 |   NAME                        READY   STATUS      RESTARTS   AGE
112 |   intel-dgpu-vainfo-1-build   0/1     Completed   0          9m49s
113 |   intel-dgpu-vainfo-rbdvz     0/1     Completed   0          45s
114 | ```
115 | ```
116 | $ oc logs intel-dgpu-vainfo-rbdvz -n intel-dgpu
117 | vainfo: VA-API version: 1.20 (libva 2.20.0)
118 | vainfo: Driver version: Intel iHD driver for Intel(R) Gen Graphics - 23.4.3 (a9f272496)
119 | vainfo: Supported profile and entrypoints
120 |       VAProfileNone                   : VAEntrypointVideoProc
121 |       VAProfileNone                   : VAEntrypointStats
122 |       VAProfileMPEG2Simple            : VAEntrypointVLD
123 |       VAProfileMPEG2Main              : VAEntrypointVLD
124 |       VAProfileH264Main               : VAEntrypointVLD
125 |       VAProfileH264Main               : VAEntrypointEncSliceLP
126 |       VAProfileH264High               : VAEntrypointVLD
127 |       VAProfileH264High               : VAEntrypointEncSliceLP
128 |       VAProfileJPEGBaseline           : VAEntrypointVLD
129 |       VAProfileJPEGBaseline           : VAEntrypointEncPicture
130 |       VAProfileH264ConstrainedBaseline: VAEntrypointVLD
131 |       VAProfileH264ConstrainedBaseline: VAEntrypointEncSliceLP
132 |       VAProfileHEVCMain               : VAEntrypointVLD
133 |       VAProfileHEVCMain               : VAEntrypointEncSliceLP
134 |       VAProfileHEVCMain10             : VAEntrypointVLD
135 |       VAProfileHEVCMain10             : VAEntrypointEncSliceLP
136 |       VAProfileVP9Profile0            : VAEntrypointVLD
137 |       VAProfileVP9Profile1            : VAEntrypointVLD
138 |       VAProfileVP9Profile2            : VAEntrypointVLD
139 |       VAProfileVP9Profile3            : VAEntrypointVLD
140 |       VAProfileHEVCMain12             : VAEntrypointVLD
141 |       VAProfileHEVCMain422_10         : VAEntrypointVLD
142 |       VAProfileHEVCMain422_10         : VAEntrypointEncSliceLP
143 |       VAProfileHEVCMain422_12         : VAEntrypointVLD
144 |       VAProfileHEVCMain444            : VAEntrypointVLD
145 |       VAProfileHEVCMain444            : VAEntrypointEncSliceLP
146 |       VAProfileHEVCMain444_10         : VAEntrypointVLD
147 |       VAProfileHEVCMain444_10         : VAEntrypointEncSliceLP
148 |       VAProfileHEVCMain444_12         : VAEntrypointVLD
149 |       VAProfileHEVCSccMain            : VAEntrypointVLD
150 |       VAProfileHEVCSccMain            : VAEntrypointEncSliceLP
151 |       VAProfileHEVCSccMain10          : VAEntrypointVLD
152 |       VAProfileHEVCSccMain10          : VAEntrypointEncSliceLP
153 |       VAProfileHEVCSccMain444         : VAEntrypointVLD
154 |       VAProfileHEVCSccMain444         : VAEntrypointEncSliceLP
155 |       VAProfileAV1Profile0            : VAEntrypointVLD
156 |       VAProfileAV1Profile0            : VAEntrypointEncSliceLP
157 |       VAProfileHEVCSccMain444_10      : VAEntrypointVLD
158 |       VAProfileHEVCSccMain444_10      : VAEntrypointEncSliceLP
159 | ```
160 | #### Using libvpl
161 | This workload runs various test programs from [libvpl](https://github.com/intel/libvpl) utilizing the i915 resource from GPU provisioning and displays the Video Processing Library (VPL) fetures supported by the GPU.
162 | 
163 | *	Build the workload container image.
164 | 
165 | ```
166 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/intelvpl_build.yaml
167 | ```
168 | 
169 | *	Deploy and execute the workload.
170 | 
171 | ```
172 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/l2/dgpu/intelvpl_job.yaml
173 | ```
174 | 
175 | * Check the results.
176 | ```
177 | $ oc get pods -n intel-dgpu
178 |   NAME                          READY   STATUS      RESTARTS   AGE
179 |   intel-dgpu-intelvpl-1-build   0/1     Completed   0          26m
180 |   intel-dgpu-intelvpl-f68tb     0/1     Completed   0          15m
181 | ```
182 | ```
183 | $ oc logs intel-dgpu-intelvpl-f68tb -n intel-dgpu
184 | 
185 | Implementation #0: mfxhw64
186 |   Library path: /usr/lib64/libmfxhw64.so.1.35
187 |   AccelerationMode: MFX_ACCEL_MODE_VIA_VAAPI
188 |   ApiVersion: 1.35
189 |   Impl: MFX_IMPL_TYPE_HARDWARE
190 |   VendorImplID: 0x0000
191 |   ImplName: mfxhw64
192 |   License:
193 |   Version: 1.2
194 |   Keywords: MSDK,x64
195 |   VendorID: 0x8086
196 |   mfxAccelerationModeDescription:
197 |     Version: 1.0
198 |     Mode: MFX_ACCEL_MODE_VIA_VAAPI
199 |   mfxPoolPolicyDescription:
200 |     Version: 1.0
201 |   mfxDeviceDescription:
202 |     MediaAdapterType: MFX_MEDIA_UNKNOWN
203 |     DeviceID: 56c1/0
204 |     Version: 1.1
205 |   mfxDecoderDescription:
206 |     Version: 0.0
207 |   mfxEncoderDescription:
208 |     Version: 0.0
209 |   mfxVPPDescription:
210 |     Version: 0.0
211 |   NumExtParam: 0
212 |   Warning - MFX_IMPLCAPS_SURFACE_TYPES not supported
213 | 
214 | Implementation #1: mfxhw64
215 |   Library path: /usr/lib64/libmfxhw64.so.1.35
216 |   AccelerationMode: MFX_ACCEL_MODE_VIA_VAAPI
217 |   ApiVersion: 1.35
218 |   Impl: MFX_IMPL_TYPE_HARDWARE
219 |   VendorImplID: 0x0001
220 |   ImplName: mfxhw64
221 |   License:
222 |   Version: 1.2
223 |   Keywords: MSDK,x64
224 |   VendorID: 0x8086
225 |   mfxAccelerationModeDescription:
226 |     Version: 1.0
227 |     Mode: MFX_ACCEL_MODE_VIA_VAAPI
228 |   mfxPoolPolicyDescription:
229 |     Version: 1.0
230 |   mfxDeviceDescription:
231 |     MediaAdapterType: MFX_MEDIA_UNKNOWN
232 |     DeviceID: 56c1/1
233 |     Version: 1.1
234 |   mfxDecoderDescription:
235 |     Version: 0.0
236 |   mfxEncoderDescription:
237 |     Version: 0.0
238 |   mfxVPPDescription:
239 |     Version: 0.0
240 |   NumExtParam: 0
241 |   Warning - MFX_IMPLCAPS_SURFACE_TYPES not supported
242 | 
243 | Total number of implementations found = 2
244 | ```
245 | 
246 | ## See Also
247 | For GPU demos on vanilla Kubernetes, refer to [link](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/intel-opencl-icd) 


--------------------------------------------------------------------------------
/docs/supported_platforms.md:
--------------------------------------------------------------------------------
 1 | # Supported Red Hat OpenShift Container Platform (RHOCP) Infrastructure 
 2 | 
 3 | Before provisioning the Intel hardware and software technology with RHOCP, users need to decide the infrastructure configuration they want to use to install their RHOCP cluster. Red Hat provides support for multiple infrastructure combinations with RHOCP, and this document provides additional details for the specific infrastructure features included in the Intel® Technology Enabling for OpenShift* project. 
 4 | 
 5 | ## Multi-node bare metal RHOCP cluster provisioning 
 6 | 
 7 | The software and hardware technology included in this project are developed, integrated, and tested on the multi-mode bare metal RHOCP 4.18 cluster. Refer to Red Hat’s documentation for instructions to [install a user-provisioned cluster on bare metal](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/pdf/installing_on_bare_metal/OpenShift_Container_Platform-4.18-Installing_on_bare_metal-en-US.pdf) and reach out to Red Hat for support in provisioning your bare metal cluster.  
 8 | 
 9 | The related container images and operators required to enable the Intel technology included in this project have been certified with Red Hat and published on the [Red Hat Ecosystem Catalog](https://catalog.redhat.com/software). Users identifying issues or feature requests related to the Intel Technology Enabling for OpenShift project are encouraged to submit [issues](https://github.com/intel/intel-technology-enabling-for-openshift/issues) to engage and collaborate with the open source community.  
10 | 
11 | ## Single Node OpenShift (SNO) cluster 
12 | 
13 | The SNO cluster is supported by RHOCP; however, SNO resource restrictions for some configurations could negatively impact the functionality of the operators included in this project or user workloads. The user should verify that the resource requirements necessary for successful provisioning and execution of the workloads and features are available. Users can run the included images and operators for multi-node bare metal cluster on the RHOCP SNO cluster, but this implementation is not fully tested by this project. Please submit any identified [issues](https://github.com/intel/intel-technology-enabling-for-openshift/issues) and feature requests to the Intel Technology Enabling for OpenShift project to engage and collaborate with the open source community. 
14 | 
15 | ## Cloud Service Provider (CSP) managed RHOCP cluster 
16 | 
17 | The container images, operators, and documents included in the Intel Technology Enabling for OpenShift project are intended for multi-node bare metal RHOCP cluster provisioning. While RHOCP may be installed with the major CSP environments, refer to Red Hat’s documentation for [selecting an installation method and preparing a cluster](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/installation_overview/installing-preparing#installing-preparing-selecting-cluster-type) for CSP details and consult Red Hat or your CSP for specific support. Any contributions in this area are welcomed.  
18 | 
19 | ## Virtual Machine (VM) based RHOCP cluster 
20 | 
21 | Multi-node bare metal RHOCP cluster provisioning is the primary cluster use case supported by the project. Virtual Machine technology-based infrastructures support will be included in a future release. For feature requests and related discussions, please submit a project [issue](https://github.com/intel/intel-technology-enabling-for-openshift/issues). For RHOCP infrastructure support, please contact Red Hat. 
22 | 
23 | ## Supported Intel Hardware Features 
24 | 
25 | The following Intel feature technologies are supported in this release.  
26 | 
27 | | Intel Feature Technology                      | Intel Hardware Platform                                        |
28 | |-----------------------------------------------|----------------------------------------------------------------|
29 | | Intel® Gaudi® AI accelerator                  | [Intel Gaudi 2 AI accelerator](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi2.html) <br/> [Intel Gaudi 3 AI accelerator](https://www.intel.com/content/www/us/en/content-details/817486/intel-gaudi-3-ai-accelerator-white-paper.html)                                                    |
30 | | Intel® Data Center GPU Flex Series            | [Intel Data Center GPU Flex 140](https://www.intel.com/content/www/us/en/products/sku/230020/intel-data-center-gpu-flex-140/specifications.html) <br/>  [Intel Data Center GPU Flex 170](https://www.intel.com/content/www/us/en/products/sku/230019/intel-data-center-gpu-flex-170/specifications.html) |
31 | | Intel® Data Center GPU Max Series            | [Intel Data Center GPU Max 1100](https://www.intel.com/content/www/us/en/products/sku/232876/intel-data-center-gpu-max-1100/specifications.html) <br/>  [Intel Data Center GPU Max 1550](https://www.intel.com/content/www/us/en/products/sku/232873/intel-data-center-gpu-max-1550/specifications.html) |
32 | | Intel® QuickAssist Technology (Intel® QAT) | [6th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/xeon6-product-brief.html) <br/> [5th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)    |
33 | | Intel® Software Guard Extensions (Intel® SGX) | [6th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/xeon6-product-brief.html) <br/>  [5th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)    |                             
34 | | Intel® Data Streaming Accelerator (Intel® DSA) | [6th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/xeon6-product-brief.html) <br/> [5th Gen Intel® Xeon® Scalable processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)    |
35 | 
36 | ## Setting up Intel Hardware Features
37 | 
38 | ### BIOS Configuration
39 | Note: Please refer to your BIOS vendor for specific instructions. This is only a reference for BIOS configuration. 
40 | | Feature | BIOS Configuration | 
41 | | ----- | ---------------------- |
42 | | Intel SGX | [Link](https://www.intel.com/content/www/us/en/support/articles/000087972/server-products/single-node-servers.html) |
43 | | Intel Data Center GPU Flex Series | [Link](https://www.intel.com/content/www/us/en/content-details/774119/virtualization-guide-for-intel-data-center-gpu-flex-series.html?wapkw=gpu%20flex%20series%20setup%20guide) |
44 | | Intel Data Center GPU Max Series | [Link](https://dgpu-docs.intel.com/system-user-guides/DNP-Max-1100-userguide/DNP-Max-1100-userguide.html#bios-setup) |
45 | | Intel QAT | [Link](https://github.com/intel/qatlib/blob/7429ee2b7c837137ed11959a3c2cc3729dc15739/INSTALL#L104) |
46 | | Intel DSA | [Link](https://cdrdv2-public.intel.com/759709/353216-data-streaming-accelerator-user-guide-003.pdf) |
47 | 
48 | ## Component Matrix
49 | 
50 | Below are the recommended components versions to use with this release of Intel Technology Enabling for OpenShift.
51 | 
52 | | Name                                                | Version                                                                                       |
53 | |-----------------------------------------------------|-----------------------------------------------------------------------------------------------|
54 | | Intel Technology Enabling for OpenShift             | 1.6.1                                                                                         |
55 | | RHOCP version                                       | 4.18.1                                                                                        |
56 | | Kubernetes version                                  | v1.31.5                                                                                       |
57 | | OS and Kernel                                       | Red Hat Enterprise CoreOS with Linux Kernel 5.14.0-427.50.2.el9_4.x86_64                      |
58 | | Intel Data Center GPU Driver for OpenShift          | 3.0.0                                                                                         |
59 | | Kernel Module Management Operator                   | 2.3.0                                                                                         |
60 | | Intel GPU out of tree (OOT) driver                  | I915_RELEASE=I915_24WW49.3_803.125_23.10.83_231129.89 </br> FIRMWARE_RELEASE=24WW51.4_1057.13 |
61 | | Intel Device Plugins Operator (on OCP)              | 0.32.1                                                                                        |
62 | | Node Feature Discovery Operator                     | 4.18.0-202505052133                                                                           |
63 | | RedHat OpenShift AI (RHOAI) operator                | NA                                                                                            |
64 | | OpenVINO operator                                   | NA                                                                                            |
65 | 
66 | ## Support 
67 | 
68 | ### Project life cycle introduction 
69 | 
70 | The Intel Technology Enabling for OpenShift project is based on the [RHOCP life cycle](https://access.redhat.com/support/policy/updates/openshift) and implements the “X.Y.Z” version naming for this project. X.Y version name for this project matches the RHOCP X.Y version. The Z release is intended for hot bug fixes and minor features. 
71 | 
72 | For example: 
73 | 
74 | * 1.0.0 release for RHOCP 4.12.0 and 1.0.z is the potential hot bug fixes or minor feature release for RHOCP 4.12.z  
75 | 
76 | * 1.1.0 release for RHOCP 4.13.0 and 1.1.z is the potential hot bug fixes or minor feature release for RHOCP 4.13.z  
77 | 
78 | * 1.2.0 release for RHOCP 4.14 and 1.2.z is the potential hot bug fixes or minor feature release for RHOCP 4.14.z  
79 | 
80 | The Intel Technology Enabling for OpenShift project typically begins a new development and release cycle after the GA of each RHOCP Y release is complete. Expect some delay for the latest RHOCP release to complete component dependency integration with this project and for the required testing and verification necessary for certification with Red Hat. 
81 | 
82 | ### Commercial support from Red Hat 
83 | 
84 | This project relies on features developed and released with the latest RHOCP release. Commercial RHOCP release support is outlined in the [Red Hat OpenShift Container Platform Life Cycle Policy](https://access.redhat.com/support/policy/updates/openshift) and Intel will collaborate with Red Hat to address specific requirements from our users.  
85 | 
86 | ### Support from the open source community 
87 | 
88 | Intel Technology Enabling for OpenShift is run as an open source project on GitHub. Project GitHub [issues](https://github.com/intel/intel-technology-enabling-for-openshift/issues) can be used as the primary support interface for users to submit feature requests and report issues to the community when using Intel technology provided by this project. 
89 | 


--------------------------------------------------------------------------------
/tests/gaudi/l2/README.md:
--------------------------------------------------------------------------------
  1 | # Verify Intel® Gaudi® AI Accelerator Provisioning
  2 | 
  3 | ## hl-smi 
  4 | System Management Interface Tool (hl-smi) utility tool obtains information and monitors data of the Intel Gaudi AI accelerators.
  5 | `hl-smi` tool is packaged with the Gaudi base image. Run below command to deploy and execute the tool:
  6 | ```
  7 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/hl-smi_job.yaml
  8 | ```
  9 | 
 10 | Verify Output:
 11 | ```
 12 | $ oc get pods -n gaudi-validation
 13 | NAME                         READY   STATUS      RESTARTS   AGE
 14 | hl-smi-workload-2-f5qgs      0/1     Completed   0          27m
 15 | ```
 16 | ```
 17 | $ oc logs hl-smi-workload-2-f5qgs -n gaudi-validation
 18 | +-----------------------------------------------------------------------------+
 19 | | HL-SMI Version:                                hl-1.17.1-fw-51.5.0          |
 20 | | Driver Version:                                     1.17.1-78932ae          |
 21 | |-------------------------------+----------------------+----------------------+
 22 | | AIP  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 23 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | AIP-Util  Compute M. |
 24 | |===============================+======================+======================|
 25 | |   0  HL-225              N/A  | 0000:4d:00.0     N/A |                   0  |
 26 | | N/A   31C   N/A    95W / 600W |    768MiB / 98304MiB |     0%           N/A |
 27 | |-------------------------------+----------------------+----------------------+
 28 | |   1  HL-225              N/A  | 0000:b4:00.0     N/A |                   0  |
 29 | | N/A   28C   N/A    85W / 600W |    768MiB / 98304MiB |     0%           N/A |
 30 | |-------------------------------+----------------------+----------------------+
 31 | | Compute Processes:                                               AIP Memory |
 32 | |  AIP       PID   Type   Process name                             Usage      |
 33 | |=============================================================================|
 34 | |   0        N/A   N/A    N/A                                      N/A        |
 35 | |   1        N/A   N/A    N/A                                      N/A        |
 36 | +=============================================================================+
 37 | ```
 38 | 
 39 | ## HCCL
 40 | HCCL (Habana Collective Communication Library) demo is a program that demonstrates HCCL usage and supports communication via Gaudi based scale-out or Host NIC scale-out. Refer [HCCL Demo](https://github.com/HabanaAI/hccl_demo/tree/main?tab=readme-ov-file#hccl-demo) for more details.
 41 | 
 42 | Build the workload container image:
 43 | ```
 44 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/hccl_build.yaml
 45 | ```
 46 | Create service account with required permissions: 
 47 | ```
 48 | $ oc create sa hccl-demo-anyuid-sa -n gaudi-validation
 49 | $ oc adm policy add-scc-to-user anyuid -z hccl-demo-anyuid-sa -n gaudi-validation
 50 | ```
 51 | Deploy and execute the workload:
 52 | ```
 53 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/hccl_job.yaml
 54 | ```
 55 | 
 56 | Verify Output:
 57 | ``` 
 58 | $ oc get pods -n gaudi-validation
 59 | NAME                         READY   STATUS      RESTARTS   AGE
 60 | hccl-demo-workload-1-build   0/1     Completed   0          23m
 61 | hccl-demo-workload-wq8mx     0/1     Completed   0          10m
 62 | ```
 63 | ```
 64 | $ oc logs hccl-demo-workload-wq8mx -n gaudi-validation
 65 | Affinity: Numa mapping directory: /tmp/affinity_topology_output
 66 | Affinity: Script has not been executed before, going to execute...
 67 | Affinity: Script has finished successfully
 68 | Welcome to HCCL demo
 69 | .
 70 | .
 71 | .
 72 | ####################################################################################################
 73 | [BENCHMARK] hcclAllReduce(src!=dst, data_size=33554432, count=8388608, dtype=float, iterations=1000)
 74 | [BENCHMARK]     NW Bandwidth   : 258.209121 GB/s
 75 | [BENCHMARK]     Algo Bandwidth : 147.548069 GB/s
 76 | ####################################################################################################
 77 | ```
 78 | 
 79 | ## vLLM 
 80 | vLLM is a serving engine for LLM's. The following workloads deploys a VLLM server with an LLM using Intel Gaudi. Refer to [Intel Gaudi vLLM fork](https://github.com/HabanaAI/vllm-fork.git) for more details.
 81 | 
 82 | Use the gaudi-validation project
 83 | ```
 84 | $ oc project gaudi-validation
 85 | ```
 86 | Build the workload container image:
 87 | ```
 88 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/vllm_buildconfig.yaml
 89 | 
 90 | ```
 91 | Check if the build has completed
 92 | ```
 93 | $ oc get builds
 94 | NAMESPACE           NAME               TYPE     FROM         STATUS     STARTED       DURATION
 95 | gaudi-validation   vllm-workload-1    Docker   Dockerfile   Complete   7 minutes ago   4m58s
 96 | 
 97 | ```
 98 | 
 99 | Deploy the workload:
100 | * Update the hugging face token in the ```vllm_hf_secret.yaml``` file, refer to [link](https://huggingface.co/docs/hub/en/security-tokens) for more details. 
101 | ```
102 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/vllm_hf_secret.yaml
103 | ```
104 | meta-llama/Llama-3.1-8B model is used in this deployment and the hugging face token is used to access such gated models.
105 | * For the PV setup with NFS, refer to [documentation](https://docs.openshift.com/container-platform/4.17/storage/persistent_storage/persistent-storage-nfs.html).
106 | * The vLLM pod needs to access the host's shared memory for tensor parallel inference, which is mounted as a volume.
107 | ```
108 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/vllm_deployment.yaml
109 | ```
110 | Create the vllm service
111 | ```
112 | oc expose deploy/vllm-workload
113 | ```
114 | Verify Output:
115 | ``` 
116 | $ oc get pods
117 | NAME                             READY   STATUS      RESTARTS   AGE
118 | vllm-workload-1-build            0/1     Completed   0          19m
119 | vllm-workload-55f7c6cb7b-cwj2b   1/1     Running     0          8m36s
120 | 
121 | $ oc get svc
122 | NAME            TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)    AGE
123 | vllm-workload   ClusterIP   1.2.3.4          <none>        8000/TCP   114s
124 | ```
125 | ```
126 | $ oc logs vllm-workload-55f7c6cb7b-cwj2b 
127 | 
128 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BS_BUCKET_MIN=32 (default:min)
129 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BS_BUCKET_STEP=32 (default:step)
130 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BS_BUCKET_MAX=256 (default:max)
131 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_PROMPT_SEQ_BUCKET_MIN=128 (default:min)
132 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_PROMPT_SEQ_BUCKET_STEP=128 (default:step)
133 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_PROMPT_SEQ_BUCKET_MAX=1024 (default:max)
134 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BLOCK_BUCKET_MIN=128 (default:min)
135 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BLOCK_BUCKET_STEP=128 (default:step)
136 | INFO 10-30 19:35:53 habana_model_runner.py:95] VLLM_DECODE_BLOCK_BUCKET_MAX=4096 (default:max)
137 | INFO 10-30 19:35:53 habana_model_runner.py:691] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
138 | INFO 10-30 19:35:53 habana_model_runner.py:696] Decode bucket config (min, step, max_warmup) bs:[32, 32, 256], block:[128, 128, 4096]
139 | ============================= HABANA PT BRIDGE CONFIGURATION ===========================
140 | PT_HPU_LAZY_MODE = 1
141 | PT_RECIPE_CACHE_PATH =
142 | PT_CACHE_FOLDER_DELETE = 0
143 | PT_HPU_RECIPE_CACHE_CONFIG =
144 | PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
145 | PT_HPU_LAZY_ACC_PAR_MODE = 1
146 | PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
147 | PT_HPU_EAGER_PIPELINE_ENABLE = 1
148 | PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1
149 | ---------------------------: System Configuration :---------------------------
150 | Num CPU Cores : 160
151 | CPU RAM : 1056371848 KB
152 | ------------------------------------------------------------------------------
153 | INFO 10-30 19:35:56 selector.py:85] Using HabanaAttention backend.
154 | INFO 10-30 19:35:56 loader.py:284] Loading weights on hpu ...
155 | INFO 10-30 19:35:56 weight_utils.py:224] Using model weights format ['*.safetensors', '*.bin', '*.pt']
156 | Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00<?, ?it/s]
157 | Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:03<00:11, 3.87s/it]
158 | Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:07<00:07, 3.71s/it]
159 | Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:10<00:03, 3.59s/it]
160 | Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:11<00:00, 2.49s/it]
161 | Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:11<00:00, 2.93s/it]
162 | ```
163 | 
164 | * The internal service url is used to run inference requests to the vLLM server. This service is only accessible from pods running within the same namespace i.e gaudi-validation. Run the below commands to create a sample pod and run requests.
165 | 
166 | ```
167 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/test-pod.yaml
168 | ```
169 | 
170 | Check for the pod
171 | 
172 | ```
173 | $ oc get pods
174 | NAME                           READY   STATUS      RESTARTS   AGE
175 | test                           1/1     Running     0          2s
176 | ```
177 | 
178 | Use the command below to enter pod terminal to run curl requests
179 | 
180 | ```
181 | $ oc debug pod/test
182 | ```
183 | 
184 | ```
185 | sh-5.1# curl  "http://vllm-workload.gaudi-validation.svc.cluster.local:8000/v1/models"{"object":"list","data":[{"id":"meta-llama/Llama-3.1-8B","object":"model","created":1730317412,"owned_by":"vllm","root":"meta-llama/Llama-3.1-8B","parent":null,"max_model_len":131072,"permission":[{"id":"modelperm-452b2bd990834aa5a9416d083fcc4c9e","object":"model_permission","created":1730317412,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}
186 | ```
187 | 
188 | ```
189 | sh-5.1# curl http://vllm-workload.gaudi-validation.svc.cluster.local:8000/v1/completions   -H "Content-Type: application/json"   -d '{
190 |         "model": "meta-llama/Llama-3.1-8B",
191 |         "prompt": "A constellation is a",
192 |         "max_tokens": 10
193 |       }'
194 | {"id":"cmpl-9a0442d0da67411081837a3a32a354f2","object":"text_completion","created":1730321284,"model":"meta-llama/Llama-3.1-8B","choices":[{"index":0,"text":" group of individual stars that forms a pattern or figure","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":5,"total_tokens":15,"completion_tokens":10}}
195 | ```
196 | 
197 | ## Check firmware version with hl-smi
198 | System Management Interface Tool (hl-smi) utility tool obtains information and monitors data of the Intel Gaudi AI accelerators.
199 | Run below command to check firmware version with the tool:
200 | ```
201 | $ oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/tests/gaudi/l2/hl-smi-firmware_job.yaml
202 | ```
203 | 
204 | Verify Output:
205 | ```
206 | $ oc get pods -n gaudi-validation
207 | NAME                      READY   STATUS      RESTARTS   AGE
208 | hl-smi-firmware-pxhsn     0/1     Completed   0          11s
209 | ```
210 | ```
211 | $ oc logs hl-smi-firmware-pxhsn -n gaudi-validation
212 |         Firmware [SPI] Version          : Preboot version hl-gaudi2-1.16.0-fw-50.1.2-sec-9 (May 26 2024 - 11:33:04)
213 | ```
214 | 


--------------------------------------------------------------------------------