├── LICENSE
├── README.md
├── dockerfiles
    ├── README.md
    ├── base
    │   ├── Dockerfile.rhel8.6
    │   ├── Dockerfile.rhel9.2
    │   ├── Dockerfile.rhel9.4
    │   ├── Dockerfile.suse15.5
    │   ├── Dockerfile.tencentos3.1
    │   ├── Dockerfile.ubuntu22.04
    │   ├── Dockerfile.ubuntu24.04
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── install-python310.sh
    │   ├── install_efa.sh
    │   └── tencentos_efa_patch.txt
    ├── common.mk
    ├── pytorch
    │   ├── Dockerfile.rhel8.6
    │   ├── Dockerfile.rhel9.2
    │   ├── Dockerfile.rhel9.4
    │   ├── Dockerfile.suse15.5
    │   ├── Dockerfile.tencentos3.1
    │   ├── Dockerfile.ubuntu
    │   ├── Makefile
    │   └── install_packages.sh
    ├── triton
    │   ├── Dockerfile
    │   └── Makefile
    └── triton_vllm_backend
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── model.py
    │   └── samples
    │       ├── client.py
    │       ├── model_repository
    │           └── vllm_model
    │           │   ├── 1
    │           │       └── model.json
    │           │   └── config.pbtxt
    │       ├── prompts.txt
    │       └── test_models
    │           ├── llama70b_8x
    │               ├── 1
    │               │   └── model.json
    │               └── config.pbtxt
    │           ├── llama7b_1x
    │               ├── 1
    │               │   └── model.json
    │               └── config.pbtxt
    │           └── qwen_7b_chat
    │               ├── 1
    │                   └── model.json
    │               └── config.pbtxt
├── legal-disclaimer.md
└── utils
    ├── README.md
    ├── check_framework_env.py
    └── intel_gaudi_health_screen
        ├── .gitignore
        ├── HealthReport.py
        ├── IGNodes.py
        ├── README.md
        ├── config.yaml
        ├── hccl_demo_helper.py
        ├── hostfile
        ├── run_ighs.sh
        ├── screen.py
        ├── system_utils.py
        ├── template
            ├── bare-metal
            │   ├── dockerfile
            │   ├── intel-gaudi-docker-compose-L1.yaml
            │   ├── intel-gaudi-docker-compose-L2-launcher.yaml
            │   ├── intel-gaudi-docker-compose-L2-worker.yaml
            │   └── run_hccl_demo.sh
            └── k8s
            │   ├── intel-gaudi-health-screen-L1.yaml
            │   └── intel-gaudi-health-screen-L2_hccl-demo.yaml
        ├── utilities.py
        └── version.txt


/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Intel&reg; Gaudi&reg; Accelerator Setup and Installation
 2 | 
 3 | <br />
 4 | 
 5 | ---
 6 | 
 7 | <br />
 8 | 
 9 | By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
10 | 
11 | <br />
12 | 
13 | ---
14 | 
15 | <br />
16 | 
17 | ## Overview
18 | 
19 | Welcome to Setup and Installation GitHub Repository!
20 | 
21 | The full installation documentation has been consolidated into the Installation Guide in our Intel Gaudi Documentation. Please reference our [Intel Gaudi docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide.
22 | 
23 | This respository contains the following references:
24 |   - dockerfiles -- Reference dockerfiles and build script to build Gaudi Docker images
25 |   - utils -- Reference utility scripts
26 | 


--------------------------------------------------------------------------------
/dockerfiles/README.md:
--------------------------------------------------------------------------------
 1 | # Gaudi Docker Images Builder
 2 | 
 3 | ## Table of Contents
 4 |   - [Overview](#overview)
 5 |   - [Build docker](#docker_build)
 6 | 
 7 | 
 8 | <br />
 9 | 
10 | ---
11 | 
12 | <br />
13 | 
14 | ## Overview
15 | 
16 | This folder contains Gaudi dockerfiles and makefiles that can be used to build Habanalabs docker images for Gaudi.
17 | 
18 | <br />
19 | 
20 | ---
21 | 
22 | <br />
23 | 
24 | ## Build Docker
25 | 
26 | This script can be used as reference to build docker images for Gaudi.
27 | 
28 | ### How to Build Docker Images from Habana Dockerfiles
29 | 
30 | 1. Go into the folder of the image type you would like to build:
31 |     * base
32 |     * pytorch
33 |     * triton
34 | 
35 | 2. Run build command to generate Docker image
36 |     ```
37 |     make build
38 |     ```
39 |     Examples:
40 |     #### Build pytorch image for rhel9.2:
41 |     ```
42 |     cd pytorch
43 |     make build BUILD_OS=rhel9.2
44 |     ```
45 | 
46 |     #### Build triton image (default OS - ubuntu22.04):
47 |     ```
48 |     cd triton
49 |     make build
50 |     ```
51 | 
52 |     #### Build triton vllm backend (default OS - ubuntu22.04):
53 |     ```
54 |     cd triton_vllm_backend
55 |     make build BUILD_OS=ubuntu22.04
56 |     ```
57 | 
58 | 3. Build command variables
59 | 
60 |     #### Optional Parameters
61 |     * BUILD_OS - set the OS to build (default ubuntu22.04)
62 |     * BUILD_DIR - the folder where the build be executed from (default dockerbuild in image folder)
63 |     * VERBOSE - set to TRUE to echo the commands (default FALSE)
64 |     * DOCKER_CACHE - set to TRUE to use cache for building docker image (default FALSE)
65 | 
66 | 4. Instructions for triton-vllm-back-end server
67 | 
68 |    * Run the backend container as described in [habana docs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Triton_Inference.html?highlight=triton%20inference#run-the-backend-container)
69 |    * Start the triton server
70 |      ```bash
71 |      tritonserver --model-repository samples/model_repository
72 |      ```
73 |      The current samples/model_repository/vllm_model contains llama27B 1x.We also have sample model files for llama2 7b/70b and qwen2-7b respectively under samples/model_repository/test_models folder. To use them , copy the model.json and config.pbtxt to vllm_model folder structure.
74 |    * To test with client, please follow the instructions [here](https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#sending-your-first-inference)
75 | 
76 | 


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.rhel8.6:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 Habana Labs, Ltd.
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # HabanaLabs Dockerfile base installer layer for RedHat 8.6
  6 | FROM registry.access.redhat.com/ubi8/ubi:8.6
  7 | ARG ARTIFACTORY_URL
  8 | ARG VERSION
  9 | ARG REVISION
 10 | 
 11 | LABEL vendor="Habanalabs Ltd."
 12 | LABEL release="${VERSION}-${REVISION}"
 13 | 
 14 | COPY LICENSE /licenses/
 15 | 
 16 | RUN dnf install -y \
 17 |         python3-dnf-plugin-versionlock && \
 18 |     dnf versionlock add redhat-release* && \
 19 |     dnf clean all
 20 | 
 21 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \
 22 |     dnf clean all
 23 | 
 24 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 25 |     echo "name=CentOS Linux 8 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 26 |     echo "baseurl=https://vault.centos.org/8-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 27 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 28 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo
 29 | 
 30 | RUN dnf update -y && dnf install -y \
 31 |         bzip2 \
 32 |         bzip2-devel \
 33 |         clang \
 34 |         cmake3 \
 35 |         cpp \
 36 |         gcc \
 37 |         gcc-c++ \
 38 |         git \
 39 |         glibc \
 40 |         glibc-devel \
 41 |         glibc-headers \
 42 |         iproute \
 43 |         jemalloc \
 44 |         libarchive \
 45 |         libjpeg-devel \
 46 |         libksba \
 47 |         llvm \
 48 |         lsof \
 49 |         mesa-libGL \
 50 |         openssh-clients \
 51 |         openssh-server \
 52 |         python3.11-devel \
 53 |         python3.11-pip \
 54 |         redhat-lsb-core \
 55 |         unzip \
 56 |         wget && \
 57 |     dnf clean all && \
 58 |     rm -f /etc/ssh/ssh_host_*_key*
 59 | 
 60 | # CVE-2023-47038 RHSA-2024:3128
 61 | RUN dnf module reset -y perl && \
 62 |     dnf module enable -y perl:5.32 && \
 63 |     dnf module install -y --allowerasing perl:5.32 && \
 64 |     dnf clean all
 65 | 
 66 | RUN echo "[appstream]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 67 |     echo "name=CentOS Linux 8 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 68 |     echo "baseurl=https://vault.centos.org/8-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 69 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 70 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo
 71 | 
 72 | COPY install_efa.sh .
 73 | RUN ./install_efa.sh && rm install_efa.sh && rm -f /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 74 | ENV OPENMPI_VERSION=4.1.6
 75 | ENV MPI_ROOT=/opt/habanalabs/openmpi
 76 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
 77 | ENV PATH=${MPI_ROOT}/bin:$PATH
 78 | ENV OPAL_PREFIX=${MPI_ROOT}
 79 | ENV MPICC=${MPI_ROOT}/bin/mpicc
 80 | ENV RDMAV_FORK_SAFE=1
 81 | ENV FI_EFA_USE_DEVICE_RDMA=0
 82 | ENV OMPI_MCA_btl=^openib
 83 | 
 84 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
 85 |     echo "name=Habana RH8 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
 86 |     echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6" >> /etc/yum.repos.d/habanalabs.repo && \
 87 |     echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo
 88 | 
 89 | RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \
 90 |     echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \
 91 |     echo "baseurl=https://vault.centos.org/8-stream/PowerTools/x86_64/os/"  >> /etc/yum.repos.d/powertools.repo && \
 92 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/powertools.repo && \
 93 |     echo "gpgcheck=1" >> /etc/yum.repos.d/powertools.repo
 94 | 
 95 | ENV PYTHON_VERSION=3.11
 96 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
 97 |     alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \
 98 |     alternatives --set python3 /usr/bin/python3.11
 99 | 
100 | RUN dnf install -y \
101 |         habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \
102 |         habanalabs-thunk-"$VERSION"-"$REVISION".el8 \
103 |         habanalabs-firmware-tools-"$VERSION"-"$REVISION".el8 \
104 |         habanalabs-graph-"$VERSION"-"$REVISION".el8 && \
105 |     dnf clean all && \
106 |     rm -f /etc/yum.repos.d/habanalabs.repo
107 | 
108 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
109 | ENV PIP_NO_CACHE_DIR=on
110 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
111 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
112 | 
113 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
114 |     tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
115 |     cd /tmp/openmpi-${OPENMPI_VERSION} && \
116 |     ./configure --prefix=${MPI_ROOT} --with-verbs && \
117 |     make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}
118 | 
119 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
120 | 
121 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
122 | 
123 | # SSH configuration necessary to support mpi-operator v2
124 | RUN mkdir -p /var/run/sshd && \
125 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
126 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
127 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
128 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
129 |     mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc
130 | 
131 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
132 | ENV HABANA_LOGS=/var/log/habana_logs/
133 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
134 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.rhel9.2:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd.
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.2
  6 | FROM registry.access.redhat.com/ubi9/ubi:9.2
  7 | ARG ARTIFACTORY_URL
  8 | ARG VERSION
  9 | ARG REVISION
 10 | 
 11 | LABEL vendor="Habanalabs Ltd."
 12 | LABEL release="${VERSION}-${REVISION}"
 13 | 
 14 | COPY LICENSE /licenses/
 15 | 
 16 | RUN dnf install -y \
 17 |         python3-dnf-plugin-versionlock && \
 18 |     dnf versionlock add redhat-release* && \
 19 |     dnf clean all
 20 | 
 21 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
 22 |     dnf clean all
 23 | 
 24 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 25 |     echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 26 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 27 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 28 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo
 29 | 
 30 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 31 |     echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 32 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 33 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 34 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo
 35 | 
 36 | RUN dnf update -y && dnf install -y \
 37 |         bzip2 \
 38 |         bzip2-devel \
 39 |         clang \
 40 |         cmake3 \
 41 |         cpp \
 42 |         gcc \
 43 |         gcc-c++ \
 44 |         git \
 45 |         glibc \
 46 |         glibc-devel \
 47 |         glibc-headers \
 48 |         iproute \
 49 |         jemalloc \
 50 |         libarchive \
 51 |         libffi-devel \
 52 |         libjpeg-devel \
 53 |         libksba \
 54 |         llvm \
 55 |         lsb_release \
 56 |         lsof \
 57 |         mesa-libGL \
 58 |         openssh-clients \
 59 |         openssh-server \
 60 |         openssl \
 61 |         openssl-devel \
 62 |         python3-devel \
 63 |         unzip \
 64 |         wget \
 65 |         zlib-devel && \
 66 |     dnf clean all && \
 67 |     rm -f /etc/ssh/ssh_host_*_key*
 68 | 
 69 | ENV PYTHON_VERSION=3.10
 70 | COPY install-python310.sh .
 71 | RUN ./install-python310.sh rhel9.2 && rm install-python310.sh
 72 | RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig
 73 | ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 74 | 
 75 | COPY install_efa.sh .
 76 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 77 | 
 78 | ENV OPENMPI_VERSION=4.1.6
 79 | ENV MPI_ROOT=/opt/habanalabs/openmpi
 80 | ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
 81 | ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH
 82 | ENV OPAL_PREFIX=${MPI_ROOT}
 83 | ENV MPICC=${MPI_ROOT}/bin/mpicc
 84 | ENV RDMAV_FORK_SAFE=1
 85 | ENV FI_EFA_USE_DEVICE_RDMA=0
 86 | ENV OMPI_MCA_btl=^openib
 87 | 
 88 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
 89 |     echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
 90 |     echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \
 91 |     echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \
 92 |     echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo
 93 | 
 94 | # for Habana GPG key with SHA-1 signature
 95 | RUN update-crypto-policies --set DEFAULT:SHA1
 96 | 
 97 | RUN dnf install -y \
 98 |         habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \
 99 |         habanalabs-thunk-"$VERSION"-"$REVISION".el9 \
100 |         habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \
101 |         habanalabs-graph-"$VERSION"-"$REVISION".el9 && \
102 |     dnf clean all && \
103 |     rm -f /etc/yum.repos.d/habanalabs.repo
104 | 
105 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
106 | ENV PIP_NO_CACHE_DIR=on
107 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
108 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
109 | 
110 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
111 |     tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
112 |     cd /tmp/openmpi-${OPENMPI_VERSION} && \
113 |     ./configure --prefix=${MPI_ROOT} --with-verbs && \
114 |     make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}
115 | 
116 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
117 | 
118 | RUN ln -s /usr/bin/python3 /usr/bin/python
119 | 
120 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
121 | 
122 | # SSH configuration necessary to support mpi-operator v2
123 | RUN mkdir -p /var/run/sshd && \
124 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
125 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
126 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
127 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
128 |     mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc
129 | 
130 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
131 | ENV HABANA_LOGS=/var/log/habana_logs/
132 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
133 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.rhel9.4:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd.
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.4
  6 | FROM registry.access.redhat.com/ubi9/ubi:9.4
  7 | ARG ARTIFACTORY_URL
  8 | ARG VERSION
  9 | ARG REVISION
 10 | 
 11 | LABEL vendor="Habanalabs Ltd."
 12 | LABEL release="${VERSION}-${REVISION}"
 13 | 
 14 | COPY LICENSE /licenses/
 15 | 
 16 | RUN dnf install -y \
 17 |         python3-dnf-plugin-versionlock && \
 18 |     dnf versionlock add redhat-release* && \
 19 |     dnf clean all
 20 | 
 21 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
 22 |     dnf clean all
 23 | 
 24 | RUN dnf update -y && dnf install -y \
 25 |         openssl \
 26 |         openssl-devel && \
 27 |     dnf versionlock add openssl* openssl-devel* && \
 28 |     dnf clean all
 29 | 
 30 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 31 |     echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 32 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 33 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
 34 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo
 35 | 
 36 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 37 |     echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 38 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 39 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
 40 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo
 41 | 
 42 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
 43 |     echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
 44 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
 45 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
 46 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
 47 | 
 48 | RUN dnf install -y \
 49 |         bzip2 \
 50 |         bzip2-devel \
 51 |         clang \
 52 |         cmake3 \
 53 |         cpp \
 54 |         ffmpeg-free \
 55 |         gcc \
 56 |         gcc-c++ \
 57 |         git \
 58 |         glibc \
 59 |         glibc-devel \
 60 |         glibc-headers \
 61 |         iproute \
 62 |         jemalloc \
 63 |         libarchive \
 64 |         libffi-devel \
 65 |         libjpeg-devel \
 66 |         libksba \
 67 |         llvm \
 68 |         lsb_release \
 69 |         lsof \
 70 |         mesa-libGL \
 71 |         openssh-clients \
 72 |         openssh-server \
 73 |         python3-devel \
 74 |         python3.11 \
 75 |         python3.11-devel \
 76 |         python3.11-pip \
 77 |         python3.11-rpm \
 78 |         unzip \
 79 |         wget \
 80 |         zlib-devel && \
 81 |     dnf versionlock add \
 82 |         python3-rpm \
 83 |         rpm* && \
 84 |     dnf clean all && \
 85 |     rm -f /etc/ssh/ssh_host_*_key*
 86 | 
 87 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
 88 |     alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \
 89 |     alternatives --set python3 /usr/bin/python3.11 && \
 90 |     alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 2 && \
 91 |     alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.9 1 && \
 92 |     alternatives --set pip3 /usr/bin/pip3.11
 93 | 
 94 | COPY install_efa.sh .
 95 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 96 | 
 97 | ENV OPENMPI_VERSION=4.1.6
 98 | ENV MPI_ROOT=/opt/habanalabs/openmpi
 99 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
100 | ENV PATH=${MPI_ROOT}/bin:$PATH
101 | ENV OPAL_PREFIX=${MPI_ROOT}
102 | ENV MPICC=${MPI_ROOT}/bin/mpicc
103 | ENV RDMAV_FORK_SAFE=1
104 | ENV FI_EFA_USE_DEVICE_RDMA=0
105 | ENV OMPI_MCA_btl=^openib
106 | 
107 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
108 |     echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
109 |     echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \
110 |     echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \
111 |     echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo
112 | 
113 | # for Habana GPG key with SHA-1 signature
114 | RUN update-crypto-policies --set DEFAULT:SHA1
115 | 
116 | RUN dnf install -y \
117 |         habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \
118 |         habanalabs-thunk-"$VERSION"-"$REVISION".el9 \
119 |         habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \
120 |         habanalabs-graph-"$VERSION"-"$REVISION".el9 && \
121 |     dnf clean all && \
122 |     chmod +t /var/log/habana_logs && \
123 |     rm -f /etc/yum.repos.d/habanalabs.repo
124 | 
125 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
126 | ENV PIP_NO_CACHE_DIR=on
127 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
128 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
129 | 
130 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
131 |     tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
132 |     cd /tmp/openmpi-${OPENMPI_VERSION} && \
133 |     ./configure --prefix=${MPI_ROOT} --with-verbs && \
134 |     make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}
135 | 
136 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
137 | 
138 | RUN ln -s /usr/bin/python3 /usr/bin/python
139 | 
140 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
141 | 
142 | # SSH configuration necessary to support mpi-operator v2
143 | RUN mkdir -p /var/run/sshd && \
144 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
145 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
146 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
147 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
148 |     mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc
149 | 
150 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
151 | ENV HABANA_LOGS=/var/log/habana_logs/
152 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
153 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.suse15.5:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd.
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # HabanaLabs Dockerfile base installer layer for SUSE 15.5
  6 | FROM registry.suse.com/suse/sle15:15.5
  7 | ARG ARTIFACTORY_URL
  8 | ARG VERSION
  9 | ARG REVISION
 10 | 
 11 | # for RHEL certification
 12 | LABEL vendor="Habanalabs Ltd."
 13 | LABEL release="${VERSION}-${REVISION}"
 14 | 
 15 | COPY LICENSE /licenses/
 16 | 
 17 | RUN zypper addrepo -f http://download.opensuse.org/distribution/leap/15.5/repo/oss/ OpenSUSI && \
 18 |     echo "gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSI.repo && \
 19 |     echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSI.repo
 20 | 
 21 | RUN zypper addrepo -f http://download.opensuse.org/source/distribution/leap/15.5/repo/oss/ OpenSUSISrc && \
 22 |     echo "gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSISrc.repo && \
 23 |     echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSISrc.repo
 24 | 
 25 | RUN zypper mr -p 99 SLE_BCI
 26 | 
 27 | RUN zypper update -y && zypper install -y --allow-downgrade \
 28 |         clang \
 29 |         cmake \
 30 |         ffmpeg \
 31 |         gcc \
 32 |         gcc-c++ \
 33 |         git \
 34 |         glibc-devel \
 35 |         iproute \
 36 |         jemalloc \
 37 |         kernel-devel \
 38 |         kernel-macros \
 39 |         lbzip2 \
 40 |         libarchive-devel \
 41 |         libffi-devel \
 42 |         libjpeg-devel \
 43 |         libksba \
 44 |         linux-glibc-devel \
 45 |         llvm \
 46 |         lsof \
 47 |         Mesa-libGL-devel \
 48 |         Mesa-libGL1 \
 49 |         openssh-clients \
 50 |         openssh-server \
 51 |         openssl \
 52 |         openssl-devel \
 53 |         python311 \
 54 |         python311-devel \
 55 |         unzip \
 56 |         wget \
 57 |         zlib-devel && \
 58 |     zypper clean && \
 59 |     rm -f /etc/ssh/ssh_host_*_key*
 60 | 
 61 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
 62 | ENV PIP_NO_CACHE_DIR=on
 63 | 
 64 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
 65 |     alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \
 66 |     alternatives --set python3 /usr/bin/python3.11
 67 | 
 68 | RUN wget https://bootstrap.pypa.io/get-pip.py && \
 69 |     python3 get-pip.py && \
 70 |     rm -f get-pip.py && \
 71 |     python3 -m pip install setuptools==76.1.0 wheel && \
 72 |     python3 -m pip install --upgrade Jinja2
 73 | 
 74 | COPY install_efa.sh .
 75 | RUN ./install_efa.sh && rm -f install_efa.sh /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 76 | 
 77 | ENV MPI_ROOT=/opt/amazon/openmpi
 78 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
 79 | ENV PATH=${MPI_ROOT}/bin:$PATH
 80 | ENV OPAL_PREFIX=${MPI_ROOT}
 81 | ENV MPICC=${MPI_ROOT}/bin/mpicc
 82 | ENV RDMA_FORK_SAFE=1
 83 | ENV FI_EFA_USE_DEVICE_RDMA=1
 84 | 
 85 | RUN echo "[habanalabs]" > /etc/zypp/repos.d/habanalabs.repo && \
 86 |     echo "name=Habana SUSE Linux repo" >> /etc/zypp/repos.d/habanalabs.repo && \
 87 |     echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/sles/15/15.5" >> /etc/zypp/repos.d/habanalabs.repo && \
 88 |     echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/sles/15/15.5/repodata/repomd.xml.key" >> /etc/zypp/repos.d/habanalabs.repo && \
 89 |     echo "gpgcheck=1" >> /etc/zypp/repos.d/habanalabs.repo
 90 | 
 91 | RUN zypper --gpg-auto-import-keys install -y \
 92 |         habanalabs-rdma-core-"$VERSION"-"$REVISION" \
 93 |         habanalabs-thunk-"$VERSION"-"$REVISION" \
 94 |         habanalabs-firmware-tools-"$VERSION"-"$REVISION" \
 95 |         habanalabs-graph-"$VERSION"-"$REVISION" && \
 96 |     zypper clean && \
 97 |     rm -f /etc/zypp/repos.d/habanalabs.repo
 98 | 
 99 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
100 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
101 | 
102 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
103 | 
104 | # SSH configuration necessary to support mpi-operator v2
105 | RUN mkdir -p /var/run/sshd && \
106 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
107 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
108 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
109 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
110 |     mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc
111 | 
112 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
113 | ENV HABANA_LOGS=/var/log/habana_logs/
114 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
115 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
116 | 


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.tencentos3.1:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd.
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | #
  5 | # HabanaLabs Dockerfile base installer layer for Tencentos 3.1
  6 | FROM tencentos/tencentos_server31_mini:20230630
  7 | ARG ARTIFACTORY_URL
  8 | ARG VERSION
  9 | ARG REVISION
 10 | 
 11 | # for RHEL certification
 12 | LABEL vendor="Habanalabs Ltd."
 13 | LABEL release="${VERSION}-${REVISION}"
 14 | 
 15 | COPY LICENSE /licenses/
 16 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \
 17 |     dnf clean all && rm -rf /var/cache/yum
 18 | 
 19 | RUN dnf install -y \
 20 |         python3-dnf-plugin-versionlock && \
 21 |     dnf versionlock add redhat-release* && \
 22 |     dnf clean all
 23 | 
 24 | RUN dnf update -y && dnf install -y \
 25 |         clang \
 26 |         cmake3 \
 27 |         cpp \
 28 |         gcc \
 29 |         gcc-c++ \
 30 |         git \
 31 |         glibc \
 32 |         glibc-devel \
 33 |         glibc-headers \
 34 |         iproute \
 35 |         jemalloc \
 36 |         libarchive \
 37 |         libjpeg-devel \
 38 |         libksba \
 39 |         llvm \
 40 |         lsof \
 41 |         mesa-libGL \
 42 |         openssh-clients \
 43 |         openssh-server \
 44 |         redhat-lsb-core \
 45 |         unzip \
 46 |         wget && \
 47 |     dnf clean all && \
 48 |     rm -f /etc/ssh/ssh_host_*_key*
 49 | 
 50 | COPY install-python310.sh .
 51 | RUN ./install-python310.sh tencentos3.1 && rm -f install-python310.sh
 52 | RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig
 53 | ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 54 | 
 55 | COPY install_efa.sh .
 56 | COPY tencentos_efa_patch.txt /tmp/tencentos_efa_patch.txt
 57 | RUN ./install_efa.sh && rm -f install_efa.sh /tmp/tencentos_efa_patch.txt /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 58 | 
 59 | ENV MPI_ROOT=/usr/mpi/gcc/openmpi-4.1.5a1
 60 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib64:/usr/lib/habanalabs:$LD_LIBRARY_PATH
 61 | ENV PATH=${MPI_ROOT}/bin:$PATH
 62 | ENV OPAL_PREFIX=${MPI_ROOT}
 63 | ENV MPICC=${MPI_ROOT}/bin/mpicc
 64 | ENV RDMAV_FORK_SAFE=1
 65 | ENV FI_EFA_USE_DEVICE_RDMA=1
 66 | 
 67 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
 68 |     echo "name=Habana TC31 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
 69 |     echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1" >> /etc/yum.repos.d/habanalabs.repo && \
 70 |     echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo
 71 | 
 72 | RUN dnf install -y \
 73 |         habanalabs-rdma-core-"$VERSION"-"$REVISION".tl3 \
 74 |         habanalabs-thunk-"$VERSION"-"$REVISION".tl3 \
 75 |         habanalabs-firmware-tools-"$VERSION"-"$REVISION".tl3 \
 76 |         habanalabs-graph-"$VERSION"-"$REVISION".tl3 && \
 77 |     rm -f /etc/yum.repos.d/habanalabs.repo && \
 78 |     dnf clean all
 79 | 
 80 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
 81 | ENV PIP_NO_CACHE_DIR=on
 82 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
 83 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
 84 | 
 85 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 && \
 86 |     python3 -m pip install --upgrade Jinja2
 87 | 
 88 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
 89 | 
 90 | # SSH configuration necessary to support mpi-operator v2
 91 | RUN mkdir -p /var/run/sshd && \
 92 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
 93 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
 94 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
 95 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
 96 |     mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc
 97 | 
 98 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
 99 | ENV HABANA_LOGS=/var/log/habana_logs/
100 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
101 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.ubuntu22.04:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 22.04
 6 | FROM ubuntu:jammy
 7 | ARG ARTIFACTORY_URL
 8 | ARG VERSION
 9 | ARG REVISION
10 | 
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
13 | ENV HABANA_LOGS=/var/log/habana_logs/
14 | ENV OS_NUMBER=2204
15 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
16 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
17 | 
18 | RUN apt-get update && apt-get install -y --no-install-recommends \
19 |         apt-transport-https \
20 |         apt-utils \
21 |         bc \
22 |         build-essential \
23 |         ca-certificates \
24 |         dkms \
25 |         ethtool \
26 |         gcc \
27 |         git \
28 |         gnupg \
29 |         gpg-agent \
30 |         graphviz \
31 |         libgl1 \
32 |         libgnutls30 \
33 |         libgoogle-glog0v5 \
34 |         libjemalloc2 \
35 |         libjpeg-dev \
36 |         libkrb5-3 \
37 |         libpq-dev \
38 |         lsof \
39 |         make \
40 |         openssh-client \
41 |         openssh-server \
42 |         protobuf-compiler \
43 |         python3 \
44 |         python3-dev \
45 |         python3-pip \
46 |         python3-tk \
47 |         python3-venv \
48 |         unzip \
49 |         vim \
50 |         wget && \
51 |     apt-get upgrade -y libc6 && \
52 |     apt-get autoremove && rm -rf /var/lib/apt/lists/* && \
53 |     rm -f /etc/ssh/ssh_host_*_key*
54 | 
55 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
56 | ENV PIP_NO_CACHE_DIR=on
57 | 
58 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 && \
59 |     python3 -m pip install --upgrade Jinja2
60 | 
61 | COPY install_efa.sh .
62 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
63 | 
64 | ENV MPI_ROOT=/opt/amazon/openmpi
65 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
66 | ENV PATH=${MPI_ROOT}/bin:$PATH
67 | ENV OPAL_PREFIX=${MPI_ROOT}
68 | ENV MPICC=${MPI_ROOT}/bin/mpicc
69 | ENV RDMAV_FORK_SAFE=1
70 | ENV FI_EFA_USE_DEVICE_RDMA=1
71 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
72 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
73 | 
74 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \
75 |     chown root:root /usr/share/keyrings/habana-artifactory.gpg && \
76 |     chmod 644 /usr/share/keyrings/habana-artifactory.gpg  && \
77 |     echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \
78 |     cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \
79 |     sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \
80 |     apt-get update && apt-get install -y --no-install-recommends \
81 |         habanalabs-rdma-core="$VERSION"-"$REVISION" \
82 |         habanalabs-thunk="$VERSION"-"$REVISION" \
83 |         habanalabs-firmware-tools="$VERSION"-"$REVISION" \
84 |         habanalabs-graph="$VERSION"-"$REVISION" && \
85 |     apt-get autoremove && rm -rf /var/lib/apt/lists/* && \
86 |     mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \
87 |     sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list
88 | 
89 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
90 | 
91 | # SSH configuration necessary to support mpi-operator v2
92 | RUN mkdir -p /var/run/sshd && \
93 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
94 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
95 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
96 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
97 |     echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \
98 |     sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc


--------------------------------------------------------------------------------
/dockerfiles/base/Dockerfile.ubuntu24.04:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 24.04
 6 | FROM ubuntu:noble
 7 | ARG ARTIFACTORY_URL
 8 | ARG VERSION
 9 | ARG REVISION
10 | 
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
13 | ENV HABANA_LOGS=/var/log/habana_logs/
14 | ENV OS_NUMBER=2404
15 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
16 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
17 | 
18 | RUN apt-get update && apt-get install -y --no-install-recommends \
19 |         apt-transport-https \
20 |         apt-utils \
21 |         bc \
22 |         build-essential \
23 |         ca-certificates \
24 |         dkms \
25 |         ethtool \
26 |         gcc \
27 |         git \
28 |         gnupg \
29 |         gpg-agent \
30 |         graphviz \
31 |         libgl1 \
32 |         libgnutls30 \
33 |         libgoogle-glog0v6t64 \
34 |         libjemalloc2 \
35 |         libjpeg-dev \
36 |         libkrb5-3 \
37 |         libpq-dev \
38 |         lsof \
39 |         make \
40 |         openssh-client \
41 |         openssh-server \
42 |         protobuf-compiler \
43 |         python3 \
44 |         python3-dev \
45 |         unzip \
46 |         vim \
47 |         wget && \
48 |     apt-get update && apt-get upgrade -y libtasn1-6 && \
49 |     apt-get autoremove && rm -rf /var/lib/apt/lists/* && \
50 |     rm -f /etc/ssh/ssh_host_*_key*
51 | 
52 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
53 | ENV PIP_NO_CACHE_DIR=on
54 | 
55 | RUN mv /usr/lib/python3.12/EXTERNALLY-MANAGED /usr/lib/python3.12/EXTERNALLY-MANAGED.old && \
56 |     wget https://bootstrap.pypa.io/get-pip.py && \
57 |     python3 get-pip.py && \
58 |     rm -f get-pip.py && \
59 |     python3 -m pip install setuptools wheel && \
60 |     python3 -m pip install --upgrade Jinja2
61 | 
62 | COPY install_efa.sh .
63 | RUN ./install_efa.sh && rm -f install_efa.sh /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
64 | 
65 | ENV MPI_ROOT=/opt/amazon/openmpi
66 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
67 | ENV PATH=${MPI_ROOT}/bin:$PATH
68 | ENV OPAL_PREFIX=${MPI_ROOT}
69 | ENV MPICC=${MPI_ROOT}/bin/mpicc
70 | ENV RDMAV_FORK_SAFE=1
71 | ENV FI_EFA_USE_DEVICE_RDMA=1
72 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
73 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
74 | 
75 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \
76 |     chown root:root /usr/share/keyrings/habana-artifactory.gpg && \
77 |     chmod 644 /usr/share/keyrings/habana-artifactory.gpg  && \
78 |     echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian noble main" | tee -a /etc/apt/sources.list && \
79 |     cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \
80 |     sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \
81 |     apt-get update && apt-get install -y --no-install-recommends \
82 |         habanalabs-rdma-core="$VERSION"-"$REVISION" \
83 |         habanalabs-thunk="$VERSION"-"$REVISION" \
84 |         habanalabs-firmware-tools="$VERSION"-"$REVISION" \
85 |         habanalabs-graph="$VERSION"-"$REVISION" && \
86 |     apt-get autoremove && rm -rf /var/lib/apt/lists/* && \
87 |     mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \
88 |     sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list
89 | 
90 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
91 | 
92 | # SSH configuration necessary to support mpi-operator v2
93 | RUN mkdir -p /var/run/sshd && \
94 |     sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
95 |     sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \
96 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
97 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
98 |     echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \
99 |     sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc


--------------------------------------------------------------------------------
/dockerfiles/base/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/dockerfiles/base/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | include ../common.mk
 3 | 
 4 | IMAGE_NAME = base-installer-${BUILD_OS}
 5 | 
 6 | ifdef REPO_NAME
 7 | 	DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg REPO_NAME=$(REPO_NAME)
 8 | endif
 9 | 
10 | init:
11 | 	$(HIDE)mkdir -p $(BUILD_DIR)
12 | 	$(HIDE)cp $(CURDIR)/LICENSE $(BUILD_DIR)/
13 | 	$(HIDE)cp $(CURDIR)/*.sh $(BUILD_DIR)/
14 | 	$(HIDE)cp $(CURDIR)/tencentos_efa_patch.txt $(BUILD_DIR)/
15 | 	$(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile
16 | 
17 | build: init
18 | 


--------------------------------------------------------------------------------
/dockerfiles/base/install-python310.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | _BASE_NAME=${1:-"ubuntu22.04"}
 5 | _SSL_LIB=""
 6 | 
 7 | # preinstall dependencies and define variables
 8 | case "${_BASE_NAME}" in
 9 |     *ubuntu22.04* | *ubuntu24.04*)
10 |         echo "Skip installation of Python 3.10 from sources on Ubuntu 22.04 and Ubuntu 24.04"
11 |         exit 0;
12 |     ;;
13 |     *rhel*)
14 |         dnf install -y sqlite-devel readline-devel xz-devel
15 |     ;;
16 |     *tencentos3.1*)
17 |         dnf install -y sqlite-devel readline-devel zlib-devel xz-devel bzip2-devel libffi-devel
18 |         wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && \
19 |             cd /opt/ && \
20 |             tar xzf openssl-1.1.1w.tar.gz && \
21 |             rm -rf openssl-1.1.1w.tar.gz && \
22 |             cd openssl-1.1.1w && \
23 |             ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && \
24 |             make && make install
25 |         ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem
26 | 
27 |         PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin
28 |         LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH
29 |         _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w"
30 |     ;;
31 | esac
32 | 
33 | # install Python
34 | wget -nv -O /opt/Python-3.10.14.tgz https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz
35 | cd /opt/
36 | tar xzf Python-3.10.14.tgz
37 | rm -f Python-3.10.14.tgz
38 | cd Python-3.10.14
39 | ./configure --enable-optimizations --enable-loadable-sqlite-extensions --enable-shared $_SSL_LIB
40 | make -j && make altinstall
41 | 
42 | # post install
43 | case "${_BASE_NAME}" in
44 |     *rhel9*)
45 |         alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 2 && \
46 |         alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \
47 |         alternatives --set python3 /usr/local/bin/python3.10
48 |         export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
49 |     ;;
50 |     *tencentos3.1*)
51 |         alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 4 && \
52 |         alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 3 && \
53 |         alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
54 |         alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \
55 |         alternatives --set python3 /usr/local/bin/python3.10
56 |         export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
57 |     ;;
58 | esac
59 | 
60 | python3 -m pip install --upgrade pip setuptools
61 | 
62 | 


--------------------------------------------------------------------------------
/dockerfiles/base/install_efa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -ex
 2 | 
 3 | DEFAULT_EFA_INSTALLER_VER=1.34.0
 4 | efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER}
 5 | 
 6 | tmp_dir=$(mktemp -d)
 7 | wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-$efa_installer_version.tar.gz -P $tmp_dir
 8 | tar -xf $tmp_dir/aws-efa-installer-$efa_installer_version.tar.gz -C $tmp_dir
 9 | RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify"
10 | pushd $tmp_dir/aws-efa-installer
11 | . /etc/os-release
12 | case $ID in
13 |     rhel)
14 |         # we cannot install dkms packages on RHEL images due to OCP rules
15 |         find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \;
16 |         find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \;
17 |         case $VERSION_ID in
18 |             8*)
19 |                 dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm
20 |             ;;
21 |             9*)
22 |                 dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm
23 |             ;;
24 |             *)
25 |                 echo "Unsupported RHEL version: $VERSION_ID"
26 |                 exit 1
27 |             ;;
28 |         esac
29 |         RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'"
30 |     ;;
31 |     tencentos)
32 |         # dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm
33 |         find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \;
34 |         find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \;
35 |         rm -rf RPMS/ROCKYLINUX8/x86_64/rdma-core/rdma*
36 |         patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch
37 |         tmp_dir_ofed=$(mktemp -d)
38 |         wget -O $tmp_dir_ofed/MLNX_OFED.tgz https://${ARTIFACTORY_URL}/artifactory/gaudi-installer/deps/MLNX_OFED_LINUX-5.8-3.0.7.0-rhel8.4-x86_64.tgz
39 |         pushd $tmp_dir_ofed
40 |         tar xf MLNX_OFED.tgz
41 |         ofed_packages_path="mlnx-ofed"
42 |         pushd mlnx-ofed
43 |         yum install pciutils-libs tcsh tk python36 gcc-gfortran kernel-modules fuse-libs numactl-libs -y
44 |         ./mlnxofedinstall --distro RHEL8.4 --skip-distro-check --user-space-only --skip-repo --force
45 |         popd
46 |         popd
47 |         rm -rf $tmp_dir_ofed
48 |         RUN_EFA_INSTALLER="echo 'Skipping EFA installer on tencentos'"
49 |     ;;
50 |     ubuntu)
51 |         apt-get update
52 |     ;;
53 | esac
54 | 
55 | eval $RUN_EFA_INSTALLER
56 | 
57 | case $ID in
58 |     ubuntu)
59 |         apt-get autoremove && rm -rf /var/lib/apt/lists/*
60 |     ;;
61 | esac
62 | 
63 | popd
64 | rm -rf $tmp_dir
65 | 


--------------------------------------------------------------------------------
/dockerfiles/base/tencentos_efa_patch.txt:
--------------------------------------------------------------------------------
  1 | diff --git a/common.sh b/common.sh
  2 | index 3c3a0e4..b463f42 100755
  3 | --- a/common.sh
  4 | +++ b/common.sh
  5 | @@ -50,6 +50,15 @@ has_substring() {
  6 |  	fi
  7 |  }
  8 | 
  9 | +is_tencentos_3() {
 10 | +	. /etc/os-release
 11 | +	if [ "$NAME" = "TencentOS Server" ] && [ "$VERSION_ID" = "3.1" ]; then
 12 | +		return 0
 13 | +	else
 14 | +		return 1
 15 | +	fi
 16 | +}
 17 | +
 18 |  is_amazon_linux_2() {
 19 |  	. /etc/os-release
 20 |  	if [ "$NAME" = "Amazon Linux" ] && [ "$VERSION_ID" = "2" ]; then
 21 | @@ -164,7 +173,7 @@ is_suse_15() {
 22 |  }
 23 | 
 24 |  install_cmd() {
 25 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then
 26 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then
 27 |  		if [ $1 == "localinstall" ]; then
 28 |  			shift
 29 |  			yum -y localinstall $@
 30 | @@ -181,7 +190,7 @@ install_cmd() {
 31 |  	fi
 32 |  }
 33 |  search_cmd() {
 34 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then
 35 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then
 36 |  		yum list installed $@
 37 |  	elif is_suse_15; then
 38 |  		zypper search --installed-only --match-exact $@
 39 | @@ -194,7 +203,7 @@ search_cmd() {
 40 |  }
 41 |  remove_cmd() {
 42 |  	# we don't remove the dependencies of the efa packages as it may have reverse dependencies on other system packages
 43 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then
 44 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then
 45 |  		rpm --erase --nodeps $@
 46 |  	elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then
 47 |  		# purge is identical to remove except that packages are removed and purged
 48 | @@ -207,7 +216,7 @@ remove_cmd() {
 49 |  }
 50 |  # Get the list of file installed by the package name
 51 |  query_file_list_cmd() {
 52 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then
 53 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then
 54 |  		rpm -ql $@
 55 |  	elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then
 56 |  		dpkg -L $@
 57 | @@ -220,7 +229,7 @@ query_file_list_cmd() {
 58 |  # reverse dependencies (some other installed packages depend on them)
 59 |  # this command will return non-zero
 60 |  remove_dryrun_cmd() {
 61 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then
 62 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then
 63 |  		rpm --erase --test $@
 64 |  	elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then
 65 |  		dpkg -r --dry-run $@
 66 | diff --git a/efa_installer.sh b/efa_installer.sh
 67 | index 544673f..faf3369 100755
 68 | --- a/efa_installer.sh
 69 | +++ b/efa_installer.sh
 70 | @@ -97,7 +97,7 @@ select_mpi() {
 71 |  }
 72 | 
 73 |  detect_os() {
 74 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then
 75 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then
 76 |  		PACKAGE_TYPE="rpm"
 77 |  		KERNEL_SEARCH_STRING=kernel
 78 |  		INSTALL_ARGS="--setopt=skip_missing_names_on_install=False"
 79 | @@ -209,7 +209,7 @@ setup_install_package_paths() {
 80 |  	local kmod_path
 81 | 
 82 |  	if [ "${PACKAGE_TYPE}" = "rpm" ]; then
 83 | -		if is_rhel_8 || is_rockylinux_8; then
 84 | +		if is_rhel_8 || is_rockylinux_8|| is_tencentos_3; then
 85 |  			base_dir="RPMS/ROCKYLINUX8/${arch}"
 86 |  			debug_dir="RPMS/ROCKYLINUX8/${arch}/debug"
 87 |  		elif is_rockylinux_9 || is_rhel_9; then
 88 | @@ -465,7 +465,7 @@ install_apt_package() {
 89 |  install_dependencies() {
 90 |  	local packages
 91 | 
 92 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then
 93 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then
 94 |  		packages="pciutils rpmdevtools"
 95 |  		if [ ${SKIP_KMOD} -eq 0 ]; then
 96 |  			for kernel in ${INSTALLED_KERNELS[@]}; do
 97 | @@ -785,7 +785,7 @@ uninstall_efa() {
 98 | 
 99 |  uninstall_old_efa_packages() {
100 |  	# Uninstall 'openmpi' and 'libfabric' if packaged by AWS.
101 | -	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then
102 | +	if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then
103 |  		for pkg in openmpi libfabric libfabric-debuginfo; do
104 |  			rpm -ql $pkg | grep -q /opt/amazon
105 |  			if [ $? -eq 0 ]; then
106 | 


--------------------------------------------------------------------------------
/dockerfiles/common.mk:
--------------------------------------------------------------------------------
 1 | VERBOSE ?= FALSE
 2 | DOCKER ?= docker
 3 | DOCKER_CACHE ?= FALSE
 4 | BUILD_OS ?= ubuntu22.04
 5 | BUILD_DIR ?= $(CURDIR)/dockerbuild
 6 | 
 7 | REPO_SERVER ?= vault.habana.ai
 8 | PT_VERSION ?= 2.6.0
 9 | RELEASE_VERSION ?= 1.21.0
10 | RELEASE_BUILD_ID ?= 555
11 | 
12 | BASE_IMAGE_URL ?= base-installer-$(BUILD_OS)
13 | IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID)
14 | 
15 | DOCKER_BUILD_ARGS := --build-arg ARTIFACTORY_URL=$(REPO_SERVER) --build-arg VERSION=$(RELEASE_VERSION) --build-arg REVISION=$(RELEASE_BUILD_ID) --build-arg BASE_NAME=$(BASE_IMAGE_URL)
16 | 
17 | # Hide or not the calls depending of VERBOSE
18 | ifeq ($(VERBOSE),TRUE)
19 | 	HIDE =
20 | else
21 | 	HIDE = @
22 | endif
23 | 
24 | # Use cache for build depending of DOCKER_CACHE
25 | ifeq ($(DOCKER_CACHE),TRUE)
26 | 	CACH_FLAG =
27 | else
28 | 	CACH_FLAG = --no-cache
29 | endif
30 | 
31 | .PHONY: help build clean
32 | 
33 | help: ## Prints this help.
34 | 	@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
35 | 
36 | .DEFAULT_GOAL := help
37 | 
38 | clean: ## clean the build dir
39 | 	$(HIDE)rm -rf $(BUILD_DIR)
40 | 
41 | build:  ## build docker image
42 | 	@echo Building image - $(IMAGE_NAME)
43 | 	$(HIDE)$(DOCKER) build --network=host $(CACH_FLAG) --tag $(IMAGE_URL) $(DOCKER_BUILD_ARGS) $(BUILD_DIR)
44 | 	@echo -n $(IMAGE_URL) | tee $(BUILD_DIR)/image_name
45 | 


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile.rhel8.6:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 8.6
 6 | ARG BASE_NAME
 7 | ARG VERSION
 8 | ARG REVISION
 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION}
10 | ARG PT_VERSION
11 | ARG VERSION
12 | ARG REVISION
13 | ARG BASE_NAME
14 | ARG ARTIFACTORY_URL
15 | 
16 | LABEL name="PyTorch Installer"
17 | LABEL summary="Habanalabs PyTorch installer layer for RHEL8.6"
18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch"
19 | 
20 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth
21 | 
22 | RUN dnf update -y && dnf install -y \
23 |         cairo-devel \
24 |         curl \
25 |         gcc-toolset-11 \
26 |         gperftools-devel \
27 |         iproute \
28 |         jq \
29 |         lapack-devel \
30 |         numactl \
31 |         numactl-devel \
32 |         openblas-devel \
33 |         pdsh \
34 |         which \
35 |         zlib-devel && \
36 |     dnf clean all
37 | 
38 | COPY install_packages.sh .
39 | 
40 | RUN ./install_packages.sh && rm -f install_packages.sh && \
41 |     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
42 | 
43 | # Configure GCC 11
44 | ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH}
45 | ENV MANPATH=/opt/rh/gcc-toolset-11/root/usr/share/man:${MANPATH}
46 | ENV INFOPATH=/opt/rh/gcc-toolset-11/root/usr/share/info:${INFOPATH}
47 | ENV PCP_DIR=/opt/rh/gcc-toolset-11/root
48 | ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH}
49 | ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH}
50 | 
51 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
52 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile.rhel9.2:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.2
 6 | ARG BASE_NAME
 7 | ARG VERSION
 8 | ARG REVISION
 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION}
10 | ARG PT_VERSION
11 | ARG VERSION
12 | ARG REVISION
13 | ARG BASE_NAME
14 | ARG ARTIFACTORY_URL
15 | 
16 | LABEL name="PyTorch Installer"
17 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2"
18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch"
19 | 
20 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth
21 | 
22 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
23 |     echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
24 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
25 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
26 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
27 | 
28 | RUN dnf update -y && dnf install --nobest --allowerasing -y \
29 |         cairo-devel \
30 |         curl \
31 |         gperftools-devel \
32 |         iproute \
33 |         jq \
34 |         lapack-devel \
35 |         numactl \
36 |         numactl-devel \
37 |         openblas-devel \
38 |         which \
39 |         zlib-devel && \
40 |     dnf clean all
41 | 
42 | COPY install_packages.sh .
43 | 
44 | RUN ./install_packages.sh && rm -f install_packages.sh && \
45 |     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
46 | 
47 | # Set LD_PRELOAD after all required installations to
48 | # avoid warnings during docker creation
49 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
50 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile.rhel9.4:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.4
 6 | ARG BASE_NAME
 7 | ARG VERSION
 8 | ARG REVISION
 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION}
10 | ARG PT_VERSION
11 | ARG VERSION
12 | ARG REVISION
13 | ARG BASE_NAME
14 | ARG ARTIFACTORY_URL
15 | 
16 | LABEL name="PyTorch Installer"
17 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4"
18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch"
19 | 
20 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt
21 | 
22 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
23 |     echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
24 |     echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
25 |     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
26 |     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
27 | 
28 | RUN dnf update -y && dnf install --nobest --nodocs --setopt=install_weak_deps=false --allowerasing -y \
29 |         cairo-devel \
30 |         gperftools-devel \
31 |         iproute \
32 |         jq \
33 |         lapack-devel \
34 |         numactl \
35 |         numactl-devel \
36 |         openblas-devel \
37 |         which \
38 |         zlib-devel && \
39 |     dnf clean all
40 | 
41 | COPY install_packages.sh .
42 | 
43 | RUN ./install_packages.sh && rm -f install_packages.sh && \
44 |     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
45 | 
46 | # Set LD_PRELOAD after all required installations to
47 | # avoid warnings during docker creation
48 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
49 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile.suse15.5:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile PyTorch installer layer for SUSE 15.5
 6 | ARG BASE_NAME
 7 | ARG VERSION
 8 | ARG REVISION
 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION}
10 | ARG PT_VERSION
11 | ARG VERSION
12 | ARG REVISION
13 | ARG BASE_NAME
14 | ARG ARTIFACTORY_URL
15 | 
16 | # for RHEL certification
17 | LABEL name="PyTorch Installer"
18 | LABEL summary="Habanalabs PyTorch installer layer for SUSE 15.5"
19 | LABEL description="Image with pre installed Habanalabs packages for PyTorch"
20 | 
21 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/
22 | 
23 | RUN zypper update -y && zypper install -y --allow-downgrade \
24 |         cairo-devel \
25 |         gperftools-devel \
26 |         jq \
27 |         lapack-devel \
28 |         numactl && \
29 |     zypper clean
30 | 
31 | COPY install_packages.sh .
32 | 
33 | RUN ./install_packages.sh && rm -f install_packages.sh && \
34 |     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
35 | 
36 | # Set LD_PRELOAD after all required installations to
37 | # avoid warnings during docker creation
38 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4
39 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile.tencentos3.1:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 8.6
 6 | ARG BASE_NAME
 7 | ARG VERSION
 8 | ARG REVISION
 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION}
10 | ARG PT_VERSION
11 | ARG VERSION
12 | ARG REVISION
13 | ARG BASE_NAME
14 | ARG ARTIFACTORY_URL
15 | 
16 | LABEL name="PyTorch Installer"
17 | LABEL summary="Habanalabs PyTorch installer layer for Tencentos 3.1"
18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch"
19 | 
20 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/
21 | 
22 | RUN dnf versionlock add openmpi* perftest*
23 | 
24 | RUN dnf update -y && dnf install -y \
25 |         cairo-devel \
26 |         curl \
27 |         gcc-toolset-11 \
28 |         gperftools-devel \
29 |         iproute \
30 |         jq \
31 |         lapack-devel \
32 |         numactl \
33 |         numactl-devel \
34 |         openblas-devel \
35 |         libevent \
36 |         pdsh \
37 |         which \
38 |         zlib-devel && \
39 |     dnf clean all
40 | 
41 | # Configure GCC 11
42 | ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH}
43 | ENV MANPATH=/opt/rh/gcc-toolset-11/root/usr/share/man:${MANPATH}
44 | ENV INFOPATH=/opt/rh/gcc-toolset-11/root/usr/share/info:${INFOPATH}
45 | ENV PCP_DIR=/opt/rh/gcc-toolset-11/root
46 | ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.5a1/lib64:/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH}
47 | ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:/usr/mpi/gcc/openmpi-4.1.5a1/lib64/pkgconfig:${PKG_CONFIG_PATH}
48 | ENV CMAKE_PREFIX_PATH=/usr/mpi/gcc/openmpi-4.1.5a1/include:${CMAKE_PREFIX_PATH}
49 | 
50 | COPY install_packages.sh .
51 | 
52 | RUN ./install_packages.sh && rm -f install_packages.sh && \
53 |     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
54 | 
55 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
56 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
57 | 


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Dockerfile.ubuntu:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile PyTorch installer layer for Ubuntu22.04
 6 | ARG BASE_NAME
 7 | ARG VERSION
 8 | ARG REVISION
 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION}
10 | ARG PT_VERSION
11 | ARG VERSION
12 | ARG REVISION
13 | ARG BASE_NAME
14 | ARG ARTIFACTORY_URL
15 | 
16 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/
17 | 
18 | RUN apt-get update && apt-get install -y --no-install-recommends \
19 |         curl \
20 |         iproute2 \
21 |         jq \
22 |         libcurl4 \
23 |         libgoogle-perftools-dev \
24 |         libhdf5-dev \
25 |         libjpeg-dev \
26 |         liblapack-dev \
27 |         libnuma-dev \
28 |         libopenblas-dev \
29 |         moreutils \
30 |         numactl \
31 |         pdsh && \
32 |     apt-get autoremove && rm -rf /var/lib/apt/lists/*
33 | 
34 | RUN bash -c "\
35 |         case $BASE_NAME in \
36 |             *ubuntu22.04*) \
37 |                update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \
38 |             ;; \
39 |             *ubuntu24.04*) \
40 |                update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \
41 |             ;; \
42 |         esac"
43 | 
44 | COPY install_packages.sh .
45 | 
46 | RUN ./install_packages.sh && rm -f install_packages.sh && \
47 |     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
48 | 
49 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4
50 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768


--------------------------------------------------------------------------------
/dockerfiles/pytorch/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | include ../common.mk
 3 | 
 4 | IMAGE_NAME = pytorch-installer-${BUILD_OS}-$(PT_VERSION)
 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION)
 6 | 
 7 | base:
 8 | ifneq ($(shell $(DOCKER) image inspect $(BASE_IMAGE_URL):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) --format="image_exists" 2>/dev/null), image_exists)
 9 | 	cd ../base; \
10 | 	make build; \
11 | 	cd ../pytorch
12 | endif
13 | 
14 | init: base
15 | 	$(HIDE)mkdir -p $(BUILD_DIR)
16 | 	$(HIDE)cp $(CURDIR)/install_packages.sh $(BUILD_DIR)/
17 | ifneq (,$(findstring ubuntu,$(BUILD_OS)))
18 | 	$(HIDE)cp $(CURDIR)/Dockerfile.ubuntu $(BUILD_DIR)/Dockerfile
19 | else
20 | 	$(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile
21 | endif
22 | 
23 | build: init
24 | 


--------------------------------------------------------------------------------
/dockerfiles/pytorch/install_packages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | PT_PACKAGE_NAME="pytorch_modules-v${PT_VERSION}_${VERSION}_${REVISION}.tgz"
 5 | OS_STRING="ubuntu${OS_NUMBER}"
 6 | case "${BASE_NAME}" in
 7 |     *sles15.5* | *suse15.5*)
 8 |         OS_STRING="suse155"
 9 |     ;;
10 |     *rhel9.2*)
11 |         OS_STRING="rhel92"
12 |     ;;
13 |     *rhel9.4*)
14 |         OS_STRING="rhel94"
15 |     ;;
16 |     *rhel8*)
17 |         OS_STRING="rhel86"
18 |     ;;
19 |     *tencentos*)
20 |         OS_STRING="tencentos31"
21 |     ;;
22 | esac
23 | PT_ARTIFACT_PATH="https://${ARTIFACTORY_URL}/artifactory/gaudi-pt-modules/${VERSION}/${REVISION}/pytorch/${OS_STRING}"
24 | 
25 | TMP_PATH=$(mktemp --directory)
26 | wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}"
27 | tar -zxf "${PT_PACKAGE_NAME}" -C "${TMP_PATH}"/.
28 | pushd "${TMP_PATH}"
29 | ./install.sh $VERSION $REVISION
30 | popd
31 | 
32 | rm -rf "${TMP_PATH}" "${PT_PACKAGE_NAME}"
33 | 


--------------------------------------------------------------------------------
/dockerfiles/triton/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04
 6 | FROM nvcr.io/nvidia/tritonserver:23.12-py3
 7 | ARG ARTIFACTORY_URL
 8 | ARG PT_VERSION
 9 | ARG VERSION
10 | ARG REVISION
11 | ARG HABANA_PIP_VERSION="22.3"
12 | ARG PT_BUILD_REPO=gaudi-pt-modules
13 | ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz"
14 | ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204"
15 | ARG PT_EXTRACT_PATH="/root/habanalabs/pytorch_temp"
16 | 
17 | ENV DEBIAN_FRONTEND=noninteractive
18 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
19 | ENV HABANA_LOGS=/var/log/habana_logs/
20 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
21 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
22 | ENV PIP_NO_CACHE_DIR=on
23 | ENV PIP_DEFAULT_TIMEOUT=1000
24 | ENV MPI_ROOT=/opt/hpcx/ompi
25 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
26 | ENV PATH=${MPI_ROOT}/bin:$PATH
27 | ENV OPAL_PREFIX=${MPI_ROOT}
28 | ENV MPICC=${MPI_ROOT}/bin/mpicc
29 | ENV RDMAV_FORK_SAFE=1
30 | ENV PYTHONPATH=/root:/usr/lib/habanalabs
31 | RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \
32 |     wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \
33 |     apt-key add public && rm public && apt-get update && \
34 |     apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \
35 |         habanalabs-thunk="$VERSION"-"$REVISION" \
36 |         habanalabs-firmware-tools="$VERSION"-"$REVISION" \
37 |         habanalabs-graph="$VERSION"-"$REVISION" && \
38 |     apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \
39 |     sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list
40 | 
41 | RUN apt-get update && apt-get install -y \
42 |     libjemalloc2 \
43 |     libcairo2-dev \
44 |     libglib2.0-dev \
45 |     libhdf5-dev \
46 |     libnuma-dev \
47 |     libpcre2-dev \
48 |     libjpeg-dev \
49 |     liblapack-dev \
50 |     libopenblas-dev \
51 |     numactl \
52 |     libgoogle-perftools-dev && \
53 |     apt-get clean && rm -rf /var/lib/apt/lists/*
54 | 
55 | RUN python3 -m pip install pip==24.2 --disable-pip-version-check && \
56 |     python3 -m pip install setuptools==75.1.0 --disable-pip-version-check && \
57 |     python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check
58 | 
59 | RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \
60 |     mkdir -p /root/habanalabs/pytorch_temp && \
61 |     tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C ${PT_EXTRACT_PATH}/. && \
62 |     python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \
63 |     pip install mpi4py==3.1.4 --disable-pip-version-check && \
64 |     grep -ivE "#|lightning" ${PT_EXTRACT_PATH}/requirements-pytorch.txt > ${PT_EXTRACT_PATH}/requirements-pytorch-nolightning.txt && \
65 |     pip install -r ${PT_EXTRACT_PATH}/requirements-pytorch-nolightning.txt --no-warn-script-location --disable-pip-version-check && \
66 |     pip install ${PT_EXTRACT_PATH}/*.whl --disable-pip-version-check && \
67 |     grep "lightning" ${PT_EXTRACT_PATH}/requirements-pytorch.txt > ${PT_EXTRACT_PATH}/requirements-pytorch-lightning.txt && \
68 |     pip install -r ${PT_EXTRACT_PATH}/requirements-pytorch-lightning.txt --disable-pip-version-check && \
69 |     echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \
70 |     pip uninstall -y pillow && \
71 |     pip uninstall -y pillow-simd && \
72 |     pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \
73 |     rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/*
74 | 
75 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4
76 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768


--------------------------------------------------------------------------------
/dockerfiles/triton/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | include ../common.mk
 3 | 
 4 | IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS}
 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION)
 6 | 
 7 | init:
 8 | ifneq ($(BUILD_OS), ubuntu22.04)
 9 | 	$(error triton is only supported on ubuntu22.04)
10 | endif
11 | 	$(HIDE)mkdir -p $(BUILD_DIR)
12 | 	$(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile
13 | 
14 | build: init
15 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 HabanaLabs, Ltd.
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04
 6 | FROM nvcr.io/nvidia/tritonserver:24.06-py3
 7 | ARG ARTIFACTORY_URL
 8 | ARG PT_VERSION
 9 | ARG VERSION
10 | ARG REVISION
11 | ARG HABANA_PIP_VERSION="22.3"
12 | ARG PT_BUILD_REPO=gaudi-pt-modules
13 | ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz"
14 | ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204"
15 | ENV DEBIAN_FRONTEND=noninteractive
16 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
17 | ENV HABANA_LOGS=/var/log/habana_logs/
18 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
19 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
20 | ENV PIP_NO_CACHE_DIR=on
21 | ENV PIP_DEFAULT_TIMEOUT=1000
22 | ENV MPI_ROOT=/opt/hpcx/ompi
23 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
24 | ENV PATH=${MPI_ROOT}/bin:$PATH
25 | ENV OPAL_PREFIX=${MPI_ROOT}
26 | ENV MPICC=${MPI_ROOT}/bin/mpicc
27 | ENV RDMAV_FORK_SAFE=1
28 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/
29 | 
30 | ADD model.py .
31 | RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \
32 |     wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \
33 |     apt-key add public && rm public && apt-get update && \
34 |     apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \
35 |         habanalabs-thunk="$VERSION"-"$REVISION" \
36 |         habanalabs-firmware-tools="$VERSION"-"$REVISION" \
37 |         habanalabs-graph="$VERSION"-"$REVISION" && \
38 |     apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \
39 |     sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list
40 | 
41 | RUN apt-get update && apt-get install -y \
42 |     libjemalloc2 \
43 |     libcairo2-dev \
44 |     libglib2.0-dev \
45 |     libhdf5-dev \
46 |     libnuma-dev \
47 |     libpcre2-dev \
48 |     libjpeg-dev \
49 |     liblapack-dev \
50 |     libopenblas-dev \
51 |     numactl \
52 |     libgoogle-perftools-dev && \
53 |     apt-get clean && rm -rf /var/lib/apt/lists/*
54 | 
55 | RUN python3 -m pip install pip==23.3.1 --disable-pip-version-check && \
56 |     python3 -m pip install setuptools==67.3.3 --disable-pip-version-check && \
57 |     python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check
58 | 
59 | RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \
60 |     mkdir -p /root/habanalabs/pytorch_temp && \
61 |     tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C /root/habanalabs/pytorch_temp/. && \
62 |     python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \
63 |     pip install mpi4py==3.1.4 --disable-pip-version-check && \
64 |     #pip install $(grep -ivE "#|lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt | grep .) --no-warn-script-location --disable-pip-version-check && \
65 |     pip install /root/habanalabs/pytorch_temp/*.whl --disable-pip-version-check && \
66 |     pip install $(grep "lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt) --disable-pip-version-check && \
67 |     echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \
68 |     pip uninstall -y pillow && \
69 |     pip uninstall -y pillow-simd && \
70 |     pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \
71 |     rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/*
72 | #RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@v0.4.2-Gaudi-1.16.0
73 | RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@275e3250ba6ed8cc13b2d6e4928db73df420e64b
74 | 
75 | RUN mkdir -p /opt/tritonserver/backends/vllm
76 | COPY model.py /opt/tritonserver/backends/vllm/
77 | 
78 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4
79 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
80 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | include ../common.mk
 3 | 
 4 | IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS}
 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION)
 6 | 
 7 | init:
 8 | ifneq ($(BUILD_OS), ubuntu22.04)
 9 | 	$(error triton is only supported on ubuntu22.04)
10 | endif
11 | 	$(HIDE)mkdir -p $(BUILD_DIR)
12 | 	$(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile
13 | 	$(HIDE)cp $(CURDIR)/model.py $(BUILD_DIR)/model.py
14 | 
15 | build: init
16 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions
  7 | # are met:
  8 | #  * Redistributions of source code must retain the above copyright
  9 | #    notice, this list of conditions and the following disclaimer.
 10 | #  * Redistributions in binary form must reproduce the above copyright
 11 | #    notice, this list of conditions and the following disclaimer in the
 12 | #    documentation and/or other materials provided with the distribution.
 13 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 14 | #    contributors may be used to endorse or promote products derived
 15 | #    from this software without specific prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 20 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | import argparse
 30 | import asyncio
 31 | import json
 32 | import sys
 33 | 
 34 | import numpy as np
 35 | import tritonclient.grpc.aio as grpcclient
 36 | from tritonclient.utils import *
 37 | 
 38 | 
 39 | class LLMClient:
 40 |     def __init__(self, flags: argparse.Namespace):
 41 |         self._client = grpcclient.InferenceServerClient(
 42 |             url=flags.url, verbose=flags.verbose
 43 |         )
 44 |         self._flags = flags
 45 |         self._loop = asyncio.get_event_loop()
 46 |         self._results_dict = {}
 47 | 
 48 |     async def async_request_iterator(
 49 |         self, prompts, sampling_parameters, exclude_input_in_output
 50 |     ):
 51 |         try:
 52 |             for iter in range(self._flags.iterations):
 53 |                 for i, prompt in enumerate(prompts):
 54 |                     prompt_id = self._flags.offset + (len(prompts) * iter) + i
 55 |                     self._results_dict[str(prompt_id)] = []
 56 |                     yield self.create_request(
 57 |                         prompt,
 58 |                         self._flags.streaming_mode,
 59 |                         prompt_id,
 60 |                         sampling_parameters,
 61 |                         exclude_input_in_output,
 62 |                     )
 63 |         except Exception as error:
 64 |             print(f"Caught an error in the request iterator: {error}")
 65 | 
 66 |     async def stream_infer(self, prompts, sampling_parameters, exclude_input_in_output):
 67 |         try:
 68 |             # Start streaming
 69 |             response_iterator = self._client.stream_infer(
 70 |                 inputs_iterator=self.async_request_iterator(
 71 |                     prompts, sampling_parameters, exclude_input_in_output
 72 |                 ),
 73 |                 stream_timeout=self._flags.stream_timeout,
 74 |             )
 75 |             async for response in response_iterator:
 76 |                 yield response
 77 |         except InferenceServerException as error:
 78 |             print(error)
 79 |             sys.exit(1)
 80 | 
 81 |     async def process_stream(
 82 |         self, prompts, sampling_parameters, exclude_input_in_output
 83 |     ):
 84 |         # Clear results in between process_stream calls
 85 |         self.results_dict = []
 86 |         success = True
 87 |         # Read response from the stream
 88 |         async for response in self.stream_infer(
 89 |             prompts, sampling_parameters, exclude_input_in_output
 90 |         ):
 91 |             result, error = response
 92 |             if error:
 93 |                 print(f"Encountered error while processing: {error}")
 94 |                 success = False
 95 |             else:
 96 |                 output = result.as_numpy("text_output")
 97 |                 for i in output:
 98 |                     self._results_dict[result.get_response().id].append(i)
 99 |         return success
100 | 
101 |     async def run(self):
102 |         # Sampling parameters for text generation
103 |         # including `temperature`, `top_p`, top_k`, `max_tokens`, `early_stopping`.
104 |         # Full list available at:
105 |         # https://github.com/vllmproject/vllm/blob/5255d99dc595f9ae7647842242d6542aa4145a4f/vllm/sampling_params.py#L23
106 |         sampling_parameters = {
107 |             "temperature": "0.1",
108 |             "top_p": "0.95",
109 |             "max_tokens": "100",
110 |         }
111 |         exclude_input_in_output = self._flags.exclude_inputs_in_outputs
112 |         if self._flags.lora_name is not None:
113 |             sampling_parameters["lora_name"] = self._flags.lora_name
114 |         with open(self._flags.input_prompts, "r") as file:
115 |             print(f"Loading inputs from `{self._flags.input_prompts}`...")
116 |             prompts = file.readlines()
117 | 
118 |         success = await self.process_stream(
119 |             prompts, sampling_parameters, exclude_input_in_output
120 |         )
121 | 
122 |         with open(self._flags.results_file, "w") as file:
123 |             for id in self._results_dict.keys():
124 |                 for result in self._results_dict[id]:
125 |                     file.write(result.decode("utf-8"))
126 | 
127 |                 file.write("\n")
128 |                 file.write("\n=========\n\n")
129 |             print(f"Storing results into `{self._flags.results_file}`...")
130 | 
131 |         if self._flags.verbose:
132 |             with open(self._flags.results_file, "r") as file:
133 |                 print(f"\nContents of `{self._flags.results_file}` ===>")
134 |                 print(file.read())
135 |         if success:
136 |             print("PASS: vLLM example")
137 |         else:
138 |             print("FAIL: vLLM example")
139 | 
140 |     def run_async(self):
141 |         self._loop.run_until_complete(self.run())
142 | 
143 |     def create_request(
144 |         self,
145 |         prompt,
146 |         stream,
147 |         request_id,
148 |         sampling_parameters,
149 |         exclude_input_in_output,
150 |         send_parameters_as_tensor=True,
151 |     ):
152 |         inputs = []
153 |         prompt_data = np.array([prompt.encode("utf-8")], dtype=np.object_)
154 |         try:
155 |             inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
156 |             inputs[-1].set_data_from_numpy(prompt_data)
157 |         except Exception as error:
158 |             print(f"Encountered an error during request creation: {error}")
159 | 
160 |         stream_data = np.array([stream], dtype=bool)
161 |         inputs.append(grpcclient.InferInput("stream", [1], "BOOL"))
162 |         inputs[-1].set_data_from_numpy(stream_data)
163 | 
164 |         # Request parameters are not yet supported via BLS. Provide an
165 |         # optional mechanism to send serialized parameters as an input
166 |         # tensor until support is added
167 | 
168 |         if send_parameters_as_tensor:
169 |             sampling_parameters_data = np.array(
170 |                 [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_
171 |             )
172 |             inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES"))
173 |             inputs[-1].set_data_from_numpy(sampling_parameters_data)
174 | 
175 |         inputs.append(grpcclient.InferInput("exclude_input_in_output", [1], "BOOL"))
176 |         inputs[-1].set_data_from_numpy(np.array([exclude_input_in_output], dtype=bool))
177 | 
178 |         # Add requested outputs
179 |         outputs = []
180 |         outputs.append(grpcclient.InferRequestedOutput("text_output"))
181 | 
182 |         # Issue the asynchronous sequence inference.
183 |         return {
184 |             "model_name": self._flags.model,
185 |             "inputs": inputs,
186 |             "outputs": outputs,
187 |             "request_id": str(request_id),
188 |             "parameters": sampling_parameters,
189 |         }
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     parser = argparse.ArgumentParser()
194 |     parser.add_argument(
195 |         "-m",
196 |         "--model",
197 |         type=str,
198 |         required=False,
199 |         default="vllm_model",
200 |         help="Model name",
201 |     )
202 |     parser.add_argument(
203 |         "-v",
204 |         "--verbose",
205 |         action="store_true",
206 |         required=False,
207 |         default=False,
208 |         help="Enable verbose output",
209 |     )
210 |     parser.add_argument(
211 |         "-u",
212 |         "--url",
213 |         type=str,
214 |         required=False,
215 |         default="localhost:8001",
216 |         help="Inference server URL and its gRPC port. Default is localhost:8001.",
217 |     )
218 |     parser.add_argument(
219 |         "-t",
220 |         "--stream-timeout",
221 |         type=float,
222 |         required=False,
223 |         default=None,
224 |         help="Stream timeout in seconds. Default is None.",
225 |     )
226 |     parser.add_argument(
227 |         "--offset",
228 |         type=int,
229 |         required=False,
230 |         default=0,
231 |         help="Add offset to request IDs used",
232 |     )
233 |     parser.add_argument(
234 |         "--input-prompts",
235 |         type=str,
236 |         required=False,
237 |         default="prompts.txt",
238 |         help="Text file with input prompts",
239 |     )
240 |     parser.add_argument(
241 |         "--results-file",
242 |         type=str,
243 |         required=False,
244 |         default="results.txt",
245 |         help="The file with output results",
246 |     )
247 |     parser.add_argument(
248 |         "--iterations",
249 |         type=int,
250 |         required=False,
251 |         default=1,
252 |         help="Number of iterations through the prompts file",
253 |     )
254 |     parser.add_argument(
255 |         "-s",
256 |         "--streaming-mode",
257 |         action="store_true",
258 |         required=False,
259 |         default=False,
260 |         help="Enable streaming mode",
261 |     )
262 |     parser.add_argument(
263 |         "--exclude-inputs-in-outputs",
264 |         action="store_true",
265 |         required=False,
266 |         default=False,
267 |         help="Exclude prompt from outputs",
268 |     )
269 |     parser.add_argument(
270 |         "-l",
271 |         "--lora-name",
272 |         type=str,
273 |         required=False,
274 |         default=None,
275 |         help="The querying LoRA name",
276 |     )
277 |     FLAGS = parser.parse_args()
278 | 
279 |     client = LLMClient(FLAGS)
280 |     client.run_async()
281 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/1/model.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model":"meta-llama/Llama-2-7b-hf",
 3 |     "tokenizer":"meta-llama/Llama-2-7b-hf",
 4 |     "disable_log_requests": "false",
 5 |     "gpu_memory_utilization": 0.5,
 6 |     "enforce_eager": "true",
 7 |     "max_num_seqs": 512,
 8 |     "swap_space": 16,
 9 |     "dtype": "bfloat16",
10 |     "tensor_parallel_size": 1,
11 |     "max_num_batched_tokens": 8192
12 | }
13 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | # The usage of device is deferred to the vLLM engine
31 | instance_group [
32 |   {
33 |     count: 1
34 |     kind: KIND_MODEL
35 |   }
36 | ]
37 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/prompts.txt:
--------------------------------------------------------------------------------
1 | Hello, my name is
2 | The most dangerous animal is
3 | The capital of France is
4 | The future of AI is
5 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/1/model.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model":"meta-llama/Llama-2-70b-hf",
 3 |     "tokenizer":"meta-llama/Llama-2-70b-hf",
 4 |     "disable_log_requests": "false",
 5 |     "gpu_memory_utilization": 0.5,
 6 |     "enforce_eager": "true",
 7 |     "max_num_seqs": 512,
 8 |     "swap_space": 16,
 9 |     "dtype": "bfloat16",
10 |     "tensor_parallel_size": 8,
11 |     "max_num_batched_tokens": 8192
12 | }
13 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | # The usage of device is deferred to the vLLM engine
31 | instance_group [
32 |   {
33 |     count: 1
34 |     kind: KIND_MODEL
35 |   }
36 | ]
37 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/1/model.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model":"meta-llama/Llama-2-7b-hf",
 3 |     "tokenizer":"meta-llama/Llama-2-7b-hf",
 4 |     "disable_log_requests": "false",
 5 |     "gpu_memory_utilization": 0.5,
 6 |     "enforce_eager": "true",
 7 |     "max_num_seqs": 512,
 8 |     "swap_space": 16,
 9 |     "dtype": "bfloat16",
10 |     "tensor_parallel_size": 1,
11 |     "max_num_batched_tokens": 8192
12 | }
13 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | # The usage of device is deferred to the vLLM engine
31 | instance_group [
32 |   {
33 |     count: 1
34 |     kind: KIND_MODEL
35 |   }
36 | ]
37 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/1/model.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model":"Qwen/Qwen2-7B-Instruct",
 3 |     "tokenizer":"Qwen/Qwen2-7B-Instruct", 
 4 |     "disable_log_requests": "false",
 5 |     "gpu_memory_utilization": 0.5,
 6 |     "enforce_eager": "true",
 7 |     "max_num_seqs": 512,
 8 |     "swap_space": 16,
 9 |     "dtype": "bfloat16",
10 |     "tensor_parallel_size": 1,
11 |     "max_num_batched_tokens": 131072,
12 |     "chat_template": "true"
13 | }
14 | 


--------------------------------------------------------------------------------
/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | # The usage of device is deferred to the vLLM engine
31 | instance_group [
32 |   {
33 |     count: 1
34 |     kind: KIND_MODEL
35 |   }
36 | ]
37 | 


--------------------------------------------------------------------------------
/legal-disclaimer.md:
--------------------------------------------------------------------------------
 1 | ## Legal Notice and Disclaimer
 2 | 
 3 | No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document.
 4 | 
 5 | Habana Labs disclaims all warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and non-infringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade.
 6 | 
 7 | All information provided here is subject to change without notice. Habana Labs may make changes to its test conditions and internal reliability goals at any time.  Contact your Habana Labs representative to obtain the latest Habana Labs product specifications and roadmaps. Your costs and results may vary.
 8 | 
 9 | The products described may contain design defects or errors known as errata which may cause the product to deviate from published specifications. Current characterized errata are available on request.
10 | 
11 | Software and workloads used in performance tests may have been optimized for performance only on Habana Labs hardware. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary.  You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products.
12 | 
13 | No product or component can be absolutely secure.
14 | 
15 | Habana Labs, Gaudi and SynapseAI are trademarks of Habana Labs in the U.S. and/or other countries.
16 | 
17 | *Other names and brands may be claimed as the property of others.
18 | 
19 | © 2021 Habana Labs
20 | 


--------------------------------------------------------------------------------
/utils/README.md:
--------------------------------------------------------------------------------
  1 | # Gaudi Utils
  2 | 
  3 | By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Gaudi Utils](#gaudi-utils)
  8 |   - [Table of Contents](#table-of-contents)
  9 |   - [Overview](#overview)
 10 |   - [manage\_network\_ifs](#manage_network_ifs)
 11 |   - [Operations](#operations)
 12 |     - [Up](#up)
 13 |     - [Down](#down)
 14 |     - [Status](#status)
 15 |     - [Set IP](#set-ip)
 16 |     - [Unset IP](#unset-ip)
 17 |   - [check\_framework\_env](#check_framework_env)
 18 |   - [Intel Gaudi Health Screen (IGHS)](#intel-gaudi-health-screen-ighs)
 19 | 
 20 | ## Overview
 21 | 
 22 | Welcome to Intel Gaudi's Util Scripts!
 23 | 
 24 | This folder contains some Intel Gaudi utility scripts that users can access as reference.
 25 | 
 26 | ## manage_network_ifs
 27 | 
 28 | Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh or /opt/habanalabs/qual/gaudi3/bin/manage_network_ifs.sh).
 29 | 
 30 | This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces.
 31 | 
 32 | The following is the usage of the script:
 33 | 
 34 | ```
 35 | usage: ./manage_network_ifs.sh [options]
 36 | 
 37 | options:
 38 |        --up         toggle up all Intel Gaudi network interfaces
 39 |        --down       toggle down all Intel Gaudi network interfaces
 40 |        --status     print status of all Intel Gaudi network interfaces
 41 |        --set-pfc    set PFC (enabled=0,1,2,3)
 42 |        --unset-pfc  unset PFC (enabled=none)
 43 |        --check-pfc  dump PFC configuration
 44 |        --no-progbar do not show progress bar
 45 |   -v,  --verbose    print more logs
 46 |   -h,  --help       print this help
 47 | 
 48 | Note: Please run this script with one operation at a time
 49 | ```
 50 | ## Operations
 51 | 
 52 | Before executing any operation, this script finds all the Intel Gaudi network interfaces available on the system and stores the Intel Gaudi interface information into a list.
 53 | The list will be used for the operations. If no Intel Gaudi network interface is found, the script will exit.
 54 | 
 55 | ### Up
 56 | 
 57 | Use the following command to bring all Intel Gaudi network interfaces online:
 58 | ```
 59 | sudo manage_network_ifs.sh --up
 60 | ```
 61 | ### Down
 62 | 
 63 | Use the following command to bring all Intel Gaudi network interfaces offline:
 64 | ```
 65 | sudo manage_network_ifs.sh --down
 66 | ```
 67 | ### Status
 68 | 
 69 | Print the current operational state of all Intel Gaudi network interfaces such as how many ports are up/down:
 70 | ```
 71 | sudo manage_network_ifs.sh --status
 72 | ```
 73 | ### Set PFC
 74 | 
 75 | Use the following command to set PFC for all Intel Gaudi network interfaces:
 76 | ```
 77 | sudo manage_network_ifs.sh --set-pfc
 78 | ```
 79 | ### Unset PFC
 80 | 
 81 | Use the following command to unset PFC for all Intel Gaudi network interfaces:
 82 | ```
 83 | sudo manage_network_ifs.sh --unset-pfc
 84 | ```
 85 | 
 86 | ### Check current PFC configuration
 87 | 
 88 | Use the following command to check current PFC status for all Intel Gaudi network interfaces:
 89 | ```
 90 | sudo manage_network_ifs.sh --check-pfc
 91 | ```
 92 | 
 93 | ## check_framework_env
 94 | 
 95 | This script can be used as reference to check the environment for running PyTorch on Intel Gaudi.
 96 | 
 97 | The following is the usage of the script:
 98 | 
 99 | ```
100 | usage: check_framework_env.py [-h] [--cards CARDS]
101 | 
102 | Check health of Intel Gaudi for PyTorch
103 | 
104 | optional arguments:
105 |   -h, --help            show this help message and exit
106 |   --cards CARDS         Set number of cards to test (default: 1)
107 | ```
108 | 
109 | ## Intel Gaudi Health Screen (IGHS)
110 | 
111 | **Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test
112 | includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems.
113 | 
114 | ``` bash
115 | usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES]
116 |                  [--job-id JOB_ID] [--round ROUND] [--config CONFIG]
117 |                  [--ighs-check [{node,hccl-demo,none}]] [--node-write-report]
118 |                  [--node-name NODE_NAME] [--logs-dir LOGS_DIR]
119 | 
120 | optional arguments:
121 |   -h, --help            show this help message and exit
122 |   --initialize          Downloads Necessary Repos and Creates Report Template
123 |   --screen              Starts Health Screen for Cluster
124 |   --target-nodes TARGET_NODES
125 |                         List of target nodes
126 |   --job-id JOB_ID       Needed to identify hccl-demo running log
127 |   --round ROUND         Needed to identify hccl-demo running round log
128 |   --config CONFIG       Configuration file for Health Screener
129 |   --ighs-check [{node,hccl-demo,none}]
130 |                         Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce
131 |                         (HCCL_DEMO between paris of nodes)
132 |   --node-write-report   Write Individual Node Health Report
133 |   --node-name NODE_NAME Name of Node
134 |   --logs-dir LOGS_DIR   Output directory of health screen results
135 | ```
136 | 
137 | To run a full IGHS test, run the below command:
138 | 
139 | ``` bash
140 | # Creates IGHS Report and screens clusters for any infected nodes.
141 | # Will check Level 1 and 2 by default
142 | python screen.py --initialize --screen
143 | ```
144 | 
145 | IGHS can alternatively be run through below script:
146 | 
147 | ``` bash
148 | # Creates IGHS Report and screens clusters for any infected nodes.
149 | # Will check Level 1 and 2 by default
150 | ./run_ighs.sh
151 | ```
152 | 


--------------------------------------------------------------------------------
/utils/check_framework_env.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Copyright (C) 2022 Habana Labs, Ltd. an Intel Company
 3 | # All Rights Reserved.
 4 | #
 5 | # Unauthorized copying of this file or any element(s) within it, via any medium
 6 | # is strictly prohibited.
 7 | # This file contains Habana Labs, Ltd. proprietary and confidential information
 8 | # and is subject to the confidentiality and license agreements under which it
 9 | # was provided.
10 | #
11 | ###############################################################################
12 | 
13 | import argparse
14 | import os
15 | import concurrent.futures
16 | 
17 | def parse_arguments():
18 |     parser = argparse.ArgumentParser(description="Check health of Intel Gaudi for PyTorch")
19 | 
20 |     parser.add_argument("--cards",
21 |                         default=1,
22 |                         type=int,
23 |                         required=False,
24 |                         help="Set number of cards to test (default: 1)")
25 | 
26 |     args = parser.parse_args()
27 |     print(f"Configuration: {args}")
28 | 
29 |     return args
30 | 
31 | def pytorch_test(device_id=0):
32 |     """ Checks health of Intel Gaudi through running a basic
33 |     PyTorch example on Intel Gaudi
34 | 
35 |     Args:
36 |         device_id (int, optional): ID of Intel Gaudi. Defaults to 0.
37 |     """
38 | 
39 |     os.environ["HLS_MODULE_ID"] = str(device_id)
40 |     os.environ["HABANA_VISIBLE_MODULES"] = str(device_id)
41 | 
42 |     try:
43 |         import torch
44 |         import habana_frameworks.torch.core
45 |     except Exception as e:
46 |         print(f"Card {device_id} Failed to initialize Intel Gaudi PyTorch: {str(e)}")
47 |         raise
48 | 
49 |     try:
50 |         x = torch.tensor([2]).to('hpu')
51 |         y = x + x
52 | 
53 |         assert y == 4, 'Sanity check failed: Wrong Add output'
54 |         assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card'
55 |     except (RuntimeError, AssertionError) as e:
56 |         print(f"Card Module ID {device_id} Failure: {e}")
57 |         raise
58 | 
59 |     return device_id
60 | 
61 | if __name__ == '__main__':
62 |     args = parse_arguments()
63 |     passed_cards = set()
64 | 
65 |     with concurrent.futures.ProcessPoolExecutor() as executor:
66 |         futures = [executor.submit(pytorch_test, device_id) for device_id in range(args.cards)]
67 |         for future in concurrent.futures.as_completed(futures):
68 |             try:
69 |                 dev_id = future.result()
70 |                 passed_cards.add(dev_id)
71 |                 print(f"Card module_id {dev_id} PASSED")
72 | 
73 |             except Exception as e:
74 |                 print(f"Failed to initialize on Intel Gaudi, error: {str(e)}")
75 | 
76 |     failed_cards =  set(range(args.cards)) - passed_cards
77 | 
78 |     print(f"Failed cards Module ID: {failed_cards}")
79 |     print(f"Passed cards Module ID: {passed_cards}")


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/.gitignore:
--------------------------------------------------------------------------------
1 | tmp/*
2 | build/*
3 | logs/*
4 | .graph_dump/*
5 | __pycache__*


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/HealthReport.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import os, csv, time, shutil, fcntl, glob, copy
 14 | from collections import defaultdict
 15 | from tempfile import NamedTemporaryFile
 16 | 
 17 | from utilities import copy_files
 18 | 
 19 | import logging
 20 | 
 21 | _logger = logging.getLogger("health_screener")
 22 | 
 23 | class HealthReport():
 24 | 
 25 |     def __init__(self, f_dir="tmp", report_name="health_report.csv"):
 26 |         """ Initialize Health Report Class
 27 | 
 28 |         Args:
 29 |             f_dir (str, optional): File Directory to store Health Report logs and results. Defaults to "tmp".
 30 |             report_name (str, optional): File name of Health Report csv. Defaults to "health_report.csv".
 31 |         """
 32 |         self.header           = ["node_id", "index", "module_id", "pci_address", "temperature_C", "temperature_state_C", "device_acquire_fail", "down_links", "multi_node_fail", "missing"]
 33 | 
 34 |         self.f_dir            = f_dir
 35 |         self.report_name      = report_name
 36 |         self.f_path           = f"{self.f_dir}/{self.report_name}"
 37 | 
 38 |         self.header_hccl_demo = ["round","group_id", "node_ids", "num_nodes", "multi_node_fail", "missing", "qpc_fail"]
 39 |         self.f_path_hccl_demo = f"{self.f_dir}/{os.path.splitext(self.report_name)[0]}_hccl_demo.csv"
 40 | 
 41 | 
 42 |     def create(self, create_base=True, create_hccl_demo=False):
 43 |         """Create CSV Health Report Files. One for Base Health Checks and HCCL Demo Checks
 44 | 
 45 |         Args:
 46 |             create_base (bool, optional): Create Base Health_Report CSV file. Defaults to True.
 47 |             create_hccl_demo (bool, optional): Create HCCL_DEMO_Health_Report if it doesn't exist. Defaults to False.
 48 |         """
 49 | 
 50 |         dir_name = os.path.dirname(self.f_path)
 51 |         if not os.path.exists(dir_name):
 52 |             os.makedirs(dir_name)
 53 | 
 54 |         if create_base:
 55 |             with open(self.f_path, "w+", newline='') as f:
 56 |                 writer = csv.DictWriter(f, fieldnames=self.header, extrasaction='ignore')
 57 |                 writer.writeheader()
 58 |             _logger.info(f"Created {self.f_path} with header: {self.header}")
 59 | 
 60 |         if create_hccl_demo and not self.exist(level=2):
 61 |             with open(self.f_path_hccl_demo, "w+", newline='') as f:
 62 |                 writer = csv.DictWriter(f, fieldnames=self.header_hccl_demo, extrasaction='ignore')
 63 |                 writer.writeheader()
 64 |             _logger.info(f"Created {self.f_path_hccl_demo} with header: {self.header_hccl_demo}")
 65 | 
 66 |     def exist(self, level=1):
 67 |         """Checks to see if Base Health Report exist
 68 | 
 69 |         Args:
 70 |             level (int, optional): Health Screen level report csv to check. Defaults to 1.
 71 | 
 72 |         Returns:
 73 |             bool: Returns True if the Base Health Report (self.f_path) or HCCL_DEMO Health Report (self.f_path_hccl_demo) exist
 74 |         """
 75 |         f_path = self.f_path
 76 | 
 77 |         if level == 2:
 78 |             f_path = self.f_path_hccl_demo
 79 | 
 80 |         return os.path.exists(f_path)
 81 | 
 82 |     def write_rows(self, data=list(), level=1):
 83 |         """ Write health check results to Health Report CSV. Can write multiple rows at once
 84 | 
 85 |         Args:
 86 |             data (_type_, optional): Health Report CSV Row data. Defaults to list().
 87 |             level (int, optional): Health Screen Level. Defaults to 1.
 88 |         """
 89 | 
 90 |         if level == 1:
 91 |             f_path = self.f_path
 92 |             header = self.header
 93 | 
 94 | 
 95 |         elif level == 2:
 96 |             f_path = self.f_path_hccl_demo
 97 |             header = self.header_hccl_demo
 98 | 
 99 |         with open(f_path, "a", newline='') as f:
100 |             fcntl.flock(f, fcntl.LOCK_EX)
101 |             writer = csv.DictWriter(f, fieldnames=header, extrasaction='ignore')
102 |             writer.writerows(data)
103 |             time.sleep(0.1)
104 |             fcntl.flock(f, fcntl.LOCK_UN)
105 | 
106 |     def update_health_report(self, detected_nodes, infected_nodes, missing_nodes):
107 |         """ Update health_report with hccl_demo results
108 | 
109 |         Args:
110 |             detected_nodes (list[str]): List of detected node_ids
111 |             infected_nodes (list[str]): List of infected node_ids
112 |             missing_nodes (list[str]): List of missing node_ids
113 |         """
114 |         temp_file = NamedTemporaryFile(mode='w', delete=False)
115 |         detected_nodes_cp = detected_nodes.copy()
116 | 
117 |         with open(self.f_path, 'r', newline='') as csv_file, temp_file:
118 |             reader     = csv.DictReader(csv_file)
119 |             writer     = csv.DictWriter(temp_file, fieldnames=self.header)
120 | 
121 |             writer.writeheader()
122 |             for row in reader:
123 |                 if row["node_id"] in infected_nodes or row["node_id"] in missing_nodes:
124 |                     row["multi_node_fail"] = True
125 |                 elif row["node_id"] in detected_nodes_cp:
126 |                     row["multi_node_fail"] = False
127 |                     row["missing"] = False
128 | 
129 |                 writer.writerow(row)
130 | 
131 |                 missing_nodes.discard(row["node_id"])
132 |                 detected_nodes_cp.discard(row["node_id"])
133 | 
134 |             # These are unreported Detected Nodes. Add to Report
135 |             if len(detected_nodes_cp):
136 |                 for n in detected_nodes_cp:
137 |                     writer.writerow({"node_id": n, "multi_node_fail": False, "missing": False})
138 | 
139 |             # These are unreported Missing Nodes. Add to Report
140 |             if len(missing_nodes):
141 |                 for n in missing_nodes:
142 |                     writer.writerow({"node_id": n, "multi_node_fail": True, "missing": True})
143 | 
144 |         shutil.move(temp_file.name, self.f_path)
145 | 
146 |     def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, qpc_fail, missing_nodes):
147 |         """ Update health_report with hccl_demo results, based on infected_nodes.
148 | 
149 |         Args:
150 |             all_node_pairs (list[str]): List of all Node Pairs reported by Level 2 round
151 |             multi_node_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test
152 |             qpc_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test due to QPC error
153 |             missing_nodes (list[str]): List of Node Pairs that couldn't run HCCL_Demo
154 |         """
155 |         temp_file = NamedTemporaryFile(mode='w', delete=False)
156 | 
157 |         with open(self.f_path_hccl_demo, 'r', newline='') as csv_file, temp_file:
158 |             reader     = csv.DictReader(csv_file)
159 |             writer     = csv.DictWriter(temp_file, fieldnames=self.header_hccl_demo, extrasaction='ignore')
160 | 
161 |             writer.writeheader()
162 |             for row in reader:
163 |                 if(row["round"] == round):
164 |                     row["multi_node_fail"] = (row["node_ids"] in multi_node_fail)
165 |                     row["qpc_fail"]        = (row["node_ids"] in qpc_fail)
166 |                     row["missing"]         = (row["node_ids"] in missing_nodes)
167 | 
168 |                 if row["node_ids"] in all_node_pairs:
169 |                     del all_node_pairs[row["node_ids"]]
170 | 
171 |                 writer.writerow(row)
172 | 
173 |             # These are unreported node_pairs. Add remaining node pairs
174 |             if len(all_node_pairs):
175 |                 writer.writerows(list(all_node_pairs.values()))
176 | 
177 |         shutil.move(temp_file.name, self.f_path_hccl_demo)
178 | 
179 |     def check_screen_complete(self, num_nodes, hccl_demo=False, round=0):
180 |         """ Check on status of Health Screen Check.
181 |         Screen considered done if all nodes health checks are done
182 | 
183 |         Args:
184 |             num_nodes (int): Number of Nodes screened
185 |             hccl_demo (bool, optional): Status of HCCL_DEMO all reduce test. Defaults to False.
186 |             round (int, optional): Level 2 Round. This will only check Level 2 round results. This is ignored for Level 1 runs.
187 | 
188 |         Returns:
189 |             bool: Status of Screen. If all nodes are found, screening is done
190 |         """
191 |         f_path           = self.f_path if (not hccl_demo) else self.f_path_hccl_demo
192 |         n_cards_per_node = 8
193 | 
194 |         with open(f_path, "r", newline='') as f:
195 |             reader = csv.DictReader(f)
196 | 
197 |             if hccl_demo:
198 |                 n_cards = 0
199 |                 for row in reader:
200 |                     if(int(row["round"]) == round):
201 |                         n_cards += (int(row["num_nodes"]) * n_cards_per_node)
202 |             else:
203 |                 n_cards = len(list(reader))
204 | 
205 |         total_cards        = n_cards_per_node * num_nodes
206 |         has_all_nodes_info = (n_cards == total_cards)
207 |         num_found_nodes    = n_cards // n_cards_per_node
208 | 
209 |         return has_all_nodes_info, num_found_nodes
210 | 
211 |     def extract_node_info(self):
212 |         """ Extracts Detected, Infected, and Missing Nodes from Health Report.
213 | 
214 |         Returns:
215 |             (set, set, set):  (Detected Nodes, Infected Nodes, Missing Nodes)
216 |         """
217 |         detected_nodes          = set()
218 |         missing_nodes           = set()
219 |         device_acquire_fail_set = set()
220 |         down_links_set          = set()
221 |         temperature_fail_set    = set()
222 |         temperature_warn_set    = set()
223 | 
224 |         with open(self.f_path, "r", newline='') as f:
225 |             reader = csv.DictReader(f)
226 |             for row in reader:
227 |                 detected_nodes.add(row["node_id"])
228 | 
229 |                 if row["device_acquire_fail"] == "True":
230 |                     device_acquire_fail_set.add(row["node_id"])
231 |                 if row["down_links"] != "[]" and row["down_links"] != "":
232 |                     down_links_set.add(row["node_id"])
233 |                 if row["missing"] == "True":
234 |                     missing_nodes.add(row["node_id"])
235 |                 if row["temperature_state_C"] == "CRITICAL":
236 |                     temperature_fail_set.add(row["node_id"])
237 |                 if row["temperature_state_C"] == "WARN":
238 |                     temperature_warn_set.add(row["node_id"])
239 | 
240 |         if(len(device_acquire_fail_set)):
241 |             _logger.info(f"{len(device_acquire_fail_set)} Infected (Device Acquire fail): {sorted(list(device_acquire_fail_set))}")
242 |         if(len(down_links_set)):
243 |             _logger.info(f"{len(down_links_set)} Infected (Down Links): {sorted(list(down_links_set))}")
244 |         if(len(temperature_warn_set)):
245 |             _logger.info(f"{len(temperature_warn_set)} Infected (Temperature WARN): {sorted(list(temperature_warn_set))}")
246 |         if(len(temperature_fail_set)):
247 |             _logger.info(f"{len(temperature_fail_set)} Infected (Temperature CRITICAL): {sorted(list(temperature_fail_set))}")
248 | 
249 |         infected_nodes = set()
250 |         infected_nodes.update(device_acquire_fail_set)
251 |         infected_nodes.update(down_links_set)
252 |         infected_nodes.update(temperature_fail_set)
253 |         infected_nodes.update(temperature_warn_set)
254 | 
255 |         return detected_nodes, infected_nodes, missing_nodes
256 | 
257 | 
258 |     def extract_hccl_demo_info(self):
259 |         """ Extracts Detected, Infected, and Missing Nodes from HCCL DEMO Health Report
260 | 
261 |         Returns:
262 |             (set, set, set):  (Detected Nodes, Infected Nodes, Missing Nodes)
263 |         """
264 |         detected_nodes = set()
265 |         infected_nodes = set()
266 |         missing_nodes  = set()
267 |         fail_checks    = defaultdict(list)
268 |         missing_checks = defaultdict(list)
269 | 
270 |         with open(self.f_path_hccl_demo, "r", newline='') as f:
271 |             reader = csv.DictReader(f)
272 |             for row in reader:
273 |                 node_ids = row["node_ids"].strip("[']").replace("'","").split(', ')
274 |                 detected_nodes.update(node_ids)
275 | 
276 |                 for n in node_ids:
277 |                     fail_status = int(row["multi_node_fail"] == "True")
278 |                     fail_checks[n].append(fail_status)
279 | 
280 |                     missing_status = int(row["missing"] == "True")
281 |                     missing_checks[n].append(missing_status)
282 | 
283 |         for n, v in fail_checks.items():
284 |             if sum(v) == len(v):
285 |                 infected_nodes.add(n)
286 | 
287 |         for n, v in missing_checks.items():
288 |             if sum(v) == len(v):
289 |                 missing_nodes.add(n)
290 | 
291 |         detected_nodes -= missing_nodes
292 |         infected_nodes -= missing_nodes
293 | 
294 |         _logger.info(f"{len(infected_nodes)} Infected (HCCL): {sorted(list(infected_nodes))}")
295 | 
296 |         return detected_nodes, infected_nodes, missing_nodes
297 | 
298 |     def gather_health_report(self, level, remote_path, hosts):
299 |         """ Gathers Health Report from all hosts
300 | 
301 |         Args:
302 |             level (str): IGHS Level
303 |             remote_path (str): Remote Destintation of IGHS Report
304 |             hosts (list, optional): List of IP Addresses to gather IGHS Reports
305 |         """
306 |         copy_files(src=f"{remote_path}/intel_gaudi_health_screen/{self.f_dir}/L{level}",
307 |                         dst=f"{self.f_dir}",
308 |                         hosts=hosts,
309 |                         to_remote=False)
310 | 
311 |     def consolidate_health_report(self, level, report_dir):
312 |         """ Consolidates the health_report_*.csv from worker pods into a single master csv file
313 | 
314 |         Args:
315 |             level (str): IGHS Level
316 |             report_dir (str): Directory of CSV files to merge
317 |         """
318 |         data      = list()
319 |         path      = f"{report_dir}/L{level}/health_report_*.csv"
320 |         csv_files = glob.glob(path)
321 | 
322 |         for f in csv_files:
323 |             with open(f, 'r', newline='') as csv_file:
324 |                 reader = csv.DictReader(csv_file)
325 |                 for row in reader:
326 |                     data.append(row)
327 | 
328 |         self.write_rows(data=data, level=level)
329 | 
330 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/IGNodes.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import os, time, csv, json
 14 | import logging
 15 | import multiprocessing
 16 | 
 17 | from HealthReport import HealthReport
 18 | from utilities import run_cmd, create_logger
 19 | 
 20 | _logger = logging.getLogger("health_screener")
 21 | 
 22 | 
 23 | class IGNodes():
 24 | 
 25 |     def __init__(self, health_report=HealthReport()):
 26 |         """ Keeps Track of Nodes and their current states
 27 | 
 28 |         Args:
 29 |             health_report (HealthReport, optional): IGHS Health Report. Defaults to creating a new HealthReport().
 30 |         """
 31 |         self.all_nodes           = list()
 32 |         self.launcher_nodes      = list()
 33 |         self.worker_nodes        = list()
 34 |         self.healthy_nodes       = set()
 35 |         self.watch_nodes         = set()
 36 |         self.infected_nodes      = set()
 37 |         self.missing_nodes       = set()
 38 | 
 39 |         self.groups_tracker      = list()
 40 |         self.current_node_groups = list()
 41 | 
 42 |         self.health_report       = health_report
 43 |         self.log_dir             = health_report.f_dir
 44 | 
 45 |     def update_node_status(self, healthy_nodes, infected_nodes, missing_nodes, undetected_nodes=[]):
 46 |         """Update the node lists status based on current node groups. If a node
 47 |         paring fails with known healthy node, then the other node is considered
 48 |         infected. Otherwise it will be moved to the healthy node list
 49 | 
 50 |         Args:
 51 |             healthy_nodes ([str]): List of Healthy nodes that pass IGHS testing
 52 |             infected_nodes ([str]): List of nodes that failed to pass IGHS testing
 53 |             missing_nodes ([str]): List of nodes that IGHS did not run testing on
 54 |             undetected_nodes ([str]): List of nodes that IGHS did not run testing on b/c it wasn't scheduled on
 55 |         """
 56 |         watch_nodes       = self.watch_nodes.copy()
 57 | 
 58 |         # Remove Nodes that haven't been tested yet from the healthy list
 59 |         for n in undetected_nodes:
 60 |             if n in watch_nodes and n in healthy_nodes:
 61 |                 healthy_nodes.remove(n)
 62 | 
 63 |         self.healthy_nodes.update(healthy_nodes)
 64 | 
 65 |         for group in self.current_node_groups:
 66 |             n1, n2 = group
 67 |             self.determine_node_health(infected_nodes, missing_nodes, n1, n2)
 68 |             self.determine_node_health(infected_nodes, missing_nodes, n2, n1)
 69 | 
 70 |         self.watch_nodes  = self.watch_nodes.difference(self.healthy_nodes)
 71 | 
 72 |     def determine_node_health(self, infected_nodes, missing_nodes, n1, n2):
 73 |         """Determine whether a node is healthy .
 74 | 
 75 |         Args:
 76 |             infected_nodes ([str]): List of nodes that failed to pass IGHS testing
 77 |             missing_nodes ([str]): List of nodes that IGHS did not run testing on
 78 |             n1 (str): Node name to investigate if it passes the IGHS test
 79 |             n2 (str): Node name that should be considered healthy. This assist in verifying status of N1
 80 |         """
 81 |         if n2 in self.healthy_nodes:
 82 |             remove_from_watch = False
 83 | 
 84 |             if n1 in infected_nodes:
 85 |                 self.infected_nodes.add(n1)
 86 |                 remove_from_watch = True
 87 |             if n1 in missing_nodes:
 88 |                 self.missing_nodes.add(n1)
 89 |                 remove_from_watch = True
 90 | 
 91 |             if remove_from_watch and n1 in self.watch_nodes:
 92 |                 self.watch_nodes.remove(n1)
 93 | 
 94 | class IGNode():
 95 | 
 96 |     def __init__(self, name="", health_report=HealthReport(), num_checks_link_state=10, log_level=logging.INFO, write_dir="/tmp/ighs"):
 97 |         self.name = name
 98 |         if name == "" and "MY_NODE_NAME" in os.environ:
 99 |             self.name = os.environ["MY_NODE_NAME"]
100 | 
101 |         self.cards                   = dict()
102 |         self.num_checks_link_state   = num_checks_link_state
103 |         self.write_dir               = write_dir
104 |         if(not os.path.exists(self.write_dir)):
105 |             os.makedirs(self.write_dir)
106 | 
107 |         self.health_report           = health_report
108 |         if not self.health_report.exist():
109 |             self.health_report.create()
110 | 
111 |         self.logger, _ = create_logger(logger_name=self.name, logger_file_name=self.name, f_path=f"{write_dir}", level=log_level)
112 | 
113 | 
114 |     def scan_cards(self):
115 |         self.logger.info(f"Scanning cards info on Node: {self.name}")
116 | 
117 |         cmd = "hl-smi -Q index,module_id,bus_id,memory.used,temperature.aip,name -f csv,noheader"
118 |         output = run_cmd(cmd)
119 | 
120 |         reader = csv.reader(output.split('\n'), delimiter=',')
121 |         for row in reader:
122 |             if len(row) == 0:
123 |                 continue
124 |             elif len(row) < 6:
125 |                 _logger.error(f"hl-smi output is not correct: Recieved output: {row}")
126 |                 continue
127 | 
128 |             i             = row[0]
129 |             module_id     = row[1].strip()
130 |             pci_address   = row[2]
131 |             memory_used   = int(row[3].split()[0])
132 |             temperature_C = int(row[4].split()[0])
133 |             system_name   = row[5]
134 | 
135 |             card = IGCard(system_name=system_name, index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger)
136 |             self.cards[i] = card
137 | 
138 |         self.cards = dict(sorted(self.cards.items()))
139 | 
140 |     def record_dmesg(self):
141 |         cmd    = f"dmesg -T"
142 |         output = run_cmd(cmd)
143 | 
144 |         self.logger.info("***** START of DMESG *****")
145 |         self.logger.info(output)
146 |         self.logger.info("***** END of DMESG *****")
147 | 
148 |     def health_check(self, target_cards=[], write_report=False):
149 |         checked_cards = list()
150 |         processes     = list()
151 |         card_queue    = multiprocessing.Queue()
152 | 
153 |         if len(target_cards) == 0:
154 |             target_cards = self.cards.keys()
155 | 
156 |         for i in target_cards:
157 |             card = self.cards[str(i)]
158 |             p = multiprocessing.Process(target=card.check_health, args=(self.num_checks_link_state,card_queue)) 
159 | 
160 |             p.start()
161 |             processes.append((card,p))
162 | 
163 |         for card,p in processes:
164 |             p.join()
165 |         card_queue.put(None)
166 | 
167 |         for card in iter(card_queue.get, None):
168 |             card.node_id = self.name
169 |             checked_cards.append(card)
170 |             self.logger.info(card)
171 | 
172 |         self.record_dmesg()
173 |         checked_cards_dict = self.write_json(checked_cards)
174 |         if(write_report):
175 |             self.health_report.write_rows(data=checked_cards_dict)
176 | 
177 |     def write_json(self, cards):
178 |         node_status = dict()
179 |         node_status["name"]        = self.name
180 |         node_status["is_infected"] = False
181 |         node_status["cards"]       = list()
182 | 
183 |         for c in cards:
184 |             c_status = c.__dict__
185 |             del c_status["logger"]
186 |             node_status["cards"].append(c.__dict__)
187 | 
188 |             if c.is_infected:
189 |                 node_status["is_infected"] = True
190 |         
191 |         self.logger.info("***** START of Node Report *****")
192 |         self.logger.info(json.dumps(node_status))
193 |         self.logger.info("***** END of Node Report *****")
194 | 
195 |         return node_status["cards"]
196 | 
197 | class IGCard():
198 | 
199 |     def __init__(self, system_name="", index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None):
200 |         self.system_name               = system_name
201 |         self.node_id                   = ""
202 |         self.logger                    = logger
203 |         self.index                     = index
204 |         self.module_id                 = module_id
205 |         self.pci_address               = pci_address
206 |         self.memory_used               = memory_used
207 |         self.temperature_C             = temperature
208 |         self.temperature_state_C       = ""
209 | 
210 |         self.framework                 = framework
211 |         self.down_links                = list()
212 |         self.device_acquire_fail       = False
213 |         self.multi_node_fail           = False
214 |         self.is_infected               = False
215 | 
216 |         self.internal_ports            = list()
217 |         self.external_ports            = list()
218 | 
219 |     def check_health(self,num_checks_link_state=10, checked_cards=[]):
220 |         self.check_port_type()
221 |         self.check_link_state(attempts=num_checks_link_state, sleep_sec=0.2)
222 |         self.check_device_acquire_fail()
223 |         self.check_temperature_state()
224 | 
225 |         checked_cards.put(self)
226 | 
227 |     def check_link_state(self, attempts=10, sleep_sec=0.5):
228 |         self.logger.debug(f"Checking {self.pci_address} Link State. Will check {attempts} times")
229 |         all_ports = self.internal_ports + self.external_ports
230 |         all_ports_txt = ",".join(all_ports)
231 | 
232 |         cmd = f"hl-smi -n link -i {self.pci_address} -P {all_ports_txt}"
233 |         down_links = set()
234 | 
235 |         for a in range(attempts):
236 |             output = run_cmd(cmd)
237 |             links_state = output.strip().split("\n")
238 | 
239 |             for i, status in enumerate(links_state):
240 |                 if ("DOWN" in status):
241 |                     down_links.add(i)
242 |                     self.logger.debug(f"Attempt: {a} Port: {i} DOWN")
243 |                     self.is_infected = True
244 | 
245 |             time.sleep(sleep_sec)
246 | 
247 |         self.down_links = list(down_links)
248 | 
249 |         return self.down_links
250 | 
251 | 
252 |     def check_port_type(self):
253 |         self.logger.debug(f"Checking {self.pci_address} Port Types (Internal|External)")
254 | 
255 |         cmd    = f"hl-smi -n ports -i {self.pci_address}"
256 |         output = run_cmd(cmd)
257 |         output_list = output.strip().split("\n")
258 | 
259 |         for output in output_list:
260 |             port_txt, port_type = output.split(":")
261 |             port = port_txt.split(" ")[1]
262 | 
263 |             if "external" in port_type:
264 |                 self.external_ports.append(port)
265 |             else:
266 |                 self.internal_ports.append(port)
267 | 
268 |     def check_device_acquire_fail(self):
269 |         self.logger.debug(f"Checking {self.pci_address} for Device Acquire Issues")
270 |         self.device_acquire_fail = False
271 | 
272 |         os.environ["ID"] = str(self.module_id)
273 |         os.environ["HABANA_VISIBLE_MODULES"] = str(self.module_id)
274 | 
275 |         try:
276 |             import torch
277 |             import habana_frameworks.torch.core
278 |         except Exception as e:
279 |             self.logger.error(f"Card {self.module_id} {self.pci_address} Failed to initialize Intel Gaudi PyTorch: {str(e)}")
280 |             self.device_acquire_fail  = True
281 |             self.is_infected = True
282 | 
283 |         try:
284 |             x = torch.tensor([2]).to('hpu')
285 |             y = x + x
286 | 
287 |             assert y == 4, 'Sanity check failed: Wrong Add output'
288 |             assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device'
289 |         except (RuntimeError, AssertionError, Exception) as e:
290 |             self.logger.error(f"{self.pci_address} Device Acquire Failure: {e}")
291 |             self.device_acquire_fail  = True
292 |             self.is_infected = True
293 | 
294 |         return self.device_acquire_fail
295 | 
296 |     def check_temperature_state(self):
297 |         if "HL-325" in self.system_name:
298 |             # Gaudi-3 System
299 |             max_good_temperature = 200
300 |             base_temperature     = 45
301 |             max_delta            = 80
302 |         else:
303 |             # Gaudi-2 System
304 |             max_good_temperature = 83
305 |             base_temperature     = 25
306 |             max_delta            = 25
307 |             
308 | 
309 |         if self.temperature_C >= max_good_temperature:
310 |             self.temperature_state_C = "CRITICAL"
311 |             self.is_infected = True
312 |         elif abs(self.temperature_C - base_temperature) >= max_delta:
313 |             self.temperature_state_C = "WARN"
314 |             self.is_infected = True
315 |         else:
316 |             self.temperature_state_C = "NORMAL"
317 | 
318 |     def __str__(self):
319 |         report_str = f""" Index: {self.index}
320 |                             Module Id: {self.module_id}
321 |                             PCI Address: {self.pci_address}
322 |                             Temperature: {self.temperature_C} C
323 |                             Temperature State: {self.temperature_state_C}
324 |                             Down Links: {self.down_links}
325 |                             Device Acquire Fail: {self.device_acquire_fail}"""
326 | 
327 |         return report_str
328 | 
329 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/README.md:
--------------------------------------------------------------------------------
  1 | # Intel Gaudi Health Screen 2.2.2
  2 | 
  3 | A large scale Intel Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the
  4 | cluster network health. Troubleshooting issues on a large cluster can be a tedious act. To simplify the debugging process the
  5 | **Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test
  6 | includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems
  7 | 
  8 | IGHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training
  9 | on a gaudi systems until the scans are complete. At the end of the scans, IGHS produces a CSV report detailing the state of each gaudi card.
 10 | 
 11 | It is reccomended to run IGHS in the below scenarios:
 12 | 
 13 | * After a system upgrade/update
 14 | * Before running a long term training
 15 | * Pinpointing problematic systems in a cluster if a problem can't be isolated to a single system
 16 | 
 17 | IGHS runs a multi-tiered configurable scan:
 18 | 
 19 | * Level 1 - Individual System Diagnostics
 20 | * Level 2 - Multi-System Communication Diagnostics
 21 | 
 22 | ## Level 1 - Individual System Diagnostic
 23 | 
 24 | Level 1 focuses on individual Gaudi Cards Health Diagnostics.
 25 | 
 26 | | Test                      | Description                                                |
 27 | | ------------------------- | ---------------------------------------------------------- |
 28 | | Gaudi Ports Status        | Checks if ports are DOWN                                   |
 29 | | Device Acquire Failures   | Checks if devices are busy                                 |
 30 | | Device Temperatue         | Checks if devices temperatures are in acceptable range     |
 31 | 
 32 | **2 System Cluster Example**
 33 | 
 34 | Here is an example of running IGHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and
 35 | flags for multi node communication failure
 36 | 
 37 | | node_id  | index | module_id | pci_address  | temperature_C | temperature_C | device_acquire_fail | down_links | multi_node_fail | missing |
 38 | | -------- | ----- | --------- | ------------ | ------------- | ------------- | ------------------- | ---------- | ----------------| ------- |
 39 | | sys-9-05 | 0     | 3         | 0000:19:00.0 | 22            |               | False               | [9]        | True            | False   |
 40 | | sys-9-05 | 1     | 7         | 0000:b3:00.0 | 60            | WARN          | False               | [7]        | True            | False   |
 41 | | sys-9-05 | 2     | 2         | 0000:1a:00.0 | 84            | CRITICAL      | False               | [5, 7]     | True            | False   |
 42 | | sys-9-05 | 3     | 6         | 0000:b4:00.0 | 23            |               | False               | [4]        | True            | False   |
 43 | | sys-9-05 | 4     | 1         | 0000:33:00.0 | 25            |               | False               | [4, 5]     | True            | False   |
 44 | | sys-9-05 | 5     | 5         | 0000:cc:00.0 | 24            |               | False               | [4, 5]     | True            | False   |
 45 | | sys-9-05 | 6     | 0         | 0000:34:00.0 | 27            |               | False               | [4, 5]     | True            | False   |
 46 | | sys-4-04 | 7     | 4         | 0000:cd:00.0 | 28            |               | False               | []         | False           | False   |
 47 | | sys-4-04 | 0     | 3         | 0000:19:00.0 | 28            |               | False               | []         | False           | False   |
 48 | | sys-4-04 | 1     | 7         | 0000:b3:00.0 | 28            |               | False               | []         | False           | False   |
 49 | | sys-4-04 | 2     | 2         | 0000:1a:00.0 | 28            |               | False               | []         | False           | False   |
 50 | | sys-4-04 | 3     | 0         | 0000:34:00.0 | 24            |               | False               | []         | False           | False   |
 51 | | sys-4-04 | 4     | 6         | 0000:b4:00.0 | 24            |               | False               | []         | False           | False   |
 52 | | sys-4-04 | 5     | 1         | 0000:33:00.0 | 21            |               | False               | []         | False           | False   |
 53 | | sys-4-04 | 6     | 5         | 0000:cc:00.0 | 21            |               | False               | []         | False           | False   |
 54 | | sys-4-04 | 7     | 4         | 0000:cd:00.0 | 26            |               | False               | []         | False           | False   |
 55 | 
 56 | ``` log
 57 | [2023-02-07 09:02:39] INFO Infected (Temperature WARN) 1 Node: ['sys-9-05']
 58 | [2023-02-07 09:02:39] INFO Infected (Temperature CRITICAL) 1 Node: ['sys-9-05']
 59 | [2023-02-07 09:02:39] INFO Infected 1 Node: ['sys-9-05']
 60 | [2023-02-07 09:02:39] INFO Missing 0 Node: []
 61 | [2023-02-07 09:02:39] INFO Healthy 1 Node: ["sys-4-04"]
 62 | 
 63 | [2023-02-07 09:02:39] INFO Detected 2 Node: ["sys-4-04","sys-9-05"]
 64 | 
 65 | ```
 66 | 
 67 | 
 68 | ## Level 2 - Multi-System Communication Diagnostics
 69 | 
 70 | Level 2 performs a collective communication all reduce test between multiple system through [HCCL_DEMO](https://github.com/HabanaAI/hccl_demo] repo.
 71 | It runs X rounds with unique pairs of systems ensuring that a system is able to communicate across different sets of systems. If no
 72 | pair systems have failed, then the testing will stop. If there was a system with communication issues, it will be flagged on the
 73 | first round.
 74 | 
 75 | ** Multi Node Cluster Example**
 76 | 
 77 | Here is an example of running IGHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged
 78 | in the multi_node_fail column, then one of the nodes has a communication issue. List of infected nodes will be printed out to
 79 | the log as well as the `health_report.csv` multi_node_fail column.
 80 | 
 81 | | round | group_id | node_ids                 | num_nodes | multi_node_fail | missing | qpc_fail |
 82 | | ----- | -------- | ------------------------ | --------- | --------------- | ------- | -------- |
 83 | | 0     | 11       | ['sys-7-01', 'sys-9-05'] | 2         | True            | False   | True     |
 84 | | 0     | 4        | ['sys-2-03', 'sys-4-04'] | 2         | True            | True    | False    |
 85 | | 0     | 13       | ['sys-6-06', 'sys-9-06'] | 2         | False           | False   | False    |
 86 | | 0     | 1        | ['sys-3-01', 'sys-9-01'] | 2         | False           | False   | False    |
 87 | | 0     | 2        | ['sys-6-03', 'sys-8-01'] | 2         | False           | False   | False    |
 88 | | 0     | 0        | ['sys-3-06', 'sys-6-02'] | 2         | False           | False   | False    |
 89 | | 0     | 10       | ['sys-2-01', 'sys-4-01'] | 2         | False           | False   | False    |
 90 | | 0     | 6        | ['sys-6-05', 'sys-9-03'] | 2         | False           | False   | False    |
 91 | | 0     | 14       | ['sys-4-05', 'sys-8-03'] | 2         | False           | False   | False    |
 92 | | 0     | 12       | ['sys-6-04', 'sys-8-05'] | 2         | False           | False   | False    |
 93 | | 0     | 8        | ['sys-7-06', 'sys-9-02'] | 2         | False           | False   | False    |
 94 | | 0     | 5        | ['sys-3-04', 'sys-7-02'] | 2         | False           | False   | False    |
 95 | | 0     | 3        | ['sys-4-03', 'sys-6-01'] | 2         | False           | False   | False    |
 96 | | 0     | 7        | ['sys-2-06', 'sys-3-03'] | 2         | False           | False   | False    |
 97 | | 0     | 9        | ['sys-2-04', 'sys-9-04'] | 2         | False           | False   | False    |
 98 | | 1     | 1        | ['sys-3-04', 'sys-4-05'] | 2         | False           | False   | False    |
 99 | | 1     | 20       | ['sys-2-03', 'sys-7-02'] | 2         | True            | True    | False    |
100 | | 1     | 19       | ['sys-3-01', 'sys-9-03'] | 2         | False           | False   | False    |
101 | | 1     | 0        | ['sys-3-03', 'sys-9-04'] | 2         | False           | False   | False    |
102 | | 1     | 12       | ['sys-4-04', 'sys-6-02'] | 2         | False           | False   | False    |
103 | | 1     | 9        | ['sys-4-03', 'sys-6-05'] | 2         | False           | False   | False    |
104 | | 1     | 14       | ['sys-3-06', 'sys-6-04'] | 2         | False           | False   | False    |
105 | | 1     | 15       | ['sys-4-01', 'sys-8-03'] | 2         | False           | False   | False    |
106 | | 1     | 3        | ['sys-8-01', 'sys-9-05'] | 2         | True            | False   | False    |
107 | | 1     | 8        | ['sys-6-03', 'sys-9-02'] | 2         | False           | False   | False    |
108 | | 1     | 7        | ['sys-2-06', 'sys-6-01'] | 2         | False           | False   | False    |
109 | | 1     | 10       | ['sys-6-06', 'sys-8-06'] | 2         | False           | False   | False    |
110 | | 1     | 11       | ['sys-3-02', 'sys-7-04'] | 2         | False           | False   | False    |
111 | | 1     | 17       | ['sys-8-04', 'sys-8-05'] | 2         | False           | False   | False    |
112 | | 1     | 18       | ['sys-4-02', 'sys-9-01'] | 2         | False           | False   | False    |
113 | | 1     | 16       | ['sys-2-02', 'sys-9-06'] | 2         | False           | False   | False    |
114 | 
115 | Logs show that we have 1 Infected Nodes and 1 Missing Node. Missing node represents a node that hasn't been tested yet and there are standard checks to see why it hasn't
116 | been tested, such as having missing cards, it is occupied by another session, or it is a MISC use case.
117 | 
118 | ``` log
119 | [2023-02-07 09:02:39] INFO Infected 1 Node: ['sys-9-05']
120 | [2023-02-07 09:02:39] INFO Missing 1 Node: ['sys-2-03']
121 | [2023-02-07 09:02:39] INFO Healthy 34 Node: ["sys-2-01","sys-2-02","sys-2-03","sys-2-04","sys-2-06","sys-3-01","sys-3-02","sys-3-03","sys-3-04","sys-3-06","sys-4-01","sys-4-02","sys-4-03","sys-4-04","sys-4-05","sys-6-01","sys-6-02","sys-6-03","sys-6-04","sys-6-05","sys-6-06","sys-7-01","sys-7-02","sys-7-04","sys-7-06","sys-8-01","sys-8-03","sys-8-04","sys-8-05","sys-8-06","sys-9-01","sys-9-02","sys-9-03","sys-9-04","sys-9-06"]
122 | 
123 | [2023-02-07 09:02:39] INFO Detected 36 Node: ["sys-2-01","sys-2-02","sys-2-03","sys-2-04","sys-2-06","sys-3-01","sys-3-02","sys-3-03","sys-3-04","sys-3-06","sys-4-01","sys-4-02","sys-4-03","sys-4-04","sys-4-05","sys-6-01","sys-6-02","sys-6-03","sys-6-04","sys-6-05","sys-6-06","sys-7-01","sys-7-02","sys-7-04","sys-7-06","sys-8-01","sys-8-03","sys-8-04","sys-8-05","sys-8-06","sys-9-01","sys-9-02","sys-9-03","sys-9-04","sys-9-05","sys-9-06"]
124 | [2023-02-07 09:02:39] INFO 1 Nodes w/ missing cards: ['sys-2-03']
125 | ```
126 | 
127 | ## Setup
128 | 
129 | IGHS is compatible with python3 default packages and does not require additional packages
130 | to be installed.
131 | 
132 | If your setup Environment requires custom configruation, update the yaml files located in the templates folder.
133 | 
134 | If running on bare metal system, then install `pdsh` to your system.
135 | 
136 | Update [config.yaml](config.yaml) to match your system Environment
137 | 
138 | ``` yaml
139 | # Sets IGHS to screen for K8s or Bare Metal Environment (k8s, bare-metal).
140 | system-info:
141 |   type: "k8s"
142 |   # Namespace is only required for k8s settings
143 |   namespace: "intelgaudi"
144 |   # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile
145 |   # hostfile: "./hostfile"
146 | 
147 |   # Bare Metal Configurations
148 |   ssh-path: "./ssh"
149 |   tcp-interface: "10.3.124.0/24"
150 | 
151 | # Image to run Intel Gaudi Health Screen
152 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest"
153 | 
154 | # Node Label used to identify a Intel Gaudi Node
155 | gaudi-node-label: "habana.ai/gaudi=NoSchedule"
156 | 
157 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
158 | log-level: "DEBUG"
159 | 
160 | # Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature)
161 | level-1:
162 |   run: true
163 |   timeout_s: 150
164 |   # Number of times to check Port Status
165 |   num-checks-link-state: 10
166 | 
167 | # Level 2 - Checks All Reduce between node pairs in the cluster.
168 | level-2:
169 |   run: true
170 |   timeout_s: 130
171 |   # Number of times to check Network connections between nodes
172 |   num-rounds: 5
173 | ```
174 | 
175 | To learn the features of IGHS, run the below command:
176 | 
177 | ``` bash
178 | python screen.py --help
179 | 
180 | usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES]
181 |                  [--job-id JOB_ID] [--round ROUND] [--config CONFIG]
182 |                  [--ighs-check [{node,hccl-demo,none}]] [--node-write-report]
183 |                  [--node-name NODE_NAME] [--logs-dir LOGS_DIR]
184 | 
185 | optional arguments:
186 |   -h, --help            show this help message and exit
187 |   --initialize          Downloads Necessary Repos and Creates Report Template
188 |   --screen              Starts Health Screen for Cluster
189 |   --target-nodes TARGET_NODES
190 |                         List of target nodes
191 |   --job-id JOB_ID       Needed to identify hccl-demo running log
192 |   --round ROUND         Needed to identify hccl-demo running round log
193 |   --config CONFIG       Configuration file for Health Screener
194 |   --ighs-check [{node,hccl-demo,none}]
195 |                         Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce
196 |                         (HCCL_DEMO between paris of nodes)
197 |   --node-write-report   Write Individual Node Health Report
198 |   --node-name NODE_NAME Name of Node
199 |   --logs-dir LOGS_DIR   Output directory of health screen results
200 | ```
201 | 
202 | To Run IGHS, run the below command:
203 | 
204 | ``` bash
205 | # Creates IGHS Report and screens clusters for any infected nodes.
206 | # Will check Level 1 and 2 by default
207 | python screen.py --initialize --screen
208 | ```
209 | 
210 | IGHS can alternatively be run through below script:
211 | 
212 | ``` bash
213 | # Creates IGHS Report and screens clusters for any infected nodes.
214 | # Will check Level 1 and 2 by default
215 | ./run_ighs.sh
216 | ```
217 | 
218 | ### Run on BareMetal
219 | 
220 | To run on bare-metal systems update the [config.yaml](config.yaml) to use bare-metal configuration.
221 | 
222 | ``` yaml
223 | # Sets IGHS to screen for K8s or Bare Metal Environment (k8s, bare-metal).
224 | system-info:
225 |   type: "bare-metal"
226 |   # Namespace is only required for k8s settings
227 |   namespace: "intelgaudi"
228 |   # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile
229 |   hostfile: "./hostfile"
230 | 
231 |   # Bare Metal Configurations
232 |   ssh-path: "./ssh"
233 |   tcp-interface: "10.3.124.0/24"
234 | 
235 | # Image to run Intel Gaudi Health Screen
236 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest"
237 | 
238 | # Node Label used to identify a Intel Gaudi Node
239 | gaudi-node-label: "habana.ai/gaudi=NoSchedule"
240 | 
241 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
242 | log-level: "DEBUG"
243 | 
244 | # Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature)
245 | level-1:
246 |   run: true
247 |   timeout_s: 150
248 |   # Number of times to check Port Status
249 |   num-checks-link-state: 10
250 | 
251 | # Level 2 - Checks All Reduce between node pairs in the cluster.
252 | level-2:
253 |   run: true
254 |   timeout_s: 130
255 |   # Number of times to check Network connections between nodes
256 |   num-rounds: 5
257 | ```
258 | 
259 | Before running the screening test, you need to generate the ssh key used for passwordless ssh:
260 | 
261 | ``` bash
262 | # Keys to setup initial bare-metal passwordless ssh connection between systems
263 | mkdir -p ssh;
264 | ssh-keygen -t rsa -f ssh/ighs_rsa;
265 | chmod 600 ssh/ighs_rsa;
266 | chmod 644 ssh/ighs_rsa.pub;
267 | 
268 | # Keys to setup containers passwordless ssh connection
269 | mkdir -p template/bare-metal/ssh;
270 | ssh-keygen -t rsa -f template/bare-metal/ssh/id_rsa;
271 | chmod 600 template/bare-metal/ssh/id_rsa;
272 | chmod 644 template/bare-metal/ssh/id_rsa.pub;
273 | 
274 | cat template/bare-metal/ssh/id_rsa.pub > template/bare-metal/ssh/authorized_keys;
275 | ```
276 | 
277 | ## Recovery Steps
278 | 
279 | | Issue                     | Description                                                                             |
280 | | ------------------------- | --------------------------------------------------------------------------------------- |
281 | | Down Internal Links       | Need to investigate Gaudi Card Health                                                   |
282 | | Down External Links       | Check Cable, switches, and Gaudi Card Health                                            |
283 | | QPC Issues                | Network Configuration issue (stale gaudinet.json, stale NIC configurations, etc... )    |
284 | | Missing Cards             | Need to investigate Gaudi Card Health                                                   |
285 | | k8s Issues                | Node Resources are not set/configured properly                                          |
286 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/config.yaml:
--------------------------------------------------------------------------------
 1 | # Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info
 2 | system-info:
 3 |   type: "k8s"
 4 |   # Namespace is only required for k8s settings
 5 |   namespace: "intelgaudi"
 6 | 
 7 |   # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile
 8 |   # hostfile: "./hostfile"
 9 | 
10 |   # Bare Metal Configurations
11 |   ssh-path: "./ssh"
12 |   tcp-interface: "10.3.124.0/24"
13 | 
14 | # Image to run Intel Gaudi Health Screen
15 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest"
16 | 
17 | # Node Label used to identify a Intel Gaudi Node
18 | gaudi-node-label: "habana.ai/gaudi=NoSchedule"
19 | 
20 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
21 | log-level: "DEBUG"
22 | 
23 | # Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure, Device Temperature)
24 | level-1:
25 |   run: true
26 |   timeout_s: 150
27 |   # Number of times to check Port Status
28 |   num-checks-link-state: 12
29 | 
30 | # Level 2 - Checks All Reduce between node pairs in the cluster.
31 | level-2:
32 |   run: true
33 |   timeout_s: 130
34 |   # Number of times to check Network connections between nodes
35 |   num-rounds: 5
36 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/hccl_demo_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import random, math, os, yaml, glob, json
 14 | 
 15 | import logging
 16 | _logger = logging.getLogger("health_screener")
 17 | 
 18 | def find_groups(healthy_nodes, watch_nodes, groups_tracker):
 19 |     """ Find a list of node groups to run hccl_demo all reduce test
 20 | 
 21 |     Args:
 22 |         healthy_nodes ([str]): Nodes that previously passed a pair testing of hccl_demo
 23 |         watch_nodes ([str]): Nodes that haven't has a passing round of hccl_demo
 24 |         groups_tracker ([str]): History of used groups. A group has to be unique
 25 | 
 26 |     Returns:
 27 |         ([str],[str]): Unique list of groups of nodes, History of used groups
 28 |     """
 29 |     random.shuffle(healthy_nodes)
 30 |     random.shuffle(watch_nodes)
 31 | 
 32 |     found_unique      = True
 33 |     num_nodes         = len(healthy_nodes) + len(watch_nodes)
 34 |     node_groups       = list()
 35 |     max_num_groups    = num_nodes // 2
 36 |     max_combinations  = (math.factorial(num_nodes)) / (math.factorial(num_nodes-2) * 2)
 37 |     max_attempts      = 10
 38 |     groups_tracker    = set(groups_tracker)
 39 | 
 40 |     if num_nodes == 1:
 41 |         _logger.warning(f"Need more than 1 Node to test pair all_reduce")
 42 |         return node_groups, list(groups_tracker)
 43 | 
 44 |     while len(node_groups) < max_num_groups and found_unique:
 45 |         i            = 0
 46 |         h_i, w_i     = 0,0
 47 | 
 48 |         if len(groups_tracker) >= max_combinations:
 49 |             _logger.info(f"Reached maximum combinations {max_combinations} for {num_nodes} Nodes")
 50 |             break
 51 | 
 52 |         node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i)
 53 |         i += 1
 54 |         if len(node_group) < 2 or node_group[0] == node_group[1]:
 55 |             _logger.info(f"Found invalid node_group {node_group}. Exiting group id search")
 56 |             found_unique = False
 57 |             break
 58 | 
 59 |         while group_id in groups_tracker:
 60 |             if i >= max_attempts:
 61 |                 _logger.warning(f"Max attempt {max_attempts} reached for finding unique pair combination.")
 62 |                 found_unique = False
 63 |                 break
 64 | 
 65 |             node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i)
 66 |             i += 1
 67 |             if len(node_group) < 2 or node_group[0] == node_group[1]:
 68 |                 _logger.info(f"Internal while Found invalid node_group {node_group}. Exiting group id search")
 69 |                 found_unique = False
 70 |                 break
 71 | 
 72 |         if found_unique:
 73 |             groups_tracker.add(group_id)
 74 |             node_groups.append(node_group)
 75 | 
 76 |             for n in node_group:
 77 |                 if n in healthy_nodes:
 78 |                     healthy_nodes.remove(n)
 79 |                 if n in watch_nodes:
 80 |                     watch_nodes.remove(n)
 81 | 
 82 |         if len(watch_nodes) == 0:
 83 |             break
 84 | 
 85 |     return node_groups, list(groups_tracker)
 86 | 
 87 | def find_group_id(healthy_nodes, watch_nodes, h_i=0, w_i=0):
 88 |     """ Finds a group of nodes and combines to form a group id
 89 | 
 90 |     Args:
 91 |         healthy_nodes ([str]): Nodes that previously passed a pair testing of hccl_demo
 92 |         watch_nodes ([str]): Nodes that haven't has a passing round of hccl_demo
 93 |         h_i (int): Index of next potential node id for healthy_nodes
 94 |         w_i (int): Index of next potential node id for watch_nodes
 95 | 
 96 |     Returns:
 97 |         ([str], str): Potential nodes and their group id
 98 |     """
 99 |     group_id    = ""
100 |     node_group  = []
101 |     max_attempt = 10
102 | 
103 |     # Goal of testing is to test watch_nodes and pair it with a healhty_node if available
104 |     if len(watch_nodes) == 0 or (len(watch_nodes) == 1 and len(healthy_nodes)==0):
105 |         return node_group, group_id, (h_i, w_i)
106 | 
107 |     for i in range(max_attempt):
108 |         if len(watch_nodes) and w_i < len(watch_nodes):
109 |             node_group.append(watch_nodes[w_i])
110 |             w_i += 1
111 |         if len(healthy_nodes) and h_i < len(healthy_nodes):
112 |             node_group.append(healthy_nodes[h_i])
113 |             h_i += 1
114 | 
115 |         if h_i >= len(healthy_nodes):
116 |             random.shuffle(healthy_nodes)
117 |             h_i = 0
118 |         if w_i >= len(watch_nodes):
119 |             random.shuffle(watch_nodes)
120 |             w_i = 0
121 | 
122 |         if len(node_group) >= 2:
123 |             break
124 | 
125 |     if len(node_group) > 1:
126 |         node_group.sort()
127 |         group_id = "-".join(node_group)
128 | 
129 |     return node_group, group_id, (h_i, w_i)
130 | 
131 | def gather_hccl_logs(job_path, round, log_dir, health_report):
132 |     """ Retrieve hccl_demo log files based on the job yamls executed
133 | 
134 |     Args:
135 |         job_path (str): Base directory of job yamls executed
136 |         round (int): Round to retrieve HCCL_Demo logs
137 |         log_dir (str): Base directory of HCCL_Demo logs
138 |         health_report (HealthReport): Tracks and reports health of hccl_demo
139 |     """
140 |     path         = f"{job_path}/**/r{round}/*.yaml"
141 |     job_files    = glob.glob(path, recursive=True)
142 |     hccl_results = dict()
143 | 
144 |     for f_name in job_files:
145 |         with open(f_name, 'r', newline='') as f:
146 |             job_data = yaml.safe_load(f)
147 | 
148 |         launcher_template = job_data["spec"]["mpiReplicaSpecs"]["Launcher"]["template"]
149 | 
150 |         job_id            = launcher_template["metadata"]["labels"]["name"]
151 |         target_nodes      = launcher_template["spec"]["containers"][0]["env"][4]["value"]
152 |         target_nodes      = target_nodes.split(',')
153 | 
154 |         hccl_results[f"{target_nodes}"] = hccl_demo_check(job_id=f"{log_dir}/L2/r{round}/{job_id}",
155 |                 target_nodes=target_nodes, health_report=health_report, write=False)
156 | 
157 |     multi_node_fail = set()
158 |     qpc_fail        = set()
159 |     missing_nodes   = set()
160 | 
161 |     for results in hccl_results.values():
162 |         if results["multi_node_fail"]:
163 |             multi_node_fail.add(f"{results['node_ids']}")
164 | 
165 |         if results["qpc_fail"]:
166 |             qpc_fail.add(f"{results['node_ids']}")
167 | 
168 |         if results["missing"]:
169 |             missing_nodes.add(f"{results['node_ids']}")
170 | 
171 |     health_report.update_hccl_demo_health_report(round=round, all_node_pairs=hccl_results, multi_node_fail=multi_node_fail, qpc_fail=qpc_fail, missing_nodes=missing_nodes)
172 | 
173 | def hccl_demo_check(job_id, health_report, target_nodes=[], hccl_log=[], write=True):
174 |     """ Check on HCCL Demo Status. Reads the output log, if it
175 |     has "Exiting HCCL demo with code: 1" then it is treated as a
176 |     failure
177 | 
178 |     Args:
179 |         job_id (str): Metadata name of the Job
180 |         health_report (HealthReport): Tracks and reports health of hccl_demo
181 |         target_nodes ([str], optional): Nodes that are used in hccl_demo testing
182 |         hccl_log ([str]): Log of HCCL_DEMO run
183 |         write (bool, optional): Writes to Report. Used to collect hccl results and update Base Health Report. Default to True
184 | 
185 |     Returns:
186 |         dict: HCCL Demo Health Report result data.
187 |     """
188 |     f_name_log       = f"{job_id}.log"
189 |     round            = os.path.basename(job_id).split("-")[2][1:]
190 |     group_id         = os.path.basename(job_id).split("-")[3]
191 |     hccl_demo_fail   = True
192 |     missing          = False
193 |     qpc_fail         = False
194 | 
195 |     if len(hccl_log) == 0:
196 |         if not os.path.exists(f_name_log):
197 |             _logger.error(f"{f_name_log} can't be found or has no data")
198 |             hccl_demo_fail = True
199 |             missing        = True
200 |         else:
201 |             with open(f_name_log, "r", newline='') as f:
202 |                 lines = f.readlines()
203 |                 hccl_demo_fail, qpc_fail, missing, _ = analyze_hccl_log(lines)
204 |     else:
205 |         hccl_demo_fail, qpc_fail, missing, target_nodes = analyze_hccl_log(hccl_log)
206 | 
207 |     target_nodes.sort()
208 |     data = {
209 |             "round": round,
210 |             "group_id": group_id,
211 |             "node_ids": target_nodes,
212 |             "num_nodes": len(target_nodes),
213 |             "multi_node_fail": hccl_demo_fail,
214 |             "missing": missing,
215 |             "qpc_fail": qpc_fail
216 |     }
217 | 
218 |     if write:
219 |         _logger.info("***** START of Node Report *****")
220 |         _logger.info(json.dumps(data))
221 |         _logger.info("***** END of Node Report *****")
222 |         health_report.write_rows(data=[data], level=2)
223 | 
224 |     return data
225 | 
226 | def analyze_hccl_log(data):
227 |     err_phrase       = "Exiting HCCL demo with code: 1"
228 |     err_phrase_other = "During handling of the above exception, another exception occurred:"
229 |     err_phrase_ssh   = "ssh: Could not resolve hostname"
230 |     err_phrase_qpc   = "Source: QPC, error"
231 |     pass_phrase      = "Bandwidth"
232 |     
233 |     target_phrase    = "Target Nodes: " 
234 | 
235 |     hccl_demo_fail   = True
236 |     missing          = False
237 |     qpc_fail         = False
238 |     target_nodes     = []
239 | 
240 |     for l in data:
241 |         if l.find(err_phrase_ssh) != -1:
242 |             hccl_demo_fail = True
243 |             missing        = True
244 |         elif l.find(err_phrase_qpc) != -1:
245 |             hccl_demo_fail = True
246 |             qpc_fail       = True
247 |         elif l.find(err_phrase) != -1 or l.find(err_phrase_other) != -1:
248 |             hccl_demo_fail = True
249 |         elif l.find(pass_phrase) != -1:
250 |             hccl_demo_fail = False
251 |         elif l.find(target_phrase) != -1:
252 |             colon_index = l.index(":")
253 |             target_nodes = l[colon_index+2:].split(",")
254 | 
255 |     return hccl_demo_fail, qpc_fail, missing, target_nodes
256 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/hostfile:
--------------------------------------------------------------------------------
1 | sys-01
2 | sys-02


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/run_ighs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | LOG_DIR=logs/$(date +'%m-%Y/%m-%d-%Y/%m-%d-%Y_%H-%M')
15 | 
16 | python3 screen.py --initialize --logs-dir $LOG_DIR;
17 | python3 screen.py --screen --logs-dir $LOG_DIR;
18 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/screen.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import os, datetime, yaml, sys, time, json
 14 | import argparse
 15 | import logging
 16 | 
 17 | from utilities import download_repos, create_logger, get_logging_level
 18 | from hccl_demo_helper import hccl_demo_check
 19 | from system_utils import KubeUtils, BareMetalUtils
 20 | 
 21 | from HealthReport import HealthReport
 22 | from IGNodes import IGNodes, IGNode
 23 | 
 24 | 
 25 | _logger = None
 26 | 
 27 | def monitor_ighs_status(system_mode, level, nodes, timeout_s=240, round=0):
 28 |         sleep_time_s       = 2
 29 |         max_attempts       = (timeout_s // sleep_time_s) + min(timeout_s % sleep_time_s, 1)
 30 |         current_run_status = dict()
 31 |         lvl_check_msg      = f"Checking IGHS Level {level}"
 32 | 
 33 |         num_nodes          = len(nodes.all_nodes)
 34 |         if level == 2:
 35 |             num_nodes      = len(nodes.current_node_groups) * 2
 36 |             lvl_check_msg += f" Round {round}"
 37 | 
 38 |         _logger.info(f"{lvl_check_msg} Status")
 39 | 
 40 |         for attempt in range(max_attempts):
 41 |             num_found_nodes = system_mode.check_screen_complete(current_run_status=current_run_status, health_report=nodes.health_report, level=level, round=round)
 42 | 
 43 |             if num_found_nodes == num_nodes:
 44 |                 _logger.info(f"Found {num_found_nodes}/{num_nodes} Nodes during Health Screen")
 45 |                 break
 46 | 
 47 |             _logger.info(f"Attempt {attempt}/{max_attempts}: Found {num_found_nodes}/{num_nodes} Nodes - Will Check again in {sleep_time_s} seconds")
 48 |             time.sleep(sleep_time_s)
 49 |         num_found_nodes = system_mode.check_screen_complete(current_run_status=current_run_status, health_report=nodes.health_report, level=level, round=round, final_check=True)
 50 | 
 51 |         if level == 1:
 52 |             detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_node_info()
 53 |             missing_nodes.update(set(nodes.all_nodes).difference(detected_nodes))
 54 |             undetected_nodes = []
 55 | 
 56 |             nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes)
 57 |         elif level == 2:
 58 |             detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_hccl_demo_info()
 59 |             undetected_nodes = set(nodes.all_nodes).difference(detected_nodes)
 60 | 
 61 |             nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes)
 62 | 
 63 |             detected_nodes_l1, infected_nodes_l1, missing_nodes = nodes.health_report.extract_node_info()
 64 |             detected_nodes.update(detected_nodes_l1)
 65 |             infected_nodes.update(infected_nodes_l1)
 66 | 
 67 |         healthy_nodes  = detected_nodes.difference(infected_nodes).difference(missing_nodes)
 68 | 
 69 |         healthy_nodes  = sorted(list(healthy_nodes))
 70 |         missing_nodes  = sorted(list(missing_nodes))
 71 |         infected_nodes = sorted(list(infected_nodes))
 72 |         nodes.update_node_status(healthy_nodes, infected_nodes, missing_nodes, undetected_nodes=undetected_nodes)
 73 | 
 74 |         watch_nodes    = sorted(list(nodes.watch_nodes))
 75 |         detected_nodes = sorted(list(detected_nodes))
 76 | 
 77 |         if level == 1:
 78 |             nodes.healthy_nodes = set(healthy_nodes)
 79 | 
 80 |         _logger.info(f"Detected {len(detected_nodes)} Node: {detected_nodes}")
 81 |         _logger.info(f"  Healthy {len(healthy_nodes)} Node: {healthy_nodes}")
 82 |         _logger.info(f"  Infected {len(infected_nodes)} Node: {infected_nodes}")
 83 |         _logger.info(f"Missing {len(missing_nodes)} Node: {missing_nodes}")
 84 |         _logger.info(f"Unverified {len(watch_nodes)} Node: {watch_nodes}")
 85 | 
 86 |         return healthy_nodes, infected_nodes, missing_nodes
 87 | 
 88 | 
 89 | def main(args):
 90 |     global _logger
 91 | 
 92 |     if args.logs_dir == "":
 93 |         c_time           = datetime.datetime.now()
 94 |         date_year_format = c_time.strftime("%m-%Y")
 95 |         date_format      = c_time.strftime("%m-%d-%Y")
 96 |         time_format      = c_time.strftime("%H-%M")
 97 |         args.logs_dir    = f"logs/{date_year_format}/{date_format}/{date_format}_{time_format}"
 98 | 
 99 | 
100 |     ighs_report_name = "health_report.csv"
101 |     ighs_log_dir     = args.logs_dir
102 | 
103 |     if args.node_name:
104 |         ighs_level       = os.environ["IGHS_LEVEL"] if "IGHS_LEVEL" in os.environ else 1
105 |         ighs_report_name = f"health_report_{args.node_name}.csv"
106 |         ighs_log_dir     = f"{args.logs_dir}/L{ighs_level}"
107 | 
108 |     health_report = HealthReport(f_dir=ighs_log_dir, report_name=ighs_report_name)
109 |     job_path      = "tmp/jobs"
110 | 
111 |     with open(args.config, 'r') as f:
112 |         config_data = yaml.safe_load(f)
113 | 
114 |     hostfile = ""
115 |     if "hostfile" in config_data["system-info"]:
116 |         hostfile = config_data["system-info"]["hostfile"]
117 | 
118 |     log_level  = get_logging_level(config_data["log-level"])
119 |     _logger, _ = create_logger(logger_name="health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level)
120 | 
121 |     if config_data["system-info"]["type"] == "k8s":
122 |         system_mode = KubeUtils(image=config_data["image"],
123 |                                 hostfile=hostfile,
124 |                                 namespace=config_data["system-info"]["namespace"],
125 |                                 log_dir=args.logs_dir)
126 |     elif config_data["system-info"]["type"] == "bare-metal":
127 |         system_mode = BareMetalUtils(image=config_data["image"],
128 |                                      hostfile=hostfile,
129 |                                      ssh_path=config_data["system-info"]["ssh-path"],
130 |                                      tcp_interface=config_data["system-info"]["tcp-interface"],
131 |                                      log_dir=args.logs_dir)
132 |     else:
133 |         _logger.error(f"system_mode: {system_mode} in {args.config} is not set correctly. system_mode has to be set to k8s or bare-metal")
134 |         sys.exit(1)
135 | 
136 | 
137 |     if args.initialize:
138 |         _logger.info(f"Loaded Configuration File: {args.config}")
139 |         _logger.info(f"{config_data}")
140 | 
141 |         health_report.create(create_base=True, create_hccl_demo=True)
142 |         download_repos()
143 | 
144 |         system_mode.initialize_system()
145 | 
146 |     if args.screen:
147 |         start_time = datetime.datetime.now()
148 | 
149 |         intel_gaudi_nodes             = IGNodes(health_report=health_report)
150 |         intel_gaudi_nodes.all_nodes   = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"])
151 |         healthy_nodes, infected_nodes, missing_nodes    = list(), list(), list()
152 |         occupied_nodes, missing_cards_nodes, misc_nodes = list(), list(), list()
153 | 
154 |         if config_data["level-1"]["run"]:
155 |             _logger.info("Running Level 1 Checks: Card Diagnostics")
156 |             if not os.path.exists(f"{health_report.f_dir}/L1"):
157 |                 os.makedirs(f"{health_report.f_dir}/L1")
158 | 
159 |             nodes_initialized = system_mode.initialize_node_jobs(level=1,
160 |                                              nodes=intel_gaudi_nodes,
161 |                                              job_base_path=job_path)
162 |             if nodes_initialized:
163 |                 healthy_nodes, infected_nodes, missing_nodes = monitor_ighs_status(system_mode=system_mode,
164 |                                                                             level=1,
165 |                                                                             nodes=intel_gaudi_nodes,
166 |                                                                             timeout_s=config_data["level-1"]["timeout_s"])
167 |                 occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes)
168 |                 system_mode.clear_ighs_pods()
169 | 
170 |             summary = {
171 |                 "level": 1,
172 |                 "infected": infected_nodes,
173 |                 "missing": missing_nodes,
174 |                 "occupied": occupied_nodes,
175 |                 "missing_cards": missing_cards_nodes,
176 |                 "untested": misc_nodes,
177 |                 "healthy": healthy_nodes
178 |             }
179 | 
180 |             with open(f"{args.logs_dir}/ighs_L1_summary.json", 'w', encoding ='utf8') as f:
181 |                 json.dump(summary, f, indent=4)
182 | 
183 |         if config_data["level-2"]["run"]:
184 |             _logger.info("Running Level 2 Checks: Pair HCCL_DEMO All Reduce")
185 |             if not os.path.exists(f"{health_report.f_dir}/L2"):
186 |                 os.makedirs(f"{health_report.f_dir}/L2")
187 | 
188 |             intel_gaudi_nodes.healthy_nodes = set()
189 |             intel_gaudi_nodes.watch_nodes   = set(intel_gaudi_nodes.all_nodes).difference(set(missing_nodes))
190 |             intel_gaudi_nodes.missing_nodes = set(missing_nodes)
191 | 
192 |             for i in range(config_data["level-2"]["num-rounds"]):
193 |                 nodes_initialized = system_mode.initialize_node_jobs(level=2,
194 |                                                  nodes=intel_gaudi_nodes,
195 |                                                  job_base_path=job_path,
196 |                                                  round=i)
197 |                 if not nodes_initialized:
198 |                     _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No other Nodes to screen. Exit screening early.")
199 |                     break
200 | 
201 |                 healthy_nodes, infected_nodes, missing_nodes = monitor_ighs_status(system_mode=system_mode,
202 |                                                                                 level=2,
203 |                                                                                 nodes=intel_gaudi_nodes,
204 |                                                                                 timeout_s=config_data["level-2"]["timeout_s"],
205 |                                                                                 round=i)
206 |                 occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes)
207 |                 system_mode.clear_ighs_pods(job_type="mpijobs")
208 | 
209 |                 if len(intel_gaudi_nodes.watch_nodes) == 0:
210 |                     _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No other Nodes to screen. Exit screening early.")
211 |                     break
212 | 
213 |             summary = {
214 |                 "level": 2,
215 |                 "infected": infected_nodes,
216 |                 "missing": missing_nodes,
217 |                 "occupied": occupied_nodes,
218 |                 "missing_cards": missing_cards_nodes,
219 |                 "untested": misc_nodes,
220 |                 "healthy": healthy_nodes
221 |             }
222 | 
223 |             with open(f"{args.logs_dir}/ighs_L2_summary.json", 'w', encoding ='utf8') as f:
224 |                 json.dump(summary, f, indent=4)
225 | 
226 |         end_time  = datetime.datetime.now()
227 |         diff_time = (end_time - start_time)
228 |         _logger.info(f"Total Run Time: {diff_time}")
229 | 
230 |     if args.ighs_check == "node":
231 |         node = IGNode(health_report=health_report,
232 |                      num_checks_link_state=config_data["level-1"]["num-checks-link-state"],
233 |                      log_level=log_level,
234 |                      name=args.node_name)
235 |         node.scan_cards()
236 |         node.health_check(write_report=args.node_write_report)
237 |     elif args.ighs_check == "hccl-demo":
238 |         health_report.create(create_base=False, create_hccl_demo=True)
239 | 
240 |         target_nodes = args.target_nodes.strip("[']").replace("'","").split(',')
241 |         hccl_demo_check(job_id=f"{health_report.f_dir}/L2/{args.round}/{args.job_id}",
242 |                         target_nodes=target_nodes, health_report=health_report)
243 | 
244 | if __name__=="__main__":
245 |     parser = argparse.ArgumentParser()
246 | 
247 |     parser.add_argument("--initialize", action="store_true", help="Downloads Necessary Repos and Creates Report Template")
248 |     parser.add_argument("--screen", action="store_true", help="Starts Health Screen for Cluster")
249 |     parser.add_argument("--target-nodes", type=str, default="", help="List of target nodes")
250 |     parser.add_argument("--job-id", type=str, default="", help="Needed to identify hccl-demo running log")
251 |     parser.add_argument("--round", type=str, default="", help="Needed to identify hccl-demo running round log")
252 |     parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file for Health Screener")
253 |     parser.add_argument("--ighs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"],
254 |         help="Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes)")
255 | 
256 |     parser.add_argument("--node-write-report", action="store_true", help="Write Individual Node Health Report")
257 |     parser.add_argument("--node-name", type=str, default="", help="Name of Node")
258 |     parser.add_argument("--logs-dir", type=str, default="", help="Output directory of health screen results")
259 | 
260 |     args = parser.parse_args()
261 | 
262 | 
263 |     main(args)
264 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE
 2 | FROM ${BASE_IMAGE}
 3 | 
 4 | RUN mkdir ~/.ssh && \
 5 | cd ~/.ssh && \
 6 | ssh-keygen -A && \
 7 | sed -i 's/#Port 22/Port 3122/g' /etc/ssh/sshd_config && \
 8 | sed -i 's/#   Port 22/    Port 3122/g' /etc/ssh/ssh_config && \
 9 | sed -i 's/3022/3122/g' ~/.bashrc && \
10 | echo "Host *" >> ~/.ssh/config && \
11 | echo "ForwardAgent yes" >> ~/.ssh/config && \
12 | echo "StrictHostKeyChecking no" >> ~/.ssh/config && \
13 | echo "UserKnownHostsFile /dev/null" >> ~/.ssh/config && \
14 | echo "LogLevel ERROR" >> ~/.ssh/config && \
15 | service ssh start && \
16 | chmod 600 ~/.ssh/config
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   ighs_level1:
 3 |     image: ighs_level1
 4 |     build:
 5 |       context: .
 6 |       network: host
 7 |       args:
 8 |         BASE_IMAGE: "${BASE_IMAGE}"
 9 |     container_name: ighs_level1
10 |     runtime: habana
11 |     environment:
12 |     - HABANA_VISIBLE_DEVICES=all
13 |     - OMPI_MCA_btl_vader_single_copy_mechanism=none
14 |     - IGHS_LEVEL=1
15 |     cap_add:
16 |       - SYS_NICE
17 |       - SYSLOG
18 |     ipc: host
19 |     network_mode: host
20 |     working_dir: /tmp/ighs/intel_gaudi_health_screen
21 |     volumes:
22 |       - ./ssh:/root/.ssh/
23 |       - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen
24 |       - /etc/localtime:/etc/localtime:ro
25 |     command: >
26 |       bash -c "python screen.py --ighs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} && \
27 |               chmod 777 -R $${LOG_DIR}"
28 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   ighs_level2_launcher:
 3 |     image: ighs_level2
 4 |     build:
 5 |       context: .
 6 |       network: host
 7 |       args:
 8 |         BASE_IMAGE: "${BASE_IMAGE}"
 9 |     container_name: ighs_level2_launcher
10 |     runtime: habana
11 |     environment:
12 |     - HABANA_VISIBLE_DEVICES=all
13 |     - OMPI_MCA_btl_vader_single_copy_mechanism=none
14 |     - IGHS_LEVEL=2
15 |     cap_add:
16 |       - SYS_NICE
17 |       - SYSLOG
18 |     ipc: host
19 |     network_mode: host
20 |     working_dir: /tmp/ighs/intel_gaudi_health_screen
21 |     volumes:
22 |       - ./ssh:/root/.ssh/
23 |       - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen
24 |       - /etc/localtime:/etc/localtime:ro
25 |     command: >
26 |       template/bare-metal/run_hccl_demo.sh


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   ighs_level2_worker:
 3 |     image: ighs_level2
 4 |     build:
 5 |       context: .
 6 |       network: host
 7 |       args:
 8 |         BASE_IMAGE: "${BASE_IMAGE}"
 9 |     container_name: ighs_level2_worker
10 |     runtime: habana
11 |     environment:
12 |     - HABANA_VISIBLE_DEVICES=all
13 |     - OMPI_MCA_btl_vader_single_copy_mechanism=none
14 |     - IGHS_LEVEL=2
15 |     cap_add:
16 |       - SYS_NICE
17 |       - SYSLOG
18 |     ipc: host
19 |     network_mode: host
20 |     working_dir: /tmp/ighs/intel_gaudi_health_screen
21 |     volumes:
22 |       - ./ssh:/root/.ssh/
23 |       - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen
24 |       - /etc/localtime:/etc/localtime:ro
25 |     tty: true
26 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NUM_NODES="${NUM_NODES:-1}";
 4 | HOME_DIR="${HOME_DIR:-/tmp/ighs/intel_gaudi_health_screen}";
 5 | WORK_DIR="${WORK_DIR:-/tmp/ighs/intel_gaudi_health_screen/build/hccl_demo}";
 6 | 
 7 | NGPU_PER_NODE=8;
 8 | N_CARDS=$((NUM_NODES*NGPU_PER_NODE));
 9 | 
10 | cd ${WORK_DIR};
11 | CMD="python ${WORK_DIR}/run_hccl_demo.py \
12 | --test all_reduce \
13 | --loop 1000 \
14 | --size 32m \
15 | -clean \
16 | -mpi ";
17 | 
18 | mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/;
19 | cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
20 | touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
21 | echo "Target Nodes: $TARGET_NODES" >> $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
22 | 
23 | $CMD \
24 | -np ${N_CARDS} \
25 | --allow-run-as-root \
26 | --bind-to core \
27 | --map-by ppr:4:socket:PE=6 \
28 | --rank-by core --report-bindings \
29 | --tag-output \
30 | --merge-stderr-to-stdout --prefix $MPI_ROOT \
31 | -H ${TARGET_NODES//,/:48,}:48 \
32 | --mca btl_tcp_if_include $TCP_INTERFACE \
33 | -x MASTER_ADDR \
34 | -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \
35 | -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \
36 | 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
37 | 
38 | cd ${HOME_DIR};
39 | python $HOME_DIR/screen.py --ighs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --round $ROUND;
40 | 
41 | chmod 777 -R $HOME_DIR/$LOG_DIR
42 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: template-metadata-name
 5 |   namespace: default
 6 |   labels:
 7 |     app: ighs
 8 | spec:
 9 |   template:
10 |     metadata:
11 |       labels:
12 |         app: ighs
13 |     spec:
14 |       restartPolicy: "Never"
15 |       affinity:
16 |         nodeAffinity:
17 |           requiredDuringSchedulingIgnoredDuringExecution:
18 |             nodeSelectorTerms:
19 |               - matchExpressions:
20 |                   - key: kubernetes.io/hostname
21 |                     operator: In
22 |                     values:
23 |                       - IGHS-DUMMY-VAL
24 |       volumes:
25 |         - name: mydir
26 |           emptyDir: {}
27 |       tolerations:
28 |         - key: ""
29 |           operator: "Exists"
30 |           effect: "NoSchedule"
31 |       containers:
32 |         - name: template-container-name
33 |           image: template-container-image
34 |           imagePullPolicy: IfNotPresent
35 |           workingDir: /workdir
36 |           command: ["/bin/bash", "-c"]
37 |           args:
38 |             - >-
39 |               ssh-keygen -A;
40 |               service ssh start;
41 | 
42 |               while [ ! -d /workdir/intel_gaudi_health_screen ]; do
43 |                 sleep 2s;
44 |               done;
45 |               sleep 10s;
46 | 
47 |               cd /workdir/intel_gaudi_health_screen;
48 |               python /workdir/intel_gaudi_health_screen/screen.py --ighs-check node --logs-dir $LOG_DIR;
49 |           volumeMounts:
50 |             - name: mydir
51 |               mountPath: /workdir
52 |           securityContext:
53 |             capabilities:
54 |               add:
55 |                 - SYSLOG
56 |           env:
57 |             - name: IGHS_LEVEL
58 |               value: "1"
59 |             - name: MY_POD_IP
60 |               valueFrom:
61 |                 fieldRef:
62 |                   fieldPath: status.podIP
63 |             - name: MY_NODE_NAME
64 |               valueFrom:
65 |                 fieldRef:
66 |                   fieldPath: spec.nodeName
67 |             - name: MY_POD_NAMESPACE
68 |               valueFrom:
69 |                 fieldRef:
70 |                   fieldPath: metadata.namespace
71 |           resources:
72 |             limits:
73 |               habana.ai/gaudi: 8
74 |               hugepages-2Mi: 29000Mi
75 |               memory: 200Gi
76 |               cpu: 95
77 |             requests:
78 |               habana.ai/gaudi: 8
79 |               hugepages-2Mi: 29000Mi
80 |               memory: 200Gi
81 |               cpu: 95
82 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: kubeflow.org/v2beta1
  2 | kind: MPIJob
  3 | metadata:
  4 |   name: template-metadata-name
  5 |   namespace: default
  6 |   labels:
  7 |     app: ighs-hccl
  8 | spec:
  9 |   slotsPerWorker: 8
 10 |   runPolicy:
 11 |     cleanPodPolicy: Running
 12 |   mpiReplicaSpecs:
 13 |     Launcher:
 14 |       replicas: 1
 15 |       template:
 16 |         metadata:
 17 |           labels:
 18 |             app: ighs-hccl
 19 |         spec:
 20 |           volumes:
 21 |             - name: mydir
 22 |               emptyDir: {}
 23 |           containers:
 24 |             - image: template-container-image
 25 |               name: ighs-launcher
 26 |               imagePullPolicy: IfNotPresent
 27 |               workingDir: /workdir
 28 |               volumeMounts:
 29 |                 - name: mydir
 30 |                   mountPath: /workdir
 31 |               securityContext:
 32 |                 capabilities:
 33 |                   add:
 34 |                     - SYSLOG
 35 |               env:
 36 |                 - name: JOB_ID
 37 |                   valueFrom:
 38 |                     fieldRef:
 39 |                       fieldPath: metadata.labels['name']
 40 |                 - name: MY_NODE_NAME
 41 |                   valueFrom:
 42 |                     fieldRef:
 43 |                       fieldPath: spec.nodeName
 44 |                 - name: HOME_DIR
 45 |                   value: "/workdir/intel_gaudi_health_screen"
 46 |                 - name: IGHS_LEVEL
 47 |                   value: "2"
 48 |               command: ["/bin/bash", "-c"]
 49 |               args:
 50 |                 - >-
 51 |                   set -eo pipefail;
 52 |                   echo "Target Nodes: $TARGET_NODES";
 53 |                   ssh-keygen -A;
 54 |                   service ssh start;
 55 | 
 56 |                   while [ ! -d /workdir/intel_gaudi_health_screen ]; do
 57 |                     sleep 2s;
 58 |                   done;
 59 |                   sleep 10s;
 60 | 
 61 |                   declare -xr HOSTSFILE=$OMPI_MCA_orte_default_hostfile;
 62 | 
 63 |                   declare -xr NUM_NODES=$(wc -l < $HOSTSFILE);
 64 |                   declare -xr NGPU_PER_NODE=8;
 65 |                   declare -xr N_CARDS=$((NUM_NODES*NGPU_PER_NODE));
 66 | 
 67 |                   cd ${HOME_DIR}/build/hccl_demo;
 68 |                   declare -xr CMD="python ${HOME_DIR}/build/hccl_demo/run_hccl_demo.py \
 69 |                     --test all_reduce \
 70 |                     --loop 1000 \
 71 |                     --size 32m \
 72 |                     -mpi ";
 73 | 
 74 |                   mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/;
 75 |                   cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
 76 |                   touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
 77 |                   echo "Target Nodes: $TARGET_NODES" > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
 78 | 
 79 |                   $CMD \
 80 |                     -np ${N_CARDS} \
 81 |                     --allow-run-as-root \
 82 |                     --bind-to core \
 83 |                     --map-by ppr:4:socket:PE=6 \
 84 |                     --rank-by core --report-bindings \
 85 |                     --tag-output \
 86 |                     --merge-stderr-to-stdout --prefix $MPI_ROOT \
 87 |                     --mca btl_tcp_if_include eth0 \
 88 |                     -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \
 89 |                     -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \
 90 |                     -x MAX_TIMEOUT=60 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
 91 | 
 92 |                   cd ${HOME_DIR};
 93 |                   python ${HOME_DIR}/screen.py --ighs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND;
 94 | 
 95 |     Worker:
 96 |       replicas: template-num-nodes
 97 |       template:
 98 |         metadata:
 99 |           labels:
100 |             app: ighs-hccl
101 |         spec:
102 |           affinity:
103 |             nodeAffinity:
104 |               requiredDuringSchedulingIgnoredDuringExecution:
105 |                 nodeSelectorTerms:
106 |                   - matchExpressions:
107 |                       - key: kubernetes.io/hostname
108 |                         operator: In
109 |                         values:
110 |                           - IGHS-DUMMY-VAL
111 |           volumes:
112 |             - name: mydir
113 |               emptyDir: {}
114 |           tolerations:
115 |             - key: ""
116 |               operator: "Exists"
117 |               effect: "NoSchedule"
118 |             - key: ""
119 |               operator: "Exists"
120 |               effect: "NoExecute"
121 |           containers:
122 |             - image: template-container-image
123 |               name: ighs-worker
124 |               imagePullPolicy: IfNotPresent
125 |               securityContext:
126 |                 capabilities:
127 |                   add:
128 |                     - SYSLOG
129 |               resources:
130 |                 limits:
131 |                   habana.ai/gaudi: 8
132 |                   hugepages-2Mi: 29000Mi
133 |                   cpu: 95
134 |                   memory: 200Gi
135 |                 requests:
136 |                   habana.ai/gaudi: 8
137 |                   hugepages-2Mi: 29000Mi
138 |                   memory: 200Gi
139 |                   cpu: 95
140 |               volumeMounts:
141 |                 - name: mydir
142 |                   mountPath: /workdir
143 |               env:
144 |                 - name: IGHS_LEVEL
145 |                   value: "2"
146 |                 - name: MY_POD_IP
147 |                   valueFrom:
148 |                     fieldRef:
149 |                       fieldPath: status.podIP
150 |                 - name: MY_NODE_NAME
151 |                   valueFrom:
152 |                     fieldRef:
153 |                       fieldPath: spec.nodeName
154 |                 - name: MY_POD_NAMESPACE
155 |                   valueFrom:
156 |                     fieldRef:
157 |                       fieldPath: metadata.namespace
158 |               command: ["/bin/bash", "-c"]
159 |               args:
160 |                 - >-
161 |                   printenv | grep "MY" >> /etc/environment;
162 |                   ssh-keygen -A;
163 |                   service ssh start;
164 |                   sleep 365d;
165 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/utilities.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import os, time, sys
 14 | import subprocess, shlex
 15 | from datetime import datetime
 16 | 
 17 | import logging
 18 | from logging import handlers
 19 | 
 20 | _logger = logging.getLogger("health_screener")
 21 | 
 22 | def get_logging_level(log_level):
 23 |     log_level = log_level.lower()
 24 |     num_level = logging.INFO
 25 | 
 26 |     if log_level == "info":
 27 |         num_level = logging.INFO
 28 |     elif log_level == "debug":
 29 |         num_level = logging.DEBUG
 30 |     elif log_level == "warn":
 31 |         num_level = logging.WARN
 32 |     elif log_level == "error":
 33 |         num_level = logging.ERROR
 34 |     elif log_level == "critical":
 35 |         num_level = logging.CRITICAL
 36 | 
 37 |     return num_level
 38 | 
 39 | def create_logger(logger_name, logger_file_name, f_path="", level=logging.INFO, max_bytes=5e6, backup_count=10):
 40 |     """ Creates Logger that writes to logs directory
 41 | 
 42 |     Args:
 43 |         logger_name (str): Name of Logger File. Will be appended with logs/{current_time}/logger_name.log
 44 |         level (int, optional): Logging Level. Defaults to logging.INFO.
 45 |         max_bytes (int, optional): Max size of log file. Will rollover once maxed reach. Defaults to 5e6.
 46 |         backup_count (int, optional): Rollover Limit. Defaults to 10.
 47 | 
 48 |     Returns:
 49 |         logger: Logger Object used to log details to designated logger file
 50 |     """
 51 |     t_logger  = logging.getLogger(logger_name)
 52 |     t_logger.setLevel(level)
 53 | 
 54 |     c_time = datetime.now()
 55 |     date_format = c_time.strftime("%m-%d-%Y")
 56 |     time_format = c_time.strftime("%H-%M")
 57 | 
 58 |     file_path = f"{f_path}/{logger_file_name}.log" if f_path != "" else f"logs/{date_format}/{date_format}_{time_format}/{logger_file_name}.log"
 59 |     d_path    = os.path.dirname(file_path)
 60 |     _logger.debug(f"d_path: {d_path} file_path: {file_path}")
 61 | 
 62 |     if(not os.path.exists(d_path)):
 63 |         os.makedirs(d_path)
 64 | 
 65 |     formatter = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s",datefmt='%Y-%m-%d %H:%M:%S')
 66 |     handler   = logging.handlers.RotatingFileHandler(file_path, maxBytes=max_bytes, backupCount=backup_count)
 67 |     handler.setFormatter(formatter)
 68 | 
 69 |     stream_handler = logging.StreamHandler(sys.stdout)
 70 |     stream_handler.setFormatter(formatter)
 71 | 
 72 |     t_logger.addHandler(handler)
 73 |     t_logger.addHandler(stream_handler)
 74 | 
 75 |     return t_logger, d_path
 76 | 
 77 | def run_cmd(cmd, timeout_s=900, verbose=False):
 78 |     """ Run Command through subprocess.run()
 79 | 
 80 |     Args:
 81 |         cmd (str): CMD to run
 82 |         timeout_s (int, optional): Timeout of CMD. Defaults to 1_800.
 83 |         verbose (bool, optional): Print results. Defaults to False
 84 | 
 85 |     Returns:
 86 |         bool: Result of CMD. If it encounters any weird exceptions it will be flagged as False
 87 |     """
 88 | 
 89 |     cmd = shlex.split(cmd)
 90 |     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout_s)
 91 | 
 92 |     if (verbose):
 93 |         _logger.debug(f"Running cmd: {cmd}")
 94 |         _logger.debug(result.stdout)
 95 | 
 96 |     return result.stdout
 97 | 
 98 | def download_repos():
 99 |     """ Download HCCL_DEMO Repo to assist in health checks
100 |     """
101 |     if not os.path.exists("build"):
102 |         os.makedirs("build")
103 | 
104 |     if not os.path.exists("build/hccl_demo"):
105 |         _logger.info(f"Downloading hccl_demo into build/")
106 |         cmd = "git clone https://github.com/HabanaAI/hccl_demo.git build/hccl_demo"
107 |         run_cmd(cmd)
108 | 
109 |         os.environ["MPI"]="1"
110 |         cmd = "make -C build/hccl_demo"
111 |         run_cmd(cmd)
112 | 
113 | def copy_files(src, dst, to_remote=True, hosts=[], exclude={}):
114 |     """ Copies files through rsync from src to dst over the list of hosts
115 | 
116 |     Args:
117 |         src (str): Source file/directory to copy
118 |         dst (str): Destination to copy files/directory
119 |         to_remote (bool, optional): rsync to remote destination (src -> host:dst). False will rsync to local destination (h:src -> dst). Defaults to True.
120 |         hosts (list, optional): List of IP Addresses to copy to/from. Defaults to [].
121 |         exclude (dict, optional): Files/Directory to ignore. Follow rsync rules for exclusions. Defaults to {}.
122 |     """
123 |     rsync_cmd = f"rsync -ahzgop --exclude={exclude}"
124 | 
125 |     for h in hosts:
126 |         if (to_remote):
127 |             src_path = src
128 |             dst_path = f"{h}:{dst}"
129 |         else:
130 |             src_path = f"{h}:{src}"
131 |             dst_path = dst
132 | 
133 |         _logger.debug(f"Copying {src_path} to {dst_path}")
134 |         cmd    = f"{rsync_cmd} {src_path} {dst_path}"
135 |         output = run_cmd(cmd)
136 | 
137 | 
138 | def clear_job(job):
139 |     """ Clear MPIJobs based on Job Name
140 | 
141 |     Args:
142 |         job (str): Job Name to delete
143 |     """
144 |     _logger.info(f"Checking for existing MPIJobs {job}")
145 |     cmd = f"kubectl get mpijobs -n default {job} -o=custom-columns='NAME:.metadata.name' --no-headers"
146 |     output = run_cmd(cmd)
147 | 
148 |     if job in output:
149 |         _logger.info(f"Found MPIJobs {job}. Will delete.")
150 |         cmd = f"kubectl delete mpijobs -n default {job}"
151 |         output = run_cmd(cmd)
152 | 
153 |         cmd = f"kubectl get pods -n default --selector=training.kubeflow.org/job-name={job} -o=custom-columns='NAME:.metadata.name' --no-headers"
154 | 
155 |         max_attempt = 15
156 |         for attempts in range(max_attempt):
157 |             output = run_cmd(cmd).strip()
158 | 
159 |             if(len(output) == 0):
160 |                 break
161 | 
162 |             _logger.info(f"Attempt {attempts} Pods are still up. Will wait 10 seconds to check again")
163 |             time.sleep(10)
164 | 


--------------------------------------------------------------------------------
/utils/intel_gaudi_health_screen/version.txt:
--------------------------------------------------------------------------------
1 | 2.2.2


--------------------------------------------------------------------------------