├── LICENSE ├── README.md ├── dockerfiles ├── README.md ├── base │ ├── Dockerfile.rhel8.6 │ ├── Dockerfile.rhel9.2 │ ├── Dockerfile.rhel9.4 │ ├── Dockerfile.suse15.5 │ ├── Dockerfile.tencentos3.1 │ ├── Dockerfile.ubuntu22.04 │ ├── Dockerfile.ubuntu24.04 │ ├── LICENSE │ ├── Makefile │ ├── install-python310.sh │ ├── install_efa.sh │ └── tencentos_efa_patch.txt ├── common.mk ├── pytorch │ ├── Dockerfile.rhel8.6 │ ├── Dockerfile.rhel9.2 │ ├── Dockerfile.rhel9.4 │ ├── Dockerfile.suse15.5 │ ├── Dockerfile.tencentos3.1 │ ├── Dockerfile.ubuntu │ ├── Makefile │ └── install_packages.sh ├── triton │ ├── Dockerfile │ └── Makefile └── triton_vllm_backend │ ├── Dockerfile │ ├── Makefile │ ├── model.py │ └── samples │ ├── client.py │ ├── model_repository │ └── vllm_model │ │ ├── 1 │ │ └── model.json │ │ └── config.pbtxt │ ├── prompts.txt │ └── test_models │ ├── llama70b_8x │ ├── 1 │ │ └── model.json │ └── config.pbtxt │ ├── llama7b_1x │ ├── 1 │ │ └── model.json │ └── config.pbtxt │ └── qwen_7b_chat │ ├── 1 │ └── model.json │ └── config.pbtxt ├── legal-disclaimer.md └── utils ├── README.md ├── check_framework_env.py └── intel_gaudi_health_screen ├── .gitignore ├── HealthReport.py ├── IGNodes.py ├── README.md ├── config.yaml ├── hccl_demo_helper.py ├── hostfile ├── run_ighs.sh ├── screen.py ├── system_utils.py ├── template ├── bare-metal │ ├── dockerfile │ ├── intel-gaudi-docker-compose-L1.yaml │ ├── intel-gaudi-docker-compose-L2-launcher.yaml │ ├── intel-gaudi-docker-compose-L2-worker.yaml │ └── run_hccl_demo.sh └── k8s │ ├── intel-gaudi-health-screen-L1.yaml │ └── intel-gaudi-health-screen-L2_hccl-demo.yaml ├── utilities.py └── version.txt /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intel® Gaudi® Accelerator Setup and Installation 2 | 3 |
4 | 5 | --- 6 | 7 |
8 | 9 | By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). 10 | 11 |
12 | 13 | --- 14 | 15 |
16 | 17 | ## Overview 18 | 19 | Welcome to Setup and Installation GitHub Repository! 20 | 21 | The full installation documentation has been consolidated into the Installation Guide in our Intel Gaudi Documentation. Please reference our [Intel Gaudi docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide. 22 | 23 | This respository contains the following references: 24 | - dockerfiles -- Reference dockerfiles and build script to build Gaudi Docker images 25 | - utils -- Reference utility scripts 26 | -------------------------------------------------------------------------------- /dockerfiles/README.md: -------------------------------------------------------------------------------- 1 | # Gaudi Docker Images Builder 2 | 3 | ## Table of Contents 4 | - [Overview](#overview) 5 | - [Build docker](#docker_build) 6 | 7 | 8 |
9 | 10 | --- 11 | 12 |
13 | 14 | ## Overview 15 | 16 | This folder contains Gaudi dockerfiles and makefiles that can be used to build Habanalabs docker images for Gaudi. 17 | 18 |
19 | 20 | --- 21 | 22 |
23 | 24 | ## Build Docker 25 | 26 | This script can be used as reference to build docker images for Gaudi. 27 | 28 | ### How to Build Docker Images from Habana Dockerfiles 29 | 30 | 1. Go into the folder of the image type you would like to build: 31 | * base 32 | * pytorch 33 | * triton 34 | 35 | 2. Run build command to generate Docker image 36 | ``` 37 | make build 38 | ``` 39 | Examples: 40 | #### Build pytorch image for rhel9.2: 41 | ``` 42 | cd pytorch 43 | make build BUILD_OS=rhel9.2 44 | ``` 45 | 46 | #### Build triton image (default OS - ubuntu22.04): 47 | ``` 48 | cd triton 49 | make build 50 | ``` 51 | 52 | #### Build triton vllm backend (default OS - ubuntu22.04): 53 | ``` 54 | cd triton_vllm_backend 55 | make build BUILD_OS=ubuntu22.04 56 | ``` 57 | 58 | 3. Build command variables 59 | 60 | #### Optional Parameters 61 | * BUILD_OS - set the OS to build (default ubuntu22.04) 62 | * BUILD_DIR - the folder where the build be executed from (default dockerbuild in image folder) 63 | * VERBOSE - set to TRUE to echo the commands (default FALSE) 64 | * DOCKER_CACHE - set to TRUE to use cache for building docker image (default FALSE) 65 | 66 | 4. Instructions for triton-vllm-back-end server 67 | 68 | * Run the backend container as described in [habana docs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Triton_Inference.html?highlight=triton%20inference#run-the-backend-container) 69 | * Start the triton server 70 | ```bash 71 | tritonserver --model-repository samples/model_repository 72 | ``` 73 | The current samples/model_repository/vllm_model contains llama27B 1x.We also have sample model files for llama2 7b/70b and qwen2-7b respectively under samples/model_repository/test_models folder. To use them , copy the model.json and config.pbtxt to vllm_model folder structure. 74 | * To test with client, please follow the instructions [here](https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#sending-your-first-inference) 75 | 76 | -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.rhel8.6: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for RedHat 8.6 6 | FROM registry.access.redhat.com/ubi8/ubi:8.6 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | LABEL vendor="Habanalabs Ltd." 12 | LABEL release="${VERSION}-${REVISION}" 13 | 14 | COPY LICENSE /licenses/ 15 | 16 | RUN dnf install -y \ 17 | python3-dnf-plugin-versionlock && \ 18 | dnf versionlock add redhat-release* && \ 19 | dnf clean all 20 | 21 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ 22 | dnf clean all 23 | 24 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 25 | echo "name=CentOS Linux 8 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 26 | echo "baseurl=https://vault.centos.org/8-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 27 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 28 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo 29 | 30 | RUN dnf update -y && dnf install -y \ 31 | bzip2 \ 32 | bzip2-devel \ 33 | clang \ 34 | cmake3 \ 35 | cpp \ 36 | gcc \ 37 | gcc-c++ \ 38 | git \ 39 | glibc \ 40 | glibc-devel \ 41 | glibc-headers \ 42 | iproute \ 43 | jemalloc \ 44 | libarchive \ 45 | libjpeg-devel \ 46 | libksba \ 47 | llvm \ 48 | lsof \ 49 | mesa-libGL \ 50 | openssh-clients \ 51 | openssh-server \ 52 | python3.11-devel \ 53 | python3.11-pip \ 54 | redhat-lsb-core \ 55 | unzip \ 56 | wget && \ 57 | dnf clean all && \ 58 | rm -f /etc/ssh/ssh_host_*_key* 59 | 60 | # CVE-2023-47038 RHSA-2024:3128 61 | RUN dnf module reset -y perl && \ 62 | dnf module enable -y perl:5.32 && \ 63 | dnf module install -y --allowerasing perl:5.32 && \ 64 | dnf clean all 65 | 66 | RUN echo "[appstream]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 67 | echo "name=CentOS Linux 8 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 68 | echo "baseurl=https://vault.centos.org/8-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 69 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 70 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo 71 | 72 | COPY install_efa.sh . 73 | RUN ./install_efa.sh && rm install_efa.sh && rm -f /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 74 | ENV OPENMPI_VERSION=4.1.6 75 | ENV MPI_ROOT=/opt/habanalabs/openmpi 76 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 77 | ENV PATH=${MPI_ROOT}/bin:$PATH 78 | ENV OPAL_PREFIX=${MPI_ROOT} 79 | ENV MPICC=${MPI_ROOT}/bin/mpicc 80 | ENV RDMAV_FORK_SAFE=1 81 | ENV FI_EFA_USE_DEVICE_RDMA=0 82 | ENV OMPI_MCA_btl=^openib 83 | 84 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 85 | echo "name=Habana RH8 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 86 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6" >> /etc/yum.repos.d/habanalabs.repo && \ 87 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo 88 | 89 | RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \ 90 | echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \ 91 | echo "baseurl=https://vault.centos.org/8-stream/PowerTools/x86_64/os/" >> /etc/yum.repos.d/powertools.repo && \ 92 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/powertools.repo && \ 93 | echo "gpgcheck=1" >> /etc/yum.repos.d/powertools.repo 94 | 95 | ENV PYTHON_VERSION=3.11 96 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 97 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ 98 | alternatives --set python3 /usr/bin/python3.11 99 | 100 | RUN dnf install -y \ 101 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \ 102 | habanalabs-thunk-"$VERSION"-"$REVISION".el8 \ 103 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el8 \ 104 | habanalabs-graph-"$VERSION"-"$REVISION".el8 && \ 105 | dnf clean all && \ 106 | rm -f /etc/yum.repos.d/habanalabs.repo 107 | 108 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 109 | ENV PIP_NO_CACHE_DIR=on 110 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 111 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 112 | 113 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 114 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 115 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 116 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 117 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 118 | 119 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 120 | 121 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 122 | 123 | # SSH configuration necessary to support mpi-operator v2 124 | RUN mkdir -p /var/run/sshd && \ 125 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 126 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 127 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 128 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 129 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 130 | 131 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 132 | ENV HABANA_LOGS=/var/log/habana_logs/ 133 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 134 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.rhel9.2: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.2 6 | FROM registry.access.redhat.com/ubi9/ubi:9.2 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | LABEL vendor="Habanalabs Ltd." 12 | LABEL release="${VERSION}-${REVISION}" 13 | 14 | COPY LICENSE /licenses/ 15 | 16 | RUN dnf install -y \ 17 | python3-dnf-plugin-versionlock && \ 18 | dnf versionlock add redhat-release* && \ 19 | dnf clean all 20 | 21 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ 22 | dnf clean all 23 | 24 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 25 | echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 26 | echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 27 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 28 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo 29 | 30 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 31 | echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 32 | echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 33 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 34 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo 35 | 36 | RUN dnf update -y && dnf install -y \ 37 | bzip2 \ 38 | bzip2-devel \ 39 | clang \ 40 | cmake3 \ 41 | cpp \ 42 | gcc \ 43 | gcc-c++ \ 44 | git \ 45 | glibc \ 46 | glibc-devel \ 47 | glibc-headers \ 48 | iproute \ 49 | jemalloc \ 50 | libarchive \ 51 | libffi-devel \ 52 | libjpeg-devel \ 53 | libksba \ 54 | llvm \ 55 | lsb_release \ 56 | lsof \ 57 | mesa-libGL \ 58 | openssh-clients \ 59 | openssh-server \ 60 | openssl \ 61 | openssl-devel \ 62 | python3-devel \ 63 | unzip \ 64 | wget \ 65 | zlib-devel && \ 66 | dnf clean all && \ 67 | rm -f /etc/ssh/ssh_host_*_key* 68 | 69 | ENV PYTHON_VERSION=3.10 70 | COPY install-python310.sh . 71 | RUN ./install-python310.sh rhel9.2 && rm install-python310.sh 72 | RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig 73 | ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 74 | 75 | COPY install_efa.sh . 76 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 77 | 78 | ENV OPENMPI_VERSION=4.1.6 79 | ENV MPI_ROOT=/opt/habanalabs/openmpi 80 | ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 81 | ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH 82 | ENV OPAL_PREFIX=${MPI_ROOT} 83 | ENV MPICC=${MPI_ROOT}/bin/mpicc 84 | ENV RDMAV_FORK_SAFE=1 85 | ENV FI_EFA_USE_DEVICE_RDMA=0 86 | ENV OMPI_MCA_btl=^openib 87 | 88 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 89 | echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 90 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ 91 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 92 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo 93 | 94 | # for Habana GPG key with SHA-1 signature 95 | RUN update-crypto-policies --set DEFAULT:SHA1 96 | 97 | RUN dnf install -y \ 98 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ 99 | habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ 100 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ 101 | habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ 102 | dnf clean all && \ 103 | rm -f /etc/yum.repos.d/habanalabs.repo 104 | 105 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 106 | ENV PIP_NO_CACHE_DIR=on 107 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 108 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 109 | 110 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 111 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 112 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 113 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 114 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 115 | 116 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 117 | 118 | RUN ln -s /usr/bin/python3 /usr/bin/python 119 | 120 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 121 | 122 | # SSH configuration necessary to support mpi-operator v2 123 | RUN mkdir -p /var/run/sshd && \ 124 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 125 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 126 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 127 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 128 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 129 | 130 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 131 | ENV HABANA_LOGS=/var/log/habana_logs/ 132 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 133 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.rhel9.4: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.4 6 | FROM registry.access.redhat.com/ubi9/ubi:9.4 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | LABEL vendor="Habanalabs Ltd." 12 | LABEL release="${VERSION}-${REVISION}" 13 | 14 | COPY LICENSE /licenses/ 15 | 16 | RUN dnf install -y \ 17 | python3-dnf-plugin-versionlock && \ 18 | dnf versionlock add redhat-release* && \ 19 | dnf clean all 20 | 21 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ 22 | dnf clean all 23 | 24 | RUN dnf update -y && dnf install -y \ 25 | openssl \ 26 | openssl-devel && \ 27 | dnf versionlock add openssl* openssl-devel* && \ 28 | dnf clean all 29 | 30 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 31 | echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 32 | echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 33 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 34 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo 35 | 36 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 37 | echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 38 | echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 39 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 40 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo 41 | 42 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 43 | echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 44 | echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 45 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 46 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo 47 | 48 | RUN dnf install -y \ 49 | bzip2 \ 50 | bzip2-devel \ 51 | clang \ 52 | cmake3 \ 53 | cpp \ 54 | ffmpeg-free \ 55 | gcc \ 56 | gcc-c++ \ 57 | git \ 58 | glibc \ 59 | glibc-devel \ 60 | glibc-headers \ 61 | iproute \ 62 | jemalloc \ 63 | libarchive \ 64 | libffi-devel \ 65 | libjpeg-devel \ 66 | libksba \ 67 | llvm \ 68 | lsb_release \ 69 | lsof \ 70 | mesa-libGL \ 71 | openssh-clients \ 72 | openssh-server \ 73 | python3-devel \ 74 | python3.11 \ 75 | python3.11-devel \ 76 | python3.11-pip \ 77 | python3.11-rpm \ 78 | unzip \ 79 | wget \ 80 | zlib-devel && \ 81 | dnf versionlock add \ 82 | python3-rpm \ 83 | rpm* && \ 84 | dnf clean all && \ 85 | rm -f /etc/ssh/ssh_host_*_key* 86 | 87 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 88 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 89 | alternatives --set python3 /usr/bin/python3.11 && \ 90 | alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 2 && \ 91 | alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.9 1 && \ 92 | alternatives --set pip3 /usr/bin/pip3.11 93 | 94 | COPY install_efa.sh . 95 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 96 | 97 | ENV OPENMPI_VERSION=4.1.6 98 | ENV MPI_ROOT=/opt/habanalabs/openmpi 99 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 100 | ENV PATH=${MPI_ROOT}/bin:$PATH 101 | ENV OPAL_PREFIX=${MPI_ROOT} 102 | ENV MPICC=${MPI_ROOT}/bin/mpicc 103 | ENV RDMAV_FORK_SAFE=1 104 | ENV FI_EFA_USE_DEVICE_RDMA=0 105 | ENV OMPI_MCA_btl=^openib 106 | 107 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 108 | echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 109 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ 110 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 111 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo 112 | 113 | # for Habana GPG key with SHA-1 signature 114 | RUN update-crypto-policies --set DEFAULT:SHA1 115 | 116 | RUN dnf install -y \ 117 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ 118 | habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ 119 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ 120 | habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ 121 | dnf clean all && \ 122 | chmod +t /var/log/habana_logs && \ 123 | rm -f /etc/yum.repos.d/habanalabs.repo 124 | 125 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 126 | ENV PIP_NO_CACHE_DIR=on 127 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 128 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 129 | 130 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 131 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 132 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 133 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 134 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 135 | 136 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 137 | 138 | RUN ln -s /usr/bin/python3 /usr/bin/python 139 | 140 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 141 | 142 | # SSH configuration necessary to support mpi-operator v2 143 | RUN mkdir -p /var/run/sshd && \ 144 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 145 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 146 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 147 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 148 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 149 | 150 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 151 | ENV HABANA_LOGS=/var/log/habana_logs/ 152 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 153 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.suse15.5: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for SUSE 15.5 6 | FROM registry.suse.com/suse/sle15:15.5 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | # for RHEL certification 12 | LABEL vendor="Habanalabs Ltd." 13 | LABEL release="${VERSION}-${REVISION}" 14 | 15 | COPY LICENSE /licenses/ 16 | 17 | RUN zypper addrepo -f http://download.opensuse.org/distribution/leap/15.5/repo/oss/ OpenSUSI && \ 18 | echo "gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSI.repo && \ 19 | echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSI.repo 20 | 21 | RUN zypper addrepo -f http://download.opensuse.org/source/distribution/leap/15.5/repo/oss/ OpenSUSISrc && \ 22 | echo "gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSISrc.repo && \ 23 | echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSISrc.repo 24 | 25 | RUN zypper mr -p 99 SLE_BCI 26 | 27 | RUN zypper update -y && zypper install -y --allow-downgrade \ 28 | clang \ 29 | cmake \ 30 | ffmpeg \ 31 | gcc \ 32 | gcc-c++ \ 33 | git \ 34 | glibc-devel \ 35 | iproute \ 36 | jemalloc \ 37 | kernel-devel \ 38 | kernel-macros \ 39 | lbzip2 \ 40 | libarchive-devel \ 41 | libffi-devel \ 42 | libjpeg-devel \ 43 | libksba \ 44 | linux-glibc-devel \ 45 | llvm \ 46 | lsof \ 47 | Mesa-libGL-devel \ 48 | Mesa-libGL1 \ 49 | openssh-clients \ 50 | openssh-server \ 51 | openssl \ 52 | openssl-devel \ 53 | python311 \ 54 | python311-devel \ 55 | unzip \ 56 | wget \ 57 | zlib-devel && \ 58 | zypper clean && \ 59 | rm -f /etc/ssh/ssh_host_*_key* 60 | 61 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 62 | ENV PIP_NO_CACHE_DIR=on 63 | 64 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 65 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ 66 | alternatives --set python3 /usr/bin/python3.11 67 | 68 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 69 | python3 get-pip.py && \ 70 | rm -f get-pip.py && \ 71 | python3 -m pip install setuptools==76.1.0 wheel && \ 72 | python3 -m pip install --upgrade Jinja2 73 | 74 | COPY install_efa.sh . 75 | RUN ./install_efa.sh && rm -f install_efa.sh /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 76 | 77 | ENV MPI_ROOT=/opt/amazon/openmpi 78 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 79 | ENV PATH=${MPI_ROOT}/bin:$PATH 80 | ENV OPAL_PREFIX=${MPI_ROOT} 81 | ENV MPICC=${MPI_ROOT}/bin/mpicc 82 | ENV RDMA_FORK_SAFE=1 83 | ENV FI_EFA_USE_DEVICE_RDMA=1 84 | 85 | RUN echo "[habanalabs]" > /etc/zypp/repos.d/habanalabs.repo && \ 86 | echo "name=Habana SUSE Linux repo" >> /etc/zypp/repos.d/habanalabs.repo && \ 87 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/sles/15/15.5" >> /etc/zypp/repos.d/habanalabs.repo && \ 88 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/sles/15/15.5/repodata/repomd.xml.key" >> /etc/zypp/repos.d/habanalabs.repo && \ 89 | echo "gpgcheck=1" >> /etc/zypp/repos.d/habanalabs.repo 90 | 91 | RUN zypper --gpg-auto-import-keys install -y \ 92 | habanalabs-rdma-core-"$VERSION"-"$REVISION" \ 93 | habanalabs-thunk-"$VERSION"-"$REVISION" \ 94 | habanalabs-firmware-tools-"$VERSION"-"$REVISION" \ 95 | habanalabs-graph-"$VERSION"-"$REVISION" && \ 96 | zypper clean && \ 97 | rm -f /etc/zypp/repos.d/habanalabs.repo 98 | 99 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 100 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 101 | 102 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 103 | 104 | # SSH configuration necessary to support mpi-operator v2 105 | RUN mkdir -p /var/run/sshd && \ 106 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 107 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 108 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 109 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 110 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 111 | 112 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 113 | ENV HABANA_LOGS=/var/log/habana_logs/ 114 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 115 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 116 | -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.tencentos3.1: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Tencentos 3.1 6 | FROM tencentos/tencentos_server31_mini:20230630 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | # for RHEL certification 12 | LABEL vendor="Habanalabs Ltd." 13 | LABEL release="${VERSION}-${REVISION}" 14 | 15 | COPY LICENSE /licenses/ 16 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ 17 | dnf clean all && rm -rf /var/cache/yum 18 | 19 | RUN dnf install -y \ 20 | python3-dnf-plugin-versionlock && \ 21 | dnf versionlock add redhat-release* && \ 22 | dnf clean all 23 | 24 | RUN dnf update -y && dnf install -y \ 25 | clang \ 26 | cmake3 \ 27 | cpp \ 28 | gcc \ 29 | gcc-c++ \ 30 | git \ 31 | glibc \ 32 | glibc-devel \ 33 | glibc-headers \ 34 | iproute \ 35 | jemalloc \ 36 | libarchive \ 37 | libjpeg-devel \ 38 | libksba \ 39 | llvm \ 40 | lsof \ 41 | mesa-libGL \ 42 | openssh-clients \ 43 | openssh-server \ 44 | redhat-lsb-core \ 45 | unzip \ 46 | wget && \ 47 | dnf clean all && \ 48 | rm -f /etc/ssh/ssh_host_*_key* 49 | 50 | COPY install-python310.sh . 51 | RUN ./install-python310.sh tencentos3.1 && rm -f install-python310.sh 52 | RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig 53 | ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 54 | 55 | COPY install_efa.sh . 56 | COPY tencentos_efa_patch.txt /tmp/tencentos_efa_patch.txt 57 | RUN ./install_efa.sh && rm -f install_efa.sh /tmp/tencentos_efa_patch.txt /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 58 | 59 | ENV MPI_ROOT=/usr/mpi/gcc/openmpi-4.1.5a1 60 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib64:/usr/lib/habanalabs:$LD_LIBRARY_PATH 61 | ENV PATH=${MPI_ROOT}/bin:$PATH 62 | ENV OPAL_PREFIX=${MPI_ROOT} 63 | ENV MPICC=${MPI_ROOT}/bin/mpicc 64 | ENV RDMAV_FORK_SAFE=1 65 | ENV FI_EFA_USE_DEVICE_RDMA=1 66 | 67 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 68 | echo "name=Habana TC31 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 69 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1" >> /etc/yum.repos.d/habanalabs.repo && \ 70 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo 71 | 72 | RUN dnf install -y \ 73 | habanalabs-rdma-core-"$VERSION"-"$REVISION".tl3 \ 74 | habanalabs-thunk-"$VERSION"-"$REVISION".tl3 \ 75 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".tl3 \ 76 | habanalabs-graph-"$VERSION"-"$REVISION".tl3 && \ 77 | rm -f /etc/yum.repos.d/habanalabs.repo && \ 78 | dnf clean all 79 | 80 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 81 | ENV PIP_NO_CACHE_DIR=on 82 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 83 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 84 | 85 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 && \ 86 | python3 -m pip install --upgrade Jinja2 87 | 88 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 89 | 90 | # SSH configuration necessary to support mpi-operator v2 91 | RUN mkdir -p /var/run/sshd && \ 92 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 93 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 94 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 95 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 96 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 97 | 98 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 99 | ENV HABANA_LOGS=/var/log/habana_logs/ 100 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 101 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.ubuntu22.04: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 22.04 6 | FROM ubuntu:jammy 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 13 | ENV HABANA_LOGS=/var/log/habana_logs/ 14 | ENV OS_NUMBER=2204 15 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 16 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 17 | 18 | RUN apt-get update && apt-get install -y --no-install-recommends \ 19 | apt-transport-https \ 20 | apt-utils \ 21 | bc \ 22 | build-essential \ 23 | ca-certificates \ 24 | dkms \ 25 | ethtool \ 26 | gcc \ 27 | git \ 28 | gnupg \ 29 | gpg-agent \ 30 | graphviz \ 31 | libgl1 \ 32 | libgnutls30 \ 33 | libgoogle-glog0v5 \ 34 | libjemalloc2 \ 35 | libjpeg-dev \ 36 | libkrb5-3 \ 37 | libpq-dev \ 38 | lsof \ 39 | make \ 40 | openssh-client \ 41 | openssh-server \ 42 | protobuf-compiler \ 43 | python3 \ 44 | python3-dev \ 45 | python3-pip \ 46 | python3-tk \ 47 | python3-venv \ 48 | unzip \ 49 | vim \ 50 | wget && \ 51 | apt-get upgrade -y libc6 && \ 52 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 53 | rm -f /etc/ssh/ssh_host_*_key* 54 | 55 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 56 | ENV PIP_NO_CACHE_DIR=on 57 | 58 | RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 && \ 59 | python3 -m pip install --upgrade Jinja2 60 | 61 | COPY install_efa.sh . 62 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 63 | 64 | ENV MPI_ROOT=/opt/amazon/openmpi 65 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 66 | ENV PATH=${MPI_ROOT}/bin:$PATH 67 | ENV OPAL_PREFIX=${MPI_ROOT} 68 | ENV MPICC=${MPI_ROOT}/bin/mpicc 69 | ENV RDMAV_FORK_SAFE=1 70 | ENV FI_EFA_USE_DEVICE_RDMA=1 71 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 72 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 73 | 74 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \ 75 | chown root:root /usr/share/keyrings/habana-artifactory.gpg && \ 76 | chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ 77 | echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 78 | cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ 79 | sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ 80 | apt-get update && apt-get install -y --no-install-recommends \ 81 | habanalabs-rdma-core="$VERSION"-"$REVISION" \ 82 | habanalabs-thunk="$VERSION"-"$REVISION" \ 83 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 84 | habanalabs-graph="$VERSION"-"$REVISION" && \ 85 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 86 | mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ 87 | sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 88 | 89 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 90 | 91 | # SSH configuration necessary to support mpi-operator v2 92 | RUN mkdir -p /var/run/sshd && \ 93 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 94 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 95 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 96 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 97 | echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \ 98 | sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.ubuntu24.04: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 24.04 6 | FROM ubuntu:noble 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 13 | ENV HABANA_LOGS=/var/log/habana_logs/ 14 | ENV OS_NUMBER=2404 15 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 16 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 17 | 18 | RUN apt-get update && apt-get install -y --no-install-recommends \ 19 | apt-transport-https \ 20 | apt-utils \ 21 | bc \ 22 | build-essential \ 23 | ca-certificates \ 24 | dkms \ 25 | ethtool \ 26 | gcc \ 27 | git \ 28 | gnupg \ 29 | gpg-agent \ 30 | graphviz \ 31 | libgl1 \ 32 | libgnutls30 \ 33 | libgoogle-glog0v6t64 \ 34 | libjemalloc2 \ 35 | libjpeg-dev \ 36 | libkrb5-3 \ 37 | libpq-dev \ 38 | lsof \ 39 | make \ 40 | openssh-client \ 41 | openssh-server \ 42 | protobuf-compiler \ 43 | python3 \ 44 | python3-dev \ 45 | unzip \ 46 | vim \ 47 | wget && \ 48 | apt-get update && apt-get upgrade -y libtasn1-6 && \ 49 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 50 | rm -f /etc/ssh/ssh_host_*_key* 51 | 52 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 53 | ENV PIP_NO_CACHE_DIR=on 54 | 55 | RUN mv /usr/lib/python3.12/EXTERNALLY-MANAGED /usr/lib/python3.12/EXTERNALLY-MANAGED.old && \ 56 | wget https://bootstrap.pypa.io/get-pip.py && \ 57 | python3 get-pip.py && \ 58 | rm -f get-pip.py && \ 59 | python3 -m pip install setuptools wheel && \ 60 | python3 -m pip install --upgrade Jinja2 61 | 62 | COPY install_efa.sh . 63 | RUN ./install_efa.sh && rm -f install_efa.sh /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 64 | 65 | ENV MPI_ROOT=/opt/amazon/openmpi 66 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 67 | ENV PATH=${MPI_ROOT}/bin:$PATH 68 | ENV OPAL_PREFIX=${MPI_ROOT} 69 | ENV MPICC=${MPI_ROOT}/bin/mpicc 70 | ENV RDMAV_FORK_SAFE=1 71 | ENV FI_EFA_USE_DEVICE_RDMA=1 72 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 73 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 74 | 75 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \ 76 | chown root:root /usr/share/keyrings/habana-artifactory.gpg && \ 77 | chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ 78 | echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian noble main" | tee -a /etc/apt/sources.list && \ 79 | cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ 80 | sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ 81 | apt-get update && apt-get install -y --no-install-recommends \ 82 | habanalabs-rdma-core="$VERSION"-"$REVISION" \ 83 | habanalabs-thunk="$VERSION"-"$REVISION" \ 84 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 85 | habanalabs-graph="$VERSION"-"$REVISION" && \ 86 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 87 | mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ 88 | sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 89 | 90 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" 91 | 92 | # SSH configuration necessary to support mpi-operator v2 93 | RUN mkdir -p /var/run/sshd && \ 94 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 95 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 96 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 97 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 98 | echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \ 99 | sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc -------------------------------------------------------------------------------- /dockerfiles/base/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /dockerfiles/base/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = base-installer-${BUILD_OS} 5 | 6 | ifdef REPO_NAME 7 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg REPO_NAME=$(REPO_NAME) 8 | endif 9 | 10 | init: 11 | $(HIDE)mkdir -p $(BUILD_DIR) 12 | $(HIDE)cp $(CURDIR)/LICENSE $(BUILD_DIR)/ 13 | $(HIDE)cp $(CURDIR)/*.sh $(BUILD_DIR)/ 14 | $(HIDE)cp $(CURDIR)/tencentos_efa_patch.txt $(BUILD_DIR)/ 15 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile 16 | 17 | build: init 18 | -------------------------------------------------------------------------------- /dockerfiles/base/install-python310.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | _BASE_NAME=${1:-"ubuntu22.04"} 5 | _SSL_LIB="" 6 | 7 | # preinstall dependencies and define variables 8 | case "${_BASE_NAME}" in 9 | *ubuntu22.04* | *ubuntu24.04*) 10 | echo "Skip installation of Python 3.10 from sources on Ubuntu 22.04 and Ubuntu 24.04" 11 | exit 0; 12 | ;; 13 | *rhel*) 14 | dnf install -y sqlite-devel readline-devel xz-devel 15 | ;; 16 | *tencentos3.1*) 17 | dnf install -y sqlite-devel readline-devel zlib-devel xz-devel bzip2-devel libffi-devel 18 | wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && \ 19 | cd /opt/ && \ 20 | tar xzf openssl-1.1.1w.tar.gz && \ 21 | rm -rf openssl-1.1.1w.tar.gz && \ 22 | cd openssl-1.1.1w && \ 23 | ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && \ 24 | make && make install 25 | ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem 26 | 27 | PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin 28 | LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH 29 | _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w" 30 | ;; 31 | esac 32 | 33 | # install Python 34 | wget -nv -O /opt/Python-3.10.14.tgz https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz 35 | cd /opt/ 36 | tar xzf Python-3.10.14.tgz 37 | rm -f Python-3.10.14.tgz 38 | cd Python-3.10.14 39 | ./configure --enable-optimizations --enable-loadable-sqlite-extensions --enable-shared $_SSL_LIB 40 | make -j && make altinstall 41 | 42 | # post install 43 | case "${_BASE_NAME}" in 44 | *rhel9*) 45 | alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 2 && \ 46 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 47 | alternatives --set python3 /usr/local/bin/python3.10 48 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 49 | ;; 50 | *tencentos3.1*) 51 | alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 4 && \ 52 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 3 && \ 53 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 54 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ 55 | alternatives --set python3 /usr/local/bin/python3.10 56 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 57 | ;; 58 | esac 59 | 60 | python3 -m pip install --upgrade pip setuptools 61 | 62 | -------------------------------------------------------------------------------- /dockerfiles/base/install_efa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | DEFAULT_EFA_INSTALLER_VER=1.34.0 4 | efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER} 5 | 6 | tmp_dir=$(mktemp -d) 7 | wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-$efa_installer_version.tar.gz -P $tmp_dir 8 | tar -xf $tmp_dir/aws-efa-installer-$efa_installer_version.tar.gz -C $tmp_dir 9 | RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify" 10 | pushd $tmp_dir/aws-efa-installer 11 | . /etc/os-release 12 | case $ID in 13 | rhel) 14 | # we cannot install dkms packages on RHEL images due to OCP rules 15 | find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; 16 | find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; 17 | case $VERSION_ID in 18 | 8*) 19 | dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm 20 | ;; 21 | 9*) 22 | dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm 23 | ;; 24 | *) 25 | echo "Unsupported RHEL version: $VERSION_ID" 26 | exit 1 27 | ;; 28 | esac 29 | RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'" 30 | ;; 31 | tencentos) 32 | # dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm 33 | find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; 34 | find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; 35 | rm -rf RPMS/ROCKYLINUX8/x86_64/rdma-core/rdma* 36 | patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch 37 | tmp_dir_ofed=$(mktemp -d) 38 | wget -O $tmp_dir_ofed/MLNX_OFED.tgz https://${ARTIFACTORY_URL}/artifactory/gaudi-installer/deps/MLNX_OFED_LINUX-5.8-3.0.7.0-rhel8.4-x86_64.tgz 39 | pushd $tmp_dir_ofed 40 | tar xf MLNX_OFED.tgz 41 | ofed_packages_path="mlnx-ofed" 42 | pushd mlnx-ofed 43 | yum install pciutils-libs tcsh tk python36 gcc-gfortran kernel-modules fuse-libs numactl-libs -y 44 | ./mlnxofedinstall --distro RHEL8.4 --skip-distro-check --user-space-only --skip-repo --force 45 | popd 46 | popd 47 | rm -rf $tmp_dir_ofed 48 | RUN_EFA_INSTALLER="echo 'Skipping EFA installer on tencentos'" 49 | ;; 50 | ubuntu) 51 | apt-get update 52 | ;; 53 | esac 54 | 55 | eval $RUN_EFA_INSTALLER 56 | 57 | case $ID in 58 | ubuntu) 59 | apt-get autoremove && rm -rf /var/lib/apt/lists/* 60 | ;; 61 | esac 62 | 63 | popd 64 | rm -rf $tmp_dir 65 | -------------------------------------------------------------------------------- /dockerfiles/base/tencentos_efa_patch.txt: -------------------------------------------------------------------------------- 1 | diff --git a/common.sh b/common.sh 2 | index 3c3a0e4..b463f42 100755 3 | --- a/common.sh 4 | +++ b/common.sh 5 | @@ -50,6 +50,15 @@ has_substring() { 6 | fi 7 | } 8 | 9 | +is_tencentos_3() { 10 | + . /etc/os-release 11 | + if [ "$NAME" = "TencentOS Server" ] && [ "$VERSION_ID" = "3.1" ]; then 12 | + return 0 13 | + else 14 | + return 1 15 | + fi 16 | +} 17 | + 18 | is_amazon_linux_2() { 19 | . /etc/os-release 20 | if [ "$NAME" = "Amazon Linux" ] && [ "$VERSION_ID" = "2" ]; then 21 | @@ -164,7 +173,7 @@ is_suse_15() { 22 | } 23 | 24 | install_cmd() { 25 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 26 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 27 | if [ $1 == "localinstall" ]; then 28 | shift 29 | yum -y localinstall $@ 30 | @@ -181,7 +190,7 @@ install_cmd() { 31 | fi 32 | } 33 | search_cmd() { 34 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 35 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 36 | yum list installed $@ 37 | elif is_suse_15; then 38 | zypper search --installed-only --match-exact $@ 39 | @@ -194,7 +203,7 @@ search_cmd() { 40 | } 41 | remove_cmd() { 42 | # we don't remove the dependencies of the efa packages as it may have reverse dependencies on other system packages 43 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then 44 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then 45 | rpm --erase --nodeps $@ 46 | elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then 47 | # purge is identical to remove except that packages are removed and purged 48 | @@ -207,7 +216,7 @@ remove_cmd() { 49 | } 50 | # Get the list of file installed by the package name 51 | query_file_list_cmd() { 52 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then 53 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then 54 | rpm -ql $@ 55 | elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then 56 | dpkg -L $@ 57 | @@ -220,7 +229,7 @@ query_file_list_cmd() { 58 | # reverse dependencies (some other installed packages depend on them) 59 | # this command will return non-zero 60 | remove_dryrun_cmd() { 61 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then 62 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then 63 | rpm --erase --test $@ 64 | elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then 65 | dpkg -r --dry-run $@ 66 | diff --git a/efa_installer.sh b/efa_installer.sh 67 | index 544673f..faf3369 100755 68 | --- a/efa_installer.sh 69 | +++ b/efa_installer.sh 70 | @@ -97,7 +97,7 @@ select_mpi() { 71 | } 72 | 73 | detect_os() { 74 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 75 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 76 | PACKAGE_TYPE="rpm" 77 | KERNEL_SEARCH_STRING=kernel 78 | INSTALL_ARGS="--setopt=skip_missing_names_on_install=False" 79 | @@ -209,7 +209,7 @@ setup_install_package_paths() { 80 | local kmod_path 81 | 82 | if [ "${PACKAGE_TYPE}" = "rpm" ]; then 83 | - if is_rhel_8 || is_rockylinux_8; then 84 | + if is_rhel_8 || is_rockylinux_8|| is_tencentos_3; then 85 | base_dir="RPMS/ROCKYLINUX8/${arch}" 86 | debug_dir="RPMS/ROCKYLINUX8/${arch}/debug" 87 | elif is_rockylinux_9 || is_rhel_9; then 88 | @@ -465,7 +465,7 @@ install_apt_package() { 89 | install_dependencies() { 90 | local packages 91 | 92 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 93 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 94 | packages="pciutils rpmdevtools" 95 | if [ ${SKIP_KMOD} -eq 0 ]; then 96 | for kernel in ${INSTALLED_KERNELS[@]}; do 97 | @@ -785,7 +785,7 @@ uninstall_efa() { 98 | 99 | uninstall_old_efa_packages() { 100 | # Uninstall 'openmpi' and 'libfabric' if packaged by AWS. 101 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 102 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 103 | for pkg in openmpi libfabric libfabric-debuginfo; do 104 | rpm -ql $pkg | grep -q /opt/amazon 105 | if [ $? -eq 0 ]; then 106 | -------------------------------------------------------------------------------- /dockerfiles/common.mk: -------------------------------------------------------------------------------- 1 | VERBOSE ?= FALSE 2 | DOCKER ?= docker 3 | DOCKER_CACHE ?= FALSE 4 | BUILD_OS ?= ubuntu22.04 5 | BUILD_DIR ?= $(CURDIR)/dockerbuild 6 | 7 | REPO_SERVER ?= vault.habana.ai 8 | PT_VERSION ?= 2.6.0 9 | RELEASE_VERSION ?= 1.21.0 10 | RELEASE_BUILD_ID ?= 555 11 | 12 | BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) 13 | IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) 14 | 15 | DOCKER_BUILD_ARGS := --build-arg ARTIFACTORY_URL=$(REPO_SERVER) --build-arg VERSION=$(RELEASE_VERSION) --build-arg REVISION=$(RELEASE_BUILD_ID) --build-arg BASE_NAME=$(BASE_IMAGE_URL) 16 | 17 | # Hide or not the calls depending of VERBOSE 18 | ifeq ($(VERBOSE),TRUE) 19 | HIDE = 20 | else 21 | HIDE = @ 22 | endif 23 | 24 | # Use cache for build depending of DOCKER_CACHE 25 | ifeq ($(DOCKER_CACHE),TRUE) 26 | CACH_FLAG = 27 | else 28 | CACH_FLAG = --no-cache 29 | endif 30 | 31 | .PHONY: help build clean 32 | 33 | help: ## Prints this help. 34 | @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) 35 | 36 | .DEFAULT_GOAL := help 37 | 38 | clean: ## clean the build dir 39 | $(HIDE)rm -rf $(BUILD_DIR) 40 | 41 | build: ## build docker image 42 | @echo Building image - $(IMAGE_NAME) 43 | $(HIDE)$(DOCKER) build --network=host $(CACH_FLAG) --tag $(IMAGE_URL) $(DOCKER_BUILD_ARGS) $(BUILD_DIR) 44 | @echo -n $(IMAGE_URL) | tee $(BUILD_DIR)/image_name 45 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.rhel8.6: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 8.6 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | 16 | LABEL name="PyTorch Installer" 17 | LABEL summary="Habanalabs PyTorch installer layer for RHEL8.6" 18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 19 | 20 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth 21 | 22 | RUN dnf update -y && dnf install -y \ 23 | cairo-devel \ 24 | curl \ 25 | gcc-toolset-11 \ 26 | gperftools-devel \ 27 | iproute \ 28 | jq \ 29 | lapack-devel \ 30 | numactl \ 31 | numactl-devel \ 32 | openblas-devel \ 33 | pdsh \ 34 | which \ 35 | zlib-devel && \ 36 | dnf clean all 37 | 38 | COPY install_packages.sh . 39 | 40 | RUN ./install_packages.sh && rm -f install_packages.sh && \ 41 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 42 | 43 | # Configure GCC 11 44 | ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH} 45 | ENV MANPATH=/opt/rh/gcc-toolset-11/root/usr/share/man:${MANPATH} 46 | ENV INFOPATH=/opt/rh/gcc-toolset-11/root/usr/share/info:${INFOPATH} 47 | ENV PCP_DIR=/opt/rh/gcc-toolset-11/root 48 | ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH} 49 | ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH} 50 | 51 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 52 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.rhel9.2: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.2 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | 16 | LABEL name="PyTorch Installer" 17 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2" 18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 19 | 20 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth 21 | 22 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 23 | echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 24 | echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 25 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 26 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo 27 | 28 | RUN dnf update -y && dnf install --nobest --allowerasing -y \ 29 | cairo-devel \ 30 | curl \ 31 | gperftools-devel \ 32 | iproute \ 33 | jq \ 34 | lapack-devel \ 35 | numactl \ 36 | numactl-devel \ 37 | openblas-devel \ 38 | which \ 39 | zlib-devel && \ 40 | dnf clean all 41 | 42 | COPY install_packages.sh . 43 | 44 | RUN ./install_packages.sh && rm -f install_packages.sh && \ 45 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 46 | 47 | # Set LD_PRELOAD after all required installations to 48 | # avoid warnings during docker creation 49 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 50 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.rhel9.4: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.4 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | 16 | LABEL name="PyTorch Installer" 17 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4" 18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 19 | 20 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt 21 | 22 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 23 | echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 24 | echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 25 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 26 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo 27 | 28 | RUN dnf update -y && dnf install --nobest --nodocs --setopt=install_weak_deps=false --allowerasing -y \ 29 | cairo-devel \ 30 | gperftools-devel \ 31 | iproute \ 32 | jq \ 33 | lapack-devel \ 34 | numactl \ 35 | numactl-devel \ 36 | openblas-devel \ 37 | which \ 38 | zlib-devel && \ 39 | dnf clean all 40 | 41 | COPY install_packages.sh . 42 | 43 | RUN ./install_packages.sh && rm -f install_packages.sh && \ 44 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 45 | 46 | # Set LD_PRELOAD after all required installations to 47 | # avoid warnings during docker creation 48 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 49 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.suse15.5: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for SUSE 15.5 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | 16 | # for RHEL certification 17 | LABEL name="PyTorch Installer" 18 | LABEL summary="Habanalabs PyTorch installer layer for SUSE 15.5" 19 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 20 | 21 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 22 | 23 | RUN zypper update -y && zypper install -y --allow-downgrade \ 24 | cairo-devel \ 25 | gperftools-devel \ 26 | jq \ 27 | lapack-devel \ 28 | numactl && \ 29 | zypper clean 30 | 31 | COPY install_packages.sh . 32 | 33 | RUN ./install_packages.sh && rm -f install_packages.sh && \ 34 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 35 | 36 | # Set LD_PRELOAD after all required installations to 37 | # avoid warnings during docker creation 38 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 39 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.tencentos3.1: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 8.6 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | 16 | LABEL name="PyTorch Installer" 17 | LABEL summary="Habanalabs PyTorch installer layer for Tencentos 3.1" 18 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 19 | 20 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 21 | 22 | RUN dnf versionlock add openmpi* perftest* 23 | 24 | RUN dnf update -y && dnf install -y \ 25 | cairo-devel \ 26 | curl \ 27 | gcc-toolset-11 \ 28 | gperftools-devel \ 29 | iproute \ 30 | jq \ 31 | lapack-devel \ 32 | numactl \ 33 | numactl-devel \ 34 | openblas-devel \ 35 | libevent \ 36 | pdsh \ 37 | which \ 38 | zlib-devel && \ 39 | dnf clean all 40 | 41 | # Configure GCC 11 42 | ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH} 43 | ENV MANPATH=/opt/rh/gcc-toolset-11/root/usr/share/man:${MANPATH} 44 | ENV INFOPATH=/opt/rh/gcc-toolset-11/root/usr/share/info:${INFOPATH} 45 | ENV PCP_DIR=/opt/rh/gcc-toolset-11/root 46 | ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.5a1/lib64:/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH} 47 | ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:/usr/mpi/gcc/openmpi-4.1.5a1/lib64/pkgconfig:${PKG_CONFIG_PATH} 48 | ENV CMAKE_PREFIX_PATH=/usr/mpi/gcc/openmpi-4.1.5a1/include:${CMAKE_PREFIX_PATH} 49 | 50 | COPY install_packages.sh . 51 | 52 | RUN ./install_packages.sh && rm -f install_packages.sh && \ 53 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 54 | 55 | ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 56 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 57 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.ubuntu: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for Ubuntu22.04 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | 16 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 17 | 18 | RUN apt-get update && apt-get install -y --no-install-recommends \ 19 | curl \ 20 | iproute2 \ 21 | jq \ 22 | libcurl4 \ 23 | libgoogle-perftools-dev \ 24 | libhdf5-dev \ 25 | libjpeg-dev \ 26 | liblapack-dev \ 27 | libnuma-dev \ 28 | libopenblas-dev \ 29 | moreutils \ 30 | numactl \ 31 | pdsh && \ 32 | apt-get autoremove && rm -rf /var/lib/apt/lists/* 33 | 34 | RUN bash -c "\ 35 | case $BASE_NAME in \ 36 | *ubuntu22.04*) \ 37 | update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \ 38 | ;; \ 39 | *ubuntu24.04*) \ 40 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \ 41 | ;; \ 42 | esac" 43 | 44 | COPY install_packages.sh . 45 | 46 | RUN ./install_packages.sh && rm -f install_packages.sh && \ 47 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 48 | 49 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 50 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = pytorch-installer-${BUILD_OS}-$(PT_VERSION) 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) 6 | 7 | base: 8 | ifneq ($(shell $(DOCKER) image inspect $(BASE_IMAGE_URL):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) --format="image_exists" 2>/dev/null), image_exists) 9 | cd ../base; \ 10 | make build; \ 11 | cd ../pytorch 12 | endif 13 | 14 | init: base 15 | $(HIDE)mkdir -p $(BUILD_DIR) 16 | $(HIDE)cp $(CURDIR)/install_packages.sh $(BUILD_DIR)/ 17 | ifneq (,$(findstring ubuntu,$(BUILD_OS))) 18 | $(HIDE)cp $(CURDIR)/Dockerfile.ubuntu $(BUILD_DIR)/Dockerfile 19 | else 20 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile 21 | endif 22 | 23 | build: init 24 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/install_packages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | PT_PACKAGE_NAME="pytorch_modules-v${PT_VERSION}_${VERSION}_${REVISION}.tgz" 5 | OS_STRING="ubuntu${OS_NUMBER}" 6 | case "${BASE_NAME}" in 7 | *sles15.5* | *suse15.5*) 8 | OS_STRING="suse155" 9 | ;; 10 | *rhel9.2*) 11 | OS_STRING="rhel92" 12 | ;; 13 | *rhel9.4*) 14 | OS_STRING="rhel94" 15 | ;; 16 | *rhel8*) 17 | OS_STRING="rhel86" 18 | ;; 19 | *tencentos*) 20 | OS_STRING="tencentos31" 21 | ;; 22 | esac 23 | PT_ARTIFACT_PATH="https://${ARTIFACTORY_URL}/artifactory/gaudi-pt-modules/${VERSION}/${REVISION}/pytorch/${OS_STRING}" 24 | 25 | TMP_PATH=$(mktemp --directory) 26 | wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" 27 | tar -zxf "${PT_PACKAGE_NAME}" -C "${TMP_PATH}"/. 28 | pushd "${TMP_PATH}" 29 | ./install.sh $VERSION $REVISION 30 | popd 31 | 32 | rm -rf "${TMP_PATH}" "${PT_PACKAGE_NAME}" 33 | -------------------------------------------------------------------------------- /dockerfiles/triton/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04 6 | FROM nvcr.io/nvidia/tritonserver:23.12-py3 7 | ARG ARTIFACTORY_URL 8 | ARG PT_VERSION 9 | ARG VERSION 10 | ARG REVISION 11 | ARG HABANA_PIP_VERSION="22.3" 12 | ARG PT_BUILD_REPO=gaudi-pt-modules 13 | ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz" 14 | ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204" 15 | ARG PT_EXTRACT_PATH="/root/habanalabs/pytorch_temp" 16 | 17 | ENV DEBIAN_FRONTEND=noninteractive 18 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 19 | ENV HABANA_LOGS=/var/log/habana_logs/ 20 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 21 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 22 | ENV PIP_NO_CACHE_DIR=on 23 | ENV PIP_DEFAULT_TIMEOUT=1000 24 | ENV MPI_ROOT=/opt/hpcx/ompi 25 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 26 | ENV PATH=${MPI_ROOT}/bin:$PATH 27 | ENV OPAL_PREFIX=${MPI_ROOT} 28 | ENV MPICC=${MPI_ROOT}/bin/mpicc 29 | ENV RDMAV_FORK_SAFE=1 30 | ENV PYTHONPATH=/root:/usr/lib/habanalabs 31 | RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 32 | wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \ 33 | apt-key add public && rm public && apt-get update && \ 34 | apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \ 35 | habanalabs-thunk="$VERSION"-"$REVISION" \ 36 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 37 | habanalabs-graph="$VERSION"-"$REVISION" && \ 38 | apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ 39 | sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 40 | 41 | RUN apt-get update && apt-get install -y \ 42 | libjemalloc2 \ 43 | libcairo2-dev \ 44 | libglib2.0-dev \ 45 | libhdf5-dev \ 46 | libnuma-dev \ 47 | libpcre2-dev \ 48 | libjpeg-dev \ 49 | liblapack-dev \ 50 | libopenblas-dev \ 51 | numactl \ 52 | libgoogle-perftools-dev && \ 53 | apt-get clean && rm -rf /var/lib/apt/lists/* 54 | 55 | RUN python3 -m pip install pip==24.2 --disable-pip-version-check && \ 56 | python3 -m pip install setuptools==75.1.0 --disable-pip-version-check && \ 57 | python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check 58 | 59 | RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \ 60 | mkdir -p /root/habanalabs/pytorch_temp && \ 61 | tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C ${PT_EXTRACT_PATH}/. && \ 62 | python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \ 63 | pip install mpi4py==3.1.4 --disable-pip-version-check && \ 64 | grep -ivE "#|lightning" ${PT_EXTRACT_PATH}/requirements-pytorch.txt > ${PT_EXTRACT_PATH}/requirements-pytorch-nolightning.txt && \ 65 | pip install -r ${PT_EXTRACT_PATH}/requirements-pytorch-nolightning.txt --no-warn-script-location --disable-pip-version-check && \ 66 | pip install ${PT_EXTRACT_PATH}/*.whl --disable-pip-version-check && \ 67 | grep "lightning" ${PT_EXTRACT_PATH}/requirements-pytorch.txt > ${PT_EXTRACT_PATH}/requirements-pytorch-lightning.txt && \ 68 | pip install -r ${PT_EXTRACT_PATH}/requirements-pytorch-lightning.txt --disable-pip-version-check && \ 69 | echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ 70 | pip uninstall -y pillow && \ 71 | pip uninstall -y pillow-simd && \ 72 | pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \ 73 | rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/* 74 | 75 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 76 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/triton/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS} 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) 6 | 7 | init: 8 | ifneq ($(BUILD_OS), ubuntu22.04) 9 | $(error triton is only supported on ubuntu22.04) 10 | endif 11 | $(HIDE)mkdir -p $(BUILD_DIR) 12 | $(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile 13 | 14 | build: init 15 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04 6 | FROM nvcr.io/nvidia/tritonserver:24.06-py3 7 | ARG ARTIFACTORY_URL 8 | ARG PT_VERSION 9 | ARG VERSION 10 | ARG REVISION 11 | ARG HABANA_PIP_VERSION="22.3" 12 | ARG PT_BUILD_REPO=gaudi-pt-modules 13 | ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz" 14 | ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204" 15 | ENV DEBIAN_FRONTEND=noninteractive 16 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 17 | ENV HABANA_LOGS=/var/log/habana_logs/ 18 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 19 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 20 | ENV PIP_NO_CACHE_DIR=on 21 | ENV PIP_DEFAULT_TIMEOUT=1000 22 | ENV MPI_ROOT=/opt/hpcx/ompi 23 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 24 | ENV PATH=${MPI_ROOT}/bin:$PATH 25 | ENV OPAL_PREFIX=${MPI_ROOT} 26 | ENV MPICC=${MPI_ROOT}/bin/mpicc 27 | ENV RDMAV_FORK_SAFE=1 28 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 29 | 30 | ADD model.py . 31 | RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 32 | wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \ 33 | apt-key add public && rm public && apt-get update && \ 34 | apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \ 35 | habanalabs-thunk="$VERSION"-"$REVISION" \ 36 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 37 | habanalabs-graph="$VERSION"-"$REVISION" && \ 38 | apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ 39 | sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 40 | 41 | RUN apt-get update && apt-get install -y \ 42 | libjemalloc2 \ 43 | libcairo2-dev \ 44 | libglib2.0-dev \ 45 | libhdf5-dev \ 46 | libnuma-dev \ 47 | libpcre2-dev \ 48 | libjpeg-dev \ 49 | liblapack-dev \ 50 | libopenblas-dev \ 51 | numactl \ 52 | libgoogle-perftools-dev && \ 53 | apt-get clean && rm -rf /var/lib/apt/lists/* 54 | 55 | RUN python3 -m pip install pip==23.3.1 --disable-pip-version-check && \ 56 | python3 -m pip install setuptools==67.3.3 --disable-pip-version-check && \ 57 | python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check 58 | 59 | RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \ 60 | mkdir -p /root/habanalabs/pytorch_temp && \ 61 | tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C /root/habanalabs/pytorch_temp/. && \ 62 | python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \ 63 | pip install mpi4py==3.1.4 --disable-pip-version-check && \ 64 | #pip install $(grep -ivE "#|lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt | grep .) --no-warn-script-location --disable-pip-version-check && \ 65 | pip install /root/habanalabs/pytorch_temp/*.whl --disable-pip-version-check && \ 66 | pip install $(grep "lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt) --disable-pip-version-check && \ 67 | echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ 68 | pip uninstall -y pillow && \ 69 | pip uninstall -y pillow-simd && \ 70 | pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \ 71 | rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/* 72 | #RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@v0.4.2-Gaudi-1.16.0 73 | RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@275e3250ba6ed8cc13b2d6e4928db73df420e64b 74 | 75 | RUN mkdir -p /opt/tritonserver/backends/vllm 76 | COPY model.py /opt/tritonserver/backends/vllm/ 77 | 78 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 79 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 80 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS} 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) 6 | 7 | init: 8 | ifneq ($(BUILD_OS), ubuntu22.04) 9 | $(error triton is only supported on ubuntu22.04) 10 | endif 11 | $(HIDE)mkdir -p $(BUILD_DIR) 12 | $(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile 13 | $(HIDE)cp $(CURDIR)/model.py $(BUILD_DIR)/model.py 14 | 15 | build: init 16 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of NVIDIA CORPORATION nor the names of its 14 | # contributors may be used to endorse or promote products derived 15 | # from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import argparse 30 | import asyncio 31 | import json 32 | import sys 33 | 34 | import numpy as np 35 | import tritonclient.grpc.aio as grpcclient 36 | from tritonclient.utils import * 37 | 38 | 39 | class LLMClient: 40 | def __init__(self, flags: argparse.Namespace): 41 | self._client = grpcclient.InferenceServerClient( 42 | url=flags.url, verbose=flags.verbose 43 | ) 44 | self._flags = flags 45 | self._loop = asyncio.get_event_loop() 46 | self._results_dict = {} 47 | 48 | async def async_request_iterator( 49 | self, prompts, sampling_parameters, exclude_input_in_output 50 | ): 51 | try: 52 | for iter in range(self._flags.iterations): 53 | for i, prompt in enumerate(prompts): 54 | prompt_id = self._flags.offset + (len(prompts) * iter) + i 55 | self._results_dict[str(prompt_id)] = [] 56 | yield self.create_request( 57 | prompt, 58 | self._flags.streaming_mode, 59 | prompt_id, 60 | sampling_parameters, 61 | exclude_input_in_output, 62 | ) 63 | except Exception as error: 64 | print(f"Caught an error in the request iterator: {error}") 65 | 66 | async def stream_infer(self, prompts, sampling_parameters, exclude_input_in_output): 67 | try: 68 | # Start streaming 69 | response_iterator = self._client.stream_infer( 70 | inputs_iterator=self.async_request_iterator( 71 | prompts, sampling_parameters, exclude_input_in_output 72 | ), 73 | stream_timeout=self._flags.stream_timeout, 74 | ) 75 | async for response in response_iterator: 76 | yield response 77 | except InferenceServerException as error: 78 | print(error) 79 | sys.exit(1) 80 | 81 | async def process_stream( 82 | self, prompts, sampling_parameters, exclude_input_in_output 83 | ): 84 | # Clear results in between process_stream calls 85 | self.results_dict = [] 86 | success = True 87 | # Read response from the stream 88 | async for response in self.stream_infer( 89 | prompts, sampling_parameters, exclude_input_in_output 90 | ): 91 | result, error = response 92 | if error: 93 | print(f"Encountered error while processing: {error}") 94 | success = False 95 | else: 96 | output = result.as_numpy("text_output") 97 | for i in output: 98 | self._results_dict[result.get_response().id].append(i) 99 | return success 100 | 101 | async def run(self): 102 | # Sampling parameters for text generation 103 | # including `temperature`, `top_p`, top_k`, `max_tokens`, `early_stopping`. 104 | # Full list available at: 105 | # https://github.com/vllmproject/vllm/blob/5255d99dc595f9ae7647842242d6542aa4145a4f/vllm/sampling_params.py#L23 106 | sampling_parameters = { 107 | "temperature": "0.1", 108 | "top_p": "0.95", 109 | "max_tokens": "100", 110 | } 111 | exclude_input_in_output = self._flags.exclude_inputs_in_outputs 112 | if self._flags.lora_name is not None: 113 | sampling_parameters["lora_name"] = self._flags.lora_name 114 | with open(self._flags.input_prompts, "r") as file: 115 | print(f"Loading inputs from `{self._flags.input_prompts}`...") 116 | prompts = file.readlines() 117 | 118 | success = await self.process_stream( 119 | prompts, sampling_parameters, exclude_input_in_output 120 | ) 121 | 122 | with open(self._flags.results_file, "w") as file: 123 | for id in self._results_dict.keys(): 124 | for result in self._results_dict[id]: 125 | file.write(result.decode("utf-8")) 126 | 127 | file.write("\n") 128 | file.write("\n=========\n\n") 129 | print(f"Storing results into `{self._flags.results_file}`...") 130 | 131 | if self._flags.verbose: 132 | with open(self._flags.results_file, "r") as file: 133 | print(f"\nContents of `{self._flags.results_file}` ===>") 134 | print(file.read()) 135 | if success: 136 | print("PASS: vLLM example") 137 | else: 138 | print("FAIL: vLLM example") 139 | 140 | def run_async(self): 141 | self._loop.run_until_complete(self.run()) 142 | 143 | def create_request( 144 | self, 145 | prompt, 146 | stream, 147 | request_id, 148 | sampling_parameters, 149 | exclude_input_in_output, 150 | send_parameters_as_tensor=True, 151 | ): 152 | inputs = [] 153 | prompt_data = np.array([prompt.encode("utf-8")], dtype=np.object_) 154 | try: 155 | inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) 156 | inputs[-1].set_data_from_numpy(prompt_data) 157 | except Exception as error: 158 | print(f"Encountered an error during request creation: {error}") 159 | 160 | stream_data = np.array([stream], dtype=bool) 161 | inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) 162 | inputs[-1].set_data_from_numpy(stream_data) 163 | 164 | # Request parameters are not yet supported via BLS. Provide an 165 | # optional mechanism to send serialized parameters as an input 166 | # tensor until support is added 167 | 168 | if send_parameters_as_tensor: 169 | sampling_parameters_data = np.array( 170 | [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ 171 | ) 172 | inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) 173 | inputs[-1].set_data_from_numpy(sampling_parameters_data) 174 | 175 | inputs.append(grpcclient.InferInput("exclude_input_in_output", [1], "BOOL")) 176 | inputs[-1].set_data_from_numpy(np.array([exclude_input_in_output], dtype=bool)) 177 | 178 | # Add requested outputs 179 | outputs = [] 180 | outputs.append(grpcclient.InferRequestedOutput("text_output")) 181 | 182 | # Issue the asynchronous sequence inference. 183 | return { 184 | "model_name": self._flags.model, 185 | "inputs": inputs, 186 | "outputs": outputs, 187 | "request_id": str(request_id), 188 | "parameters": sampling_parameters, 189 | } 190 | 191 | 192 | if __name__ == "__main__": 193 | parser = argparse.ArgumentParser() 194 | parser.add_argument( 195 | "-m", 196 | "--model", 197 | type=str, 198 | required=False, 199 | default="vllm_model", 200 | help="Model name", 201 | ) 202 | parser.add_argument( 203 | "-v", 204 | "--verbose", 205 | action="store_true", 206 | required=False, 207 | default=False, 208 | help="Enable verbose output", 209 | ) 210 | parser.add_argument( 211 | "-u", 212 | "--url", 213 | type=str, 214 | required=False, 215 | default="localhost:8001", 216 | help="Inference server URL and its gRPC port. Default is localhost:8001.", 217 | ) 218 | parser.add_argument( 219 | "-t", 220 | "--stream-timeout", 221 | type=float, 222 | required=False, 223 | default=None, 224 | help="Stream timeout in seconds. Default is None.", 225 | ) 226 | parser.add_argument( 227 | "--offset", 228 | type=int, 229 | required=False, 230 | default=0, 231 | help="Add offset to request IDs used", 232 | ) 233 | parser.add_argument( 234 | "--input-prompts", 235 | type=str, 236 | required=False, 237 | default="prompts.txt", 238 | help="Text file with input prompts", 239 | ) 240 | parser.add_argument( 241 | "--results-file", 242 | type=str, 243 | required=False, 244 | default="results.txt", 245 | help="The file with output results", 246 | ) 247 | parser.add_argument( 248 | "--iterations", 249 | type=int, 250 | required=False, 251 | default=1, 252 | help="Number of iterations through the prompts file", 253 | ) 254 | parser.add_argument( 255 | "-s", 256 | "--streaming-mode", 257 | action="store_true", 258 | required=False, 259 | default=False, 260 | help="Enable streaming mode", 261 | ) 262 | parser.add_argument( 263 | "--exclude-inputs-in-outputs", 264 | action="store_true", 265 | required=False, 266 | default=False, 267 | help="Exclude prompt from outputs", 268 | ) 269 | parser.add_argument( 270 | "-l", 271 | "--lora-name", 272 | type=str, 273 | required=False, 274 | default=None, 275 | help="The querying LoRA name", 276 | ) 277 | FLAGS = parser.parse_args() 278 | 279 | client = LLMClient(FLAGS) 280 | client.run_async() 281 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-7b-hf", 3 | "tokenizer":"meta-llama/Llama-2-7b-hf", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 1, 11 | "max_num_batched_tokens": 8192 12 | } 13 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/prompts.txt: -------------------------------------------------------------------------------- 1 | Hello, my name is 2 | The most dangerous animal is 3 | The capital of France is 4 | The future of AI is 5 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-70b-hf", 3 | "tokenizer":"meta-llama/Llama-2-70b-hf", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 8, 11 | "max_num_batched_tokens": 8192 12 | } 13 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-7b-hf", 3 | "tokenizer":"meta-llama/Llama-2-7b-hf", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 1, 11 | "max_num_batched_tokens": 8192 12 | } 13 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"Qwen/Qwen2-7B-Instruct", 3 | "tokenizer":"Qwen/Qwen2-7B-Instruct", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 1, 11 | "max_num_batched_tokens": 131072, 12 | "chat_template": "true" 13 | } 14 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /legal-disclaimer.md: -------------------------------------------------------------------------------- 1 | ## Legal Notice and Disclaimer 2 | 3 | No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document. 4 | 5 | Habana Labs disclaims all warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and non-infringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade. 6 | 7 | All information provided here is subject to change without notice. Habana Labs may make changes to its test conditions and internal reliability goals at any time. Contact your Habana Labs representative to obtain the latest Habana Labs product specifications and roadmaps. Your costs and results may vary. 8 | 9 | The products described may contain design defects or errors known as errata which may cause the product to deviate from published specifications. Current characterized errata are available on request. 10 | 11 | Software and workloads used in performance tests may have been optimized for performance only on Habana Labs hardware. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary. You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products. 12 | 13 | No product or component can be absolutely secure. 14 | 15 | Habana Labs, Gaudi and SynapseAI are trademarks of Habana Labs in the U.S. and/or other countries. 16 | 17 | *Other names and brands may be claimed as the property of others. 18 | 19 | © 2021 Habana Labs 20 | -------------------------------------------------------------------------------- /utils/README.md: -------------------------------------------------------------------------------- 1 | # Gaudi Utils 2 | 3 | By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). 4 | 5 | ## Table of Contents 6 | 7 | - [Gaudi Utils](#gaudi-utils) 8 | - [Table of Contents](#table-of-contents) 9 | - [Overview](#overview) 10 | - [manage\_network\_ifs](#manage_network_ifs) 11 | - [Operations](#operations) 12 | - [Up](#up) 13 | - [Down](#down) 14 | - [Status](#status) 15 | - [Set IP](#set-ip) 16 | - [Unset IP](#unset-ip) 17 | - [check\_framework\_env](#check_framework_env) 18 | - [Intel Gaudi Health Screen (IGHS)](#intel-gaudi-health-screen-ighs) 19 | 20 | ## Overview 21 | 22 | Welcome to Intel Gaudi's Util Scripts! 23 | 24 | This folder contains some Intel Gaudi utility scripts that users can access as reference. 25 | 26 | ## manage_network_ifs 27 | 28 | Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh or /opt/habanalabs/qual/gaudi3/bin/manage_network_ifs.sh). 29 | 30 | This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces. 31 | 32 | The following is the usage of the script: 33 | 34 | ``` 35 | usage: ./manage_network_ifs.sh [options] 36 | 37 | options: 38 | --up toggle up all Intel Gaudi network interfaces 39 | --down toggle down all Intel Gaudi network interfaces 40 | --status print status of all Intel Gaudi network interfaces 41 | --set-pfc set PFC (enabled=0,1,2,3) 42 | --unset-pfc unset PFC (enabled=none) 43 | --check-pfc dump PFC configuration 44 | --no-progbar do not show progress bar 45 | -v, --verbose print more logs 46 | -h, --help print this help 47 | 48 | Note: Please run this script with one operation at a time 49 | ``` 50 | ## Operations 51 | 52 | Before executing any operation, this script finds all the Intel Gaudi network interfaces available on the system and stores the Intel Gaudi interface information into a list. 53 | The list will be used for the operations. If no Intel Gaudi network interface is found, the script will exit. 54 | 55 | ### Up 56 | 57 | Use the following command to bring all Intel Gaudi network interfaces online: 58 | ``` 59 | sudo manage_network_ifs.sh --up 60 | ``` 61 | ### Down 62 | 63 | Use the following command to bring all Intel Gaudi network interfaces offline: 64 | ``` 65 | sudo manage_network_ifs.sh --down 66 | ``` 67 | ### Status 68 | 69 | Print the current operational state of all Intel Gaudi network interfaces such as how many ports are up/down: 70 | ``` 71 | sudo manage_network_ifs.sh --status 72 | ``` 73 | ### Set PFC 74 | 75 | Use the following command to set PFC for all Intel Gaudi network interfaces: 76 | ``` 77 | sudo manage_network_ifs.sh --set-pfc 78 | ``` 79 | ### Unset PFC 80 | 81 | Use the following command to unset PFC for all Intel Gaudi network interfaces: 82 | ``` 83 | sudo manage_network_ifs.sh --unset-pfc 84 | ``` 85 | 86 | ### Check current PFC configuration 87 | 88 | Use the following command to check current PFC status for all Intel Gaudi network interfaces: 89 | ``` 90 | sudo manage_network_ifs.sh --check-pfc 91 | ``` 92 | 93 | ## check_framework_env 94 | 95 | This script can be used as reference to check the environment for running PyTorch on Intel Gaudi. 96 | 97 | The following is the usage of the script: 98 | 99 | ``` 100 | usage: check_framework_env.py [-h] [--cards CARDS] 101 | 102 | Check health of Intel Gaudi for PyTorch 103 | 104 | optional arguments: 105 | -h, --help show this help message and exit 106 | --cards CARDS Set number of cards to test (default: 1) 107 | ``` 108 | 109 | ## Intel Gaudi Health Screen (IGHS) 110 | 111 | **Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test 112 | includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems. 113 | 114 | ``` bash 115 | usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] 116 | [--job-id JOB_ID] [--round ROUND] [--config CONFIG] 117 | [--ighs-check [{node,hccl-demo,none}]] [--node-write-report] 118 | [--node-name NODE_NAME] [--logs-dir LOGS_DIR] 119 | 120 | optional arguments: 121 | -h, --help show this help message and exit 122 | --initialize Downloads Necessary Repos and Creates Report Template 123 | --screen Starts Health Screen for Cluster 124 | --target-nodes TARGET_NODES 125 | List of target nodes 126 | --job-id JOB_ID Needed to identify hccl-demo running log 127 | --round ROUND Needed to identify hccl-demo running round log 128 | --config CONFIG Configuration file for Health Screener 129 | --ighs-check [{node,hccl-demo,none}] 130 | Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce 131 | (HCCL_DEMO between paris of nodes) 132 | --node-write-report Write Individual Node Health Report 133 | --node-name NODE_NAME Name of Node 134 | --logs-dir LOGS_DIR Output directory of health screen results 135 | ``` 136 | 137 | To run a full IGHS test, run the below command: 138 | 139 | ``` bash 140 | # Creates IGHS Report and screens clusters for any infected nodes. 141 | # Will check Level 1 and 2 by default 142 | python screen.py --initialize --screen 143 | ``` 144 | 145 | IGHS can alternatively be run through below script: 146 | 147 | ``` bash 148 | # Creates IGHS Report and screens clusters for any infected nodes. 149 | # Will check Level 1 and 2 by default 150 | ./run_ighs.sh 151 | ``` 152 | -------------------------------------------------------------------------------- /utils/check_framework_env.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright (C) 2022 Habana Labs, Ltd. an Intel Company 3 | # All Rights Reserved. 4 | # 5 | # Unauthorized copying of this file or any element(s) within it, via any medium 6 | # is strictly prohibited. 7 | # This file contains Habana Labs, Ltd. proprietary and confidential information 8 | # and is subject to the confidentiality and license agreements under which it 9 | # was provided. 10 | # 11 | ############################################################################### 12 | 13 | import argparse 14 | import os 15 | import concurrent.futures 16 | 17 | def parse_arguments(): 18 | parser = argparse.ArgumentParser(description="Check health of Intel Gaudi for PyTorch") 19 | 20 | parser.add_argument("--cards", 21 | default=1, 22 | type=int, 23 | required=False, 24 | help="Set number of cards to test (default: 1)") 25 | 26 | args = parser.parse_args() 27 | print(f"Configuration: {args}") 28 | 29 | return args 30 | 31 | def pytorch_test(device_id=0): 32 | """ Checks health of Intel Gaudi through running a basic 33 | PyTorch example on Intel Gaudi 34 | 35 | Args: 36 | device_id (int, optional): ID of Intel Gaudi. Defaults to 0. 37 | """ 38 | 39 | os.environ["HLS_MODULE_ID"] = str(device_id) 40 | os.environ["HABANA_VISIBLE_MODULES"] = str(device_id) 41 | 42 | try: 43 | import torch 44 | import habana_frameworks.torch.core 45 | except Exception as e: 46 | print(f"Card {device_id} Failed to initialize Intel Gaudi PyTorch: {str(e)}") 47 | raise 48 | 49 | try: 50 | x = torch.tensor([2]).to('hpu') 51 | y = x + x 52 | 53 | assert y == 4, 'Sanity check failed: Wrong Add output' 54 | assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card' 55 | except (RuntimeError, AssertionError) as e: 56 | print(f"Card Module ID {device_id} Failure: {e}") 57 | raise 58 | 59 | return device_id 60 | 61 | if __name__ == '__main__': 62 | args = parse_arguments() 63 | passed_cards = set() 64 | 65 | with concurrent.futures.ProcessPoolExecutor() as executor: 66 | futures = [executor.submit(pytorch_test, device_id) for device_id in range(args.cards)] 67 | for future in concurrent.futures.as_completed(futures): 68 | try: 69 | dev_id = future.result() 70 | passed_cards.add(dev_id) 71 | print(f"Card module_id {dev_id} PASSED") 72 | 73 | except Exception as e: 74 | print(f"Failed to initialize on Intel Gaudi, error: {str(e)}") 75 | 76 | failed_cards = set(range(args.cards)) - passed_cards 77 | 78 | print(f"Failed cards Module ID: {failed_cards}") 79 | print(f"Passed cards Module ID: {passed_cards}") -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/.gitignore: -------------------------------------------------------------------------------- 1 | tmp/* 2 | build/* 3 | logs/* 4 | .graph_dump/* 5 | __pycache__* -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/HealthReport.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, csv, time, shutil, fcntl, glob, copy 14 | from collections import defaultdict 15 | from tempfile import NamedTemporaryFile 16 | 17 | from utilities import copy_files 18 | 19 | import logging 20 | 21 | _logger = logging.getLogger("health_screener") 22 | 23 | class HealthReport(): 24 | 25 | def __init__(self, f_dir="tmp", report_name="health_report.csv"): 26 | """ Initialize Health Report Class 27 | 28 | Args: 29 | f_dir (str, optional): File Directory to store Health Report logs and results. Defaults to "tmp". 30 | report_name (str, optional): File name of Health Report csv. Defaults to "health_report.csv". 31 | """ 32 | self.header = ["node_id", "index", "module_id", "pci_address", "temperature_C", "temperature_state_C", "device_acquire_fail", "down_links", "multi_node_fail", "missing"] 33 | 34 | self.f_dir = f_dir 35 | self.report_name = report_name 36 | self.f_path = f"{self.f_dir}/{self.report_name}" 37 | 38 | self.header_hccl_demo = ["round","group_id", "node_ids", "num_nodes", "multi_node_fail", "missing", "qpc_fail"] 39 | self.f_path_hccl_demo = f"{self.f_dir}/{os.path.splitext(self.report_name)[0]}_hccl_demo.csv" 40 | 41 | 42 | def create(self, create_base=True, create_hccl_demo=False): 43 | """Create CSV Health Report Files. One for Base Health Checks and HCCL Demo Checks 44 | 45 | Args: 46 | create_base (bool, optional): Create Base Health_Report CSV file. Defaults to True. 47 | create_hccl_demo (bool, optional): Create HCCL_DEMO_Health_Report if it doesn't exist. Defaults to False. 48 | """ 49 | 50 | dir_name = os.path.dirname(self.f_path) 51 | if not os.path.exists(dir_name): 52 | os.makedirs(dir_name) 53 | 54 | if create_base: 55 | with open(self.f_path, "w+", newline='') as f: 56 | writer = csv.DictWriter(f, fieldnames=self.header, extrasaction='ignore') 57 | writer.writeheader() 58 | _logger.info(f"Created {self.f_path} with header: {self.header}") 59 | 60 | if create_hccl_demo and not self.exist(level=2): 61 | with open(self.f_path_hccl_demo, "w+", newline='') as f: 62 | writer = csv.DictWriter(f, fieldnames=self.header_hccl_demo, extrasaction='ignore') 63 | writer.writeheader() 64 | _logger.info(f"Created {self.f_path_hccl_demo} with header: {self.header_hccl_demo}") 65 | 66 | def exist(self, level=1): 67 | """Checks to see if Base Health Report exist 68 | 69 | Args: 70 | level (int, optional): Health Screen level report csv to check. Defaults to 1. 71 | 72 | Returns: 73 | bool: Returns True if the Base Health Report (self.f_path) or HCCL_DEMO Health Report (self.f_path_hccl_demo) exist 74 | """ 75 | f_path = self.f_path 76 | 77 | if level == 2: 78 | f_path = self.f_path_hccl_demo 79 | 80 | return os.path.exists(f_path) 81 | 82 | def write_rows(self, data=list(), level=1): 83 | """ Write health check results to Health Report CSV. Can write multiple rows at once 84 | 85 | Args: 86 | data (_type_, optional): Health Report CSV Row data. Defaults to list(). 87 | level (int, optional): Health Screen Level. Defaults to 1. 88 | """ 89 | 90 | if level == 1: 91 | f_path = self.f_path 92 | header = self.header 93 | 94 | 95 | elif level == 2: 96 | f_path = self.f_path_hccl_demo 97 | header = self.header_hccl_demo 98 | 99 | with open(f_path, "a", newline='') as f: 100 | fcntl.flock(f, fcntl.LOCK_EX) 101 | writer = csv.DictWriter(f, fieldnames=header, extrasaction='ignore') 102 | writer.writerows(data) 103 | time.sleep(0.1) 104 | fcntl.flock(f, fcntl.LOCK_UN) 105 | 106 | def update_health_report(self, detected_nodes, infected_nodes, missing_nodes): 107 | """ Update health_report with hccl_demo results 108 | 109 | Args: 110 | detected_nodes (list[str]): List of detected node_ids 111 | infected_nodes (list[str]): List of infected node_ids 112 | missing_nodes (list[str]): List of missing node_ids 113 | """ 114 | temp_file = NamedTemporaryFile(mode='w', delete=False) 115 | detected_nodes_cp = detected_nodes.copy() 116 | 117 | with open(self.f_path, 'r', newline='') as csv_file, temp_file: 118 | reader = csv.DictReader(csv_file) 119 | writer = csv.DictWriter(temp_file, fieldnames=self.header) 120 | 121 | writer.writeheader() 122 | for row in reader: 123 | if row["node_id"] in infected_nodes or row["node_id"] in missing_nodes: 124 | row["multi_node_fail"] = True 125 | elif row["node_id"] in detected_nodes_cp: 126 | row["multi_node_fail"] = False 127 | row["missing"] = False 128 | 129 | writer.writerow(row) 130 | 131 | missing_nodes.discard(row["node_id"]) 132 | detected_nodes_cp.discard(row["node_id"]) 133 | 134 | # These are unreported Detected Nodes. Add to Report 135 | if len(detected_nodes_cp): 136 | for n in detected_nodes_cp: 137 | writer.writerow({"node_id": n, "multi_node_fail": False, "missing": False}) 138 | 139 | # These are unreported Missing Nodes. Add to Report 140 | if len(missing_nodes): 141 | for n in missing_nodes: 142 | writer.writerow({"node_id": n, "multi_node_fail": True, "missing": True}) 143 | 144 | shutil.move(temp_file.name, self.f_path) 145 | 146 | def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, qpc_fail, missing_nodes): 147 | """ Update health_report with hccl_demo results, based on infected_nodes. 148 | 149 | Args: 150 | all_node_pairs (list[str]): List of all Node Pairs reported by Level 2 round 151 | multi_node_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test 152 | qpc_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test due to QPC error 153 | missing_nodes (list[str]): List of Node Pairs that couldn't run HCCL_Demo 154 | """ 155 | temp_file = NamedTemporaryFile(mode='w', delete=False) 156 | 157 | with open(self.f_path_hccl_demo, 'r', newline='') as csv_file, temp_file: 158 | reader = csv.DictReader(csv_file) 159 | writer = csv.DictWriter(temp_file, fieldnames=self.header_hccl_demo, extrasaction='ignore') 160 | 161 | writer.writeheader() 162 | for row in reader: 163 | if(row["round"] == round): 164 | row["multi_node_fail"] = (row["node_ids"] in multi_node_fail) 165 | row["qpc_fail"] = (row["node_ids"] in qpc_fail) 166 | row["missing"] = (row["node_ids"] in missing_nodes) 167 | 168 | if row["node_ids"] in all_node_pairs: 169 | del all_node_pairs[row["node_ids"]] 170 | 171 | writer.writerow(row) 172 | 173 | # These are unreported node_pairs. Add remaining node pairs 174 | if len(all_node_pairs): 175 | writer.writerows(list(all_node_pairs.values())) 176 | 177 | shutil.move(temp_file.name, self.f_path_hccl_demo) 178 | 179 | def check_screen_complete(self, num_nodes, hccl_demo=False, round=0): 180 | """ Check on status of Health Screen Check. 181 | Screen considered done if all nodes health checks are done 182 | 183 | Args: 184 | num_nodes (int): Number of Nodes screened 185 | hccl_demo (bool, optional): Status of HCCL_DEMO all reduce test. Defaults to False. 186 | round (int, optional): Level 2 Round. This will only check Level 2 round results. This is ignored for Level 1 runs. 187 | 188 | Returns: 189 | bool: Status of Screen. If all nodes are found, screening is done 190 | """ 191 | f_path = self.f_path if (not hccl_demo) else self.f_path_hccl_demo 192 | n_cards_per_node = 8 193 | 194 | with open(f_path, "r", newline='') as f: 195 | reader = csv.DictReader(f) 196 | 197 | if hccl_demo: 198 | n_cards = 0 199 | for row in reader: 200 | if(int(row["round"]) == round): 201 | n_cards += (int(row["num_nodes"]) * n_cards_per_node) 202 | else: 203 | n_cards = len(list(reader)) 204 | 205 | total_cards = n_cards_per_node * num_nodes 206 | has_all_nodes_info = (n_cards == total_cards) 207 | num_found_nodes = n_cards // n_cards_per_node 208 | 209 | return has_all_nodes_info, num_found_nodes 210 | 211 | def extract_node_info(self): 212 | """ Extracts Detected, Infected, and Missing Nodes from Health Report. 213 | 214 | Returns: 215 | (set, set, set): (Detected Nodes, Infected Nodes, Missing Nodes) 216 | """ 217 | detected_nodes = set() 218 | missing_nodes = set() 219 | device_acquire_fail_set = set() 220 | down_links_set = set() 221 | temperature_fail_set = set() 222 | temperature_warn_set = set() 223 | 224 | with open(self.f_path, "r", newline='') as f: 225 | reader = csv.DictReader(f) 226 | for row in reader: 227 | detected_nodes.add(row["node_id"]) 228 | 229 | if row["device_acquire_fail"] == "True": 230 | device_acquire_fail_set.add(row["node_id"]) 231 | if row["down_links"] != "[]" and row["down_links"] != "": 232 | down_links_set.add(row["node_id"]) 233 | if row["missing"] == "True": 234 | missing_nodes.add(row["node_id"]) 235 | if row["temperature_state_C"] == "CRITICAL": 236 | temperature_fail_set.add(row["node_id"]) 237 | if row["temperature_state_C"] == "WARN": 238 | temperature_warn_set.add(row["node_id"]) 239 | 240 | if(len(device_acquire_fail_set)): 241 | _logger.info(f"{len(device_acquire_fail_set)} Infected (Device Acquire fail): {sorted(list(device_acquire_fail_set))}") 242 | if(len(down_links_set)): 243 | _logger.info(f"{len(down_links_set)} Infected (Down Links): {sorted(list(down_links_set))}") 244 | if(len(temperature_warn_set)): 245 | _logger.info(f"{len(temperature_warn_set)} Infected (Temperature WARN): {sorted(list(temperature_warn_set))}") 246 | if(len(temperature_fail_set)): 247 | _logger.info(f"{len(temperature_fail_set)} Infected (Temperature CRITICAL): {sorted(list(temperature_fail_set))}") 248 | 249 | infected_nodes = set() 250 | infected_nodes.update(device_acquire_fail_set) 251 | infected_nodes.update(down_links_set) 252 | infected_nodes.update(temperature_fail_set) 253 | infected_nodes.update(temperature_warn_set) 254 | 255 | return detected_nodes, infected_nodes, missing_nodes 256 | 257 | 258 | def extract_hccl_demo_info(self): 259 | """ Extracts Detected, Infected, and Missing Nodes from HCCL DEMO Health Report 260 | 261 | Returns: 262 | (set, set, set): (Detected Nodes, Infected Nodes, Missing Nodes) 263 | """ 264 | detected_nodes = set() 265 | infected_nodes = set() 266 | missing_nodes = set() 267 | fail_checks = defaultdict(list) 268 | missing_checks = defaultdict(list) 269 | 270 | with open(self.f_path_hccl_demo, "r", newline='') as f: 271 | reader = csv.DictReader(f) 272 | for row in reader: 273 | node_ids = row["node_ids"].strip("[']").replace("'","").split(', ') 274 | detected_nodes.update(node_ids) 275 | 276 | for n in node_ids: 277 | fail_status = int(row["multi_node_fail"] == "True") 278 | fail_checks[n].append(fail_status) 279 | 280 | missing_status = int(row["missing"] == "True") 281 | missing_checks[n].append(missing_status) 282 | 283 | for n, v in fail_checks.items(): 284 | if sum(v) == len(v): 285 | infected_nodes.add(n) 286 | 287 | for n, v in missing_checks.items(): 288 | if sum(v) == len(v): 289 | missing_nodes.add(n) 290 | 291 | detected_nodes -= missing_nodes 292 | infected_nodes -= missing_nodes 293 | 294 | _logger.info(f"{len(infected_nodes)} Infected (HCCL): {sorted(list(infected_nodes))}") 295 | 296 | return detected_nodes, infected_nodes, missing_nodes 297 | 298 | def gather_health_report(self, level, remote_path, hosts): 299 | """ Gathers Health Report from all hosts 300 | 301 | Args: 302 | level (str): IGHS Level 303 | remote_path (str): Remote Destintation of IGHS Report 304 | hosts (list, optional): List of IP Addresses to gather IGHS Reports 305 | """ 306 | copy_files(src=f"{remote_path}/intel_gaudi_health_screen/{self.f_dir}/L{level}", 307 | dst=f"{self.f_dir}", 308 | hosts=hosts, 309 | to_remote=False) 310 | 311 | def consolidate_health_report(self, level, report_dir): 312 | """ Consolidates the health_report_*.csv from worker pods into a single master csv file 313 | 314 | Args: 315 | level (str): IGHS Level 316 | report_dir (str): Directory of CSV files to merge 317 | """ 318 | data = list() 319 | path = f"{report_dir}/L{level}/health_report_*.csv" 320 | csv_files = glob.glob(path) 321 | 322 | for f in csv_files: 323 | with open(f, 'r', newline='') as csv_file: 324 | reader = csv.DictReader(csv_file) 325 | for row in reader: 326 | data.append(row) 327 | 328 | self.write_rows(data=data, level=level) 329 | 330 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/IGNodes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, time, csv, json 14 | import logging 15 | import multiprocessing 16 | 17 | from HealthReport import HealthReport 18 | from utilities import run_cmd, create_logger 19 | 20 | _logger = logging.getLogger("health_screener") 21 | 22 | 23 | class IGNodes(): 24 | 25 | def __init__(self, health_report=HealthReport()): 26 | """ Keeps Track of Nodes and their current states 27 | 28 | Args: 29 | health_report (HealthReport, optional): IGHS Health Report. Defaults to creating a new HealthReport(). 30 | """ 31 | self.all_nodes = list() 32 | self.launcher_nodes = list() 33 | self.worker_nodes = list() 34 | self.healthy_nodes = set() 35 | self.watch_nodes = set() 36 | self.infected_nodes = set() 37 | self.missing_nodes = set() 38 | 39 | self.groups_tracker = list() 40 | self.current_node_groups = list() 41 | 42 | self.health_report = health_report 43 | self.log_dir = health_report.f_dir 44 | 45 | def update_node_status(self, healthy_nodes, infected_nodes, missing_nodes, undetected_nodes=[]): 46 | """Update the node lists status based on current node groups. If a node 47 | paring fails with known healthy node, then the other node is considered 48 | infected. Otherwise it will be moved to the healthy node list 49 | 50 | Args: 51 | healthy_nodes ([str]): List of Healthy nodes that pass IGHS testing 52 | infected_nodes ([str]): List of nodes that failed to pass IGHS testing 53 | missing_nodes ([str]): List of nodes that IGHS did not run testing on 54 | undetected_nodes ([str]): List of nodes that IGHS did not run testing on b/c it wasn't scheduled on 55 | """ 56 | watch_nodes = self.watch_nodes.copy() 57 | 58 | # Remove Nodes that haven't been tested yet from the healthy list 59 | for n in undetected_nodes: 60 | if n in watch_nodes and n in healthy_nodes: 61 | healthy_nodes.remove(n) 62 | 63 | self.healthy_nodes.update(healthy_nodes) 64 | 65 | for group in self.current_node_groups: 66 | n1, n2 = group 67 | self.determine_node_health(infected_nodes, missing_nodes, n1, n2) 68 | self.determine_node_health(infected_nodes, missing_nodes, n2, n1) 69 | 70 | self.watch_nodes = self.watch_nodes.difference(self.healthy_nodes) 71 | 72 | def determine_node_health(self, infected_nodes, missing_nodes, n1, n2): 73 | """Determine whether a node is healthy . 74 | 75 | Args: 76 | infected_nodes ([str]): List of nodes that failed to pass IGHS testing 77 | missing_nodes ([str]): List of nodes that IGHS did not run testing on 78 | n1 (str): Node name to investigate if it passes the IGHS test 79 | n2 (str): Node name that should be considered healthy. This assist in verifying status of N1 80 | """ 81 | if n2 in self.healthy_nodes: 82 | remove_from_watch = False 83 | 84 | if n1 in infected_nodes: 85 | self.infected_nodes.add(n1) 86 | remove_from_watch = True 87 | if n1 in missing_nodes: 88 | self.missing_nodes.add(n1) 89 | remove_from_watch = True 90 | 91 | if remove_from_watch and n1 in self.watch_nodes: 92 | self.watch_nodes.remove(n1) 93 | 94 | class IGNode(): 95 | 96 | def __init__(self, name="", health_report=HealthReport(), num_checks_link_state=10, log_level=logging.INFO, write_dir="/tmp/ighs"): 97 | self.name = name 98 | if name == "" and "MY_NODE_NAME" in os.environ: 99 | self.name = os.environ["MY_NODE_NAME"] 100 | 101 | self.cards = dict() 102 | self.num_checks_link_state = num_checks_link_state 103 | self.write_dir = write_dir 104 | if(not os.path.exists(self.write_dir)): 105 | os.makedirs(self.write_dir) 106 | 107 | self.health_report = health_report 108 | if not self.health_report.exist(): 109 | self.health_report.create() 110 | 111 | self.logger, _ = create_logger(logger_name=self.name, logger_file_name=self.name, f_path=f"{write_dir}", level=log_level) 112 | 113 | 114 | def scan_cards(self): 115 | self.logger.info(f"Scanning cards info on Node: {self.name}") 116 | 117 | cmd = "hl-smi -Q index,module_id,bus_id,memory.used,temperature.aip,name -f csv,noheader" 118 | output = run_cmd(cmd) 119 | 120 | reader = csv.reader(output.split('\n'), delimiter=',') 121 | for row in reader: 122 | if len(row) == 0: 123 | continue 124 | elif len(row) < 6: 125 | _logger.error(f"hl-smi output is not correct: Recieved output: {row}") 126 | continue 127 | 128 | i = row[0] 129 | module_id = row[1].strip() 130 | pci_address = row[2] 131 | memory_used = int(row[3].split()[0]) 132 | temperature_C = int(row[4].split()[0]) 133 | system_name = row[5] 134 | 135 | card = IGCard(system_name=system_name, index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) 136 | self.cards[i] = card 137 | 138 | self.cards = dict(sorted(self.cards.items())) 139 | 140 | def record_dmesg(self): 141 | cmd = f"dmesg -T" 142 | output = run_cmd(cmd) 143 | 144 | self.logger.info("***** START of DMESG *****") 145 | self.logger.info(output) 146 | self.logger.info("***** END of DMESG *****") 147 | 148 | def health_check(self, target_cards=[], write_report=False): 149 | checked_cards = list() 150 | processes = list() 151 | card_queue = multiprocessing.Queue() 152 | 153 | if len(target_cards) == 0: 154 | target_cards = self.cards.keys() 155 | 156 | for i in target_cards: 157 | card = self.cards[str(i)] 158 | p = multiprocessing.Process(target=card.check_health, args=(self.num_checks_link_state,card_queue)) 159 | 160 | p.start() 161 | processes.append((card,p)) 162 | 163 | for card,p in processes: 164 | p.join() 165 | card_queue.put(None) 166 | 167 | for card in iter(card_queue.get, None): 168 | card.node_id = self.name 169 | checked_cards.append(card) 170 | self.logger.info(card) 171 | 172 | self.record_dmesg() 173 | checked_cards_dict = self.write_json(checked_cards) 174 | if(write_report): 175 | self.health_report.write_rows(data=checked_cards_dict) 176 | 177 | def write_json(self, cards): 178 | node_status = dict() 179 | node_status["name"] = self.name 180 | node_status["is_infected"] = False 181 | node_status["cards"] = list() 182 | 183 | for c in cards: 184 | c_status = c.__dict__ 185 | del c_status["logger"] 186 | node_status["cards"].append(c.__dict__) 187 | 188 | if c.is_infected: 189 | node_status["is_infected"] = True 190 | 191 | self.logger.info("***** START of Node Report *****") 192 | self.logger.info(json.dumps(node_status)) 193 | self.logger.info("***** END of Node Report *****") 194 | 195 | return node_status["cards"] 196 | 197 | class IGCard(): 198 | 199 | def __init__(self, system_name="", index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None): 200 | self.system_name = system_name 201 | self.node_id = "" 202 | self.logger = logger 203 | self.index = index 204 | self.module_id = module_id 205 | self.pci_address = pci_address 206 | self.memory_used = memory_used 207 | self.temperature_C = temperature 208 | self.temperature_state_C = "" 209 | 210 | self.framework = framework 211 | self.down_links = list() 212 | self.device_acquire_fail = False 213 | self.multi_node_fail = False 214 | self.is_infected = False 215 | 216 | self.internal_ports = list() 217 | self.external_ports = list() 218 | 219 | def check_health(self,num_checks_link_state=10, checked_cards=[]): 220 | self.check_port_type() 221 | self.check_link_state(attempts=num_checks_link_state, sleep_sec=0.2) 222 | self.check_device_acquire_fail() 223 | self.check_temperature_state() 224 | 225 | checked_cards.put(self) 226 | 227 | def check_link_state(self, attempts=10, sleep_sec=0.5): 228 | self.logger.debug(f"Checking {self.pci_address} Link State. Will check {attempts} times") 229 | all_ports = self.internal_ports + self.external_ports 230 | all_ports_txt = ",".join(all_ports) 231 | 232 | cmd = f"hl-smi -n link -i {self.pci_address} -P {all_ports_txt}" 233 | down_links = set() 234 | 235 | for a in range(attempts): 236 | output = run_cmd(cmd) 237 | links_state = output.strip().split("\n") 238 | 239 | for i, status in enumerate(links_state): 240 | if ("DOWN" in status): 241 | down_links.add(i) 242 | self.logger.debug(f"Attempt: {a} Port: {i} DOWN") 243 | self.is_infected = True 244 | 245 | time.sleep(sleep_sec) 246 | 247 | self.down_links = list(down_links) 248 | 249 | return self.down_links 250 | 251 | 252 | def check_port_type(self): 253 | self.logger.debug(f"Checking {self.pci_address} Port Types (Internal|External)") 254 | 255 | cmd = f"hl-smi -n ports -i {self.pci_address}" 256 | output = run_cmd(cmd) 257 | output_list = output.strip().split("\n") 258 | 259 | for output in output_list: 260 | port_txt, port_type = output.split(":") 261 | port = port_txt.split(" ")[1] 262 | 263 | if "external" in port_type: 264 | self.external_ports.append(port) 265 | else: 266 | self.internal_ports.append(port) 267 | 268 | def check_device_acquire_fail(self): 269 | self.logger.debug(f"Checking {self.pci_address} for Device Acquire Issues") 270 | self.device_acquire_fail = False 271 | 272 | os.environ["ID"] = str(self.module_id) 273 | os.environ["HABANA_VISIBLE_MODULES"] = str(self.module_id) 274 | 275 | try: 276 | import torch 277 | import habana_frameworks.torch.core 278 | except Exception as e: 279 | self.logger.error(f"Card {self.module_id} {self.pci_address} Failed to initialize Intel Gaudi PyTorch: {str(e)}") 280 | self.device_acquire_fail = True 281 | self.is_infected = True 282 | 283 | try: 284 | x = torch.tensor([2]).to('hpu') 285 | y = x + x 286 | 287 | assert y == 4, 'Sanity check failed: Wrong Add output' 288 | assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device' 289 | except (RuntimeError, AssertionError, Exception) as e: 290 | self.logger.error(f"{self.pci_address} Device Acquire Failure: {e}") 291 | self.device_acquire_fail = True 292 | self.is_infected = True 293 | 294 | return self.device_acquire_fail 295 | 296 | def check_temperature_state(self): 297 | if "HL-325" in self.system_name: 298 | # Gaudi-3 System 299 | max_good_temperature = 200 300 | base_temperature = 45 301 | max_delta = 80 302 | else: 303 | # Gaudi-2 System 304 | max_good_temperature = 83 305 | base_temperature = 25 306 | max_delta = 25 307 | 308 | 309 | if self.temperature_C >= max_good_temperature: 310 | self.temperature_state_C = "CRITICAL" 311 | self.is_infected = True 312 | elif abs(self.temperature_C - base_temperature) >= max_delta: 313 | self.temperature_state_C = "WARN" 314 | self.is_infected = True 315 | else: 316 | self.temperature_state_C = "NORMAL" 317 | 318 | def __str__(self): 319 | report_str = f""" Index: {self.index} 320 | Module Id: {self.module_id} 321 | PCI Address: {self.pci_address} 322 | Temperature: {self.temperature_C} C 323 | Temperature State: {self.temperature_state_C} 324 | Down Links: {self.down_links} 325 | Device Acquire Fail: {self.device_acquire_fail}""" 326 | 327 | return report_str 328 | 329 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/README.md: -------------------------------------------------------------------------------- 1 | # Intel Gaudi Health Screen 2.2.2 2 | 3 | A large scale Intel Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the 4 | cluster network health. Troubleshooting issues on a large cluster can be a tedious act. To simplify the debugging process the 5 | **Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test 6 | includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems 7 | 8 | IGHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training 9 | on a gaudi systems until the scans are complete. At the end of the scans, IGHS produces a CSV report detailing the state of each gaudi card. 10 | 11 | It is reccomended to run IGHS in the below scenarios: 12 | 13 | * After a system upgrade/update 14 | * Before running a long term training 15 | * Pinpointing problematic systems in a cluster if a problem can't be isolated to a single system 16 | 17 | IGHS runs a multi-tiered configurable scan: 18 | 19 | * Level 1 - Individual System Diagnostics 20 | * Level 2 - Multi-System Communication Diagnostics 21 | 22 | ## Level 1 - Individual System Diagnostic 23 | 24 | Level 1 focuses on individual Gaudi Cards Health Diagnostics. 25 | 26 | | Test | Description | 27 | | ------------------------- | ---------------------------------------------------------- | 28 | | Gaudi Ports Status | Checks if ports are DOWN | 29 | | Device Acquire Failures | Checks if devices are busy | 30 | | Device Temperatue | Checks if devices temperatures are in acceptable range | 31 | 32 | **2 System Cluster Example** 33 | 34 | Here is an example of running IGHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and 35 | flags for multi node communication failure 36 | 37 | | node_id | index | module_id | pci_address | temperature_C | temperature_C | device_acquire_fail | down_links | multi_node_fail | missing | 38 | | -------- | ----- | --------- | ------------ | ------------- | ------------- | ------------------- | ---------- | ----------------| ------- | 39 | | sys-9-05 | 0 | 3 | 0000:19:00.0 | 22 | | False | [9] | True | False | 40 | | sys-9-05 | 1 | 7 | 0000:b3:00.0 | 60 | WARN | False | [7] | True | False | 41 | | sys-9-05 | 2 | 2 | 0000:1a:00.0 | 84 | CRITICAL | False | [5, 7] | True | False | 42 | | sys-9-05 | 3 | 6 | 0000:b4:00.0 | 23 | | False | [4] | True | False | 43 | | sys-9-05 | 4 | 1 | 0000:33:00.0 | 25 | | False | [4, 5] | True | False | 44 | | sys-9-05 | 5 | 5 | 0000:cc:00.0 | 24 | | False | [4, 5] | True | False | 45 | | sys-9-05 | 6 | 0 | 0000:34:00.0 | 27 | | False | [4, 5] | True | False | 46 | | sys-4-04 | 7 | 4 | 0000:cd:00.0 | 28 | | False | [] | False | False | 47 | | sys-4-04 | 0 | 3 | 0000:19:00.0 | 28 | | False | [] | False | False | 48 | | sys-4-04 | 1 | 7 | 0000:b3:00.0 | 28 | | False | [] | False | False | 49 | | sys-4-04 | 2 | 2 | 0000:1a:00.0 | 28 | | False | [] | False | False | 50 | | sys-4-04 | 3 | 0 | 0000:34:00.0 | 24 | | False | [] | False | False | 51 | | sys-4-04 | 4 | 6 | 0000:b4:00.0 | 24 | | False | [] | False | False | 52 | | sys-4-04 | 5 | 1 | 0000:33:00.0 | 21 | | False | [] | False | False | 53 | | sys-4-04 | 6 | 5 | 0000:cc:00.0 | 21 | | False | [] | False | False | 54 | | sys-4-04 | 7 | 4 | 0000:cd:00.0 | 26 | | False | [] | False | False | 55 | 56 | ``` log 57 | [2023-02-07 09:02:39] INFO Infected (Temperature WARN) 1 Node: ['sys-9-05'] 58 | [2023-02-07 09:02:39] INFO Infected (Temperature CRITICAL) 1 Node: ['sys-9-05'] 59 | [2023-02-07 09:02:39] INFO Infected 1 Node: ['sys-9-05'] 60 | [2023-02-07 09:02:39] INFO Missing 0 Node: [] 61 | [2023-02-07 09:02:39] INFO Healthy 1 Node: ["sys-4-04"] 62 | 63 | [2023-02-07 09:02:39] INFO Detected 2 Node: ["sys-4-04","sys-9-05"] 64 | 65 | ``` 66 | 67 | 68 | ## Level 2 - Multi-System Communication Diagnostics 69 | 70 | Level 2 performs a collective communication all reduce test between multiple system through [HCCL_DEMO](https://github.com/HabanaAI/hccl_demo] repo. 71 | It runs X rounds with unique pairs of systems ensuring that a system is able to communicate across different sets of systems. If no 72 | pair systems have failed, then the testing will stop. If there was a system with communication issues, it will be flagged on the 73 | first round. 74 | 75 | ** Multi Node Cluster Example** 76 | 77 | Here is an example of running IGHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged 78 | in the multi_node_fail column, then one of the nodes has a communication issue. List of infected nodes will be printed out to 79 | the log as well as the `health_report.csv` multi_node_fail column. 80 | 81 | | round | group_id | node_ids | num_nodes | multi_node_fail | missing | qpc_fail | 82 | | ----- | -------- | ------------------------ | --------- | --------------- | ------- | -------- | 83 | | 0 | 11 | ['sys-7-01', 'sys-9-05'] | 2 | True | False | True | 84 | | 0 | 4 | ['sys-2-03', 'sys-4-04'] | 2 | True | True | False | 85 | | 0 | 13 | ['sys-6-06', 'sys-9-06'] | 2 | False | False | False | 86 | | 0 | 1 | ['sys-3-01', 'sys-9-01'] | 2 | False | False | False | 87 | | 0 | 2 | ['sys-6-03', 'sys-8-01'] | 2 | False | False | False | 88 | | 0 | 0 | ['sys-3-06', 'sys-6-02'] | 2 | False | False | False | 89 | | 0 | 10 | ['sys-2-01', 'sys-4-01'] | 2 | False | False | False | 90 | | 0 | 6 | ['sys-6-05', 'sys-9-03'] | 2 | False | False | False | 91 | | 0 | 14 | ['sys-4-05', 'sys-8-03'] | 2 | False | False | False | 92 | | 0 | 12 | ['sys-6-04', 'sys-8-05'] | 2 | False | False | False | 93 | | 0 | 8 | ['sys-7-06', 'sys-9-02'] | 2 | False | False | False | 94 | | 0 | 5 | ['sys-3-04', 'sys-7-02'] | 2 | False | False | False | 95 | | 0 | 3 | ['sys-4-03', 'sys-6-01'] | 2 | False | False | False | 96 | | 0 | 7 | ['sys-2-06', 'sys-3-03'] | 2 | False | False | False | 97 | | 0 | 9 | ['sys-2-04', 'sys-9-04'] | 2 | False | False | False | 98 | | 1 | 1 | ['sys-3-04', 'sys-4-05'] | 2 | False | False | False | 99 | | 1 | 20 | ['sys-2-03', 'sys-7-02'] | 2 | True | True | False | 100 | | 1 | 19 | ['sys-3-01', 'sys-9-03'] | 2 | False | False | False | 101 | | 1 | 0 | ['sys-3-03', 'sys-9-04'] | 2 | False | False | False | 102 | | 1 | 12 | ['sys-4-04', 'sys-6-02'] | 2 | False | False | False | 103 | | 1 | 9 | ['sys-4-03', 'sys-6-05'] | 2 | False | False | False | 104 | | 1 | 14 | ['sys-3-06', 'sys-6-04'] | 2 | False | False | False | 105 | | 1 | 15 | ['sys-4-01', 'sys-8-03'] | 2 | False | False | False | 106 | | 1 | 3 | ['sys-8-01', 'sys-9-05'] | 2 | True | False | False | 107 | | 1 | 8 | ['sys-6-03', 'sys-9-02'] | 2 | False | False | False | 108 | | 1 | 7 | ['sys-2-06', 'sys-6-01'] | 2 | False | False | False | 109 | | 1 | 10 | ['sys-6-06', 'sys-8-06'] | 2 | False | False | False | 110 | | 1 | 11 | ['sys-3-02', 'sys-7-04'] | 2 | False | False | False | 111 | | 1 | 17 | ['sys-8-04', 'sys-8-05'] | 2 | False | False | False | 112 | | 1 | 18 | ['sys-4-02', 'sys-9-01'] | 2 | False | False | False | 113 | | 1 | 16 | ['sys-2-02', 'sys-9-06'] | 2 | False | False | False | 114 | 115 | Logs show that we have 1 Infected Nodes and 1 Missing Node. Missing node represents a node that hasn't been tested yet and there are standard checks to see why it hasn't 116 | been tested, such as having missing cards, it is occupied by another session, or it is a MISC use case. 117 | 118 | ``` log 119 | [2023-02-07 09:02:39] INFO Infected 1 Node: ['sys-9-05'] 120 | [2023-02-07 09:02:39] INFO Missing 1 Node: ['sys-2-03'] 121 | [2023-02-07 09:02:39] INFO Healthy 34 Node: ["sys-2-01","sys-2-02","sys-2-03","sys-2-04","sys-2-06","sys-3-01","sys-3-02","sys-3-03","sys-3-04","sys-3-06","sys-4-01","sys-4-02","sys-4-03","sys-4-04","sys-4-05","sys-6-01","sys-6-02","sys-6-03","sys-6-04","sys-6-05","sys-6-06","sys-7-01","sys-7-02","sys-7-04","sys-7-06","sys-8-01","sys-8-03","sys-8-04","sys-8-05","sys-8-06","sys-9-01","sys-9-02","sys-9-03","sys-9-04","sys-9-06"] 122 | 123 | [2023-02-07 09:02:39] INFO Detected 36 Node: ["sys-2-01","sys-2-02","sys-2-03","sys-2-04","sys-2-06","sys-3-01","sys-3-02","sys-3-03","sys-3-04","sys-3-06","sys-4-01","sys-4-02","sys-4-03","sys-4-04","sys-4-05","sys-6-01","sys-6-02","sys-6-03","sys-6-04","sys-6-05","sys-6-06","sys-7-01","sys-7-02","sys-7-04","sys-7-06","sys-8-01","sys-8-03","sys-8-04","sys-8-05","sys-8-06","sys-9-01","sys-9-02","sys-9-03","sys-9-04","sys-9-05","sys-9-06"] 124 | [2023-02-07 09:02:39] INFO 1 Nodes w/ missing cards: ['sys-2-03'] 125 | ``` 126 | 127 | ## Setup 128 | 129 | IGHS is compatible with python3 default packages and does not require additional packages 130 | to be installed. 131 | 132 | If your setup Environment requires custom configruation, update the yaml files located in the templates folder. 133 | 134 | If running on bare metal system, then install `pdsh` to your system. 135 | 136 | Update [config.yaml](config.yaml) to match your system Environment 137 | 138 | ``` yaml 139 | # Sets IGHS to screen for K8s or Bare Metal Environment (k8s, bare-metal). 140 | system-info: 141 | type: "k8s" 142 | # Namespace is only required for k8s settings 143 | namespace: "intelgaudi" 144 | # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile 145 | # hostfile: "./hostfile" 146 | 147 | # Bare Metal Configurations 148 | ssh-path: "./ssh" 149 | tcp-interface: "10.3.124.0/24" 150 | 151 | # Image to run Intel Gaudi Health Screen 152 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" 153 | 154 | # Node Label used to identify a Intel Gaudi Node 155 | gaudi-node-label: "habana.ai/gaudi=NoSchedule" 156 | 157 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) 158 | log-level: "DEBUG" 159 | 160 | # Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature) 161 | level-1: 162 | run: true 163 | timeout_s: 150 164 | # Number of times to check Port Status 165 | num-checks-link-state: 10 166 | 167 | # Level 2 - Checks All Reduce between node pairs in the cluster. 168 | level-2: 169 | run: true 170 | timeout_s: 130 171 | # Number of times to check Network connections between nodes 172 | num-rounds: 5 173 | ``` 174 | 175 | To learn the features of IGHS, run the below command: 176 | 177 | ``` bash 178 | python screen.py --help 179 | 180 | usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] 181 | [--job-id JOB_ID] [--round ROUND] [--config CONFIG] 182 | [--ighs-check [{node,hccl-demo,none}]] [--node-write-report] 183 | [--node-name NODE_NAME] [--logs-dir LOGS_DIR] 184 | 185 | optional arguments: 186 | -h, --help show this help message and exit 187 | --initialize Downloads Necessary Repos and Creates Report Template 188 | --screen Starts Health Screen for Cluster 189 | --target-nodes TARGET_NODES 190 | List of target nodes 191 | --job-id JOB_ID Needed to identify hccl-demo running log 192 | --round ROUND Needed to identify hccl-demo running round log 193 | --config CONFIG Configuration file for Health Screener 194 | --ighs-check [{node,hccl-demo,none}] 195 | Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce 196 | (HCCL_DEMO between paris of nodes) 197 | --node-write-report Write Individual Node Health Report 198 | --node-name NODE_NAME Name of Node 199 | --logs-dir LOGS_DIR Output directory of health screen results 200 | ``` 201 | 202 | To Run IGHS, run the below command: 203 | 204 | ``` bash 205 | # Creates IGHS Report and screens clusters for any infected nodes. 206 | # Will check Level 1 and 2 by default 207 | python screen.py --initialize --screen 208 | ``` 209 | 210 | IGHS can alternatively be run through below script: 211 | 212 | ``` bash 213 | # Creates IGHS Report and screens clusters for any infected nodes. 214 | # Will check Level 1 and 2 by default 215 | ./run_ighs.sh 216 | ``` 217 | 218 | ### Run on BareMetal 219 | 220 | To run on bare-metal systems update the [config.yaml](config.yaml) to use bare-metal configuration. 221 | 222 | ``` yaml 223 | # Sets IGHS to screen for K8s or Bare Metal Environment (k8s, bare-metal). 224 | system-info: 225 | type: "bare-metal" 226 | # Namespace is only required for k8s settings 227 | namespace: "intelgaudi" 228 | # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile 229 | hostfile: "./hostfile" 230 | 231 | # Bare Metal Configurations 232 | ssh-path: "./ssh" 233 | tcp-interface: "10.3.124.0/24" 234 | 235 | # Image to run Intel Gaudi Health Screen 236 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" 237 | 238 | # Node Label used to identify a Intel Gaudi Node 239 | gaudi-node-label: "habana.ai/gaudi=NoSchedule" 240 | 241 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) 242 | log-level: "DEBUG" 243 | 244 | # Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature) 245 | level-1: 246 | run: true 247 | timeout_s: 150 248 | # Number of times to check Port Status 249 | num-checks-link-state: 10 250 | 251 | # Level 2 - Checks All Reduce between node pairs in the cluster. 252 | level-2: 253 | run: true 254 | timeout_s: 130 255 | # Number of times to check Network connections between nodes 256 | num-rounds: 5 257 | ``` 258 | 259 | Before running the screening test, you need to generate the ssh key used for passwordless ssh: 260 | 261 | ``` bash 262 | # Keys to setup initial bare-metal passwordless ssh connection between systems 263 | mkdir -p ssh; 264 | ssh-keygen -t rsa -f ssh/ighs_rsa; 265 | chmod 600 ssh/ighs_rsa; 266 | chmod 644 ssh/ighs_rsa.pub; 267 | 268 | # Keys to setup containers passwordless ssh connection 269 | mkdir -p template/bare-metal/ssh; 270 | ssh-keygen -t rsa -f template/bare-metal/ssh/id_rsa; 271 | chmod 600 template/bare-metal/ssh/id_rsa; 272 | chmod 644 template/bare-metal/ssh/id_rsa.pub; 273 | 274 | cat template/bare-metal/ssh/id_rsa.pub > template/bare-metal/ssh/authorized_keys; 275 | ``` 276 | 277 | ## Recovery Steps 278 | 279 | | Issue | Description | 280 | | ------------------------- | --------------------------------------------------------------------------------------- | 281 | | Down Internal Links | Need to investigate Gaudi Card Health | 282 | | Down External Links | Check Cable, switches, and Gaudi Card Health | 283 | | QPC Issues | Network Configuration issue (stale gaudinet.json, stale NIC configurations, etc... ) | 284 | | Missing Cards | Need to investigate Gaudi Card Health | 285 | | k8s Issues | Node Resources are not set/configured properly | 286 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/config.yaml: -------------------------------------------------------------------------------- 1 | # Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info 2 | system-info: 3 | type: "k8s" 4 | # Namespace is only required for k8s settings 5 | namespace: "intelgaudi" 6 | 7 | # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile 8 | # hostfile: "./hostfile" 9 | 10 | # Bare Metal Configurations 11 | ssh-path: "./ssh" 12 | tcp-interface: "10.3.124.0/24" 13 | 14 | # Image to run Intel Gaudi Health Screen 15 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" 16 | 17 | # Node Label used to identify a Intel Gaudi Node 18 | gaudi-node-label: "habana.ai/gaudi=NoSchedule" 19 | 20 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) 21 | log-level: "DEBUG" 22 | 23 | # Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure, Device Temperature) 24 | level-1: 25 | run: true 26 | timeout_s: 150 27 | # Number of times to check Port Status 28 | num-checks-link-state: 12 29 | 30 | # Level 2 - Checks All Reduce between node pairs in the cluster. 31 | level-2: 32 | run: true 33 | timeout_s: 130 34 | # Number of times to check Network connections between nodes 35 | num-rounds: 5 36 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/hccl_demo_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import random, math, os, yaml, glob, json 14 | 15 | import logging 16 | _logger = logging.getLogger("health_screener") 17 | 18 | def find_groups(healthy_nodes, watch_nodes, groups_tracker): 19 | """ Find a list of node groups to run hccl_demo all reduce test 20 | 21 | Args: 22 | healthy_nodes ([str]): Nodes that previously passed a pair testing of hccl_demo 23 | watch_nodes ([str]): Nodes that haven't has a passing round of hccl_demo 24 | groups_tracker ([str]): History of used groups. A group has to be unique 25 | 26 | Returns: 27 | ([str],[str]): Unique list of groups of nodes, History of used groups 28 | """ 29 | random.shuffle(healthy_nodes) 30 | random.shuffle(watch_nodes) 31 | 32 | found_unique = True 33 | num_nodes = len(healthy_nodes) + len(watch_nodes) 34 | node_groups = list() 35 | max_num_groups = num_nodes // 2 36 | max_combinations = (math.factorial(num_nodes)) / (math.factorial(num_nodes-2) * 2) 37 | max_attempts = 10 38 | groups_tracker = set(groups_tracker) 39 | 40 | if num_nodes == 1: 41 | _logger.warning(f"Need more than 1 Node to test pair all_reduce") 42 | return node_groups, list(groups_tracker) 43 | 44 | while len(node_groups) < max_num_groups and found_unique: 45 | i = 0 46 | h_i, w_i = 0,0 47 | 48 | if len(groups_tracker) >= max_combinations: 49 | _logger.info(f"Reached maximum combinations {max_combinations} for {num_nodes} Nodes") 50 | break 51 | 52 | node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i) 53 | i += 1 54 | if len(node_group) < 2 or node_group[0] == node_group[1]: 55 | _logger.info(f"Found invalid node_group {node_group}. Exiting group id search") 56 | found_unique = False 57 | break 58 | 59 | while group_id in groups_tracker: 60 | if i >= max_attempts: 61 | _logger.warning(f"Max attempt {max_attempts} reached for finding unique pair combination.") 62 | found_unique = False 63 | break 64 | 65 | node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i) 66 | i += 1 67 | if len(node_group) < 2 or node_group[0] == node_group[1]: 68 | _logger.info(f"Internal while Found invalid node_group {node_group}. Exiting group id search") 69 | found_unique = False 70 | break 71 | 72 | if found_unique: 73 | groups_tracker.add(group_id) 74 | node_groups.append(node_group) 75 | 76 | for n in node_group: 77 | if n in healthy_nodes: 78 | healthy_nodes.remove(n) 79 | if n in watch_nodes: 80 | watch_nodes.remove(n) 81 | 82 | if len(watch_nodes) == 0: 83 | break 84 | 85 | return node_groups, list(groups_tracker) 86 | 87 | def find_group_id(healthy_nodes, watch_nodes, h_i=0, w_i=0): 88 | """ Finds a group of nodes and combines to form a group id 89 | 90 | Args: 91 | healthy_nodes ([str]): Nodes that previously passed a pair testing of hccl_demo 92 | watch_nodes ([str]): Nodes that haven't has a passing round of hccl_demo 93 | h_i (int): Index of next potential node id for healthy_nodes 94 | w_i (int): Index of next potential node id for watch_nodes 95 | 96 | Returns: 97 | ([str], str): Potential nodes and their group id 98 | """ 99 | group_id = "" 100 | node_group = [] 101 | max_attempt = 10 102 | 103 | # Goal of testing is to test watch_nodes and pair it with a healhty_node if available 104 | if len(watch_nodes) == 0 or (len(watch_nodes) == 1 and len(healthy_nodes)==0): 105 | return node_group, group_id, (h_i, w_i) 106 | 107 | for i in range(max_attempt): 108 | if len(watch_nodes) and w_i < len(watch_nodes): 109 | node_group.append(watch_nodes[w_i]) 110 | w_i += 1 111 | if len(healthy_nodes) and h_i < len(healthy_nodes): 112 | node_group.append(healthy_nodes[h_i]) 113 | h_i += 1 114 | 115 | if h_i >= len(healthy_nodes): 116 | random.shuffle(healthy_nodes) 117 | h_i = 0 118 | if w_i >= len(watch_nodes): 119 | random.shuffle(watch_nodes) 120 | w_i = 0 121 | 122 | if len(node_group) >= 2: 123 | break 124 | 125 | if len(node_group) > 1: 126 | node_group.sort() 127 | group_id = "-".join(node_group) 128 | 129 | return node_group, group_id, (h_i, w_i) 130 | 131 | def gather_hccl_logs(job_path, round, log_dir, health_report): 132 | """ Retrieve hccl_demo log files based on the job yamls executed 133 | 134 | Args: 135 | job_path (str): Base directory of job yamls executed 136 | round (int): Round to retrieve HCCL_Demo logs 137 | log_dir (str): Base directory of HCCL_Demo logs 138 | health_report (HealthReport): Tracks and reports health of hccl_demo 139 | """ 140 | path = f"{job_path}/**/r{round}/*.yaml" 141 | job_files = glob.glob(path, recursive=True) 142 | hccl_results = dict() 143 | 144 | for f_name in job_files: 145 | with open(f_name, 'r', newline='') as f: 146 | job_data = yaml.safe_load(f) 147 | 148 | launcher_template = job_data["spec"]["mpiReplicaSpecs"]["Launcher"]["template"] 149 | 150 | job_id = launcher_template["metadata"]["labels"]["name"] 151 | target_nodes = launcher_template["spec"]["containers"][0]["env"][4]["value"] 152 | target_nodes = target_nodes.split(',') 153 | 154 | hccl_results[f"{target_nodes}"] = hccl_demo_check(job_id=f"{log_dir}/L2/r{round}/{job_id}", 155 | target_nodes=target_nodes, health_report=health_report, write=False) 156 | 157 | multi_node_fail = set() 158 | qpc_fail = set() 159 | missing_nodes = set() 160 | 161 | for results in hccl_results.values(): 162 | if results["multi_node_fail"]: 163 | multi_node_fail.add(f"{results['node_ids']}") 164 | 165 | if results["qpc_fail"]: 166 | qpc_fail.add(f"{results['node_ids']}") 167 | 168 | if results["missing"]: 169 | missing_nodes.add(f"{results['node_ids']}") 170 | 171 | health_report.update_hccl_demo_health_report(round=round, all_node_pairs=hccl_results, multi_node_fail=multi_node_fail, qpc_fail=qpc_fail, missing_nodes=missing_nodes) 172 | 173 | def hccl_demo_check(job_id, health_report, target_nodes=[], hccl_log=[], write=True): 174 | """ Check on HCCL Demo Status. Reads the output log, if it 175 | has "Exiting HCCL demo with code: 1" then it is treated as a 176 | failure 177 | 178 | Args: 179 | job_id (str): Metadata name of the Job 180 | health_report (HealthReport): Tracks and reports health of hccl_demo 181 | target_nodes ([str], optional): Nodes that are used in hccl_demo testing 182 | hccl_log ([str]): Log of HCCL_DEMO run 183 | write (bool, optional): Writes to Report. Used to collect hccl results and update Base Health Report. Default to True 184 | 185 | Returns: 186 | dict: HCCL Demo Health Report result data. 187 | """ 188 | f_name_log = f"{job_id}.log" 189 | round = os.path.basename(job_id).split("-")[2][1:] 190 | group_id = os.path.basename(job_id).split("-")[3] 191 | hccl_demo_fail = True 192 | missing = False 193 | qpc_fail = False 194 | 195 | if len(hccl_log) == 0: 196 | if not os.path.exists(f_name_log): 197 | _logger.error(f"{f_name_log} can't be found or has no data") 198 | hccl_demo_fail = True 199 | missing = True 200 | else: 201 | with open(f_name_log, "r", newline='') as f: 202 | lines = f.readlines() 203 | hccl_demo_fail, qpc_fail, missing, _ = analyze_hccl_log(lines) 204 | else: 205 | hccl_demo_fail, qpc_fail, missing, target_nodes = analyze_hccl_log(hccl_log) 206 | 207 | target_nodes.sort() 208 | data = { 209 | "round": round, 210 | "group_id": group_id, 211 | "node_ids": target_nodes, 212 | "num_nodes": len(target_nodes), 213 | "multi_node_fail": hccl_demo_fail, 214 | "missing": missing, 215 | "qpc_fail": qpc_fail 216 | } 217 | 218 | if write: 219 | _logger.info("***** START of Node Report *****") 220 | _logger.info(json.dumps(data)) 221 | _logger.info("***** END of Node Report *****") 222 | health_report.write_rows(data=[data], level=2) 223 | 224 | return data 225 | 226 | def analyze_hccl_log(data): 227 | err_phrase = "Exiting HCCL demo with code: 1" 228 | err_phrase_other = "During handling of the above exception, another exception occurred:" 229 | err_phrase_ssh = "ssh: Could not resolve hostname" 230 | err_phrase_qpc = "Source: QPC, error" 231 | pass_phrase = "Bandwidth" 232 | 233 | target_phrase = "Target Nodes: " 234 | 235 | hccl_demo_fail = True 236 | missing = False 237 | qpc_fail = False 238 | target_nodes = [] 239 | 240 | for l in data: 241 | if l.find(err_phrase_ssh) != -1: 242 | hccl_demo_fail = True 243 | missing = True 244 | elif l.find(err_phrase_qpc) != -1: 245 | hccl_demo_fail = True 246 | qpc_fail = True 247 | elif l.find(err_phrase) != -1 or l.find(err_phrase_other) != -1: 248 | hccl_demo_fail = True 249 | elif l.find(pass_phrase) != -1: 250 | hccl_demo_fail = False 251 | elif l.find(target_phrase) != -1: 252 | colon_index = l.index(":") 253 | target_nodes = l[colon_index+2:].split(",") 254 | 255 | return hccl_demo_fail, qpc_fail, missing, target_nodes 256 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/hostfile: -------------------------------------------------------------------------------- 1 | sys-01 2 | sys-02 -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/run_ighs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | LOG_DIR=logs/$(date +'%m-%Y/%m-%d-%Y/%m-%d-%Y_%H-%M') 15 | 16 | python3 screen.py --initialize --logs-dir $LOG_DIR; 17 | python3 screen.py --screen --logs-dir $LOG_DIR; 18 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/screen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, datetime, yaml, sys, time, json 14 | import argparse 15 | import logging 16 | 17 | from utilities import download_repos, create_logger, get_logging_level 18 | from hccl_demo_helper import hccl_demo_check 19 | from system_utils import KubeUtils, BareMetalUtils 20 | 21 | from HealthReport import HealthReport 22 | from IGNodes import IGNodes, IGNode 23 | 24 | 25 | _logger = None 26 | 27 | def monitor_ighs_status(system_mode, level, nodes, timeout_s=240, round=0): 28 | sleep_time_s = 2 29 | max_attempts = (timeout_s // sleep_time_s) + min(timeout_s % sleep_time_s, 1) 30 | current_run_status = dict() 31 | lvl_check_msg = f"Checking IGHS Level {level}" 32 | 33 | num_nodes = len(nodes.all_nodes) 34 | if level == 2: 35 | num_nodes = len(nodes.current_node_groups) * 2 36 | lvl_check_msg += f" Round {round}" 37 | 38 | _logger.info(f"{lvl_check_msg} Status") 39 | 40 | for attempt in range(max_attempts): 41 | num_found_nodes = system_mode.check_screen_complete(current_run_status=current_run_status, health_report=nodes.health_report, level=level, round=round) 42 | 43 | if num_found_nodes == num_nodes: 44 | _logger.info(f"Found {num_found_nodes}/{num_nodes} Nodes during Health Screen") 45 | break 46 | 47 | _logger.info(f"Attempt {attempt}/{max_attempts}: Found {num_found_nodes}/{num_nodes} Nodes - Will Check again in {sleep_time_s} seconds") 48 | time.sleep(sleep_time_s) 49 | num_found_nodes = system_mode.check_screen_complete(current_run_status=current_run_status, health_report=nodes.health_report, level=level, round=round, final_check=True) 50 | 51 | if level == 1: 52 | detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_node_info() 53 | missing_nodes.update(set(nodes.all_nodes).difference(detected_nodes)) 54 | undetected_nodes = [] 55 | 56 | nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes) 57 | elif level == 2: 58 | detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_hccl_demo_info() 59 | undetected_nodes = set(nodes.all_nodes).difference(detected_nodes) 60 | 61 | nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes) 62 | 63 | detected_nodes_l1, infected_nodes_l1, missing_nodes = nodes.health_report.extract_node_info() 64 | detected_nodes.update(detected_nodes_l1) 65 | infected_nodes.update(infected_nodes_l1) 66 | 67 | healthy_nodes = detected_nodes.difference(infected_nodes).difference(missing_nodes) 68 | 69 | healthy_nodes = sorted(list(healthy_nodes)) 70 | missing_nodes = sorted(list(missing_nodes)) 71 | infected_nodes = sorted(list(infected_nodes)) 72 | nodes.update_node_status(healthy_nodes, infected_nodes, missing_nodes, undetected_nodes=undetected_nodes) 73 | 74 | watch_nodes = sorted(list(nodes.watch_nodes)) 75 | detected_nodes = sorted(list(detected_nodes)) 76 | 77 | if level == 1: 78 | nodes.healthy_nodes = set(healthy_nodes) 79 | 80 | _logger.info(f"Detected {len(detected_nodes)} Node: {detected_nodes}") 81 | _logger.info(f" Healthy {len(healthy_nodes)} Node: {healthy_nodes}") 82 | _logger.info(f" Infected {len(infected_nodes)} Node: {infected_nodes}") 83 | _logger.info(f"Missing {len(missing_nodes)} Node: {missing_nodes}") 84 | _logger.info(f"Unverified {len(watch_nodes)} Node: {watch_nodes}") 85 | 86 | return healthy_nodes, infected_nodes, missing_nodes 87 | 88 | 89 | def main(args): 90 | global _logger 91 | 92 | if args.logs_dir == "": 93 | c_time = datetime.datetime.now() 94 | date_year_format = c_time.strftime("%m-%Y") 95 | date_format = c_time.strftime("%m-%d-%Y") 96 | time_format = c_time.strftime("%H-%M") 97 | args.logs_dir = f"logs/{date_year_format}/{date_format}/{date_format}_{time_format}" 98 | 99 | 100 | ighs_report_name = "health_report.csv" 101 | ighs_log_dir = args.logs_dir 102 | 103 | if args.node_name: 104 | ighs_level = os.environ["IGHS_LEVEL"] if "IGHS_LEVEL" in os.environ else 1 105 | ighs_report_name = f"health_report_{args.node_name}.csv" 106 | ighs_log_dir = f"{args.logs_dir}/L{ighs_level}" 107 | 108 | health_report = HealthReport(f_dir=ighs_log_dir, report_name=ighs_report_name) 109 | job_path = "tmp/jobs" 110 | 111 | with open(args.config, 'r') as f: 112 | config_data = yaml.safe_load(f) 113 | 114 | hostfile = "" 115 | if "hostfile" in config_data["system-info"]: 116 | hostfile = config_data["system-info"]["hostfile"] 117 | 118 | log_level = get_logging_level(config_data["log-level"]) 119 | _logger, _ = create_logger(logger_name="health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level) 120 | 121 | if config_data["system-info"]["type"] == "k8s": 122 | system_mode = KubeUtils(image=config_data["image"], 123 | hostfile=hostfile, 124 | namespace=config_data["system-info"]["namespace"], 125 | log_dir=args.logs_dir) 126 | elif config_data["system-info"]["type"] == "bare-metal": 127 | system_mode = BareMetalUtils(image=config_data["image"], 128 | hostfile=hostfile, 129 | ssh_path=config_data["system-info"]["ssh-path"], 130 | tcp_interface=config_data["system-info"]["tcp-interface"], 131 | log_dir=args.logs_dir) 132 | else: 133 | _logger.error(f"system_mode: {system_mode} in {args.config} is not set correctly. system_mode has to be set to k8s or bare-metal") 134 | sys.exit(1) 135 | 136 | 137 | if args.initialize: 138 | _logger.info(f"Loaded Configuration File: {args.config}") 139 | _logger.info(f"{config_data}") 140 | 141 | health_report.create(create_base=True, create_hccl_demo=True) 142 | download_repos() 143 | 144 | system_mode.initialize_system() 145 | 146 | if args.screen: 147 | start_time = datetime.datetime.now() 148 | 149 | intel_gaudi_nodes = IGNodes(health_report=health_report) 150 | intel_gaudi_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"]) 151 | healthy_nodes, infected_nodes, missing_nodes = list(), list(), list() 152 | occupied_nodes, missing_cards_nodes, misc_nodes = list(), list(), list() 153 | 154 | if config_data["level-1"]["run"]: 155 | _logger.info("Running Level 1 Checks: Card Diagnostics") 156 | if not os.path.exists(f"{health_report.f_dir}/L1"): 157 | os.makedirs(f"{health_report.f_dir}/L1") 158 | 159 | nodes_initialized = system_mode.initialize_node_jobs(level=1, 160 | nodes=intel_gaudi_nodes, 161 | job_base_path=job_path) 162 | if nodes_initialized: 163 | healthy_nodes, infected_nodes, missing_nodes = monitor_ighs_status(system_mode=system_mode, 164 | level=1, 165 | nodes=intel_gaudi_nodes, 166 | timeout_s=config_data["level-1"]["timeout_s"]) 167 | occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes) 168 | system_mode.clear_ighs_pods() 169 | 170 | summary = { 171 | "level": 1, 172 | "infected": infected_nodes, 173 | "missing": missing_nodes, 174 | "occupied": occupied_nodes, 175 | "missing_cards": missing_cards_nodes, 176 | "untested": misc_nodes, 177 | "healthy": healthy_nodes 178 | } 179 | 180 | with open(f"{args.logs_dir}/ighs_L1_summary.json", 'w', encoding ='utf8') as f: 181 | json.dump(summary, f, indent=4) 182 | 183 | if config_data["level-2"]["run"]: 184 | _logger.info("Running Level 2 Checks: Pair HCCL_DEMO All Reduce") 185 | if not os.path.exists(f"{health_report.f_dir}/L2"): 186 | os.makedirs(f"{health_report.f_dir}/L2") 187 | 188 | intel_gaudi_nodes.healthy_nodes = set() 189 | intel_gaudi_nodes.watch_nodes = set(intel_gaudi_nodes.all_nodes).difference(set(missing_nodes)) 190 | intel_gaudi_nodes.missing_nodes = set(missing_nodes) 191 | 192 | for i in range(config_data["level-2"]["num-rounds"]): 193 | nodes_initialized = system_mode.initialize_node_jobs(level=2, 194 | nodes=intel_gaudi_nodes, 195 | job_base_path=job_path, 196 | round=i) 197 | if not nodes_initialized: 198 | _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No other Nodes to screen. Exit screening early.") 199 | break 200 | 201 | healthy_nodes, infected_nodes, missing_nodes = monitor_ighs_status(system_mode=system_mode, 202 | level=2, 203 | nodes=intel_gaudi_nodes, 204 | timeout_s=config_data["level-2"]["timeout_s"], 205 | round=i) 206 | occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes) 207 | system_mode.clear_ighs_pods(job_type="mpijobs") 208 | 209 | if len(intel_gaudi_nodes.watch_nodes) == 0: 210 | _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No other Nodes to screen. Exit screening early.") 211 | break 212 | 213 | summary = { 214 | "level": 2, 215 | "infected": infected_nodes, 216 | "missing": missing_nodes, 217 | "occupied": occupied_nodes, 218 | "missing_cards": missing_cards_nodes, 219 | "untested": misc_nodes, 220 | "healthy": healthy_nodes 221 | } 222 | 223 | with open(f"{args.logs_dir}/ighs_L2_summary.json", 'w', encoding ='utf8') as f: 224 | json.dump(summary, f, indent=4) 225 | 226 | end_time = datetime.datetime.now() 227 | diff_time = (end_time - start_time) 228 | _logger.info(f"Total Run Time: {diff_time}") 229 | 230 | if args.ighs_check == "node": 231 | node = IGNode(health_report=health_report, 232 | num_checks_link_state=config_data["level-1"]["num-checks-link-state"], 233 | log_level=log_level, 234 | name=args.node_name) 235 | node.scan_cards() 236 | node.health_check(write_report=args.node_write_report) 237 | elif args.ighs_check == "hccl-demo": 238 | health_report.create(create_base=False, create_hccl_demo=True) 239 | 240 | target_nodes = args.target_nodes.strip("[']").replace("'","").split(',') 241 | hccl_demo_check(job_id=f"{health_report.f_dir}/L2/{args.round}/{args.job_id}", 242 | target_nodes=target_nodes, health_report=health_report) 243 | 244 | if __name__=="__main__": 245 | parser = argparse.ArgumentParser() 246 | 247 | parser.add_argument("--initialize", action="store_true", help="Downloads Necessary Repos and Creates Report Template") 248 | parser.add_argument("--screen", action="store_true", help="Starts Health Screen for Cluster") 249 | parser.add_argument("--target-nodes", type=str, default="", help="List of target nodes") 250 | parser.add_argument("--job-id", type=str, default="", help="Needed to identify hccl-demo running log") 251 | parser.add_argument("--round", type=str, default="", help="Needed to identify hccl-demo running round log") 252 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file for Health Screener") 253 | parser.add_argument("--ighs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"], 254 | help="Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes)") 255 | 256 | parser.add_argument("--node-write-report", action="store_true", help="Write Individual Node Health Report") 257 | parser.add_argument("--node-name", type=str, default="", help="Name of Node") 258 | parser.add_argument("--logs-dir", type=str, default="", help="Output directory of health screen results") 259 | 260 | args = parser.parse_args() 261 | 262 | 263 | main(args) 264 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM ${BASE_IMAGE} 3 | 4 | RUN mkdir ~/.ssh && \ 5 | cd ~/.ssh && \ 6 | ssh-keygen -A && \ 7 | sed -i 's/#Port 22/Port 3122/g' /etc/ssh/sshd_config && \ 8 | sed -i 's/# Port 22/ Port 3122/g' /etc/ssh/ssh_config && \ 9 | sed -i 's/3022/3122/g' ~/.bashrc && \ 10 | echo "Host *" >> ~/.ssh/config && \ 11 | echo "ForwardAgent yes" >> ~/.ssh/config && \ 12 | echo "StrictHostKeyChecking no" >> ~/.ssh/config && \ 13 | echo "UserKnownHostsFile /dev/null" >> ~/.ssh/config && \ 14 | echo "LogLevel ERROR" >> ~/.ssh/config && \ 15 | service ssh start && \ 16 | chmod 600 ~/.ssh/config 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | ighs_level1: 3 | image: ighs_level1 4 | build: 5 | context: . 6 | network: host 7 | args: 8 | BASE_IMAGE: "${BASE_IMAGE}" 9 | container_name: ighs_level1 10 | runtime: habana 11 | environment: 12 | - HABANA_VISIBLE_DEVICES=all 13 | - OMPI_MCA_btl_vader_single_copy_mechanism=none 14 | - IGHS_LEVEL=1 15 | cap_add: 16 | - SYS_NICE 17 | - SYSLOG 18 | ipc: host 19 | network_mode: host 20 | working_dir: /tmp/ighs/intel_gaudi_health_screen 21 | volumes: 22 | - ./ssh:/root/.ssh/ 23 | - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen 24 | - /etc/localtime:/etc/localtime:ro 25 | command: > 26 | bash -c "python screen.py --ighs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} && \ 27 | chmod 777 -R $${LOG_DIR}" 28 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | ighs_level2_launcher: 3 | image: ighs_level2 4 | build: 5 | context: . 6 | network: host 7 | args: 8 | BASE_IMAGE: "${BASE_IMAGE}" 9 | container_name: ighs_level2_launcher 10 | runtime: habana 11 | environment: 12 | - HABANA_VISIBLE_DEVICES=all 13 | - OMPI_MCA_btl_vader_single_copy_mechanism=none 14 | - IGHS_LEVEL=2 15 | cap_add: 16 | - SYS_NICE 17 | - SYSLOG 18 | ipc: host 19 | network_mode: host 20 | working_dir: /tmp/ighs/intel_gaudi_health_screen 21 | volumes: 22 | - ./ssh:/root/.ssh/ 23 | - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen 24 | - /etc/localtime:/etc/localtime:ro 25 | command: > 26 | template/bare-metal/run_hccl_demo.sh -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | ighs_level2_worker: 3 | image: ighs_level2 4 | build: 5 | context: . 6 | network: host 7 | args: 8 | BASE_IMAGE: "${BASE_IMAGE}" 9 | container_name: ighs_level2_worker 10 | runtime: habana 11 | environment: 12 | - HABANA_VISIBLE_DEVICES=all 13 | - OMPI_MCA_btl_vader_single_copy_mechanism=none 14 | - IGHS_LEVEL=2 15 | cap_add: 16 | - SYS_NICE 17 | - SYSLOG 18 | ipc: host 19 | network_mode: host 20 | working_dir: /tmp/ighs/intel_gaudi_health_screen 21 | volumes: 22 | - ./ssh:/root/.ssh/ 23 | - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen 24 | - /etc/localtime:/etc/localtime:ro 25 | tty: true 26 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NUM_NODES="${NUM_NODES:-1}"; 4 | HOME_DIR="${HOME_DIR:-/tmp/ighs/intel_gaudi_health_screen}"; 5 | WORK_DIR="${WORK_DIR:-/tmp/ighs/intel_gaudi_health_screen/build/hccl_demo}"; 6 | 7 | NGPU_PER_NODE=8; 8 | N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); 9 | 10 | cd ${WORK_DIR}; 11 | CMD="python ${WORK_DIR}/run_hccl_demo.py \ 12 | --test all_reduce \ 13 | --loop 1000 \ 14 | --size 32m \ 15 | -clean \ 16 | -mpi "; 17 | 18 | mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/; 19 | cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 20 | touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 21 | echo "Target Nodes: $TARGET_NODES" >> $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 22 | 23 | $CMD \ 24 | -np ${N_CARDS} \ 25 | --allow-run-as-root \ 26 | --bind-to core \ 27 | --map-by ppr:4:socket:PE=6 \ 28 | --rank-by core --report-bindings \ 29 | --tag-output \ 30 | --merge-stderr-to-stdout --prefix $MPI_ROOT \ 31 | -H ${TARGET_NODES//,/:48,}:48 \ 32 | --mca btl_tcp_if_include $TCP_INTERFACE \ 33 | -x MASTER_ADDR \ 34 | -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ 35 | -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ 36 | 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 37 | 38 | cd ${HOME_DIR}; 39 | python $HOME_DIR/screen.py --ighs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --round $ROUND; 40 | 41 | chmod 777 -R $HOME_DIR/$LOG_DIR 42 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: template-metadata-name 5 | namespace: default 6 | labels: 7 | app: ighs 8 | spec: 9 | template: 10 | metadata: 11 | labels: 12 | app: ighs 13 | spec: 14 | restartPolicy: "Never" 15 | affinity: 16 | nodeAffinity: 17 | requiredDuringSchedulingIgnoredDuringExecution: 18 | nodeSelectorTerms: 19 | - matchExpressions: 20 | - key: kubernetes.io/hostname 21 | operator: In 22 | values: 23 | - IGHS-DUMMY-VAL 24 | volumes: 25 | - name: mydir 26 | emptyDir: {} 27 | tolerations: 28 | - key: "" 29 | operator: "Exists" 30 | effect: "NoSchedule" 31 | containers: 32 | - name: template-container-name 33 | image: template-container-image 34 | imagePullPolicy: IfNotPresent 35 | workingDir: /workdir 36 | command: ["/bin/bash", "-c"] 37 | args: 38 | - >- 39 | ssh-keygen -A; 40 | service ssh start; 41 | 42 | while [ ! -d /workdir/intel_gaudi_health_screen ]; do 43 | sleep 2s; 44 | done; 45 | sleep 10s; 46 | 47 | cd /workdir/intel_gaudi_health_screen; 48 | python /workdir/intel_gaudi_health_screen/screen.py --ighs-check node --logs-dir $LOG_DIR; 49 | volumeMounts: 50 | - name: mydir 51 | mountPath: /workdir 52 | securityContext: 53 | capabilities: 54 | add: 55 | - SYSLOG 56 | env: 57 | - name: IGHS_LEVEL 58 | value: "1" 59 | - name: MY_POD_IP 60 | valueFrom: 61 | fieldRef: 62 | fieldPath: status.podIP 63 | - name: MY_NODE_NAME 64 | valueFrom: 65 | fieldRef: 66 | fieldPath: spec.nodeName 67 | - name: MY_POD_NAMESPACE 68 | valueFrom: 69 | fieldRef: 70 | fieldPath: metadata.namespace 71 | resources: 72 | limits: 73 | habana.ai/gaudi: 8 74 | hugepages-2Mi: 29000Mi 75 | memory: 200Gi 76 | cpu: 95 77 | requests: 78 | habana.ai/gaudi: 8 79 | hugepages-2Mi: 29000Mi 80 | memory: 200Gi 81 | cpu: 95 82 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v2beta1 2 | kind: MPIJob 3 | metadata: 4 | name: template-metadata-name 5 | namespace: default 6 | labels: 7 | app: ighs-hccl 8 | spec: 9 | slotsPerWorker: 8 10 | runPolicy: 11 | cleanPodPolicy: Running 12 | mpiReplicaSpecs: 13 | Launcher: 14 | replicas: 1 15 | template: 16 | metadata: 17 | labels: 18 | app: ighs-hccl 19 | spec: 20 | volumes: 21 | - name: mydir 22 | emptyDir: {} 23 | containers: 24 | - image: template-container-image 25 | name: ighs-launcher 26 | imagePullPolicy: IfNotPresent 27 | workingDir: /workdir 28 | volumeMounts: 29 | - name: mydir 30 | mountPath: /workdir 31 | securityContext: 32 | capabilities: 33 | add: 34 | - SYSLOG 35 | env: 36 | - name: JOB_ID 37 | valueFrom: 38 | fieldRef: 39 | fieldPath: metadata.labels['name'] 40 | - name: MY_NODE_NAME 41 | valueFrom: 42 | fieldRef: 43 | fieldPath: spec.nodeName 44 | - name: HOME_DIR 45 | value: "/workdir/intel_gaudi_health_screen" 46 | - name: IGHS_LEVEL 47 | value: "2" 48 | command: ["/bin/bash", "-c"] 49 | args: 50 | - >- 51 | set -eo pipefail; 52 | echo "Target Nodes: $TARGET_NODES"; 53 | ssh-keygen -A; 54 | service ssh start; 55 | 56 | while [ ! -d /workdir/intel_gaudi_health_screen ]; do 57 | sleep 2s; 58 | done; 59 | sleep 10s; 60 | 61 | declare -xr HOSTSFILE=$OMPI_MCA_orte_default_hostfile; 62 | 63 | declare -xr NUM_NODES=$(wc -l < $HOSTSFILE); 64 | declare -xr NGPU_PER_NODE=8; 65 | declare -xr N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); 66 | 67 | cd ${HOME_DIR}/build/hccl_demo; 68 | declare -xr CMD="python ${HOME_DIR}/build/hccl_demo/run_hccl_demo.py \ 69 | --test all_reduce \ 70 | --loop 1000 \ 71 | --size 32m \ 72 | -mpi "; 73 | 74 | mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/; 75 | cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 76 | touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 77 | echo "Target Nodes: $TARGET_NODES" > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 78 | 79 | $CMD \ 80 | -np ${N_CARDS} \ 81 | --allow-run-as-root \ 82 | --bind-to core \ 83 | --map-by ppr:4:socket:PE=6 \ 84 | --rank-by core --report-bindings \ 85 | --tag-output \ 86 | --merge-stderr-to-stdout --prefix $MPI_ROOT \ 87 | --mca btl_tcp_if_include eth0 \ 88 | -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ 89 | -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ 90 | -x MAX_TIMEOUT=60 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 91 | 92 | cd ${HOME_DIR}; 93 | python ${HOME_DIR}/screen.py --ighs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND; 94 | 95 | Worker: 96 | replicas: template-num-nodes 97 | template: 98 | metadata: 99 | labels: 100 | app: ighs-hccl 101 | spec: 102 | affinity: 103 | nodeAffinity: 104 | requiredDuringSchedulingIgnoredDuringExecution: 105 | nodeSelectorTerms: 106 | - matchExpressions: 107 | - key: kubernetes.io/hostname 108 | operator: In 109 | values: 110 | - IGHS-DUMMY-VAL 111 | volumes: 112 | - name: mydir 113 | emptyDir: {} 114 | tolerations: 115 | - key: "" 116 | operator: "Exists" 117 | effect: "NoSchedule" 118 | - key: "" 119 | operator: "Exists" 120 | effect: "NoExecute" 121 | containers: 122 | - image: template-container-image 123 | name: ighs-worker 124 | imagePullPolicy: IfNotPresent 125 | securityContext: 126 | capabilities: 127 | add: 128 | - SYSLOG 129 | resources: 130 | limits: 131 | habana.ai/gaudi: 8 132 | hugepages-2Mi: 29000Mi 133 | cpu: 95 134 | memory: 200Gi 135 | requests: 136 | habana.ai/gaudi: 8 137 | hugepages-2Mi: 29000Mi 138 | memory: 200Gi 139 | cpu: 95 140 | volumeMounts: 141 | - name: mydir 142 | mountPath: /workdir 143 | env: 144 | - name: IGHS_LEVEL 145 | value: "2" 146 | - name: MY_POD_IP 147 | valueFrom: 148 | fieldRef: 149 | fieldPath: status.podIP 150 | - name: MY_NODE_NAME 151 | valueFrom: 152 | fieldRef: 153 | fieldPath: spec.nodeName 154 | - name: MY_POD_NAMESPACE 155 | valueFrom: 156 | fieldRef: 157 | fieldPath: metadata.namespace 158 | command: ["/bin/bash", "-c"] 159 | args: 160 | - >- 161 | printenv | grep "MY" >> /etc/environment; 162 | ssh-keygen -A; 163 | service ssh start; 164 | sleep 365d; 165 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/utilities.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, time, sys 14 | import subprocess, shlex 15 | from datetime import datetime 16 | 17 | import logging 18 | from logging import handlers 19 | 20 | _logger = logging.getLogger("health_screener") 21 | 22 | def get_logging_level(log_level): 23 | log_level = log_level.lower() 24 | num_level = logging.INFO 25 | 26 | if log_level == "info": 27 | num_level = logging.INFO 28 | elif log_level == "debug": 29 | num_level = logging.DEBUG 30 | elif log_level == "warn": 31 | num_level = logging.WARN 32 | elif log_level == "error": 33 | num_level = logging.ERROR 34 | elif log_level == "critical": 35 | num_level = logging.CRITICAL 36 | 37 | return num_level 38 | 39 | def create_logger(logger_name, logger_file_name, f_path="", level=logging.INFO, max_bytes=5e6, backup_count=10): 40 | """ Creates Logger that writes to logs directory 41 | 42 | Args: 43 | logger_name (str): Name of Logger File. Will be appended with logs/{current_time}/logger_name.log 44 | level (int, optional): Logging Level. Defaults to logging.INFO. 45 | max_bytes (int, optional): Max size of log file. Will rollover once maxed reach. Defaults to 5e6. 46 | backup_count (int, optional): Rollover Limit. Defaults to 10. 47 | 48 | Returns: 49 | logger: Logger Object used to log details to designated logger file 50 | """ 51 | t_logger = logging.getLogger(logger_name) 52 | t_logger.setLevel(level) 53 | 54 | c_time = datetime.now() 55 | date_format = c_time.strftime("%m-%d-%Y") 56 | time_format = c_time.strftime("%H-%M") 57 | 58 | file_path = f"{f_path}/{logger_file_name}.log" if f_path != "" else f"logs/{date_format}/{date_format}_{time_format}/{logger_file_name}.log" 59 | d_path = os.path.dirname(file_path) 60 | _logger.debug(f"d_path: {d_path} file_path: {file_path}") 61 | 62 | if(not os.path.exists(d_path)): 63 | os.makedirs(d_path) 64 | 65 | formatter = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s",datefmt='%Y-%m-%d %H:%M:%S') 66 | handler = logging.handlers.RotatingFileHandler(file_path, maxBytes=max_bytes, backupCount=backup_count) 67 | handler.setFormatter(formatter) 68 | 69 | stream_handler = logging.StreamHandler(sys.stdout) 70 | stream_handler.setFormatter(formatter) 71 | 72 | t_logger.addHandler(handler) 73 | t_logger.addHandler(stream_handler) 74 | 75 | return t_logger, d_path 76 | 77 | def run_cmd(cmd, timeout_s=900, verbose=False): 78 | """ Run Command through subprocess.run() 79 | 80 | Args: 81 | cmd (str): CMD to run 82 | timeout_s (int, optional): Timeout of CMD. Defaults to 1_800. 83 | verbose (bool, optional): Print results. Defaults to False 84 | 85 | Returns: 86 | bool: Result of CMD. If it encounters any weird exceptions it will be flagged as False 87 | """ 88 | 89 | cmd = shlex.split(cmd) 90 | result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout_s) 91 | 92 | if (verbose): 93 | _logger.debug(f"Running cmd: {cmd}") 94 | _logger.debug(result.stdout) 95 | 96 | return result.stdout 97 | 98 | def download_repos(): 99 | """ Download HCCL_DEMO Repo to assist in health checks 100 | """ 101 | if not os.path.exists("build"): 102 | os.makedirs("build") 103 | 104 | if not os.path.exists("build/hccl_demo"): 105 | _logger.info(f"Downloading hccl_demo into build/") 106 | cmd = "git clone https://github.com/HabanaAI/hccl_demo.git build/hccl_demo" 107 | run_cmd(cmd) 108 | 109 | os.environ["MPI"]="1" 110 | cmd = "make -C build/hccl_demo" 111 | run_cmd(cmd) 112 | 113 | def copy_files(src, dst, to_remote=True, hosts=[], exclude={}): 114 | """ Copies files through rsync from src to dst over the list of hosts 115 | 116 | Args: 117 | src (str): Source file/directory to copy 118 | dst (str): Destination to copy files/directory 119 | to_remote (bool, optional): rsync to remote destination (src -> host:dst). False will rsync to local destination (h:src -> dst). Defaults to True. 120 | hosts (list, optional): List of IP Addresses to copy to/from. Defaults to []. 121 | exclude (dict, optional): Files/Directory to ignore. Follow rsync rules for exclusions. Defaults to {}. 122 | """ 123 | rsync_cmd = f"rsync -ahzgop --exclude={exclude}" 124 | 125 | for h in hosts: 126 | if (to_remote): 127 | src_path = src 128 | dst_path = f"{h}:{dst}" 129 | else: 130 | src_path = f"{h}:{src}" 131 | dst_path = dst 132 | 133 | _logger.debug(f"Copying {src_path} to {dst_path}") 134 | cmd = f"{rsync_cmd} {src_path} {dst_path}" 135 | output = run_cmd(cmd) 136 | 137 | 138 | def clear_job(job): 139 | """ Clear MPIJobs based on Job Name 140 | 141 | Args: 142 | job (str): Job Name to delete 143 | """ 144 | _logger.info(f"Checking for existing MPIJobs {job}") 145 | cmd = f"kubectl get mpijobs -n default {job} -o=custom-columns='NAME:.metadata.name' --no-headers" 146 | output = run_cmd(cmd) 147 | 148 | if job in output: 149 | _logger.info(f"Found MPIJobs {job}. Will delete.") 150 | cmd = f"kubectl delete mpijobs -n default {job}" 151 | output = run_cmd(cmd) 152 | 153 | cmd = f"kubectl get pods -n default --selector=training.kubeflow.org/job-name={job} -o=custom-columns='NAME:.metadata.name' --no-headers" 154 | 155 | max_attempt = 15 156 | for attempts in range(max_attempt): 157 | output = run_cmd(cmd).strip() 158 | 159 | if(len(output) == 0): 160 | break 161 | 162 | _logger.info(f"Attempt {attempts} Pods are still up. Will wait 10 seconds to check again") 163 | time.sleep(10) 164 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/version.txt: -------------------------------------------------------------------------------- 1 | 2.2.2 --------------------------------------------------------------------------------