├── utils ├── intel_gaudi_health_screen │ ├── version.txt │ ├── hostfile │ ├── .gitignore │ ├── template │ │ ├── bare-metal │ │ │ ├── dockerfile │ │ │ ├── intel-gaudi-docker-compose-L2-worker.yaml │ │ │ ├── intel-gaudi-docker-compose-L2-launcher.yaml │ │ │ ├── intel-gaudi-docker-compose-L1.yaml │ │ │ └── run_hccl_demo.sh │ │ └── k8s │ │ │ ├── intel-gaudi-health-screen-L1.yaml │ │ │ └── intel-gaudi-health-screen-L2_hccl-demo.yaml │ ├── run_ighs.sh │ ├── config.yaml │ ├── utilities.py │ ├── hccl_demo_helper.py │ ├── IGNodes.py │ ├── screen.py │ └── HealthReport.py ├── check_framework_env.py └── README.md ├── dockerfiles ├── triton_vllm_backend │ ├── samples │ │ ├── prompts.txt │ │ ├── test_models │ │ │ ├── llama7b_1x │ │ │ │ ├── 1 │ │ │ │ │ └── model.json │ │ │ │ └── config.pbtxt │ │ │ ├── llama70b_8x │ │ │ │ ├── 1 │ │ │ │ │ └── model.json │ │ │ │ └── config.pbtxt │ │ │ └── qwen_7b_chat │ │ │ │ ├── 1 │ │ │ │ └── model.json │ │ │ │ └── config.pbtxt │ │ ├── model_repository │ │ │ └── vllm_model │ │ │ │ ├── 1 │ │ │ │ └── model.json │ │ │ │ └── config.pbtxt │ │ └── client.py │ ├── Makefile │ └── Dockerfile ├── triton │ ├── Makefile │ └── Dockerfile ├── base │ ├── Makefile │ ├── install-python310.sh │ ├── install_efa.sh │ ├── Dockerfile.ubuntu22.04 │ ├── Dockerfile.ubuntu24.04 │ ├── Dockerfile.ubuntu22.04-py311 │ ├── tencentos_efa_patch.txt │ ├── Dockerfile.opencloudos9.2 │ ├── Dockerfile.navix9.4 │ ├── Dockerfile.tencentos3.1 │ ├── Dockerfile.rhel9.6 │ ├── Dockerfile.rhel9.4-py312 │ ├── Dockerfile.rhel9.4 │ └── LICENSE ├── pytorch │ ├── install_packages.sh │ ├── Makefile │ ├── Dockerfile.ubuntu22.04-py311 │ ├── Dockerfile.opencloudos9.2 │ ├── Dockerfile.navix9.4 │ ├── Dockerfile.rhel9.4 │ ├── Dockerfile.rhel9.6 │ ├── Dockerfile.rhel9.4-py312 │ ├── Dockerfile.ubuntu │ └── Dockerfile.tencentos3.1 ├── common.mk └── README.md ├── README.md ├── legal-disclaimer.md └── LICENSE /utils/intel_gaudi_health_screen/version.txt: -------------------------------------------------------------------------------- 1 | 2.2.2 -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/hostfile: -------------------------------------------------------------------------------- 1 | sys-01 2 | sys-02 -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/.gitignore: -------------------------------------------------------------------------------- 1 | tmp/* 2 | build/* 3 | logs/* 4 | .graph_dump/* 5 | __pycache__* -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/prompts.txt: -------------------------------------------------------------------------------- 1 | Hello, my name is 2 | The most dangerous animal is 3 | The capital of France is 4 | The future of AI is 5 | -------------------------------------------------------------------------------- /dockerfiles/triton/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS} 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) 6 | 7 | init: 8 | ifneq ($(BUILD_OS), ubuntu22.04) 9 | $(error triton is only supported on ubuntu22.04) 10 | endif 11 | $(HIDE)mkdir -p $(BUILD_DIR) 12 | $(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile 13 | 14 | build: init 15 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-7b-hf", 3 | "tokenizer":"meta-llama/Llama-2-7b-hf", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 1, 11 | "max_num_batched_tokens": 8192 12 | } 13 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-7b-hf", 3 | "tokenizer":"meta-llama/Llama-2-7b-hf", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 1, 11 | "max_num_batched_tokens": 8192 12 | } 13 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-70b-hf", 3 | "tokenizer":"meta-llama/Llama-2-70b-hf", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 8, 11 | "max_num_batched_tokens": 8192 12 | } 13 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"Qwen/Qwen2-7B-Instruct", 3 | "tokenizer":"Qwen/Qwen2-7B-Instruct", 4 | "disable_log_requests": "false", 5 | "gpu_memory_utilization": 0.5, 6 | "enforce_eager": "true", 7 | "max_num_seqs": 512, 8 | "swap_space": 16, 9 | "dtype": "bfloat16", 10 | "tensor_parallel_size": 1, 11 | "max_num_batched_tokens": 131072, 12 | "chat_template": "true" 13 | } 14 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS} 5 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) 6 | 7 | init: 8 | ifneq ($(BUILD_OS), ubuntu22.04) 9 | $(error triton is only supported on ubuntu22.04) 10 | endif 11 | $(HIDE)mkdir -p $(BUILD_DIR) 12 | $(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile 13 | $(HIDE)cp $(CURDIR)/model.py $(BUILD_DIR)/model.py 14 | 15 | build: init 16 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | FROM ${BASE_IMAGE} 3 | 4 | RUN mkdir ~/.ssh && \ 5 | cd ~/.ssh && \ 6 | ssh-keygen -A && \ 7 | sed -i 's/#Port 22/Port 3122/g' /etc/ssh/sshd_config && \ 8 | sed -i 's/# Port 22/ Port 3122/g' /etc/ssh/ssh_config && \ 9 | sed -i 's/3022/3122/g' ~/.bashrc && \ 10 | echo "Host *" >> ~/.ssh/config && \ 11 | echo "ForwardAgent yes" >> ~/.ssh/config && \ 12 | echo "StrictHostKeyChecking no" >> ~/.ssh/config && \ 13 | echo "UserKnownHostsFile /dev/null" >> ~/.ssh/config && \ 14 | echo "LogLevel ERROR" >> ~/.ssh/config && \ 15 | service ssh start && \ 16 | chmod 600 ~/.ssh/config 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | ighs_level2_worker: 3 | image: ighs_level2 4 | build: 5 | context: . 6 | network: host 7 | args: 8 | BASE_IMAGE: "${BASE_IMAGE}" 9 | container_name: ighs_level2_worker 10 | runtime: habana 11 | environment: 12 | - HABANA_VISIBLE_DEVICES=all 13 | - OMPI_MCA_btl_vader_single_copy_mechanism=none 14 | - IGHS_LEVEL=2 15 | cap_add: 16 | - SYS_NICE 17 | - SYSLOG 18 | ipc: host 19 | network_mode: host 20 | working_dir: /tmp/ighs/intel_gaudi_health_screen 21 | volumes: 22 | - ./ssh:/root/.ssh/ 23 | - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen 24 | - /etc/localtime:/etc/localtime:ro 25 | tty: true 26 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/run_ighs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | LOG_DIR=logs/$(date +'%m-%Y/%m-%d-%Y/%m-%d-%Y_%H-%M') 15 | 16 | python3 screen.py --initialize --logs-dir $LOG_DIR; 17 | python3 screen.py --screen --logs-dir $LOG_DIR; 18 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | ighs_level2_launcher: 3 | image: ighs_level2 4 | build: 5 | context: . 6 | network: host 7 | args: 8 | BASE_IMAGE: "${BASE_IMAGE}" 9 | container_name: ighs_level2_launcher 10 | runtime: habana 11 | environment: 12 | - HABANA_VISIBLE_DEVICES=all 13 | - OMPI_MCA_btl_vader_single_copy_mechanism=none 14 | - IGHS_LEVEL=2 15 | cap_add: 16 | - SYS_NICE 17 | - SYSLOG 18 | ipc: host 19 | network_mode: host 20 | working_dir: /tmp/ighs/intel_gaudi_health_screen 21 | volumes: 22 | - ./ssh:/root/.ssh/ 23 | - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen 24 | - /etc/localtime:/etc/localtime:ro 25 | command: > 26 | template/bare-metal/run_hccl_demo.sh -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | ighs_level1: 3 | image: ighs_level1 4 | build: 5 | context: . 6 | network: host 7 | args: 8 | BASE_IMAGE: "${BASE_IMAGE}" 9 | container_name: ighs_level1 10 | runtime: habana 11 | environment: 12 | - HABANA_VISIBLE_DEVICES=all 13 | - OMPI_MCA_btl_vader_single_copy_mechanism=none 14 | - IGHS_LEVEL=1 15 | cap_add: 16 | - SYS_NICE 17 | - SYSLOG 18 | ipc: host 19 | network_mode: host 20 | working_dir: /tmp/ighs/intel_gaudi_health_screen 21 | volumes: 22 | - ./ssh:/root/.ssh/ 23 | - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen 24 | - /etc/localtime:/etc/localtime:ro 25 | command: > 26 | bash -c "python screen.py --ighs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} && \ 27 | chmod 777 -R $${LOG_DIR}" 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intel® Gaudi® Accelerator Setup and Installation 2 | 3 |
4 | 5 | --- 6 | 7 |
8 | 9 | By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). 10 | 11 |
12 | 13 | --- 14 | 15 |
16 | 17 | ## Overview 18 | 19 | Welcome to Setup and Installation GitHub Repository! 20 | 21 | The full installation documentation has been consolidated into the Installation Guide in our Intel Gaudi Documentation. Please reference our [Intel Gaudi docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide. 22 | 23 | This respository contains the following references: 24 | - dockerfiles -- Reference dockerfiles and build script to build Gaudi Docker images 25 | - utils -- Reference utility scripts 26 | -------------------------------------------------------------------------------- /dockerfiles/base/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = base-installer-${BUILD_OS} 5 | ifdef CUSTOM_PYTHON_VERSION 6 | IMAGE_NAME = base-installer-${BUILD_OS}-${PYTHON_SUFFIX} 7 | endif 8 | 9 | ifdef REPO_NAME 10 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg REPO_NAME=$(REPO_NAME) 11 | endif 12 | 13 | check_base: 14 | ifeq ($(BUILD_OS),navix9.4) 15 | ifneq ($(shell $(DOCKER) image inspect navix-container-base-9.4-20241121.0.x86_64 --format="image_exists" 2>/dev/null), image_exists) 16 | wget https://dlnavix.navercorp.com/cloud-images/Navix-Container-Base-9.4-20241121.0.x86_64.tar.xz 17 | docker load -i Navix-Container-Base-9.4-20241121.0.x86_64.tar.xz 18 | rm -f Navix-Container-Base-9.4-20241121.0.x86_64.tar.xz 19 | endif 20 | endif 21 | 22 | init: check_base 23 | $(HIDE)mkdir -p $(BUILD_DIR) 24 | $(HIDE)cp $(CURDIR)/LICENSE $(BUILD_DIR)/ 25 | $(HIDE)cp $(CURDIR)/*.sh $(BUILD_DIR)/ 26 | $(HIDE)cp $(CURDIR)/tencentos_efa_patch.txt $(BUILD_DIR)/ 27 | ifdef CUSTOM_PYTHON_VERSION 28 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS)-${PYTHON_SUFFIX} $(BUILD_DIR)/Dockerfile 29 | else 30 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile 31 | endif 32 | 33 | build: init 34 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/install_packages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | PYTHON_SUFFIX="${PYTHON_SUFFIX:-}" 4 | TORCH_TYPE="${TORCH_TYPE:-fork}" 5 | PYPI_URL="${PYPI_URL:-https://pypi.org/simple/}" 6 | if [ -z "$PYTHON_SUFFIX" ]; then 7 | PT_PACKAGE_NAME="pytorch_modules-v${PT_VERSION}_${VERSION}_${REVISION}.tgz" 8 | else 9 | PT_PACKAGE_NAME="pytorch_modules_${PYTHON_SUFFIX}-v${PT_VERSION}_${VERSION}_${REVISION}.tgz" 10 | fi 11 | OS_STRING="ubuntu${OS_NUMBER}" 12 | case "${BASE_NAME}" in 13 | *rhel9.4*) 14 | OS_STRING="rhel94" 15 | ;; 16 | *rhel9.6*) 17 | OS_STRING="rhel96" 18 | ;; 19 | *tencentos*) 20 | OS_STRING="tencentos31" 21 | ;; 22 | *opencloudos9*) 23 | OS_STRING="opencloudos92" 24 | ;; 25 | *navix9*) 26 | OS_STRING="navix94" 27 | ;; 28 | esac 29 | PT_ARTIFACT_PATH="https://${ARTIFACTORY_URL}/artifactory/gaudi-pt-modules/${VERSION}/${REVISION}/pytorch/${OS_STRING}" 30 | 31 | TMP_PATH=$(mktemp --directory) 32 | wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" 33 | tar -zxf "${PT_PACKAGE_NAME}" -C "${TMP_PATH}"/. 34 | pushd "${TMP_PATH}" 35 | PYTHON_INDEX_URL="--extra-index-url ${PYPI_URL}" ./install.sh $VERSION $REVISION $TORCH_TYPE 36 | popd 37 | 38 | rm -rf "${TMP_PATH}" "${PT_PACKAGE_NAME}" 39 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/config.yaml: -------------------------------------------------------------------------------- 1 | # Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info 2 | system-info: 3 | type: "k8s" 4 | # Namespace is only required for k8s settings 5 | namespace: "intelgaudi" 6 | 7 | # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile 8 | # hostfile: "./hostfile" 9 | 10 | # Bare Metal Configurations 11 | ssh-path: "./ssh" 12 | tcp-interface: "10.3.124.0/24" 13 | 14 | # Image to run Intel Gaudi Health Screen 15 | image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" 16 | 17 | # Node Label used to identify a Intel Gaudi Node 18 | gaudi-node-label: "habana.ai/gaudi=NoSchedule" 19 | 20 | # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) 21 | log-level: "DEBUG" 22 | 23 | # Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure, Device Temperature) 24 | level-1: 25 | run: true 26 | timeout_s: 150 27 | # Number of times to check Port Status 28 | num-checks-link-state: 12 29 | 30 | # Level 2 - Checks All Reduce between node pairs in the cluster. 31 | level-2: 32 | run: true 33 | timeout_s: 130 34 | # Number of times to check Network connections between nodes 35 | num-rounds: 5 36 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | include ../common.mk 3 | 4 | IMAGE_NAME = pytorch-installer-$(BUILD_OS)-$(PT_VERSION) 5 | ifdef CUSTOM_PYTHON_VERSION 6 | IMAGE_NAME = pytorch-installer-$(BUILD_OS)-$(PT_VERSION)-$(PYTHON_SUFFIX) 7 | endif 8 | 9 | ifdef TORCH_TYPE 10 | ifeq ($(TORCH_TYPE),upstream) 11 | IMAGE_NAME = pytorch-upstream-installer-$(BUILD_OS)-$(PT_VERSION) 12 | endif 13 | endif 14 | 15 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) 16 | 17 | base: 18 | ifneq ($(shell $(DOCKER) image inspect $(BASE_IMAGE_URL):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) --format="image_exists" 2>/dev/null), image_exists) 19 | cd ../base; \ 20 | make build; \ 21 | cd ../pytorch 22 | endif 23 | 24 | init: base 25 | $(HIDE)mkdir -p $(BUILD_DIR) 26 | $(HIDE)cp $(CURDIR)/install_packages.sh $(BUILD_DIR)/ 27 | ifneq (,$(findstring ubuntu,$(BUILD_OS))) 28 | ifdef CUSTOM_PYTHON_VERSION 29 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS)-$(PYTHON_SUFFIX) $(BUILD_DIR)/Dockerfile 30 | else 31 | $(HIDE)cp $(CURDIR)/Dockerfile.ubuntu $(BUILD_DIR)/Dockerfile 32 | endif 33 | else 34 | ifdef CUSTOM_PYTHON_VERSION 35 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS)-${PYTHON_SUFFIX} $(BUILD_DIR)/Dockerfile 36 | else 37 | $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile 38 | endif 39 | endif 40 | 41 | build: init 42 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.ubuntu22.04-py311: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for Ubuntu22.04 with python 3.11 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 19 | 20 | RUN apt-get update && apt-get install -y --no-install-recommends \ 21 | curl \ 22 | iproute2 \ 23 | jq \ 24 | libcairo2-dev \ 25 | libcurl4 \ 26 | libglib2.0-dev \ 27 | libgoogle-perftools-dev \ 28 | libhdf5-dev \ 29 | libjpeg-dev \ 30 | liblapack-dev \ 31 | libnuma-dev \ 32 | libopenblas-dev \ 33 | libpcre2-dev \ 34 | libselinux1-dev \ 35 | moreutils \ 36 | numactl \ 37 | pdsh && \ 38 | apt-get autoremove && rm -rf /var/lib/apt/lists/* 39 | 40 | COPY install_packages.sh . 41 | 42 | RUN PYTHON_SUFFIX=py311 PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 43 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 44 | 45 | ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 46 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NUM_NODES="${NUM_NODES:-1}"; 4 | HOME_DIR="${HOME_DIR:-/tmp/ighs/intel_gaudi_health_screen}"; 5 | WORK_DIR="${WORK_DIR:-/tmp/ighs/intel_gaudi_health_screen/build/hccl_demo}"; 6 | 7 | NGPU_PER_NODE=8; 8 | N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); 9 | 10 | cd ${WORK_DIR}; 11 | CMD="python ${WORK_DIR}/run_hccl_demo.py \ 12 | --test all_reduce \ 13 | --loop 1000 \ 14 | --size 32m \ 15 | -clean \ 16 | -mpi "; 17 | 18 | mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/; 19 | cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 20 | touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 21 | echo "Target Nodes: $TARGET_NODES" >> $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 22 | 23 | $CMD \ 24 | -np ${N_CARDS} \ 25 | --allow-run-as-root \ 26 | --bind-to core \ 27 | --map-by ppr:4:socket:PE=6 \ 28 | --rank-by core --report-bindings \ 29 | --tag-output \ 30 | --merge-stderr-to-stdout --prefix $MPI_ROOT \ 31 | -H ${TARGET_NODES//,/:48,}:48 \ 32 | --mca btl_tcp_if_include $TCP_INTERFACE \ 33 | -x MASTER_ADDR \ 34 | -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ 35 | -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ 36 | 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 37 | 38 | cd ${HOME_DIR}; 39 | python $HOME_DIR/screen.py --ighs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --round $ROUND; 40 | 41 | chmod 777 -R $HOME_DIR/$LOG_DIR 42 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.opencloudos9.2: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for OpenCloudOS 9.2 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | LABEL name="PyTorch Installer" 19 | LABEL summary="Habanalabs PyTorch installer layer for OpenCloudOS 9.2" 20 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 21 | 22 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 23 | 24 | RUN dnf update --nobest -y && dnf install --nobest --allowerasing -y \ 25 | cairo-devel \ 26 | curl \ 27 | gperftools-devel \ 28 | iproute \ 29 | jq \ 30 | lapack-devel \ 31 | numactl \ 32 | numactl-devel \ 33 | openblas-devel \ 34 | which \ 35 | zlib-devel && \ 36 | dnf clean all 37 | 38 | COPY install_packages.sh . 39 | 40 | RUN PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 41 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 42 | 43 | # Set LD_PRELOAD after all required installations to 44 | # avoid warnings during docker creation 45 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 46 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 47 | 48 | RUN rm -f /etc/yum.repos.d/habana.repo -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.navix9.4: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for Navix 9.4 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | LABEL name="PyTorch Installer" 19 | LABEL summary="Habanalabs PyTorch installer layer for Navix 9.4" 20 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 21 | 22 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 23 | 24 | RUN dnf update --nobest -y && dnf install --nobest --nodocs --setopt=install_weak_deps=false --allowerasing -y \ 25 | cairo-devel \ 26 | gperftools-devel \ 27 | iproute \ 28 | jq \ 29 | lapack-devel \ 30 | numactl \ 31 | numactl-devel \ 32 | openblas-devel \ 33 | which \ 34 | zlib-devel && \ 35 | dnf clean all 36 | 37 | COPY install_packages.sh . 38 | 39 | RUN PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 40 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 41 | 42 | # Set LD_PRELOAD after all required installations to 43 | # avoid warnings during docker creation 44 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 45 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 46 | 47 | RUN rm -f /etc/yum.repos.d/mirror.repo -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.rhel9.4: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.4 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | LABEL name="PyTorch Installer" 19 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4" 20 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 21 | 22 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth 23 | 24 | RUN dnf update --nobest -y && dnf install --nobest --nodocs --setopt=install_weak_deps=false --allowerasing -y \ 25 | cairo-devel \ 26 | gperftools-devel \ 27 | iproute \ 28 | jq \ 29 | lapack-devel \ 30 | numactl \ 31 | numactl-devel \ 32 | openblas-devel \ 33 | which \ 34 | zlib-devel && \ 35 | dnf clean all 36 | 37 | COPY install_packages.sh . 38 | 39 | RUN PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 40 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 41 | 42 | # Set LD_PRELOAD after all required installations to 43 | # avoid warnings during docker creation 44 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 45 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.rhel9.6: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.6 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | LABEL name="PyTorch Installer" 19 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.6" 20 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 21 | 22 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth 23 | 24 | RUN dnf update --nobest -y && dnf install --nobest --nodocs --setopt=install_weak_deps=false --allowerasing -y \ 25 | cairo-devel \ 26 | gperftools-devel \ 27 | iproute \ 28 | jq \ 29 | lapack-devel \ 30 | numactl \ 31 | numactl-devel \ 32 | openblas-devel \ 33 | which \ 34 | zlib-devel && \ 35 | dnf clean all 36 | 37 | COPY install_packages.sh . 38 | 39 | RUN PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 40 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 41 | 42 | # Set LD_PRELOAD after all required installations to 43 | # avoid warnings during docker creation 44 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 45 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.rhel9.4-py312: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.4 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | LABEL name="PyTorch Installer" 19 | LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4" 20 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 21 | 22 | RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth 23 | 24 | RUN dnf update --nobest -y && dnf install --nobest --nodocs --setopt=install_weak_deps=false --allowerasing -y \ 25 | cairo-devel \ 26 | gperftools-devel \ 27 | iproute \ 28 | jq \ 29 | lapack-devel \ 30 | numactl \ 31 | numactl-devel \ 32 | openblas-devel \ 33 | which \ 34 | zlib-devel && \ 35 | dnf clean all 36 | 37 | COPY install_packages.sh . 38 | 39 | RUN PYTHON_SUFFIX=py312 PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 40 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 41 | 42 | # Set LD_PRELOAD after all required installations to 43 | # avoid warnings during docker creation 44 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 45 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.ubuntu: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for Ubuntu22.04 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 19 | 20 | RUN apt-get update && apt-get install -y --no-install-recommends \ 21 | curl \ 22 | iproute2 \ 23 | jq \ 24 | libcurl4 \ 25 | libgoogle-perftools-dev \ 26 | libhdf5-dev \ 27 | libjpeg-dev \ 28 | liblapack-dev \ 29 | libnuma-dev \ 30 | libopenblas-dev \ 31 | moreutils \ 32 | numactl \ 33 | pdsh && \ 34 | apt-get autoremove && rm -rf /var/lib/apt/lists/* 35 | 36 | RUN bash -c "\ 37 | case $BASE_NAME in \ 38 | *ubuntu22.04*) \ 39 | update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \ 40 | ;; \ 41 | *ubuntu24.04*) \ 42 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 \ 43 | ;; \ 44 | esac" 45 | 46 | COPY install_packages.sh . 47 | 48 | RUN PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 49 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 50 | 51 | ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 52 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /legal-disclaimer.md: -------------------------------------------------------------------------------- 1 | ## Legal Notice and Disclaimer 2 | 3 | No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document. 4 | 5 | Habana Labs disclaims all warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and non-infringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade. 6 | 7 | All information provided here is subject to change without notice. Habana Labs may make changes to its test conditions and internal reliability goals at any time. Contact your Habana Labs representative to obtain the latest Habana Labs product specifications and roadmaps. Your costs and results may vary. 8 | 9 | The products described may contain design defects or errors known as errata which may cause the product to deviate from published specifications. Current characterized errata are available on request. 10 | 11 | Software and workloads used in performance tests may have been optimized for performance only on Habana Labs hardware. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary. You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products. 12 | 13 | No product or component can be absolutely secure. 14 | 15 | Habana Labs, Gaudi and SynapseAI are trademarks of Habana Labs in the U.S. and/or other countries. 16 | 17 | *Other names and brands may be claimed as the property of others. 18 | 19 | © 2021 Habana Labs 20 | -------------------------------------------------------------------------------- /dockerfiles/common.mk: -------------------------------------------------------------------------------- 1 | VERBOSE ?= FALSE 2 | DOCKER ?= docker 3 | DOCKER_CACHE ?= FALSE 4 | BUILD_OS ?= ubuntu22.04 5 | BUILD_DIR ?= $(CURDIR)/dockerbuild 6 | CUSTOM_PYTHON_VERSION ?= 7 | PYTHON_SUFFIX=py$(subst .,,$(CUSTOM_PYTHON_VERSION)) 8 | TORCH_TYPE ?= fork 9 | 10 | REPO_SERVER ?= vault.habana.ai 11 | PT_VERSION ?= 2.7.1 12 | RELEASE_VERSION ?= 1.22.2 13 | RELEASE_BUILD_ID ?= 32 14 | PYPI_URL ?= https://pypi.org/simple/ 15 | 16 | BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) 17 | ifdef CUSTOM_PYTHON_VERSION 18 | BASE_IMAGE_URL = base-installer-$(BUILD_OS)-${PYTHON_SUFFIX} 19 | endif 20 | 21 | IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) 22 | 23 | DOCKER_BUILD_ARGS := --build-arg ARTIFACTORY_URL=$(REPO_SERVER) --build-arg VERSION=$(RELEASE_VERSION) --build-arg REVISION=$(RELEASE_BUILD_ID) --build-arg BASE_NAME=$(BASE_IMAGE_URL) --build-arg PYPI_URL=$(PYPI_URL) 24 | 25 | ifdef TORCH_TYPE 26 | DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg TORCH_TYPE=$(TORCH_TYPE) 27 | endif 28 | 29 | # Hide or not the calls depending of VERBOSE 30 | ifeq ($(VERBOSE),TRUE) 31 | HIDE = 32 | else 33 | HIDE = @ 34 | endif 35 | 36 | # Use cache for build depending of DOCKER_CACHE 37 | ifeq ($(DOCKER_CACHE),TRUE) 38 | CACH_FLAG = 39 | else 40 | CACH_FLAG = --no-cache 41 | endif 42 | 43 | .PHONY: help build clean 44 | 45 | help: ## Prints this help. 46 | @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) 47 | 48 | .DEFAULT_GOAL := help 49 | 50 | clean: ## clean the build dir 51 | $(HIDE)rm -rf $(BUILD_DIR) 52 | 53 | build: ## build docker image 54 | @echo Building image - $(IMAGE_NAME) 55 | $(HIDE)$(DOCKER) build --network=host $(CACH_FLAG) --tag $(IMAGE_URL) $(DOCKER_BUILD_ARGS) $(BUILD_DIR) 56 | @echo -n $(IMAGE_URL) | tee $(BUILD_DIR)/image_name 57 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | # The usage of device is deferred to the vLLM engine 31 | instance_group [ 32 | { 33 | count: 1 34 | kind: KIND_MODEL 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /dockerfiles/pytorch/Dockerfile.tencentos3.1: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile PyTorch installer layer for RHEL 8.6 6 | ARG BASE_NAME 7 | ARG VERSION 8 | ARG REVISION 9 | FROM ${BASE_NAME}:${VERSION}-${REVISION} 10 | ARG PT_VERSION 11 | ARG VERSION 12 | ARG REVISION 13 | ARG BASE_NAME 14 | ARG ARTIFACTORY_URL 15 | ARG TORCH_TYPE 16 | ARG PYPI_URL 17 | 18 | LABEL name="PyTorch Installer" 19 | LABEL summary="Habanalabs PyTorch installer layer for Tencentos 3.1" 20 | LABEL description="Image with pre installed Habanalabs packages for PyTorch" 21 | 22 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 23 | 24 | RUN dnf versionlock add openmpi* perftest* 25 | 26 | RUN dnf install -y \ 27 | cairo-devel \ 28 | curl \ 29 | gcc-toolset-11 \ 30 | gperftools-devel \ 31 | iproute \ 32 | jq \ 33 | lapack-devel \ 34 | numactl \ 35 | numactl-devel \ 36 | openblas-devel \ 37 | libevent \ 38 | pdsh \ 39 | which \ 40 | zlib-devel && \ 41 | dnf clean all 42 | 43 | # Configure GCC 11 44 | ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH} 45 | ENV MANPATH=/opt/rh/gcc-toolset-11/root/usr/share/man:${MANPATH} 46 | ENV INFOPATH=/opt/rh/gcc-toolset-11/root/usr/share/info:${INFOPATH} 47 | ENV PCP_DIR=/opt/rh/gcc-toolset-11/root 48 | ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.5a1/lib64:/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH} 49 | ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:/usr/mpi/gcc/openmpi-4.1.5a1/lib64/pkgconfig:${PKG_CONFIG_PATH} 50 | ENV CMAKE_PREFIX_PATH=/usr/mpi/gcc/openmpi-4.1.5a1/include:${CMAKE_PREFIX_PATH} 51 | 52 | COPY install_packages.sh . 53 | 54 | RUN PYPI_URL=${PYPI_URL} ./install_packages.sh && rm -f install_packages.sh && \ 55 | /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc 56 | 57 | ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 58 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 59 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: template-metadata-name 5 | namespace: default 6 | labels: 7 | app: ighs 8 | spec: 9 | template: 10 | metadata: 11 | labels: 12 | app: ighs 13 | spec: 14 | restartPolicy: "Never" 15 | affinity: 16 | nodeAffinity: 17 | requiredDuringSchedulingIgnoredDuringExecution: 18 | nodeSelectorTerms: 19 | - matchExpressions: 20 | - key: kubernetes.io/hostname 21 | operator: In 22 | values: 23 | - IGHS-DUMMY-VAL 24 | volumes: 25 | - name: mydir 26 | emptyDir: {} 27 | tolerations: 28 | - key: "" 29 | operator: "Exists" 30 | effect: "NoSchedule" 31 | containers: 32 | - name: template-container-name 33 | image: template-container-image 34 | imagePullPolicy: IfNotPresent 35 | workingDir: /workdir 36 | command: ["/bin/bash", "-c"] 37 | args: 38 | - >- 39 | ssh-keygen -A; 40 | service ssh start; 41 | 42 | while [ ! -d /workdir/intel_gaudi_health_screen ]; do 43 | sleep 2s; 44 | done; 45 | sleep 10s; 46 | 47 | cd /workdir/intel_gaudi_health_screen; 48 | python /workdir/intel_gaudi_health_screen/screen.py --ighs-check node --logs-dir $LOG_DIR; 49 | volumeMounts: 50 | - name: mydir 51 | mountPath: /workdir 52 | securityContext: 53 | capabilities: 54 | add: 55 | - SYSLOG 56 | env: 57 | - name: IGHS_LEVEL 58 | value: "1" 59 | - name: MY_POD_IP 60 | valueFrom: 61 | fieldRef: 62 | fieldPath: status.podIP 63 | - name: MY_NODE_NAME 64 | valueFrom: 65 | fieldRef: 66 | fieldPath: spec.nodeName 67 | - name: MY_POD_NAMESPACE 68 | valueFrom: 69 | fieldRef: 70 | fieldPath: metadata.namespace 71 | resources: 72 | limits: 73 | habana.ai/gaudi: 8 74 | hugepages-2Mi: 29000Mi 75 | memory: 200Gi 76 | cpu: 95 77 | requests: 78 | habana.ai/gaudi: 8 79 | hugepages-2Mi: 29000Mi 80 | memory: 200Gi 81 | cpu: 95 82 | -------------------------------------------------------------------------------- /utils/check_framework_env.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright (C) 2022 Habana Labs, Ltd. an Intel Company 3 | # All Rights Reserved. 4 | # 5 | # Unauthorized copying of this file or any element(s) within it, via any medium 6 | # is strictly prohibited. 7 | # This file contains Habana Labs, Ltd. proprietary and confidential information 8 | # and is subject to the confidentiality and license agreements under which it 9 | # was provided. 10 | # 11 | ############################################################################### 12 | 13 | import argparse 14 | import os 15 | import concurrent.futures 16 | 17 | def parse_arguments(): 18 | parser = argparse.ArgumentParser(description="Check health of Intel Gaudi for PyTorch") 19 | 20 | parser.add_argument("--cards", 21 | default=1, 22 | type=int, 23 | required=False, 24 | help="Set number of cards to test (default: 1)") 25 | 26 | args = parser.parse_args() 27 | print(f"Configuration: {args}") 28 | 29 | return args 30 | 31 | def pytorch_test(device_id=0): 32 | """ Checks health of Intel Gaudi through running a basic 33 | PyTorch example on Intel Gaudi 34 | 35 | Args: 36 | device_id (int, optional): ID of Intel Gaudi. Defaults to 0. 37 | """ 38 | 39 | os.environ["HLS_MODULE_ID"] = str(device_id) 40 | os.environ["HABANA_VISIBLE_MODULES"] = str(device_id) 41 | 42 | try: 43 | import torch 44 | import habana_frameworks.torch.core 45 | except Exception as e: 46 | print(f"Card {device_id} Failed to initialize Intel Gaudi PyTorch: {str(e)}") 47 | raise 48 | 49 | try: 50 | x = torch.tensor([2]).to('hpu') 51 | y = x + x 52 | 53 | assert y == 4, 'Sanity check failed: Wrong Add output' 54 | assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card' 55 | except (RuntimeError, AssertionError) as e: 56 | print(f"Card Module ID {device_id} Failure: {e}") 57 | raise 58 | 59 | return device_id 60 | 61 | if __name__ == '__main__': 62 | args = parse_arguments() 63 | passed_cards = set() 64 | 65 | with concurrent.futures.ProcessPoolExecutor() as executor: 66 | futures = [executor.submit(pytorch_test, device_id) for device_id in range(args.cards)] 67 | for future in concurrent.futures.as_completed(futures): 68 | try: 69 | dev_id = future.result() 70 | passed_cards.add(dev_id) 71 | print(f"Card module_id {dev_id} PASSED") 72 | 73 | except Exception as e: 74 | print(f"Failed to initialize on Intel Gaudi, error: {str(e)}") 75 | 76 | failed_cards = set(range(args.cards)) - passed_cards 77 | 78 | print(f"Failed cards Module ID: {failed_cards}") 79 | print(f"Passed cards Module ID: {passed_cards}") -------------------------------------------------------------------------------- /dockerfiles/base/install-python310.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | _BASE_NAME=${1:-"ubuntu22.04"} 5 | _SSL_LIB="" 6 | 7 | # preinstall dependencies and define variables 8 | case "${_BASE_NAME}" in 9 | *ubuntu22.04* | *ubuntu24.04*) 10 | echo "Skip installation of Python 3.10 from sources on Ubuntu 22.04 and Ubuntu 24.04" 11 | exit 0; 12 | ;; 13 | *rhel*) 14 | dnf install -y sqlite-devel readline-devel xz-devel 15 | ;; 16 | *tencentos3.1*) 17 | dnf install -y sqlite-devel readline-devel zlib-devel xz-devel bzip2-devel libffi-devel 18 | wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && \ 19 | cd /opt/ && \ 20 | tar xzf openssl-1.1.1w.tar.gz && \ 21 | rm -rf openssl-1.1.1w.tar.gz && \ 22 | cd openssl-1.1.1w && \ 23 | ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && \ 24 | make && make install 25 | ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem 26 | 27 | PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin 28 | LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH 29 | _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w" 30 | ;; 31 | esac 32 | 33 | # install Python 34 | wget -nv -O /opt/Python-3.10.18.tgz https://www.python.org/ftp/python/3.10.18/Python-3.10.18.tgz 35 | cd /opt/ 36 | tar xzf Python-3.10.18.tgz 37 | rm -f Python-3.10.18.tgz 38 | cd Python-3.10.18 39 | ./configure --enable-optimizations --enable-loadable-sqlite-extensions --enable-shared $_SSL_LIB --with-ensurepip=no 40 | make -j && make altinstall 41 | 42 | # post install 43 | case "${_BASE_NAME}" in 44 | *rhel9*) 45 | alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 2 && \ 46 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 47 | alternatives --set python3 /usr/local/bin/python3.10 48 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 49 | ;; 50 | *tencentos3.1*) 51 | alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 4 && \ 52 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 3 && \ 53 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 54 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ 55 | alternatives --install /usr/bin/unversioned-python unversioned-python /usr/bin/python3 10 && \ 56 | alternatives --install /usr/bin/python3-config python3-config /usr/local/bin/python3.10-config 1 && \ 57 | alternatives --set python3 /usr/local/bin/python3.10 && \ 58 | alternatives --set python3-config /usr/local/bin/python3.10-config && \ 59 | alternatives --set unversioned-python /usr/bin/python3 60 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 61 | PATH="/usr/local/bin:$PATH" 62 | ;; 63 | esac 64 | -------------------------------------------------------------------------------- /dockerfiles/base/install_efa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | DEFAULT_EFA_INSTALLER_VER=1.34.0 4 | efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER} 5 | 6 | tmp_dir=$(mktemp -d) 7 | wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-$efa_installer_version.tar.gz -P $tmp_dir 8 | tar -xf $tmp_dir/aws-efa-installer-$efa_installer_version.tar.gz -C $tmp_dir 9 | RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify" 10 | pushd $tmp_dir/aws-efa-installer 11 | . /etc/os-release 12 | case $ID in 13 | navix) 14 | find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; 15 | find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; 16 | dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm 17 | RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'" 18 | ;; 19 | opencloudos) 20 | find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; 21 | find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; 22 | rm -rf RPMS/ROCKYLINUX9/x86_64/rdma-core/python3-pyverbs*.rpm 23 | dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm 24 | RUN_EFA_INSTALLER="echo 'Skipping EFA installer on opencloudos'" 25 | ;; 26 | rhel) 27 | # we cannot install dkms packages on RHEL images due to OCP rules 28 | find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; 29 | find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; 30 | case $VERSION_ID in 31 | 9*) 32 | dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm 33 | ;; 34 | *) 35 | echo "Unsupported RHEL version: $VERSION_ID" 36 | exit 1 37 | ;; 38 | esac 39 | RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'" 40 | ;; 41 | tencentos) 42 | # dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm 43 | find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; 44 | find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; 45 | rm -rf RPMS/ROCKYLINUX8/x86_64/rdma-core/rdma* 46 | patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch 47 | tmp_dir_ofed=$(mktemp -d) 48 | wget -O $tmp_dir_ofed/MLNX_OFED.tgz https://${ARTIFACTORY_URL}/artifactory/gaudi-installer/deps/MLNX_OFED_LINUX-5.8-3.0.7.0-rhel8.4-x86_64.tgz 49 | pushd $tmp_dir_ofed 50 | tar xf MLNX_OFED.tgz 51 | ofed_packages_path="mlnx-ofed" 52 | pushd mlnx-ofed 53 | yum install pciutils-libs tcsh tk python36 gcc-gfortran kernel-modules fuse-libs numactl-libs -y 54 | ./mlnxofedinstall --distro RHEL8.4 --skip-distro-check --user-space-only --skip-repo --force 55 | popd 56 | popd 57 | rm -rf $tmp_dir_ofed 58 | RUN_EFA_INSTALLER="echo 'Skipping EFA installer on tencentos'" 59 | ;; 60 | ubuntu) 61 | apt-get update 62 | ;; 63 | esac 64 | 65 | eval $RUN_EFA_INSTALLER 66 | 67 | case $ID in 68 | ubuntu) 69 | apt-get autoremove && rm -rf /var/lib/apt/lists/* 70 | ;; 71 | esac 72 | 73 | popd 74 | rm -rf $tmp_dir 75 | -------------------------------------------------------------------------------- /dockerfiles/README.md: -------------------------------------------------------------------------------- 1 | # Gaudi Docker Images Builder 2 | 3 | ## Table of Contents 4 | - [Overview](#overview) 5 | - [Support matrix](#support-matrix) 6 | - [Build docker](#build-docker) 7 | 8 |
9 | 10 | --- 11 | 12 |
13 | 14 | ## Overview 15 | 16 | This folder contains Gaudi dockerfiles and makefiles that can be used to build Habanalabs docker images for Gaudi. 17 | 18 |
19 | 20 | --- 21 | 22 |
23 | 24 | ## Support Matrix 25 | 26 | | BUILD_OS | Internal torch | Upstream torch | Custom python | 27 | |----------------|:--------------:|:--------------:|:-------------:| 28 | | ubuntu22.04 | Yes | Yes | 3.11 | 29 | | ubuntu24.04 | Yes | | | 30 | | rhel9.4 | Yes | Yes | 3.12 | 31 | | rhel9.6 | Yes | | | 32 | | tencentos3.1 | Yes | | | 33 | | opencloudos9.2 | Yes | | | 34 | | navix9.4 | Yes | | | 35 | 36 |
37 | You can also build triton-installer, which is based on ubuntu22.04 OS 38 | 39 |
40 | 41 | --- 42 | 43 |
44 | 45 | ## Build Docker 46 | 47 | This script can be used as reference to build docker images for Gaudi. 48 | 49 | ### How to Build Docker Images from Habana Dockerfiles 50 | 51 | 1. Go into the folder of the image type you would like to build: 52 | * base 53 | * pytorch 54 | * triton 55 | 56 | 2. Run build command to generate Docker image 57 | ``` 58 | make build 59 | ``` 60 | Examples: 61 | #### Build pytorch image for rhel9.4: 62 | ``` 63 | cd pytorch 64 | make build BUILD_OS=rhel9.4 65 | ``` 66 | 67 | #### Build pytorch image for rhel9.4 with python3.12: 68 | ``` 69 | cd pytorch 70 | make build BUILD_OS=rhel9.4 CUSTOM_PYTHON_VERSION=3.12 71 | ``` 72 | 73 | #### Build pytorch image for ubuntu22.04 with upstream pytorch: 74 | ``` 75 | cd pytorch 76 | make build BUILD_OS=ubuntu22.04 TORCH_TYPE=upstream 77 | ``` 78 | 79 | #### Build triton image (default OS - ubuntu22.04): 80 | ``` 81 | cd triton 82 | make build 83 | ``` 84 | 85 | #### Build triton vllm backend (default OS - ubuntu22.04): 86 | ``` 87 | cd triton_vllm_backend 88 | make build BUILD_OS=ubuntu22.04 89 | ``` 90 | 91 | 3. Build command variables 92 | 93 | #### Optional Parameters 94 | * BUILD_OS - set the OS to build (default ubuntu22.04) 95 | * CUSTOM_PYTHON_VERSION - build OS with different python version than default - available ubuntu22.04 with python3.11 and rhel9.4 with python3.12 96 | * TORCH_TYPE - build pytorch docker with upstream or fork (internal) torch version (default fork) 97 | * BUILD_DIR - the folder where the build be executed from (default dockerbuild in image folder) 98 | * VERBOSE - set to TRUE to echo the commands (default FALSE) 99 | * DOCKER_CACHE - set to TRUE to use cache for building docker image (default FALSE) 100 | 101 | 4. Instructions for triton-vllm-back-end server 102 | 103 | * Run the backend container as described in [habana docs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Triton_Inference.html?highlight=triton%20inference#run-the-backend-container) 104 | * Start the triton server 105 | ```bash 106 | tritonserver --model-repository samples/model_repository 107 | ``` 108 | The current samples/model_repository/vllm_model contains llama27B 1x.We also have sample model files for llama2 7b/70b and qwen2-7b respectively under samples/model_repository/test_models folder. To use them , copy the model.json and config.pbtxt to vllm_model folder structure. 109 | * To test with client, please follow the instructions [here](https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#sending-your-first-inference) -------------------------------------------------------------------------------- /dockerfiles/triton/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04 6 | FROM nvcr.io/nvidia/tritonserver:23.12-py3 7 | ARG ARTIFACTORY_URL 8 | ARG PT_VERSION 9 | ARG VERSION 10 | ARG REVISION 11 | ARG PYPI_URL 12 | ARG HABANA_PIP_VERSION="22.3" 13 | ARG PT_BUILD_REPO=gaudi-pt-modules 14 | ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz" 15 | ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204" 16 | ARG PT_EXTRACT_PATH="/root/habanalabs/pytorch_temp" 17 | 18 | ENV DEBIAN_FRONTEND=noninteractive 19 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 20 | ENV HABANA_LOGS=/var/log/habana_logs/ 21 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 22 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 23 | ENV PIP_NO_CACHE_DIR=on 24 | ENV PIP_DEFAULT_TIMEOUT=1000 25 | ENV MPI_ROOT=/opt/hpcx/ompi 26 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 27 | ENV PATH=${MPI_ROOT}/bin:$PATH 28 | ENV OPAL_PREFIX=${MPI_ROOT} 29 | ENV MPICC=${MPI_ROOT}/bin/mpicc 30 | ENV RDMAV_FORK_SAFE=1 31 | ENV PYTHONPATH=/root:/usr/lib/habanalabs 32 | RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 33 | wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \ 34 | apt-key add public && rm public && apt-get update && \ 35 | apt-get install -y --no-install-recommends habanalabs-rdma-core="$VERSION"-"$REVISION" \ 36 | habanalabs-thunk="$VERSION"-"$REVISION" \ 37 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 38 | habanalabs-graph="$VERSION"-"$REVISION" && \ 39 | apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ 40 | sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 41 | 42 | RUN apt-get update && apt-get install -y --no-install-recommends \ 43 | libjemalloc2 \ 44 | libcairo2-dev \ 45 | libglib2.0-dev \ 46 | libhdf5-dev \ 47 | libnuma-dev \ 48 | libpcre2-dev \ 49 | libjpeg-dev \ 50 | liblapack-dev \ 51 | libopenblas-dev \ 52 | numactl \ 53 | libgoogle-perftools-dev && \ 54 | apt-get clean && rm -rf /var/lib/apt/lists/* 55 | 56 | RUN python3 -m pip install pip==24.2 --disable-pip-version-check && \ 57 | python3 -m pip install setuptools==75.1.0 --disable-pip-version-check && \ 58 | python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} --disable-pip-version-check 59 | 60 | RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \ 61 | mkdir -p /root/habanalabs/pytorch_temp && \ 62 | tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C ${PT_EXTRACT_PATH}/. && \ 63 | python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \ 64 | pip install mpi4py==3.1.4 --disable-pip-version-check && \ 65 | grep -ivE "#|lightning" ${PT_EXTRACT_PATH}/requirements-pytorch.txt > ${PT_EXTRACT_PATH}/requirements-pytorch-nolightning.txt && \ 66 | pip install -r ${PT_EXTRACT_PATH}/requirements-pytorch-nolightning.txt --no-warn-script-location --disable-pip-version-check && \ 67 | pip install ${PT_EXTRACT_PATH}/*.whl --disable-pip-version-check && \ 68 | grep "lightning" ${PT_EXTRACT_PATH}/requirements-pytorch.txt > ${PT_EXTRACT_PATH}/requirements-pytorch-lightning.txt && \ 69 | pip install -r ${PT_EXTRACT_PATH}/requirements-pytorch-lightning.txt --disable-pip-version-check && \ 70 | echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ 71 | pip uninstall -y pillow && \ 72 | pip uninstall -y pillow-simd && \ 73 | pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \ 74 | rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/* 75 | 76 | ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 77 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.ubuntu22.04: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 22.04 6 | FROM ubuntu:jammy 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 14 | ENV HABANA_LOGS=/var/log/habana_logs/ 15 | ENV OS_NUMBER=2204 16 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 17 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 18 | 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | apt-transport-https \ 21 | apt-utils \ 22 | bc \ 23 | build-essential \ 24 | ca-certificates \ 25 | dkms \ 26 | ethtool \ 27 | gcc \ 28 | git \ 29 | gnupg \ 30 | gpg-agent \ 31 | graphviz \ 32 | libgl1 \ 33 | libgnutls30 \ 34 | libgoogle-glog0v5 \ 35 | libjemalloc2 \ 36 | libjpeg-dev \ 37 | libkrb5-3 \ 38 | libpq-dev \ 39 | lsof \ 40 | make \ 41 | openssh-client \ 42 | openssh-server \ 43 | protobuf-compiler \ 44 | python3 \ 45 | python3-dev \ 46 | unzip \ 47 | vim \ 48 | wget && \ 49 | apt-get upgrade -y && \ 50 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 51 | rm -f /etc/ssh/ssh_host_*_key* 52 | 53 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 54 | ENV PIP_NO_CACHE_DIR=on 55 | 56 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 57 | python3 get-pip.py && \ 58 | rm -f get-pip.py && \ 59 | python3 -m pip install setuptools==79.0.1 wheel && \ 60 | python3 -m pip install --upgrade Jinja2 protobuf urllib3 requests 61 | 62 | COPY install_efa.sh . 63 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 64 | 65 | ENV MPI_ROOT=/opt/amazon/openmpi 66 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 67 | ENV PATH=${MPI_ROOT}/bin:$PATH 68 | ENV OPAL_PREFIX=${MPI_ROOT} 69 | ENV MPICC=${MPI_ROOT}/bin/mpicc 70 | ENV RDMAV_FORK_SAFE=1 71 | ENV FI_EFA_USE_DEVICE_RDMA=1 72 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 73 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 74 | 75 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \ 76 | chown root:root /usr/share/keyrings/habana-artifactory.gpg && \ 77 | chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ 78 | echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 79 | cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ 80 | sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ 81 | apt-get update && apt-get install -y --no-install-recommends \ 82 | habanalabs-rdma-core="$VERSION"-"$REVISION" \ 83 | habanalabs-thunk="$VERSION"-"$REVISION" \ 84 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 85 | habanalabs-graph="$VERSION"-"$REVISION" && \ 86 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 87 | mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ 88 | sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 89 | 90 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 91 | 92 | # SSH configuration necessary to support mpi-operator v2 93 | RUN mkdir -p /var/run/sshd && \ 94 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 95 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 96 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 97 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 98 | echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \ 99 | sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04 6 | FROM nvcr.io/nvidia/tritonserver:24.06-py3 7 | ARG ARTIFACTORY_URL 8 | ARG PT_VERSION 9 | ARG VERSION 10 | ARG REVISION 11 | ARG HABANA_PIP_VERSION="22.3" 12 | ARG PT_BUILD_REPO=gaudi-pt-modules 13 | ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz" 14 | ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204" 15 | ENV DEBIAN_FRONTEND=noninteractive 16 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 17 | ENV HABANA_LOGS=/var/log/habana_logs/ 18 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 19 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 20 | ENV PIP_NO_CACHE_DIR=on 21 | ENV PIP_DEFAULT_TIMEOUT=1000 22 | ENV MPI_ROOT=/opt/hpcx/ompi 23 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 24 | ENV PATH=${MPI_ROOT}/bin:$PATH 25 | ENV OPAL_PREFIX=${MPI_ROOT} 26 | ENV MPICC=${MPI_ROOT}/bin/mpicc 27 | ENV RDMAV_FORK_SAFE=1 28 | ENV PYTHONPATH=/root:/usr/lib/habanalabs/ 29 | 30 | ADD model.py . 31 | RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 32 | wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \ 33 | apt-key add public && rm public && apt-get update && \ 34 | apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \ 35 | habanalabs-thunk="$VERSION"-"$REVISION" \ 36 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 37 | habanalabs-graph="$VERSION"-"$REVISION" && \ 38 | apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ 39 | sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 40 | 41 | RUN apt-get update && apt-get install -y \ 42 | libjemalloc2 \ 43 | libcairo2-dev \ 44 | libglib2.0-dev \ 45 | libhdf5-dev \ 46 | libnuma-dev \ 47 | libpcre2-dev \ 48 | libjpeg-dev \ 49 | liblapack-dev \ 50 | libopenblas-dev \ 51 | numactl \ 52 | libgoogle-perftools-dev && \ 53 | apt-get clean && rm -rf /var/lib/apt/lists/* 54 | 55 | RUN python3 -m pip install pip==23.3.1 --disable-pip-version-check && \ 56 | python3 -m pip install setuptools==67.3.3 --disable-pip-version-check && \ 57 | python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check 58 | 59 | RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \ 60 | mkdir -p /root/habanalabs/pytorch_temp && \ 61 | tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C /root/habanalabs/pytorch_temp/. && \ 62 | python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \ 63 | pip install mpi4py==3.1.4 --disable-pip-version-check && \ 64 | #pip install $(grep -ivE "#|lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt | grep .) --no-warn-script-location --disable-pip-version-check && \ 65 | pip install /root/habanalabs/pytorch_temp/*.whl --disable-pip-version-check && \ 66 | pip install $(grep "lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt) --disable-pip-version-check && \ 67 | echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ 68 | pip uninstall -y pillow && \ 69 | pip uninstall -y pillow-simd && \ 70 | pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \ 71 | rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/* 72 | #RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@v0.4.2-Gaudi-1.16.0 73 | RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@275e3250ba6ed8cc13b2d6e4928db73df420e64b 74 | 75 | RUN mkdir -p /opt/tritonserver/backends/vllm 76 | COPY model.py /opt/tritonserver/backends/vllm/ 77 | 78 | ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 79 | ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 80 | -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.ubuntu24.04: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 24.04 6 | FROM ubuntu:noble 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | ARG OPENMPI_VER=4.1.6 12 | 13 | ENV DEBIAN_FRONTEND=noninteractive 14 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 15 | ENV HABANA_LOGS=/var/log/habana_logs/ 16 | ENV OS_NUMBER=2404 17 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 18 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 19 | 20 | RUN apt-get update && apt-get install -y --no-install-recommends \ 21 | apt-transport-https \ 22 | apt-utils \ 23 | bc \ 24 | build-essential \ 25 | ca-certificates \ 26 | dkms \ 27 | ethtool \ 28 | gcc \ 29 | git \ 30 | gnupg \ 31 | gpg-agent \ 32 | graphviz \ 33 | libgl1 \ 34 | libgnutls30 \ 35 | libgoogle-glog0v6t64 \ 36 | libjemalloc2 \ 37 | libjpeg-dev \ 38 | libkrb5-3 \ 39 | libopenmpi-dev \ 40 | libpq-dev \ 41 | lsof \ 42 | make \ 43 | openmpi-bin=${OPENMPI_VER}-* \ 44 | openmpi-common=${OPENMPI_VER}-* \ 45 | openssh-client \ 46 | openssh-server \ 47 | protobuf-compiler \ 48 | python3 \ 49 | python3-dev \ 50 | unzip \ 51 | vim \ 52 | wget && \ 53 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 54 | rm -f /etc/ssh/ssh_host_*_key* 55 | 56 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 57 | ENV PIP_NO_CACHE_DIR=on 58 | 59 | RUN mv /usr/lib/python3.12/EXTERNALLY-MANAGED /usr/lib/python3.12/EXTERNALLY-MANAGED.old && \ 60 | wget https://bootstrap.pypa.io/get-pip.py && \ 61 | python3 get-pip.py && \ 62 | rm -f get-pip.py && \ 63 | python3 -m pip install setuptools==79.0.1 wheel && \ 64 | python3 -m pip install --upgrade Jinja2 protobuf urllib3 requests 65 | 66 | COPY install_efa.sh . 67 | RUN ./install_efa.sh && rm -f install_efa.sh /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 68 | 69 | ENV LD_LIBRARY_PATH=/usr/lib/habanalabs:$LD_LIBRARY_PATH 70 | ENV RDMAV_FORK_SAFE=1 71 | ENV FI_EFA_USE_DEVICE_RDMA=1 72 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 73 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 74 | 75 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \ 76 | chown root:root /usr/share/keyrings/habana-artifactory.gpg && \ 77 | chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ 78 | echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian noble main" | tee -a /etc/apt/sources.list && \ 79 | cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ 80 | sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ 81 | apt-get update && apt-get upgrade -y && \ 82 | apt-get install -y --no-install-recommends \ 83 | habanalabs-rdma-core="$VERSION"-"$REVISION" \ 84 | habanalabs-thunk="$VERSION"-"$REVISION" \ 85 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 86 | habanalabs-graph="$VERSION"-"$REVISION" && \ 87 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 88 | mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ 89 | sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 90 | 91 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 92 | 93 | # SSH configuration necessary to support mpi-operator v2 94 | RUN mkdir -p /var/run/sshd && \ 95 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 96 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 97 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 98 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 99 | echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \ 100 | sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.ubuntu22.04-py311: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 HabanaLabs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Ubuntu 22.04 with python 3.11 6 | FROM ubuntu:jammy 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 14 | ENV HABANA_LOGS=/var/log/habana_logs/ 15 | ENV OS_NUMBER=2204 16 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 17 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins 18 | 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | apt-transport-https \ 21 | apt-utils \ 22 | bc \ 23 | build-essential \ 24 | ca-certificates \ 25 | dkms \ 26 | ethtool \ 27 | gcc \ 28 | git \ 29 | gnupg \ 30 | gpg-agent \ 31 | graphviz \ 32 | libcairo2-dev \ 33 | libgl1 \ 34 | libgnutls30 \ 35 | libgoogle-glog0v5 \ 36 | libjemalloc2 \ 37 | libjpeg-dev \ 38 | libkrb5-3 \ 39 | libpq-dev \ 40 | lsof \ 41 | make \ 42 | openssh-client \ 43 | openssh-server \ 44 | protobuf-compiler \ 45 | python3 \ 46 | python3-dev \ 47 | unzip \ 48 | vim \ 49 | wget && \ 50 | apt-get upgrade -y libc6 && \ 51 | apt-get install -y --no-install-recommends software-properties-common && \ 52 | add-apt-repository -y ppa:deadsnakes/ppa && \ 53 | apt-get update && apt-get install -y --no-install-recommends \ 54 | python3.11 \ 55 | python3.11-dev && \ 56 | apt-get remove -y software-properties-common && \ 57 | apt-get autoremove -y && rm -rf /var/lib/apt/lists/* && \ 58 | rm -f /etc/ssh/ssh_host_*_key* 59 | 60 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 61 | update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \ 62 | update-alternatives --install /usr/bin/python python /usr/bin/python3.11 2 && \ 63 | update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 64 | 65 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 66 | ENV PIP_NO_CACHE_DIR=on 67 | 68 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 69 | python3 get-pip.py && \ 70 | rm -f get-pip.py && \ 71 | python3 -m pip install setuptools==79.0.1 wheel && \ 72 | python3 -m pip install --upgrade cryptography Jinja2 meson oauthlib pycairo PyJWT zipp protobuf urllib3 requests 73 | 74 | COPY install_efa.sh . 75 | RUN ./install_efa.sh && rm -f install_efa.sh /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 76 | 77 | ENV MPI_ROOT=/opt/amazon/openmpi 78 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 79 | ENV PATH=${MPI_ROOT}/bin:$PATH 80 | ENV OPAL_PREFIX=${MPI_ROOT} 81 | ENV MPICC=${MPI_ROOT}/bin/mpicc 82 | ENV RDMAV_FORK_SAFE=1 83 | ENV FI_EFA_USE_DEVICE_RDMA=1 84 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 85 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 86 | 87 | RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \ 88 | chown root:root /usr/share/keyrings/habana-artifactory.gpg && \ 89 | chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ 90 | echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ 91 | cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ 92 | sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ 93 | apt-get update && apt-get install -y --no-install-recommends \ 94 | habanalabs-rdma-core="$VERSION"-"$REVISION" \ 95 | habanalabs-thunk="$VERSION"-"$REVISION" \ 96 | habanalabs-firmware-tools="$VERSION"-"$REVISION" \ 97 | habanalabs-graph="$VERSION"-"$REVISION" && \ 98 | apt-get autoremove && rm -rf /var/lib/apt/lists/* && \ 99 | mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ 100 | sed -i "/$ARTIFACTORY_URL/d" /etc/apt/sources.list 101 | 102 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 103 | 104 | # SSH configuration necessary to support mpi-operator v2 105 | RUN mkdir -p /var/run/sshd && \ 106 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 107 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 108 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 109 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 110 | echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \ 111 | sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc -------------------------------------------------------------------------------- /dockerfiles/base/tencentos_efa_patch.txt: -------------------------------------------------------------------------------- 1 | diff --git a/common.sh b/common.sh 2 | index 3c3a0e4..b463f42 100755 3 | --- a/common.sh 4 | +++ b/common.sh 5 | @@ -50,6 +50,15 @@ has_substring() { 6 | fi 7 | } 8 | 9 | +is_tencentos_3() { 10 | + . /etc/os-release 11 | + if [ "$NAME" = "TencentOS Server" ] && [ "$VERSION_ID" = "3.1" ]; then 12 | + return 0 13 | + else 14 | + return 1 15 | + fi 16 | +} 17 | + 18 | is_amazon_linux_2() { 19 | . /etc/os-release 20 | if [ "$NAME" = "Amazon Linux" ] && [ "$VERSION_ID" = "2" ]; then 21 | @@ -164,7 +173,7 @@ is_suse_15() { 22 | } 23 | 24 | install_cmd() { 25 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 26 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 27 | if [ $1 == "localinstall" ]; then 28 | shift 29 | yum -y localinstall $@ 30 | @@ -181,7 +190,7 @@ install_cmd() { 31 | fi 32 | } 33 | search_cmd() { 34 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 35 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 36 | yum list installed $@ 37 | elif is_suse_15; then 38 | zypper search --installed-only --match-exact $@ 39 | @@ -194,7 +203,7 @@ search_cmd() { 40 | } 41 | remove_cmd() { 42 | # we don't remove the dependencies of the efa packages as it may have reverse dependencies on other system packages 43 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then 44 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then 45 | rpm --erase --nodeps $@ 46 | elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then 47 | # purge is identical to remove except that packages are removed and purged 48 | @@ -207,7 +216,7 @@ remove_cmd() { 49 | } 50 | # Get the list of file installed by the package name 51 | query_file_list_cmd() { 52 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then 53 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then 54 | rpm -ql $@ 55 | elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then 56 | dpkg -L $@ 57 | @@ -220,7 +229,7 @@ query_file_list_cmd() { 58 | # reverse dependencies (some other installed packages depend on them) 59 | # this command will return non-zero 60 | remove_dryrun_cmd() { 61 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then 62 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then 63 | rpm --erase --test $@ 64 | elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then 65 | dpkg -r --dry-run $@ 66 | diff --git a/efa_installer.sh b/efa_installer.sh 67 | index 544673f..faf3369 100755 68 | --- a/efa_installer.sh 69 | +++ b/efa_installer.sh 70 | @@ -97,7 +97,7 @@ select_mpi() { 71 | } 72 | 73 | detect_os() { 74 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 75 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 76 | PACKAGE_TYPE="rpm" 77 | KERNEL_SEARCH_STRING=kernel 78 | INSTALL_ARGS="--setopt=skip_missing_names_on_install=False" 79 | @@ -209,7 +209,7 @@ setup_install_package_paths() { 80 | local kmod_path 81 | 82 | if [ "${PACKAGE_TYPE}" = "rpm" ]; then 83 | - if is_rhel_8 || is_rockylinux_8; then 84 | + if is_rhel_8 || is_rockylinux_8|| is_tencentos_3; then 85 | base_dir="RPMS/ROCKYLINUX8/${arch}" 86 | debug_dir="RPMS/ROCKYLINUX8/${arch}/debug" 87 | elif is_rockylinux_9 || is_rhel_9; then 88 | @@ -465,7 +465,7 @@ install_apt_package() { 89 | install_dependencies() { 90 | local packages 91 | 92 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 93 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 94 | packages="pciutils rpmdevtools" 95 | if [ ${SKIP_KMOD} -eq 0 ]; then 96 | for kernel in ${INSTALLED_KERNELS[@]}; do 97 | @@ -785,7 +785,7 @@ uninstall_efa() { 98 | 99 | uninstall_old_efa_packages() { 100 | # Uninstall 'openmpi' and 'libfabric' if packaged by AWS. 101 | - if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then 102 | + if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then 103 | for pkg in openmpi libfabric libfabric-debuginfo; do 104 | rpm -ql $pkg | grep -q /opt/amazon 105 | if [ $? -eq 0 ]; then 106 | -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.opencloudos9.2: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for OpenCloudOS 9.2 6 | FROM opencloudos/opencloudos9-minimal:9.2-v20250327 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | LABEL vendor="Habanalabs Ltd." 13 | LABEL release="${VERSION}-${REVISION}" 14 | 15 | COPY LICENSE /licenses/ 16 | 17 | RUN dnf install -y python3-dnf-plugin-versionlock && \ 18 | dnf versionlock add opencloudos-release* opencloudos-repos* && \ 19 | dnf clean all 20 | 21 | RUN dnf update -y && dnf install -y --setopt=install_weak_deps=false \ 22 | bzip2 \ 23 | bzip2-devel \ 24 | clang \ 25 | cmake3 \ 26 | cpp \ 27 | ffmpeg \ 28 | gcc \ 29 | gcc-c++ \ 30 | git \ 31 | glibc \ 32 | glibc-devel \ 33 | glibc-headers \ 34 | iproute \ 35 | jemalloc \ 36 | libarchive \ 37 | libffi-devel \ 38 | libjpeg-devel \ 39 | libksba \ 40 | llvm \ 41 | gawk \ 42 | lsb_release \ 43 | lsof \ 44 | mesa-libGL \ 45 | openssh-clients \ 46 | openssh-server \ 47 | openssl \ 48 | openssl-devel \ 49 | python3-devel \ 50 | unzip \ 51 | wget \ 52 | zlib-devel && \ 53 | dnf clean all && \ 54 | rm -f /etc/ssh/ssh_host_*_key* 55 | 56 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 57 | echo "name=Habana OpenCloudOS repo" >> /etc/yum.repos.d/habanalabs.repo && \ 58 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/opencloudos/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ 59 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/opencloudos/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 60 | echo 'gpgcheck=1' >> /etc/yum.repos.d/habanalabs.repo && \ 61 | echo 'enable=1' >> /etc/yum.repos.d/habanalabs.repo 62 | 63 | RUN wget -q -O "/tmp/habana_pubkey" "https://${ARTIFACTORY_URL}/artifactory/gaudi-general/keyPairs/primary/public" && rpm --import "/tmp/habana_pubkey" && rm -f /tmp/habana_pubkey 64 | 65 | 66 | RUN ln -sf /usr/bin/python3 /usr/bin/python 67 | 68 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 69 | ENV PIP_NO_CACHE_DIR=on 70 | 71 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 72 | python3 get-pip.py && \ 73 | rm -f get-pip.py && \ 74 | python3 -m pip install setuptools==79.0.1 wheel && \ 75 | python3 -m pip install --upgrade Jinja2 urllib3 requests 76 | 77 | COPY install_efa.sh . 78 | RUN ./install_efa.sh 1.41.0 && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 79 | 80 | ENV OPENMPI_VERSION=4.1.6 81 | ENV MPI_ROOT=/opt/habanalabs/openmpi 82 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 83 | ENV PATH=${MPI_ROOT}/bin:$PATH 84 | ENV OPAL_PREFIX=${MPI_ROOT} 85 | ENV MPICC=${MPI_ROOT}/bin/mpicc 86 | ENV RDMAV_FORK_SAFE=1 87 | ENV FI_EFA_USE_DEVICE_RDMA=0 88 | ENV OMPI_MCA_btl=^openib 89 | 90 | RUN dnf install -y \ 91 | habanalabs-rdma-core-"$VERSION"-"$REVISION".oc9 \ 92 | habanalabs-thunk-"$VERSION"-"$REVISION".oc9 \ 93 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".oc9 \ 94 | habanalabs-graph-"$VERSION"-"$REVISION".oc9 && \ 95 | dnf clean all && \ 96 | chmod +t /var/log/habana_logs && \ 97 | rm -f /etc/yum.repos.d/habana.repo 98 | 99 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 100 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 101 | 102 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 103 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 104 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 105 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 106 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 107 | 108 | RUN mkdir -p /tmp/pyverbs && \ 109 | cd /tmp/pyverbs && \ 110 | python3 -m pip download pyverbs==59.0 && \ 111 | tar zxf pyverbs-59.0.tar.gz && \ 112 | cd pyverbs-59.0 && \ 113 | sed -i 's/cython>=3.0.12/cython>=3.0.12,<3.2.0/' pyproject.toml && \ 114 | python3 -m pip install . && \ 115 | cd && \ 116 | rm -rf /tmp/pyverbs 117 | 118 | RUN python3 -m pip install --upgrade protobuf 119 | 120 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 121 | 122 | # SSH configuration necessary to support mpi-operator v2 123 | RUN mkdir -p /var/run/sshd && \ 124 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 125 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 126 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 127 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 128 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 129 | 130 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 131 | ENV HABANA_LOGS=/var/log/habana_logs/ 132 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 133 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.navix9.4: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Navix 9.4 6 | FROM navix-container-base-9.4-20241121.0.x86_64:latest 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | RUN sed -i -E 's/linux.navercorp.com\/naver\/\$releasever/dlnavix.navercorp.com\/9.4/g' /etc/yum.repos.d/navix.repo 13 | 14 | LABEL vendor="Habanalabs Ltd." 15 | LABEL release="${VERSION}-${REVISION}" 16 | 17 | COPY LICENSE /licenses/ 18 | 19 | RUN dnf install -y \ 20 | python3-dnf-plugin-versionlock && \ 21 | dnf versionlock add navix-release* navix-repos* && \ 22 | dnf clean all 23 | 24 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ 25 | dnf clean all 26 | 27 | RUN dnf update -y && dnf install -y \ 28 | bzip2 \ 29 | bzip2-devel \ 30 | clang \ 31 | cmake3 \ 32 | cpp \ 33 | ffmpeg-free \ 34 | gcc \ 35 | gcc-c++ \ 36 | git \ 37 | glibc \ 38 | glibc-devel \ 39 | glibc-headers \ 40 | iproute \ 41 | jemalloc \ 42 | libarchive \ 43 | libffi-devel \ 44 | libjpeg-devel \ 45 | libksba \ 46 | llvm \ 47 | lsb_release \ 48 | lsof \ 49 | mesa-libGL \ 50 | openssh-clients \ 51 | openssh-server \ 52 | openssl \ 53 | openssl-devel \ 54 | perl-Net-SSLeay \ 55 | python3-devel \ 56 | python3.12 \ 57 | python3.12-devel \ 58 | unzip \ 59 | wget \ 60 | zlib-devel && \ 61 | dnf clean all && \ 62 | rm -f /etc/ssh/ssh_host_*_key* 63 | 64 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 && \ 65 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 66 | alternatives --set python3 /usr/bin/python3.12 67 | 68 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 69 | ENV PIP_NO_CACHE_DIR=on 70 | 71 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 72 | python3 get-pip.py && \ 73 | rm -f get-pip.py && \ 74 | python3 -m pip install setuptools==79.0.1 wheel && \ 75 | python3 -m pip install --upgrade Jinja2 urllib3 requests 76 | 77 | COPY install_efa.sh . 78 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 79 | 80 | ENV OPENMPI_VERSION=4.1.6 81 | ENV MPI_ROOT=/opt/habanalabs/openmpi 82 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 83 | ENV PATH=${MPI_ROOT}/bin:$PATH 84 | ENV OPAL_PREFIX=${MPI_ROOT} 85 | ENV MPICC=${MPI_ROOT}/bin/mpicc 86 | ENV RDMAV_FORK_SAFE=1 87 | ENV FI_EFA_USE_DEVICE_RDMA=0 88 | ENV OMPI_MCA_btl=^openib 89 | 90 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 91 | echo "name=Habana Navix Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 92 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/navix/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ 93 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/navix/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 94 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo 95 | 96 | RUN wget -q -O "/tmp/habana_pubkey" "https://${ARTIFACTORY_URL}/artifactory/gaudi-general/keyPairs/primary/public" && rpm --import "/tmp/habana_pubkey" && rm -f /tmp/habana_pubkey 97 | 98 | RUN dnf install -y \ 99 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ 100 | habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ 101 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ 102 | habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ 103 | dnf clean all && \ 104 | chmod +t /var/log/habana_logs && \ 105 | rm -f /etc/yum.repos.d/habanalabs.repo 106 | 107 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 108 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 109 | 110 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 111 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 112 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 113 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 114 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 115 | 116 | RUN ln -s /usr/bin/python3 /usr/bin/python 117 | 118 | RUN python3 -m pip install --upgrade protobuf 119 | 120 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 121 | 122 | # SSH configuration necessary to support mpi-operator v2 123 | RUN mkdir -p /var/run/sshd && \ 124 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 125 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 126 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 127 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 128 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 129 | 130 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 131 | ENV HABANA_LOGS=/var/log/habana_logs/ 132 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 133 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.tencentos3.1: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for Tencentos 3.1 6 | FROM tencentos/tencentos_server31_mini:20230630 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | LABEL vendor="Habanalabs Ltd." 13 | LABEL release="${VERSION}-${REVISION}" 14 | 15 | COPY LICENSE /licenses/ 16 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ 17 | dnf clean all && rm -rf /var/cache/yum 18 | 19 | RUN dnf install -y \ 20 | python3-dnf-plugin-versionlock && \ 21 | dnf versionlock add tencentos-release* && \ 22 | dnf clean all 23 | 24 | RUN dnf update -y && dnf install -y \ 25 | clang \ 26 | cmake3 \ 27 | cpp \ 28 | gcc \ 29 | gcc-c++ \ 30 | git \ 31 | glibc \ 32 | glibc-devel \ 33 | glibc-headers \ 34 | iproute \ 35 | jemalloc \ 36 | libarchive \ 37 | libjpeg-devel \ 38 | libksba \ 39 | llvm \ 40 | lsof \ 41 | mesa-libGL \ 42 | openssh-clients \ 43 | openssh-server \ 44 | redhat-lsb-core \ 45 | unzip \ 46 | wget && \ 47 | dnf clean all && \ 48 | rm -f /etc/ssh/ssh_host_*_key* 49 | 50 | COPY install-python310.sh . 51 | RUN ./install-python310.sh tencentos3.1 && rm -f install-python310.sh 52 | RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig 53 | ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 54 | 55 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 56 | ENV PIP_NO_CACHE_DIR=on 57 | 58 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 59 | python3 get-pip.py && \ 60 | rm -f get-pip.py && \ 61 | python3 -m pip install setuptools==79.0.1 wheel && \ 62 | python3 -m pip install --upgrade Jinja2 protobuf requests urllib3 63 | 64 | COPY install_efa.sh . 65 | COPY tencentos_efa_patch.txt /tmp/tencentos_efa_patch.txt 66 | RUN ./install_efa.sh && rm -f install_efa.sh /tmp/tencentos_efa_patch.txt /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 67 | 68 | ENV MPI_ROOT=/usr/mpi/gcc/openmpi-4.1.5a1 69 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib64:/usr/lib/habanalabs:$LD_LIBRARY_PATH 70 | ENV PATH=${MPI_ROOT}/bin:$PATH 71 | ENV OPAL_PREFIX=${MPI_ROOT} 72 | ENV MPICC=${MPI_ROOT}/bin/mpicc 73 | ENV RDMAV_FORK_SAFE=1 74 | ENV FI_EFA_USE_DEVICE_RDMA=1 75 | 76 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 77 | echo "name=Habana TC31 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 78 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1" >> /etc/yum.repos.d/habanalabs.repo && \ 79 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo 80 | 81 | RUN wget -q -O "/tmp/habana_pubkey" "https://${ARTIFACTORY_URL}/artifactory/gaudi-general/keyPairs/primary/public" && rpm --import "/tmp/habana_pubkey" && rm -f /tmp/habana_pubkey 82 | 83 | RUN dnf install -y \ 84 | habanalabs-rdma-core-"$VERSION"-"$REVISION".tl3 \ 85 | habanalabs-thunk-"$VERSION"-"$REVISION".tl3 \ 86 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".tl3 \ 87 | habanalabs-graph-"$VERSION"-"$REVISION".tl3 && \ 88 | rm -f /etc/yum.repos.d/habanalabs.repo && \ 89 | dnf clean all 90 | 91 | RUN alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 4 && \ 92 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 3 && \ 93 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 94 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ 95 | alternatives --install /usr/bin/unversioned-python unversioned-python /usr/bin/python3 10 && \ 96 | alternatives --install /usr/bin/python3-config python3-config /usr/local/bin/python3.10-config 1 && \ 97 | alternatives --set python3 /usr/local/bin/python3.10 && \ 98 | alternatives --set python3-config /usr/local/bin/python3.10-config && \ 99 | alternatives --set unversioned-python /usr/bin/python3 && \ 100 | export PATH="/usr/local/bin:$PATH" && \ 101 | if [ ! -L /usr/bin/python ] || [ "$(readlink /usr/bin/python)" != "/etc/alternatives/unversioned-python" ]; then \ 102 | rm -f /usr/bin/python; \ 103 | ln -s /etc/alternatives/unversioned-python /usr/bin/python; \ 104 | fi 105 | 106 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 107 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 108 | 109 | RUN alternatives --set python /usr/bin/python3 110 | 111 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 112 | 113 | # SSH configuration necessary to support mpi-operator v2 114 | RUN mkdir -p /var/run/sshd && \ 115 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 116 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 117 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 118 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 119 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 120 | 121 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 122 | ENV HABANA_LOGS=/var/log/habana_logs/ 123 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 124 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /utils/README.md: -------------------------------------------------------------------------------- 1 | # Gaudi Utils 2 | 3 | By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). 4 | 5 | ## Table of Contents 6 | 7 | - [Gaudi Utils](#gaudi-utils) 8 | - [Table of Contents](#table-of-contents) 9 | - [Overview](#overview) 10 | - [manage\_network\_ifs](#manage_network_ifs) 11 | - [Operations](#operations) 12 | - [Up](#up) 13 | - [Down](#down) 14 | - [Status](#status) 15 | - [Set IP](#set-ip) 16 | - [Unset IP](#unset-ip) 17 | - [check\_framework\_env](#check_framework_env) 18 | - [Intel Gaudi Health Screen (IGHS)](#intel-gaudi-health-screen-ighs) 19 | 20 | ## Overview 21 | 22 | Welcome to Intel Gaudi's Util Scripts! 23 | 24 | This folder contains some Intel Gaudi utility scripts that users can access as reference. 25 | 26 | ## manage_network_ifs 27 | 28 | Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh or /opt/habanalabs/qual/gaudi3/bin/manage_network_ifs.sh). 29 | 30 | This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces. 31 | 32 | The following is the usage of the script: 33 | 34 | ``` 35 | usage: ./manage_network_ifs.sh [options] 36 | 37 | options: 38 | --up toggle up all Intel Gaudi network interfaces 39 | --down toggle down all Intel Gaudi network interfaces 40 | --status print status of all Intel Gaudi network interfaces 41 | --set-pfc set PFC (enabled=0,1,2,3) 42 | --unset-pfc unset PFC (enabled=none) 43 | --check-pfc dump PFC configuration 44 | --no-progbar do not show progress bar 45 | -v, --verbose print more logs 46 | -h, --help print this help 47 | 48 | Note: Please run this script with one operation at a time 49 | ``` 50 | ## Operations 51 | 52 | Before executing any operation, this script finds all the Intel Gaudi network interfaces available on the system and stores the Intel Gaudi interface information into a list. 53 | The list will be used for the operations. If no Intel Gaudi network interface is found, the script will exit. 54 | 55 | ### Up 56 | 57 | Use the following command to bring all Intel Gaudi network interfaces online: 58 | ``` 59 | sudo manage_network_ifs.sh --up 60 | ``` 61 | ### Down 62 | 63 | Use the following command to bring all Intel Gaudi network interfaces offline: 64 | ``` 65 | sudo manage_network_ifs.sh --down 66 | ``` 67 | ### Status 68 | 69 | Print the current operational state of all Intel Gaudi network interfaces such as how many ports are up/down: 70 | ``` 71 | sudo manage_network_ifs.sh --status 72 | ``` 73 | ### Set PFC 74 | 75 | Use the following command to set PFC for all Intel Gaudi network interfaces: 76 | ``` 77 | sudo manage_network_ifs.sh --set-pfc 78 | ``` 79 | ### Unset PFC 80 | 81 | Use the following command to unset PFC for all Intel Gaudi network interfaces: 82 | ``` 83 | sudo manage_network_ifs.sh --unset-pfc 84 | ``` 85 | 86 | ### Check current PFC configuration 87 | 88 | Use the following command to check current PFC status for all Intel Gaudi network interfaces: 89 | ``` 90 | sudo manage_network_ifs.sh --check-pfc 91 | ``` 92 | 93 | ## check_framework_env 94 | 95 | This script can be used as reference to check the environment for running PyTorch on Intel Gaudi. 96 | 97 | The following is the usage of the script: 98 | 99 | ``` 100 | usage: check_framework_env.py [-h] [--cards CARDS] 101 | 102 | Check health of Intel Gaudi for PyTorch 103 | 104 | optional arguments: 105 | -h, --help show this help message and exit 106 | --cards CARDS Set number of cards to test (default: 1) 107 | ``` 108 | 109 | ## Intel Gaudi Health Screen (IGHS) 110 | 111 | **Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test 112 | includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems. 113 | 114 | ``` bash 115 | usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] 116 | [--job-id JOB_ID] [--round ROUND] [--config CONFIG] 117 | [--ighs-check [{node,hccl-demo,none}]] [--node-write-report] 118 | [--node-name NODE_NAME] [--logs-dir LOGS_DIR] 119 | 120 | optional arguments: 121 | -h, --help show this help message and exit 122 | --initialize Downloads Necessary Repos and Creates Report Template 123 | --screen Starts Health Screen for Cluster 124 | --target-nodes TARGET_NODES 125 | List of target nodes 126 | --job-id JOB_ID Needed to identify hccl-demo running log 127 | --round ROUND Needed to identify hccl-demo running round log 128 | --config CONFIG Configuration file for Health Screener 129 | --ighs-check [{node,hccl-demo,none}] 130 | Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce 131 | (HCCL_DEMO between paris of nodes) 132 | --node-write-report Write Individual Node Health Report 133 | --node-name NODE_NAME Name of Node 134 | --logs-dir LOGS_DIR Output directory of health screen results 135 | ``` 136 | 137 | To run a full IGHS test, run the below command: 138 | 139 | ``` bash 140 | # Creates IGHS Report and screens clusters for any infected nodes. 141 | # Will check Level 1 and 2 by default 142 | python screen.py --initialize --screen 143 | ``` 144 | 145 | IGHS can alternatively be run through below script: 146 | 147 | ``` bash 148 | # Creates IGHS Report and screens clusters for any infected nodes. 149 | # Will check Level 1 and 2 by default 150 | ./run_ighs.sh 151 | ``` 152 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v2beta1 2 | kind: MPIJob 3 | metadata: 4 | name: template-metadata-name 5 | namespace: default 6 | labels: 7 | app: ighs-hccl 8 | spec: 9 | slotsPerWorker: 8 10 | runPolicy: 11 | cleanPodPolicy: Running 12 | mpiReplicaSpecs: 13 | Launcher: 14 | replicas: 1 15 | template: 16 | metadata: 17 | labels: 18 | app: ighs-hccl 19 | spec: 20 | volumes: 21 | - name: mydir 22 | emptyDir: {} 23 | containers: 24 | - image: template-container-image 25 | name: ighs-launcher 26 | imagePullPolicy: IfNotPresent 27 | workingDir: /workdir 28 | volumeMounts: 29 | - name: mydir 30 | mountPath: /workdir 31 | securityContext: 32 | capabilities: 33 | add: 34 | - SYSLOG 35 | env: 36 | - name: JOB_ID 37 | valueFrom: 38 | fieldRef: 39 | fieldPath: metadata.labels['name'] 40 | - name: MY_NODE_NAME 41 | valueFrom: 42 | fieldRef: 43 | fieldPath: spec.nodeName 44 | - name: HOME_DIR 45 | value: "/workdir/intel_gaudi_health_screen" 46 | - name: IGHS_LEVEL 47 | value: "2" 48 | command: ["/bin/bash", "-c"] 49 | args: 50 | - >- 51 | set -eo pipefail; 52 | echo "Target Nodes: $TARGET_NODES"; 53 | ssh-keygen -A; 54 | service ssh start; 55 | 56 | while [ ! -d /workdir/intel_gaudi_health_screen ]; do 57 | sleep 2s; 58 | done; 59 | sleep 10s; 60 | 61 | declare -xr HOSTSFILE=$OMPI_MCA_orte_default_hostfile; 62 | 63 | declare -xr NUM_NODES=$(wc -l < $HOSTSFILE); 64 | declare -xr NGPU_PER_NODE=8; 65 | declare -xr N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); 66 | 67 | cd ${HOME_DIR}/build/hccl_demo; 68 | declare -xr CMD="python ${HOME_DIR}/build/hccl_demo/run_hccl_demo.py \ 69 | --test all_reduce \ 70 | --loop 1000 \ 71 | --size 32m \ 72 | -mpi "; 73 | 74 | mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/; 75 | cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 76 | touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 77 | echo "Target Nodes: $TARGET_NODES" > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 78 | 79 | $CMD \ 80 | -np ${N_CARDS} \ 81 | --allow-run-as-root \ 82 | --bind-to core \ 83 | --map-by ppr:4:socket:PE=6 \ 84 | --rank-by core --report-bindings \ 85 | --tag-output \ 86 | --merge-stderr-to-stdout --prefix $MPI_ROOT \ 87 | --mca btl_tcp_if_include eth0 \ 88 | -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ 89 | -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ 90 | -x MAX_TIMEOUT=60 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; 91 | 92 | cd ${HOME_DIR}; 93 | python ${HOME_DIR}/screen.py --ighs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND; 94 | 95 | Worker: 96 | replicas: template-num-nodes 97 | template: 98 | metadata: 99 | labels: 100 | app: ighs-hccl 101 | spec: 102 | affinity: 103 | nodeAffinity: 104 | requiredDuringSchedulingIgnoredDuringExecution: 105 | nodeSelectorTerms: 106 | - matchExpressions: 107 | - key: kubernetes.io/hostname 108 | operator: In 109 | values: 110 | - IGHS-DUMMY-VAL 111 | volumes: 112 | - name: mydir 113 | emptyDir: {} 114 | tolerations: 115 | - key: "" 116 | operator: "Exists" 117 | effect: "NoSchedule" 118 | - key: "" 119 | operator: "Exists" 120 | effect: "NoExecute" 121 | containers: 122 | - image: template-container-image 123 | name: ighs-worker 124 | imagePullPolicy: IfNotPresent 125 | securityContext: 126 | capabilities: 127 | add: 128 | - SYSLOG 129 | resources: 130 | limits: 131 | habana.ai/gaudi: 8 132 | hugepages-2Mi: 29000Mi 133 | cpu: 95 134 | memory: 200Gi 135 | requests: 136 | habana.ai/gaudi: 8 137 | hugepages-2Mi: 29000Mi 138 | memory: 200Gi 139 | cpu: 95 140 | volumeMounts: 141 | - name: mydir 142 | mountPath: /workdir 143 | env: 144 | - name: IGHS_LEVEL 145 | value: "2" 146 | - name: MY_POD_IP 147 | valueFrom: 148 | fieldRef: 149 | fieldPath: status.podIP 150 | - name: MY_NODE_NAME 151 | valueFrom: 152 | fieldRef: 153 | fieldPath: spec.nodeName 154 | - name: MY_POD_NAMESPACE 155 | valueFrom: 156 | fieldRef: 157 | fieldPath: metadata.namespace 158 | command: ["/bin/bash", "-c"] 159 | args: 160 | - >- 161 | printenv | grep "MY" >> /etc/environment; 162 | ssh-keygen -A; 163 | service ssh start; 164 | sleep 365d; 165 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/utilities.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, time, sys 14 | import subprocess, shlex 15 | from datetime import datetime 16 | 17 | import logging 18 | from logging import handlers 19 | 20 | _logger = logging.getLogger("health_screener") 21 | 22 | def get_logging_level(log_level): 23 | log_level = log_level.lower() 24 | num_level = logging.INFO 25 | 26 | if log_level == "info": 27 | num_level = logging.INFO 28 | elif log_level == "debug": 29 | num_level = logging.DEBUG 30 | elif log_level == "warn": 31 | num_level = logging.WARN 32 | elif log_level == "error": 33 | num_level = logging.ERROR 34 | elif log_level == "critical": 35 | num_level = logging.CRITICAL 36 | 37 | return num_level 38 | 39 | def create_logger(logger_name, logger_file_name, f_path="", level=logging.INFO, max_bytes=5e6, backup_count=10): 40 | """ Creates Logger that writes to logs directory 41 | 42 | Args: 43 | logger_name (str): Name of Logger File. Will be appended with logs/{current_time}/logger_name.log 44 | level (int, optional): Logging Level. Defaults to logging.INFO. 45 | max_bytes (int, optional): Max size of log file. Will rollover once maxed reach. Defaults to 5e6. 46 | backup_count (int, optional): Rollover Limit. Defaults to 10. 47 | 48 | Returns: 49 | logger: Logger Object used to log details to designated logger file 50 | """ 51 | t_logger = logging.getLogger(logger_name) 52 | t_logger.setLevel(level) 53 | 54 | c_time = datetime.now() 55 | date_format = c_time.strftime("%m-%d-%Y") 56 | time_format = c_time.strftime("%H-%M") 57 | 58 | file_path = f"{f_path}/{logger_file_name}.log" if f_path != "" else f"logs/{date_format}/{date_format}_{time_format}/{logger_file_name}.log" 59 | d_path = os.path.dirname(file_path) 60 | _logger.debug(f"d_path: {d_path} file_path: {file_path}") 61 | 62 | if(not os.path.exists(d_path)): 63 | os.makedirs(d_path) 64 | 65 | formatter = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s",datefmt='%Y-%m-%d %H:%M:%S') 66 | handler = logging.handlers.RotatingFileHandler(file_path, maxBytes=max_bytes, backupCount=backup_count) 67 | handler.setFormatter(formatter) 68 | 69 | stream_handler = logging.StreamHandler(sys.stdout) 70 | stream_handler.setFormatter(formatter) 71 | 72 | t_logger.addHandler(handler) 73 | t_logger.addHandler(stream_handler) 74 | 75 | return t_logger, d_path 76 | 77 | def run_cmd(cmd, timeout_s=900, verbose=False): 78 | """ Run Command through subprocess.run() 79 | 80 | Args: 81 | cmd (str): CMD to run 82 | timeout_s (int, optional): Timeout of CMD. Defaults to 1_800. 83 | verbose (bool, optional): Print results. Defaults to False 84 | 85 | Returns: 86 | bool: Result of CMD. If it encounters any weird exceptions it will be flagged as False 87 | """ 88 | 89 | cmd = shlex.split(cmd) 90 | result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout_s) 91 | 92 | if (verbose): 93 | _logger.debug(f"Running cmd: {cmd}") 94 | _logger.debug(result.stdout) 95 | 96 | return result.stdout 97 | 98 | def download_repos(): 99 | """ Download HCCL_DEMO Repo to assist in health checks 100 | """ 101 | if not os.path.exists("build"): 102 | os.makedirs("build") 103 | 104 | if not os.path.exists("build/hccl_demo"): 105 | _logger.info(f"Downloading hccl_demo into build/") 106 | cmd = "git clone https://github.com/HabanaAI/hccl_demo.git build/hccl_demo" 107 | run_cmd(cmd) 108 | 109 | os.environ["MPI"]="1" 110 | cmd = "make -C build/hccl_demo" 111 | run_cmd(cmd) 112 | 113 | def copy_files(src, dst, to_remote=True, hosts=[], exclude={}): 114 | """ Copies files through rsync from src to dst over the list of hosts 115 | 116 | Args: 117 | src (str): Source file/directory to copy 118 | dst (str): Destination to copy files/directory 119 | to_remote (bool, optional): rsync to remote destination (src -> host:dst). False will rsync to local destination (h:src -> dst). Defaults to True. 120 | hosts (list, optional): List of IP Addresses to copy to/from. Defaults to []. 121 | exclude (dict, optional): Files/Directory to ignore. Follow rsync rules for exclusions. Defaults to {}. 122 | """ 123 | rsync_cmd = f"rsync -ahzgop --exclude={exclude}" 124 | 125 | for h in hosts: 126 | if (to_remote): 127 | src_path = src 128 | dst_path = f"{h}:{dst}" 129 | else: 130 | src_path = f"{h}:{src}" 131 | dst_path = dst 132 | 133 | _logger.debug(f"Copying {src_path} to {dst_path}") 134 | cmd = f"{rsync_cmd} {src_path} {dst_path}" 135 | output = run_cmd(cmd) 136 | 137 | 138 | def clear_job(job): 139 | """ Clear MPIJobs based on Job Name 140 | 141 | Args: 142 | job (str): Job Name to delete 143 | """ 144 | _logger.info(f"Checking for existing MPIJobs {job}") 145 | cmd = f"kubectl get mpijobs -n default {job} -o=custom-columns='NAME:.metadata.name' --no-headers" 146 | output = run_cmd(cmd) 147 | 148 | if job in output: 149 | _logger.info(f"Found MPIJobs {job}. Will delete.") 150 | cmd = f"kubectl delete mpijobs -n default {job}" 151 | output = run_cmd(cmd) 152 | 153 | cmd = f"kubectl get pods -n default --selector=training.kubeflow.org/job-name={job} -o=custom-columns='NAME:.metadata.name' --no-headers" 154 | 155 | max_attempt = 15 156 | for attempts in range(max_attempt): 157 | output = run_cmd(cmd).strip() 158 | 159 | if(len(output) == 0): 160 | break 161 | 162 | _logger.info(f"Attempt {attempts} Pods are still up. Will wait 10 seconds to check again") 163 | time.sleep(10) 164 | -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.rhel9.6: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.6 6 | FROM registry.access.redhat.com/ubi9/ubi:9.6 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | # for RHEL certification 13 | LABEL vendor="Habanalabs Ltd." 14 | LABEL release="${VERSION}-${REVISION}" 15 | 16 | COPY LICENSE /licenses/ 17 | 18 | RUN dnf install -y \ 19 | python3-dnf-plugin-versionlock && \ 20 | dnf versionlock add redhat-release* && \ 21 | dnf clean all 22 | 23 | RUN rpm -e --nodeps openssl-fips-provider-so 24 | 25 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ 26 | dnf clean all 27 | 28 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 29 | echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 30 | echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 31 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 32 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo 33 | 34 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 35 | echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 36 | echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 37 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 38 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo 39 | 40 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 41 | echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 42 | echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 43 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 44 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo 45 | 46 | RUN dnf update -y && dnf install -y \ 47 | bzip2 \ 48 | bzip2-devel \ 49 | clang \ 50 | cmake3 \ 51 | cpp \ 52 | ffmpeg-free \ 53 | gcc \ 54 | gcc-c++ \ 55 | git \ 56 | glibc \ 57 | glibc-devel \ 58 | glibc-headers \ 59 | iproute \ 60 | jemalloc \ 61 | libarchive \ 62 | libffi-devel \ 63 | libjpeg-devel \ 64 | libksba \ 65 | llvm \ 66 | lsb_release \ 67 | lsof \ 68 | mesa-libGL \ 69 | openssh-clients \ 70 | openssh-server \ 71 | openssl \ 72 | openssl-devel \ 73 | perl-Net-SSLeay \ 74 | python3-devel \ 75 | python3.12 \ 76 | python3.12-devel \ 77 | unzip \ 78 | wget \ 79 | zlib-devel && \ 80 | dnf clean all && \ 81 | rm -f /etc/ssh/ssh_host_*_key* 82 | 83 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 && \ 84 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 85 | alternatives --set python3 /usr/bin/python3.12 86 | 87 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 88 | ENV PIP_NO_CACHE_DIR=on 89 | 90 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 91 | python3 get-pip.py && \ 92 | rm -f get-pip.py && \ 93 | python3 -m pip install setuptools==79.0.1 wheel && \ 94 | python3 -m pip install --upgrade Jinja2 protobuf urllib3 requests 95 | 96 | COPY install_efa.sh . 97 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 98 | 99 | ENV OPENMPI_VERSION=4.1.6 100 | ENV MPI_ROOT=/opt/habanalabs/openmpi 101 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 102 | ENV PATH=${MPI_ROOT}/bin:$PATH 103 | ENV OPAL_PREFIX=${MPI_ROOT} 104 | ENV MPICC=${MPI_ROOT}/bin/mpicc 105 | ENV RDMAV_FORK_SAFE=1 106 | ENV FI_EFA_USE_DEVICE_RDMA=0 107 | ENV OMPI_MCA_btl=^openib 108 | 109 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 110 | echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 111 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.6" >> /etc/yum.repos.d/habanalabs.repo && \ 112 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 113 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo 114 | 115 | RUN wget -q -O "/tmp/habana_pubkey" "https://${ARTIFACTORY_URL}/artifactory/gaudi-general/keyPairs/primary/public" && rpm --import "/tmp/habana_pubkey" && rm -f /tmp/habana_pubkey 116 | 117 | RUN dnf install -y \ 118 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ 119 | habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ 120 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ 121 | habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ 122 | dnf clean all && \ 123 | chmod +t /var/log/habana_logs && \ 124 | rm -f /etc/yum.repos.d/habanalabs.repo 125 | 126 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 127 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 128 | 129 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 130 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 131 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 132 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 133 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 134 | 135 | RUN ln -s /usr/bin/python3 /usr/bin/python 136 | 137 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 138 | 139 | # SSH configuration necessary to support mpi-operator v2 140 | RUN mkdir -p /var/run/sshd && \ 141 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 142 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 143 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 144 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 145 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 146 | 147 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 148 | ENV HABANA_LOGS=/var/log/habana_logs/ 149 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 150 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.rhel9.4-py312: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.4 with python 3.12 6 | FROM registry.access.redhat.com/ubi9/ubi:9.4 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | LABEL vendor="Habanalabs Ltd." 13 | LABEL release="${VERSION}-${REVISION}" 14 | 15 | COPY LICENSE /licenses/ 16 | 17 | RUN dnf install -y \ 18 | python3-dnf-plugin-versionlock && \ 19 | dnf versionlock add redhat-release* && \ 20 | dnf clean all 21 | 22 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ 23 | dnf clean all 24 | 25 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 26 | echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 27 | echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 28 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 29 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo 30 | 31 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 32 | echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 33 | echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 34 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 35 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo 36 | 37 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 38 | echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 39 | echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 40 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 41 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo 42 | 43 | RUN dnf update -y && dnf install -y \ 44 | bzip2 \ 45 | bzip2-devel \ 46 | clang \ 47 | cmake3 \ 48 | cpp \ 49 | ffmpeg-free \ 50 | gcc \ 51 | gcc-c++ \ 52 | git \ 53 | glibc \ 54 | glibc-devel \ 55 | glibc-headers \ 56 | iproute \ 57 | jemalloc \ 58 | libarchive \ 59 | libffi-devel \ 60 | libjpeg-devel \ 61 | libksba \ 62 | llvm \ 63 | lsb_release \ 64 | lsof \ 65 | mesa-libGL \ 66 | openssh-clients \ 67 | openssh-server \ 68 | openssl \ 69 | openssl-devel \ 70 | python3.12 \ 71 | python3.12-devel \ 72 | python3.12-libs \ 73 | python3.12-rpm \ 74 | unzip \ 75 | wget \ 76 | zlib-devel && \ 77 | dnf versionlock add \ 78 | rpm* && \ 79 | dnf clean all && \ 80 | rm -f /etc/ssh/ssh_host_*_key* 81 | 82 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 && \ 83 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 84 | alternatives --set python3 /usr/bin/python3.12 85 | 86 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 87 | ENV PIP_NO_CACHE_DIR=on 88 | 89 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 90 | python3 get-pip.py && \ 91 | rm -f get-pip.py && \ 92 | python3 -m pip install setuptools==79.0.1 wheel && \ 93 | python3 -m pip install --upgrade Jinja2 protobuf urllib3 94 | 95 | COPY install_efa.sh . 96 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 97 | 98 | ENV OPENMPI_VERSION=4.1.6 99 | ENV MPI_ROOT=/opt/habanalabs/openmpi 100 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 101 | ENV PATH=${MPI_ROOT}/bin:$PATH 102 | ENV OPAL_PREFIX=${MPI_ROOT} 103 | ENV MPICC=${MPI_ROOT}/bin/mpicc 104 | ENV RDMAV_FORK_SAFE=1 105 | ENV FI_EFA_USE_DEVICE_RDMA=0 106 | ENV OMPI_MCA_btl=^openib 107 | 108 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 109 | echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 110 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ 111 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 112 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo 113 | 114 | RUN wget -q -O "/tmp/habana_pubkey" "https://${ARTIFACTORY_URL}/artifactory/gaudi-general/keyPairs/primary/public" && rpm --import "/tmp/habana_pubkey" && rm -f /tmp/habana_pubkey 115 | 116 | # for Habana GPG key with SHA-1 signature 117 | RUN update-crypto-policies --set DEFAULT:SHA1 118 | 119 | RUN dnf install -y \ 120 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ 121 | habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ 122 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ 123 | habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ 124 | dnf clean all && \ 125 | chmod +t /var/log/habana_logs && \ 126 | rm -f /etc/yum.repos.d/habanalabs.repo 127 | 128 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 129 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 130 | 131 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 132 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 133 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 134 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 135 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 136 | 137 | RUN ln -s /usr/bin/python3 /usr/bin/python 138 | 139 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 140 | 141 | # SSH configuration necessary to support mpi-operator v2 142 | RUN mkdir -p /var/run/sshd && \ 143 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 144 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 145 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 146 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 147 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 148 | 149 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 150 | ENV HABANA_LOGS=/var/log/habana_logs/ 151 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 152 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /dockerfiles/base/Dockerfile.rhel9.4: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # HabanaLabs Dockerfile base installer layer for RedHat 9.4 6 | FROM registry.access.redhat.com/ubi9/ubi:9.4 7 | ARG ARTIFACTORY_URL 8 | ARG VERSION 9 | ARG REVISION 10 | ARG PYPI_URL 11 | 12 | LABEL vendor="Habanalabs Ltd." 13 | LABEL release="${VERSION}-${REVISION}" 14 | 15 | COPY LICENSE /licenses/ 16 | 17 | RUN dnf install -y \ 18 | python3-dnf-plugin-versionlock && \ 19 | dnf versionlock add redhat-release* && \ 20 | dnf clean all 21 | 22 | RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ 23 | dnf clean all 24 | 25 | RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 26 | echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 27 | echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 28 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ 29 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo 30 | 31 | RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 32 | echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 33 | echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 34 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ 35 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo 36 | 37 | RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 38 | echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 39 | echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 40 | echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ 41 | echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo 42 | 43 | RUN dnf update -y && dnf install -y \ 44 | bzip2 \ 45 | bzip2-devel \ 46 | clang \ 47 | cmake3 \ 48 | cpp \ 49 | ffmpeg-free \ 50 | gcc \ 51 | gcc-c++ \ 52 | git \ 53 | glibc \ 54 | glibc-devel \ 55 | glibc-headers \ 56 | iproute \ 57 | jemalloc \ 58 | libarchive \ 59 | libffi-devel \ 60 | libjpeg-devel \ 61 | libksba \ 62 | llvm \ 63 | lsb_release \ 64 | lsof \ 65 | mesa-libGL \ 66 | openssh-clients \ 67 | openssh-server \ 68 | openssl \ 69 | openssl-devel \ 70 | python3-devel \ 71 | python3.11 \ 72 | python3.11-devel \ 73 | python3.11-rpm \ 74 | unzip \ 75 | wget \ 76 | zlib-devel && \ 77 | dnf versionlock add \ 78 | python3-rpm \ 79 | rpm* && \ 80 | dnf clean all && \ 81 | rm -f /etc/ssh/ssh_host_*_key* 82 | 83 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ 84 | alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ 85 | alternatives --install /usr/bin/python3-config python3-config /usr/bin/python3.11-config 1 && \ 86 | alternatives --set python3 /usr/bin/python3.11 87 | 88 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 89 | ENV PIP_NO_CACHE_DIR=on 90 | 91 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 92 | python3 get-pip.py && \ 93 | rm -f get-pip.py && \ 94 | python3 -m pip install setuptools==79.0.1 wheel && \ 95 | python3 -m pip install --upgrade Jinja2 protobuf urllib3 requests 96 | 97 | COPY install_efa.sh . 98 | RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh 99 | 100 | ENV OPENMPI_VERSION=4.1.6 101 | ENV MPI_ROOT=/opt/habanalabs/openmpi 102 | ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH 103 | ENV PATH=${MPI_ROOT}/bin:$PATH 104 | ENV OPAL_PREFIX=${MPI_ROOT} 105 | ENV MPICC=${MPI_ROOT}/bin/mpicc 106 | ENV RDMAV_FORK_SAFE=1 107 | ENV FI_EFA_USE_DEVICE_RDMA=0 108 | ENV OMPI_MCA_btl=^openib 109 | 110 | RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ 111 | echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ 112 | echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ 113 | echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ 114 | echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo 115 | 116 | RUN wget -q -O "/tmp/habana_pubkey" "https://${ARTIFACTORY_URL}/artifactory/gaudi-general/keyPairs/primary/public" && rpm --import "/tmp/habana_pubkey" && rm -f /tmp/habana_pubkey 117 | 118 | # for Habana GPG key with SHA-1 signature 119 | RUN update-crypto-policies --set DEFAULT:SHA1 120 | 121 | RUN dnf install -y \ 122 | habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ 123 | habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ 124 | habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ 125 | habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ 126 | dnf clean all && \ 127 | chmod +t /var/log/habana_logs && \ 128 | rm -f /etc/yum.repos.d/habanalabs.repo 129 | 130 | ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src 131 | ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib 132 | 133 | RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ 134 | tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ 135 | cd /tmp/openmpi-${OPENMPI_VERSION} && \ 136 | ./configure --prefix=${MPI_ROOT} --with-verbs && \ 137 | make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} 138 | 139 | RUN ln -s /usr/bin/python3 /usr/bin/python 140 | 141 | RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --extra-index-url ${PYPI_URL} 142 | 143 | # SSH configuration necessary to support mpi-operator v2 144 | RUN mkdir -p /var/run/sshd && \ 145 | sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 146 | sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ 147 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 148 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 149 | mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc 150 | 151 | ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so 152 | ENV HABANA_LOGS=/var/log/habana_logs/ 153 | ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw 154 | ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/hccl_demo_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import random, math, os, yaml, glob, json 14 | 15 | import logging 16 | _logger = logging.getLogger("health_screener") 17 | 18 | def find_groups(healthy_nodes, watch_nodes, groups_tracker): 19 | """ Find a list of node groups to run hccl_demo all reduce test 20 | 21 | Args: 22 | healthy_nodes ([str]): Nodes that previously passed a pair testing of hccl_demo 23 | watch_nodes ([str]): Nodes that haven't has a passing round of hccl_demo 24 | groups_tracker ([str]): History of used groups. A group has to be unique 25 | 26 | Returns: 27 | ([str],[str]): Unique list of groups of nodes, History of used groups 28 | """ 29 | random.shuffle(healthy_nodes) 30 | random.shuffle(watch_nodes) 31 | 32 | found_unique = True 33 | num_nodes = len(healthy_nodes) + len(watch_nodes) 34 | node_groups = list() 35 | max_num_groups = num_nodes // 2 36 | max_combinations = (math.factorial(num_nodes)) / (math.factorial(num_nodes-2) * 2) 37 | max_attempts = 10 38 | groups_tracker = set(groups_tracker) 39 | 40 | if num_nodes == 1: 41 | _logger.warning(f"Need more than 1 Node to test pair all_reduce") 42 | return node_groups, list(groups_tracker) 43 | 44 | while len(node_groups) < max_num_groups and found_unique: 45 | i = 0 46 | h_i, w_i = 0,0 47 | 48 | if len(groups_tracker) >= max_combinations: 49 | _logger.info(f"Reached maximum combinations {max_combinations} for {num_nodes} Nodes") 50 | break 51 | 52 | node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i) 53 | i += 1 54 | if len(node_group) < 2 or node_group[0] == node_group[1]: 55 | _logger.info(f"Found invalid node_group {node_group}. Exiting group id search") 56 | found_unique = False 57 | break 58 | 59 | while group_id in groups_tracker: 60 | if i >= max_attempts: 61 | _logger.warning(f"Max attempt {max_attempts} reached for finding unique pair combination.") 62 | found_unique = False 63 | break 64 | 65 | node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i) 66 | i += 1 67 | if len(node_group) < 2 or node_group[0] == node_group[1]: 68 | _logger.info(f"Internal while Found invalid node_group {node_group}. Exiting group id search") 69 | found_unique = False 70 | break 71 | 72 | if found_unique: 73 | groups_tracker.add(group_id) 74 | node_groups.append(node_group) 75 | 76 | for n in node_group: 77 | if n in healthy_nodes: 78 | healthy_nodes.remove(n) 79 | if n in watch_nodes: 80 | watch_nodes.remove(n) 81 | 82 | if len(watch_nodes) == 0: 83 | break 84 | 85 | return node_groups, list(groups_tracker) 86 | 87 | def find_group_id(healthy_nodes, watch_nodes, h_i=0, w_i=0): 88 | """ Finds a group of nodes and combines to form a group id 89 | 90 | Args: 91 | healthy_nodes ([str]): Nodes that previously passed a pair testing of hccl_demo 92 | watch_nodes ([str]): Nodes that haven't has a passing round of hccl_demo 93 | h_i (int): Index of next potential node id for healthy_nodes 94 | w_i (int): Index of next potential node id for watch_nodes 95 | 96 | Returns: 97 | ([str], str): Potential nodes and their group id 98 | """ 99 | group_id = "" 100 | node_group = [] 101 | max_attempt = 10 102 | 103 | # Goal of testing is to test watch_nodes and pair it with a healhty_node if available 104 | if len(watch_nodes) == 0 or (len(watch_nodes) == 1 and len(healthy_nodes)==0): 105 | return node_group, group_id, (h_i, w_i) 106 | 107 | for i in range(max_attempt): 108 | if len(watch_nodes) and w_i < len(watch_nodes): 109 | node_group.append(watch_nodes[w_i]) 110 | w_i += 1 111 | if len(healthy_nodes) and h_i < len(healthy_nodes): 112 | node_group.append(healthy_nodes[h_i]) 113 | h_i += 1 114 | 115 | if h_i >= len(healthy_nodes): 116 | random.shuffle(healthy_nodes) 117 | h_i = 0 118 | if w_i >= len(watch_nodes): 119 | random.shuffle(watch_nodes) 120 | w_i = 0 121 | 122 | if len(node_group) >= 2: 123 | break 124 | 125 | if len(node_group) > 1: 126 | node_group.sort() 127 | group_id = "-".join(node_group) 128 | 129 | return node_group, group_id, (h_i, w_i) 130 | 131 | def gather_hccl_logs(job_path, round, log_dir, health_report): 132 | """ Retrieve hccl_demo log files based on the job yamls executed 133 | 134 | Args: 135 | job_path (str): Base directory of job yamls executed 136 | round (int): Round to retrieve HCCL_Demo logs 137 | log_dir (str): Base directory of HCCL_Demo logs 138 | health_report (HealthReport): Tracks and reports health of hccl_demo 139 | """ 140 | path = f"{job_path}/**/r{round}/*.yaml" 141 | job_files = glob.glob(path, recursive=True) 142 | hccl_results = dict() 143 | 144 | for f_name in job_files: 145 | with open(f_name, 'r', newline='') as f: 146 | job_data = yaml.safe_load(f) 147 | 148 | launcher_template = job_data["spec"]["mpiReplicaSpecs"]["Launcher"]["template"] 149 | 150 | job_id = launcher_template["metadata"]["labels"]["name"] 151 | target_nodes = launcher_template["spec"]["containers"][0]["env"][4]["value"] 152 | target_nodes = target_nodes.split(',') 153 | 154 | hccl_results[f"{target_nodes}"] = hccl_demo_check(job_id=f"{log_dir}/L2/r{round}/{job_id}", 155 | target_nodes=target_nodes, health_report=health_report, write=False) 156 | 157 | multi_node_fail = set() 158 | qpc_fail = set() 159 | missing_nodes = set() 160 | 161 | for results in hccl_results.values(): 162 | if results["multi_node_fail"]: 163 | multi_node_fail.add(f"{results['node_ids']}") 164 | 165 | if results["qpc_fail"]: 166 | qpc_fail.add(f"{results['node_ids']}") 167 | 168 | if results["missing"]: 169 | missing_nodes.add(f"{results['node_ids']}") 170 | 171 | health_report.update_hccl_demo_health_report(round=round, all_node_pairs=hccl_results, multi_node_fail=multi_node_fail, qpc_fail=qpc_fail, missing_nodes=missing_nodes) 172 | 173 | def hccl_demo_check(job_id, health_report, target_nodes=[], hccl_log=[], write=True): 174 | """ Check on HCCL Demo Status. Reads the output log, if it 175 | has "Exiting HCCL demo with code: 1" then it is treated as a 176 | failure 177 | 178 | Args: 179 | job_id (str): Metadata name of the Job 180 | health_report (HealthReport): Tracks and reports health of hccl_demo 181 | target_nodes ([str], optional): Nodes that are used in hccl_demo testing 182 | hccl_log ([str]): Log of HCCL_DEMO run 183 | write (bool, optional): Writes to Report. Used to collect hccl results and update Base Health Report. Default to True 184 | 185 | Returns: 186 | dict: HCCL Demo Health Report result data. 187 | """ 188 | f_name_log = f"{job_id}.log" 189 | round = os.path.basename(job_id).split("-")[2][1:] 190 | group_id = os.path.basename(job_id).split("-")[3] 191 | hccl_demo_fail = True 192 | missing = False 193 | qpc_fail = False 194 | 195 | if len(hccl_log) == 0: 196 | if not os.path.exists(f_name_log): 197 | _logger.error(f"{f_name_log} can't be found or has no data") 198 | hccl_demo_fail = True 199 | missing = True 200 | else: 201 | with open(f_name_log, "r", newline='') as f: 202 | lines = f.readlines() 203 | hccl_demo_fail, qpc_fail, missing, _ = analyze_hccl_log(lines) 204 | else: 205 | hccl_demo_fail, qpc_fail, missing, target_nodes = analyze_hccl_log(hccl_log) 206 | 207 | target_nodes.sort() 208 | data = { 209 | "round": round, 210 | "group_id": group_id, 211 | "node_ids": target_nodes, 212 | "num_nodes": len(target_nodes), 213 | "multi_node_fail": hccl_demo_fail, 214 | "missing": missing, 215 | "qpc_fail": qpc_fail 216 | } 217 | 218 | if write: 219 | _logger.info("***** START of Node Report *****") 220 | _logger.info(json.dumps(data)) 221 | _logger.info("***** END of Node Report *****") 222 | health_report.write_rows(data=[data], level=2) 223 | 224 | return data 225 | 226 | def analyze_hccl_log(data): 227 | err_phrase = "Exiting HCCL demo with code: 1" 228 | err_phrase_other = "During handling of the above exception, another exception occurred:" 229 | err_phrase_ssh = "ssh: Could not resolve hostname" 230 | err_phrase_qpc = "Source: QPC, error" 231 | pass_phrase = "Bandwidth" 232 | 233 | target_phrase = "Target Nodes: " 234 | 235 | hccl_demo_fail = True 236 | missing = False 237 | qpc_fail = False 238 | target_nodes = [] 239 | 240 | for l in data: 241 | if l.find(err_phrase_ssh) != -1: 242 | hccl_demo_fail = True 243 | missing = True 244 | elif l.find(err_phrase_qpc) != -1: 245 | hccl_demo_fail = True 246 | qpc_fail = True 247 | elif l.find(err_phrase) != -1 or l.find(err_phrase_other) != -1: 248 | hccl_demo_fail = True 249 | elif l.find(pass_phrase) != -1: 250 | hccl_demo_fail = False 251 | elif l.find(target_phrase) != -1: 252 | colon_index = l.index(":") 253 | target_nodes = l[colon_index+2:].split(",") 254 | 255 | return hccl_demo_fail, qpc_fail, missing, target_nodes 256 | -------------------------------------------------------------------------------- /dockerfiles/triton_vllm_backend/samples/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of NVIDIA CORPORATION nor the names of its 14 | # contributors may be used to endorse or promote products derived 15 | # from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import argparse 30 | import asyncio 31 | import json 32 | import sys 33 | 34 | import numpy as np 35 | import tritonclient.grpc.aio as grpcclient 36 | from tritonclient.utils import * 37 | 38 | 39 | class LLMClient: 40 | def __init__(self, flags: argparse.Namespace): 41 | self._client = grpcclient.InferenceServerClient( 42 | url=flags.url, verbose=flags.verbose 43 | ) 44 | self._flags = flags 45 | self._loop = asyncio.get_event_loop() 46 | self._results_dict = {} 47 | 48 | async def async_request_iterator( 49 | self, prompts, sampling_parameters, exclude_input_in_output 50 | ): 51 | try: 52 | for iter in range(self._flags.iterations): 53 | for i, prompt in enumerate(prompts): 54 | prompt_id = self._flags.offset + (len(prompts) * iter) + i 55 | self._results_dict[str(prompt_id)] = [] 56 | yield self.create_request( 57 | prompt, 58 | self._flags.streaming_mode, 59 | prompt_id, 60 | sampling_parameters, 61 | exclude_input_in_output, 62 | ) 63 | except Exception as error: 64 | print(f"Caught an error in the request iterator: {error}") 65 | 66 | async def stream_infer(self, prompts, sampling_parameters, exclude_input_in_output): 67 | try: 68 | # Start streaming 69 | response_iterator = self._client.stream_infer( 70 | inputs_iterator=self.async_request_iterator( 71 | prompts, sampling_parameters, exclude_input_in_output 72 | ), 73 | stream_timeout=self._flags.stream_timeout, 74 | ) 75 | async for response in response_iterator: 76 | yield response 77 | except InferenceServerException as error: 78 | print(error) 79 | sys.exit(1) 80 | 81 | async def process_stream( 82 | self, prompts, sampling_parameters, exclude_input_in_output 83 | ): 84 | # Clear results in between process_stream calls 85 | self.results_dict = [] 86 | success = True 87 | # Read response from the stream 88 | async for response in self.stream_infer( 89 | prompts, sampling_parameters, exclude_input_in_output 90 | ): 91 | result, error = response 92 | if error: 93 | print(f"Encountered error while processing: {error}") 94 | success = False 95 | else: 96 | output = result.as_numpy("text_output") 97 | for i in output: 98 | self._results_dict[result.get_response().id].append(i) 99 | return success 100 | 101 | async def run(self): 102 | # Sampling parameters for text generation 103 | # including `temperature`, `top_p`, top_k`, `max_tokens`, `early_stopping`. 104 | # Full list available at: 105 | # https://github.com/vllmproject/vllm/blob/5255d99dc595f9ae7647842242d6542aa4145a4f/vllm/sampling_params.py#L23 106 | sampling_parameters = { 107 | "temperature": "0.1", 108 | "top_p": "0.95", 109 | "max_tokens": "100", 110 | } 111 | exclude_input_in_output = self._flags.exclude_inputs_in_outputs 112 | if self._flags.lora_name is not None: 113 | sampling_parameters["lora_name"] = self._flags.lora_name 114 | with open(self._flags.input_prompts, "r") as file: 115 | print(f"Loading inputs from `{self._flags.input_prompts}`...") 116 | prompts = file.readlines() 117 | 118 | success = await self.process_stream( 119 | prompts, sampling_parameters, exclude_input_in_output 120 | ) 121 | 122 | with open(self._flags.results_file, "w") as file: 123 | for id in self._results_dict.keys(): 124 | for result in self._results_dict[id]: 125 | file.write(result.decode("utf-8")) 126 | 127 | file.write("\n") 128 | file.write("\n=========\n\n") 129 | print(f"Storing results into `{self._flags.results_file}`...") 130 | 131 | if self._flags.verbose: 132 | with open(self._flags.results_file, "r") as file: 133 | print(f"\nContents of `{self._flags.results_file}` ===>") 134 | print(file.read()) 135 | if success: 136 | print("PASS: vLLM example") 137 | else: 138 | print("FAIL: vLLM example") 139 | 140 | def run_async(self): 141 | self._loop.run_until_complete(self.run()) 142 | 143 | def create_request( 144 | self, 145 | prompt, 146 | stream, 147 | request_id, 148 | sampling_parameters, 149 | exclude_input_in_output, 150 | send_parameters_as_tensor=True, 151 | ): 152 | inputs = [] 153 | prompt_data = np.array([prompt.encode("utf-8")], dtype=np.object_) 154 | try: 155 | inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) 156 | inputs[-1].set_data_from_numpy(prompt_data) 157 | except Exception as error: 158 | print(f"Encountered an error during request creation: {error}") 159 | 160 | stream_data = np.array([stream], dtype=bool) 161 | inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) 162 | inputs[-1].set_data_from_numpy(stream_data) 163 | 164 | # Request parameters are not yet supported via BLS. Provide an 165 | # optional mechanism to send serialized parameters as an input 166 | # tensor until support is added 167 | 168 | if send_parameters_as_tensor: 169 | sampling_parameters_data = np.array( 170 | [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ 171 | ) 172 | inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) 173 | inputs[-1].set_data_from_numpy(sampling_parameters_data) 174 | 175 | inputs.append(grpcclient.InferInput("exclude_input_in_output", [1], "BOOL")) 176 | inputs[-1].set_data_from_numpy(np.array([exclude_input_in_output], dtype=bool)) 177 | 178 | # Add requested outputs 179 | outputs = [] 180 | outputs.append(grpcclient.InferRequestedOutput("text_output")) 181 | 182 | # Issue the asynchronous sequence inference. 183 | return { 184 | "model_name": self._flags.model, 185 | "inputs": inputs, 186 | "outputs": outputs, 187 | "request_id": str(request_id), 188 | "parameters": sampling_parameters, 189 | } 190 | 191 | 192 | if __name__ == "__main__": 193 | parser = argparse.ArgumentParser() 194 | parser.add_argument( 195 | "-m", 196 | "--model", 197 | type=str, 198 | required=False, 199 | default="vllm_model", 200 | help="Model name", 201 | ) 202 | parser.add_argument( 203 | "-v", 204 | "--verbose", 205 | action="store_true", 206 | required=False, 207 | default=False, 208 | help="Enable verbose output", 209 | ) 210 | parser.add_argument( 211 | "-u", 212 | "--url", 213 | type=str, 214 | required=False, 215 | default="localhost:8001", 216 | help="Inference server URL and its gRPC port. Default is localhost:8001.", 217 | ) 218 | parser.add_argument( 219 | "-t", 220 | "--stream-timeout", 221 | type=float, 222 | required=False, 223 | default=None, 224 | help="Stream timeout in seconds. Default is None.", 225 | ) 226 | parser.add_argument( 227 | "--offset", 228 | type=int, 229 | required=False, 230 | default=0, 231 | help="Add offset to request IDs used", 232 | ) 233 | parser.add_argument( 234 | "--input-prompts", 235 | type=str, 236 | required=False, 237 | default="prompts.txt", 238 | help="Text file with input prompts", 239 | ) 240 | parser.add_argument( 241 | "--results-file", 242 | type=str, 243 | required=False, 244 | default="results.txt", 245 | help="The file with output results", 246 | ) 247 | parser.add_argument( 248 | "--iterations", 249 | type=int, 250 | required=False, 251 | default=1, 252 | help="Number of iterations through the prompts file", 253 | ) 254 | parser.add_argument( 255 | "-s", 256 | "--streaming-mode", 257 | action="store_true", 258 | required=False, 259 | default=False, 260 | help="Enable streaming mode", 261 | ) 262 | parser.add_argument( 263 | "--exclude-inputs-in-outputs", 264 | action="store_true", 265 | required=False, 266 | default=False, 267 | help="Exclude prompt from outputs", 268 | ) 269 | parser.add_argument( 270 | "-l", 271 | "--lora-name", 272 | type=str, 273 | required=False, 274 | default=None, 275 | help="The querying LoRA name", 276 | ) 277 | FLAGS = parser.parse_args() 278 | 279 | client = LLMClient(FLAGS) 280 | client.run_async() 281 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /dockerfiles/base/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/IGNodes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, time, csv, json 14 | import logging 15 | import multiprocessing 16 | 17 | from HealthReport import HealthReport 18 | from utilities import run_cmd, create_logger 19 | 20 | _logger = logging.getLogger("health_screener") 21 | 22 | 23 | class IGNodes(): 24 | 25 | def __init__(self, health_report=HealthReport()): 26 | """ Keeps Track of Nodes and their current states 27 | 28 | Args: 29 | health_report (HealthReport, optional): IGHS Health Report. Defaults to creating a new HealthReport(). 30 | """ 31 | self.all_nodes = list() 32 | self.launcher_nodes = list() 33 | self.worker_nodes = list() 34 | self.healthy_nodes = set() 35 | self.watch_nodes = set() 36 | self.infected_nodes = set() 37 | self.missing_nodes = set() 38 | 39 | self.groups_tracker = list() 40 | self.current_node_groups = list() 41 | 42 | self.health_report = health_report 43 | self.log_dir = health_report.f_dir 44 | 45 | def update_node_status(self, healthy_nodes, infected_nodes, missing_nodes, undetected_nodes=[]): 46 | """Update the node lists status based on current node groups. If a node 47 | paring fails with known healthy node, then the other node is considered 48 | infected. Otherwise it will be moved to the healthy node list 49 | 50 | Args: 51 | healthy_nodes ([str]): List of Healthy nodes that pass IGHS testing 52 | infected_nodes ([str]): List of nodes that failed to pass IGHS testing 53 | missing_nodes ([str]): List of nodes that IGHS did not run testing on 54 | undetected_nodes ([str]): List of nodes that IGHS did not run testing on b/c it wasn't scheduled on 55 | """ 56 | watch_nodes = self.watch_nodes.copy() 57 | 58 | # Remove Nodes that haven't been tested yet from the healthy list 59 | for n in undetected_nodes: 60 | if n in watch_nodes and n in healthy_nodes: 61 | healthy_nodes.remove(n) 62 | 63 | self.healthy_nodes.update(healthy_nodes) 64 | 65 | for group in self.current_node_groups: 66 | n1, n2 = group 67 | self.determine_node_health(infected_nodes, missing_nodes, n1, n2) 68 | self.determine_node_health(infected_nodes, missing_nodes, n2, n1) 69 | 70 | self.watch_nodes = self.watch_nodes.difference(self.healthy_nodes) 71 | 72 | def determine_node_health(self, infected_nodes, missing_nodes, n1, n2): 73 | """Determine whether a node is healthy . 74 | 75 | Args: 76 | infected_nodes ([str]): List of nodes that failed to pass IGHS testing 77 | missing_nodes ([str]): List of nodes that IGHS did not run testing on 78 | n1 (str): Node name to investigate if it passes the IGHS test 79 | n2 (str): Node name that should be considered healthy. This assist in verifying status of N1 80 | """ 81 | if n2 in self.healthy_nodes: 82 | remove_from_watch = False 83 | 84 | if n1 in infected_nodes: 85 | self.infected_nodes.add(n1) 86 | remove_from_watch = True 87 | if n1 in missing_nodes: 88 | self.missing_nodes.add(n1) 89 | remove_from_watch = True 90 | 91 | if remove_from_watch and n1 in self.watch_nodes: 92 | self.watch_nodes.remove(n1) 93 | 94 | class IGNode(): 95 | 96 | def __init__(self, name="", health_report=HealthReport(), num_checks_link_state=10, log_level=logging.INFO, write_dir="/tmp/ighs"): 97 | self.name = name 98 | if name == "" and "MY_NODE_NAME" in os.environ: 99 | self.name = os.environ["MY_NODE_NAME"] 100 | 101 | self.cards = dict() 102 | self.num_checks_link_state = num_checks_link_state 103 | self.write_dir = write_dir 104 | if(not os.path.exists(self.write_dir)): 105 | os.makedirs(self.write_dir) 106 | 107 | self.health_report = health_report 108 | if not self.health_report.exist(): 109 | self.health_report.create() 110 | 111 | self.logger, _ = create_logger(logger_name=self.name, logger_file_name=self.name, f_path=f"{write_dir}", level=log_level) 112 | 113 | 114 | def scan_cards(self): 115 | self.logger.info(f"Scanning cards info on Node: {self.name}") 116 | 117 | cmd = "hl-smi -Q index,module_id,bus_id,memory.used,temperature.aip,name -f csv,noheader" 118 | output = run_cmd(cmd) 119 | 120 | reader = csv.reader(output.split('\n'), delimiter=',') 121 | for row in reader: 122 | if len(row) == 0: 123 | continue 124 | elif len(row) < 6: 125 | _logger.error(f"hl-smi output is not correct: Recieved output: {row}") 126 | continue 127 | 128 | i = row[0] 129 | module_id = row[1].strip() 130 | pci_address = row[2] 131 | memory_used = int(row[3].split()[0]) 132 | temperature_C = int(row[4].split()[0]) 133 | system_name = row[5] 134 | 135 | card = IGCard(system_name=system_name, index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) 136 | self.cards[i] = card 137 | 138 | self.cards = dict(sorted(self.cards.items())) 139 | 140 | def record_dmesg(self): 141 | cmd = f"dmesg -T" 142 | output = run_cmd(cmd) 143 | 144 | self.logger.info("***** START of DMESG *****") 145 | self.logger.info(output) 146 | self.logger.info("***** END of DMESG *****") 147 | 148 | def health_check(self, target_cards=[], write_report=False): 149 | checked_cards = list() 150 | processes = list() 151 | card_queue = multiprocessing.Queue() 152 | 153 | if len(target_cards) == 0: 154 | target_cards = self.cards.keys() 155 | 156 | for i in target_cards: 157 | card = self.cards[str(i)] 158 | p = multiprocessing.Process(target=card.check_health, args=(self.num_checks_link_state,card_queue)) 159 | 160 | p.start() 161 | processes.append((card,p)) 162 | 163 | for card,p in processes: 164 | p.join() 165 | card_queue.put(None) 166 | 167 | for card in iter(card_queue.get, None): 168 | card.node_id = self.name 169 | checked_cards.append(card) 170 | self.logger.info(card) 171 | 172 | self.record_dmesg() 173 | checked_cards_dict = self.write_json(checked_cards) 174 | if(write_report): 175 | self.health_report.write_rows(data=checked_cards_dict) 176 | 177 | def write_json(self, cards): 178 | node_status = dict() 179 | node_status["name"] = self.name 180 | node_status["is_infected"] = False 181 | node_status["cards"] = list() 182 | 183 | for c in cards: 184 | c_status = c.__dict__ 185 | del c_status["logger"] 186 | node_status["cards"].append(c.__dict__) 187 | 188 | if c.is_infected: 189 | node_status["is_infected"] = True 190 | 191 | self.logger.info("***** START of Node Report *****") 192 | self.logger.info(json.dumps(node_status)) 193 | self.logger.info("***** END of Node Report *****") 194 | 195 | return node_status["cards"] 196 | 197 | class IGCard(): 198 | 199 | def __init__(self, system_name="", index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None): 200 | self.system_name = system_name 201 | self.node_id = "" 202 | self.logger = logger 203 | self.index = index 204 | self.module_id = module_id 205 | self.pci_address = pci_address 206 | self.memory_used = memory_used 207 | self.temperature_C = temperature 208 | self.temperature_state_C = "" 209 | 210 | self.framework = framework 211 | self.down_links = list() 212 | self.device_acquire_fail = False 213 | self.multi_node_fail = False 214 | self.is_infected = False 215 | 216 | self.internal_ports = list() 217 | self.external_ports = list() 218 | 219 | def check_health(self,num_checks_link_state=10, checked_cards=[]): 220 | self.check_port_type() 221 | self.check_link_state(attempts=num_checks_link_state, sleep_sec=0.2) 222 | self.check_device_acquire_fail() 223 | self.check_temperature_state() 224 | 225 | checked_cards.put(self) 226 | 227 | def check_link_state(self, attempts=10, sleep_sec=0.5): 228 | self.logger.debug(f"Checking {self.pci_address} Link State. Will check {attempts} times") 229 | all_ports = self.internal_ports + self.external_ports 230 | all_ports_txt = ",".join(all_ports) 231 | 232 | cmd = f"hl-smi -n link -i {self.pci_address} -P {all_ports_txt}" 233 | down_links = set() 234 | 235 | for a in range(attempts): 236 | output = run_cmd(cmd) 237 | links_state = output.strip().split("\n") 238 | 239 | for i, status in enumerate(links_state): 240 | if ("DOWN" in status): 241 | down_links.add(i) 242 | self.logger.debug(f"Attempt: {a} Port: {i} DOWN") 243 | self.is_infected = True 244 | 245 | time.sleep(sleep_sec) 246 | 247 | self.down_links = list(down_links) 248 | 249 | return self.down_links 250 | 251 | 252 | def check_port_type(self): 253 | self.logger.debug(f"Checking {self.pci_address} Port Types (Internal|External)") 254 | 255 | cmd = f"hl-smi -n ports -i {self.pci_address}" 256 | output = run_cmd(cmd) 257 | output_list = output.strip().split("\n") 258 | 259 | for output in output_list: 260 | port_txt, port_type = output.split(":") 261 | port = port_txt.split(" ")[1] 262 | 263 | if "external" in port_type: 264 | self.external_ports.append(port) 265 | else: 266 | self.internal_ports.append(port) 267 | 268 | def check_device_acquire_fail(self): 269 | self.logger.debug(f"Checking {self.pci_address} for Device Acquire Issues") 270 | self.device_acquire_fail = False 271 | 272 | os.environ["ID"] = str(self.module_id) 273 | os.environ["HABANA_VISIBLE_MODULES"] = str(self.module_id) 274 | 275 | try: 276 | import torch 277 | import habana_frameworks.torch.core 278 | except Exception as e: 279 | self.logger.error(f"Card {self.module_id} {self.pci_address} Failed to initialize Intel Gaudi PyTorch: {str(e)}") 280 | self.device_acquire_fail = True 281 | self.is_infected = True 282 | 283 | try: 284 | x = torch.tensor([2]).to('hpu') 285 | y = x + x 286 | 287 | assert y == 4, 'Sanity check failed: Wrong Add output' 288 | assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device' 289 | except (RuntimeError, AssertionError, Exception) as e: 290 | self.logger.error(f"{self.pci_address} Device Acquire Failure: {e}") 291 | self.device_acquire_fail = True 292 | self.is_infected = True 293 | 294 | return self.device_acquire_fail 295 | 296 | def check_temperature_state(self): 297 | if "HL-325" in self.system_name: 298 | # Gaudi-3 System 299 | max_good_temperature = 200 300 | base_temperature = 45 301 | max_delta = 80 302 | else: 303 | # Gaudi-2 System 304 | max_good_temperature = 83 305 | base_temperature = 25 306 | max_delta = 25 307 | 308 | 309 | if self.temperature_C >= max_good_temperature: 310 | self.temperature_state_C = "CRITICAL" 311 | self.is_infected = True 312 | elif abs(self.temperature_C - base_temperature) >= max_delta: 313 | self.temperature_state_C = "WARN" 314 | self.is_infected = True 315 | else: 316 | self.temperature_state_C = "NORMAL" 317 | 318 | def __str__(self): 319 | report_str = f""" Index: {self.index} 320 | Module Id: {self.module_id} 321 | PCI Address: {self.pci_address} 322 | Temperature: {self.temperature_C} C 323 | Temperature State: {self.temperature_state_C} 324 | Down Links: {self.down_links} 325 | Device Acquire Fail: {self.device_acquire_fail}""" 326 | 327 | return report_str 328 | 329 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/screen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, datetime, yaml, sys, time, json 14 | import argparse 15 | import logging 16 | 17 | from utilities import download_repos, create_logger, get_logging_level 18 | from hccl_demo_helper import hccl_demo_check 19 | from system_utils import KubeUtils, BareMetalUtils 20 | 21 | from HealthReport import HealthReport 22 | from IGNodes import IGNodes, IGNode 23 | 24 | 25 | _logger = None 26 | 27 | def monitor_ighs_status(system_mode, level, nodes, timeout_s=240, round=0): 28 | sleep_time_s = 2 29 | max_attempts = (timeout_s // sleep_time_s) + min(timeout_s % sleep_time_s, 1) 30 | current_run_status = dict() 31 | lvl_check_msg = f"Checking IGHS Level {level}" 32 | 33 | num_nodes = len(nodes.all_nodes) 34 | if level == 2: 35 | num_nodes = len(nodes.current_node_groups) * 2 36 | lvl_check_msg += f" Round {round}" 37 | 38 | _logger.info(f"{lvl_check_msg} Status") 39 | 40 | for attempt in range(max_attempts): 41 | num_found_nodes = system_mode.check_screen_complete(current_run_status=current_run_status, health_report=nodes.health_report, level=level, round=round) 42 | 43 | if num_found_nodes == num_nodes: 44 | _logger.info(f"Found {num_found_nodes}/{num_nodes} Nodes during Health Screen") 45 | break 46 | 47 | _logger.info(f"Attempt {attempt}/{max_attempts}: Found {num_found_nodes}/{num_nodes} Nodes - Will Check again in {sleep_time_s} seconds") 48 | time.sleep(sleep_time_s) 49 | num_found_nodes = system_mode.check_screen_complete(current_run_status=current_run_status, health_report=nodes.health_report, level=level, round=round, final_check=True) 50 | 51 | if level == 1: 52 | detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_node_info() 53 | missing_nodes.update(set(nodes.all_nodes).difference(detected_nodes)) 54 | undetected_nodes = [] 55 | 56 | nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes) 57 | elif level == 2: 58 | detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_hccl_demo_info() 59 | undetected_nodes = set(nodes.all_nodes).difference(detected_nodes) 60 | 61 | nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes) 62 | 63 | detected_nodes_l1, infected_nodes_l1, missing_nodes = nodes.health_report.extract_node_info() 64 | detected_nodes.update(detected_nodes_l1) 65 | infected_nodes.update(infected_nodes_l1) 66 | 67 | healthy_nodes = detected_nodes.difference(infected_nodes).difference(missing_nodes) 68 | 69 | healthy_nodes = sorted(list(healthy_nodes)) 70 | missing_nodes = sorted(list(missing_nodes)) 71 | infected_nodes = sorted(list(infected_nodes)) 72 | nodes.update_node_status(healthy_nodes, infected_nodes, missing_nodes, undetected_nodes=undetected_nodes) 73 | 74 | watch_nodes = sorted(list(nodes.watch_nodes)) 75 | detected_nodes = sorted(list(detected_nodes)) 76 | 77 | if level == 1: 78 | nodes.healthy_nodes = set(healthy_nodes) 79 | 80 | _logger.info(f"Detected {len(detected_nodes)} Node: {detected_nodes}") 81 | _logger.info(f" Healthy {len(healthy_nodes)} Node: {healthy_nodes}") 82 | _logger.info(f" Infected {len(infected_nodes)} Node: {infected_nodes}") 83 | _logger.info(f"Missing {len(missing_nodes)} Node: {missing_nodes}") 84 | _logger.info(f"Unverified {len(watch_nodes)} Node: {watch_nodes}") 85 | 86 | return healthy_nodes, infected_nodes, missing_nodes 87 | 88 | 89 | def main(args): 90 | global _logger 91 | 92 | if args.logs_dir == "": 93 | c_time = datetime.datetime.now() 94 | date_year_format = c_time.strftime("%m-%Y") 95 | date_format = c_time.strftime("%m-%d-%Y") 96 | time_format = c_time.strftime("%H-%M") 97 | args.logs_dir = f"logs/{date_year_format}/{date_format}/{date_format}_{time_format}" 98 | 99 | 100 | ighs_report_name = "health_report.csv" 101 | ighs_log_dir = args.logs_dir 102 | 103 | if args.node_name: 104 | ighs_level = os.environ["IGHS_LEVEL"] if "IGHS_LEVEL" in os.environ else 1 105 | ighs_report_name = f"health_report_{args.node_name}.csv" 106 | ighs_log_dir = f"{args.logs_dir}/L{ighs_level}" 107 | 108 | health_report = HealthReport(f_dir=ighs_log_dir, report_name=ighs_report_name) 109 | job_path = "tmp/jobs" 110 | 111 | with open(args.config, 'r') as f: 112 | config_data = yaml.safe_load(f) 113 | 114 | hostfile = "" 115 | if "hostfile" in config_data["system-info"]: 116 | hostfile = config_data["system-info"]["hostfile"] 117 | 118 | log_level = get_logging_level(config_data["log-level"]) 119 | _logger, _ = create_logger(logger_name="health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level) 120 | 121 | if config_data["system-info"]["type"] == "k8s": 122 | system_mode = KubeUtils(image=config_data["image"], 123 | hostfile=hostfile, 124 | namespace=config_data["system-info"]["namespace"], 125 | log_dir=args.logs_dir) 126 | elif config_data["system-info"]["type"] == "bare-metal": 127 | system_mode = BareMetalUtils(image=config_data["image"], 128 | hostfile=hostfile, 129 | ssh_path=config_data["system-info"]["ssh-path"], 130 | tcp_interface=config_data["system-info"]["tcp-interface"], 131 | log_dir=args.logs_dir) 132 | else: 133 | _logger.error(f"system_mode: {system_mode} in {args.config} is not set correctly. system_mode has to be set to k8s or bare-metal") 134 | sys.exit(1) 135 | 136 | 137 | if args.initialize: 138 | _logger.info(f"Loaded Configuration File: {args.config}") 139 | _logger.info(f"{config_data}") 140 | 141 | health_report.create(create_base=True, create_hccl_demo=True) 142 | download_repos() 143 | 144 | system_mode.initialize_system() 145 | 146 | if args.screen: 147 | start_time = datetime.datetime.now() 148 | 149 | intel_gaudi_nodes = IGNodes(health_report=health_report) 150 | intel_gaudi_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"]) 151 | healthy_nodes, infected_nodes, missing_nodes = list(), list(), list() 152 | occupied_nodes, missing_cards_nodes, misc_nodes = list(), list(), list() 153 | 154 | if config_data["level-1"]["run"]: 155 | _logger.info("Running Level 1 Checks: Card Diagnostics") 156 | if not os.path.exists(f"{health_report.f_dir}/L1"): 157 | os.makedirs(f"{health_report.f_dir}/L1") 158 | 159 | nodes_initialized = system_mode.initialize_node_jobs(level=1, 160 | nodes=intel_gaudi_nodes, 161 | job_base_path=job_path) 162 | if nodes_initialized: 163 | healthy_nodes, infected_nodes, missing_nodes = monitor_ighs_status(system_mode=system_mode, 164 | level=1, 165 | nodes=intel_gaudi_nodes, 166 | timeout_s=config_data["level-1"]["timeout_s"]) 167 | occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes) 168 | system_mode.clear_ighs_pods() 169 | 170 | summary = { 171 | "level": 1, 172 | "infected": infected_nodes, 173 | "missing": missing_nodes, 174 | "occupied": occupied_nodes, 175 | "missing_cards": missing_cards_nodes, 176 | "untested": misc_nodes, 177 | "healthy": healthy_nodes 178 | } 179 | 180 | with open(f"{args.logs_dir}/ighs_L1_summary.json", 'w', encoding ='utf8') as f: 181 | json.dump(summary, f, indent=4) 182 | 183 | if config_data["level-2"]["run"]: 184 | _logger.info("Running Level 2 Checks: Pair HCCL_DEMO All Reduce") 185 | if not os.path.exists(f"{health_report.f_dir}/L2"): 186 | os.makedirs(f"{health_report.f_dir}/L2") 187 | 188 | intel_gaudi_nodes.healthy_nodes = set() 189 | intel_gaudi_nodes.watch_nodes = set(intel_gaudi_nodes.all_nodes).difference(set(missing_nodes)) 190 | intel_gaudi_nodes.missing_nodes = set(missing_nodes) 191 | 192 | for i in range(config_data["level-2"]["num-rounds"]): 193 | nodes_initialized = system_mode.initialize_node_jobs(level=2, 194 | nodes=intel_gaudi_nodes, 195 | job_base_path=job_path, 196 | round=i) 197 | if not nodes_initialized: 198 | _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No other Nodes to screen. Exit screening early.") 199 | break 200 | 201 | healthy_nodes, infected_nodes, missing_nodes = monitor_ighs_status(system_mode=system_mode, 202 | level=2, 203 | nodes=intel_gaudi_nodes, 204 | timeout_s=config_data["level-2"]["timeout_s"], 205 | round=i) 206 | occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes) 207 | system_mode.clear_ighs_pods(job_type="mpijobs") 208 | 209 | if len(intel_gaudi_nodes.watch_nodes) == 0: 210 | _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No other Nodes to screen. Exit screening early.") 211 | break 212 | 213 | summary = { 214 | "level": 2, 215 | "infected": infected_nodes, 216 | "missing": missing_nodes, 217 | "occupied": occupied_nodes, 218 | "missing_cards": missing_cards_nodes, 219 | "untested": misc_nodes, 220 | "healthy": healthy_nodes 221 | } 222 | 223 | with open(f"{args.logs_dir}/ighs_L2_summary.json", 'w', encoding ='utf8') as f: 224 | json.dump(summary, f, indent=4) 225 | 226 | end_time = datetime.datetime.now() 227 | diff_time = (end_time - start_time) 228 | _logger.info(f"Total Run Time: {diff_time}") 229 | 230 | if args.ighs_check == "node": 231 | node = IGNode(health_report=health_report, 232 | num_checks_link_state=config_data["level-1"]["num-checks-link-state"], 233 | log_level=log_level, 234 | name=args.node_name) 235 | node.scan_cards() 236 | node.health_check(write_report=args.node_write_report) 237 | elif args.ighs_check == "hccl-demo": 238 | health_report.create(create_base=False, create_hccl_demo=True) 239 | 240 | target_nodes = args.target_nodes.strip("[']").replace("'","").split(',') 241 | hccl_demo_check(job_id=f"{health_report.f_dir}/L2/{args.round}/{args.job_id}", 242 | target_nodes=target_nodes, health_report=health_report) 243 | 244 | if __name__=="__main__": 245 | parser = argparse.ArgumentParser() 246 | 247 | parser.add_argument("--initialize", action="store_true", help="Downloads Necessary Repos and Creates Report Template") 248 | parser.add_argument("--screen", action="store_true", help="Starts Health Screen for Cluster") 249 | parser.add_argument("--target-nodes", type=str, default="", help="List of target nodes") 250 | parser.add_argument("--job-id", type=str, default="", help="Needed to identify hccl-demo running log") 251 | parser.add_argument("--round", type=str, default="", help="Needed to identify hccl-demo running round log") 252 | parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file for Health Screener") 253 | parser.add_argument("--ighs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"], 254 | help="Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes)") 255 | 256 | parser.add_argument("--node-write-report", action="store_true", help="Write Individual Node Health Report") 257 | parser.add_argument("--node-name", type=str, default="", help="Name of Node") 258 | parser.add_argument("--logs-dir", type=str, default="", help="Output directory of health screen results") 259 | 260 | args = parser.parse_args() 261 | 262 | 263 | main(args) 264 | -------------------------------------------------------------------------------- /utils/intel_gaudi_health_screen/HealthReport.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os, csv, time, shutil, fcntl, glob, copy 14 | from collections import defaultdict 15 | from tempfile import NamedTemporaryFile 16 | 17 | from utilities import copy_files 18 | 19 | import logging 20 | 21 | _logger = logging.getLogger("health_screener") 22 | 23 | class HealthReport(): 24 | 25 | def __init__(self, f_dir="tmp", report_name="health_report.csv"): 26 | """ Initialize Health Report Class 27 | 28 | Args: 29 | f_dir (str, optional): File Directory to store Health Report logs and results. Defaults to "tmp". 30 | report_name (str, optional): File name of Health Report csv. Defaults to "health_report.csv". 31 | """ 32 | self.header = ["node_id", "index", "module_id", "pci_address", "temperature_C", "temperature_state_C", "device_acquire_fail", "down_links", "multi_node_fail", "missing"] 33 | 34 | self.f_dir = f_dir 35 | self.report_name = report_name 36 | self.f_path = f"{self.f_dir}/{self.report_name}" 37 | 38 | self.header_hccl_demo = ["round","group_id", "node_ids", "num_nodes", "multi_node_fail", "missing", "qpc_fail"] 39 | self.f_path_hccl_demo = f"{self.f_dir}/{os.path.splitext(self.report_name)[0]}_hccl_demo.csv" 40 | 41 | 42 | def create(self, create_base=True, create_hccl_demo=False): 43 | """Create CSV Health Report Files. One for Base Health Checks and HCCL Demo Checks 44 | 45 | Args: 46 | create_base (bool, optional): Create Base Health_Report CSV file. Defaults to True. 47 | create_hccl_demo (bool, optional): Create HCCL_DEMO_Health_Report if it doesn't exist. Defaults to False. 48 | """ 49 | 50 | dir_name = os.path.dirname(self.f_path) 51 | if not os.path.exists(dir_name): 52 | os.makedirs(dir_name) 53 | 54 | if create_base: 55 | with open(self.f_path, "w+", newline='') as f: 56 | writer = csv.DictWriter(f, fieldnames=self.header, extrasaction='ignore') 57 | writer.writeheader() 58 | _logger.info(f"Created {self.f_path} with header: {self.header}") 59 | 60 | if create_hccl_demo and not self.exist(level=2): 61 | with open(self.f_path_hccl_demo, "w+", newline='') as f: 62 | writer = csv.DictWriter(f, fieldnames=self.header_hccl_demo, extrasaction='ignore') 63 | writer.writeheader() 64 | _logger.info(f"Created {self.f_path_hccl_demo} with header: {self.header_hccl_demo}") 65 | 66 | def exist(self, level=1): 67 | """Checks to see if Base Health Report exist 68 | 69 | Args: 70 | level (int, optional): Health Screen level report csv to check. Defaults to 1. 71 | 72 | Returns: 73 | bool: Returns True if the Base Health Report (self.f_path) or HCCL_DEMO Health Report (self.f_path_hccl_demo) exist 74 | """ 75 | f_path = self.f_path 76 | 77 | if level == 2: 78 | f_path = self.f_path_hccl_demo 79 | 80 | return os.path.exists(f_path) 81 | 82 | def write_rows(self, data=list(), level=1): 83 | """ Write health check results to Health Report CSV. Can write multiple rows at once 84 | 85 | Args: 86 | data (_type_, optional): Health Report CSV Row data. Defaults to list(). 87 | level (int, optional): Health Screen Level. Defaults to 1. 88 | """ 89 | 90 | if level == 1: 91 | f_path = self.f_path 92 | header = self.header 93 | 94 | 95 | elif level == 2: 96 | f_path = self.f_path_hccl_demo 97 | header = self.header_hccl_demo 98 | 99 | with open(f_path, "a", newline='') as f: 100 | fcntl.flock(f, fcntl.LOCK_EX) 101 | writer = csv.DictWriter(f, fieldnames=header, extrasaction='ignore') 102 | writer.writerows(data) 103 | time.sleep(0.1) 104 | fcntl.flock(f, fcntl.LOCK_UN) 105 | 106 | def update_health_report(self, detected_nodes, infected_nodes, missing_nodes): 107 | """ Update health_report with hccl_demo results 108 | 109 | Args: 110 | detected_nodes (list[str]): List of detected node_ids 111 | infected_nodes (list[str]): List of infected node_ids 112 | missing_nodes (list[str]): List of missing node_ids 113 | """ 114 | temp_file = NamedTemporaryFile(mode='w', delete=False) 115 | detected_nodes_cp = detected_nodes.copy() 116 | 117 | with open(self.f_path, 'r', newline='') as csv_file, temp_file: 118 | reader = csv.DictReader(csv_file) 119 | writer = csv.DictWriter(temp_file, fieldnames=self.header) 120 | 121 | writer.writeheader() 122 | for row in reader: 123 | if row["node_id"] in infected_nodes or row["node_id"] in missing_nodes: 124 | row["multi_node_fail"] = True 125 | elif row["node_id"] in detected_nodes_cp: 126 | row["multi_node_fail"] = False 127 | row["missing"] = False 128 | 129 | writer.writerow(row) 130 | 131 | missing_nodes.discard(row["node_id"]) 132 | detected_nodes_cp.discard(row["node_id"]) 133 | 134 | # These are unreported Detected Nodes. Add to Report 135 | if len(detected_nodes_cp): 136 | for n in detected_nodes_cp: 137 | writer.writerow({"node_id": n, "multi_node_fail": False, "missing": False}) 138 | 139 | # These are unreported Missing Nodes. Add to Report 140 | if len(missing_nodes): 141 | for n in missing_nodes: 142 | writer.writerow({"node_id": n, "multi_node_fail": True, "missing": True}) 143 | 144 | shutil.move(temp_file.name, self.f_path) 145 | 146 | def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, qpc_fail, missing_nodes): 147 | """ Update health_report with hccl_demo results, based on infected_nodes. 148 | 149 | Args: 150 | all_node_pairs (list[str]): List of all Node Pairs reported by Level 2 round 151 | multi_node_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test 152 | qpc_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test due to QPC error 153 | missing_nodes (list[str]): List of Node Pairs that couldn't run HCCL_Demo 154 | """ 155 | temp_file = NamedTemporaryFile(mode='w', delete=False) 156 | 157 | with open(self.f_path_hccl_demo, 'r', newline='') as csv_file, temp_file: 158 | reader = csv.DictReader(csv_file) 159 | writer = csv.DictWriter(temp_file, fieldnames=self.header_hccl_demo, extrasaction='ignore') 160 | 161 | writer.writeheader() 162 | for row in reader: 163 | if(row["round"] == round): 164 | row["multi_node_fail"] = (row["node_ids"] in multi_node_fail) 165 | row["qpc_fail"] = (row["node_ids"] in qpc_fail) 166 | row["missing"] = (row["node_ids"] in missing_nodes) 167 | 168 | if row["node_ids"] in all_node_pairs: 169 | del all_node_pairs[row["node_ids"]] 170 | 171 | writer.writerow(row) 172 | 173 | # These are unreported node_pairs. Add remaining node pairs 174 | if len(all_node_pairs): 175 | writer.writerows(list(all_node_pairs.values())) 176 | 177 | shutil.move(temp_file.name, self.f_path_hccl_demo) 178 | 179 | def check_screen_complete(self, num_nodes, hccl_demo=False, round=0): 180 | """ Check on status of Health Screen Check. 181 | Screen considered done if all nodes health checks are done 182 | 183 | Args: 184 | num_nodes (int): Number of Nodes screened 185 | hccl_demo (bool, optional): Status of HCCL_DEMO all reduce test. Defaults to False. 186 | round (int, optional): Level 2 Round. This will only check Level 2 round results. This is ignored for Level 1 runs. 187 | 188 | Returns: 189 | bool: Status of Screen. If all nodes are found, screening is done 190 | """ 191 | f_path = self.f_path if (not hccl_demo) else self.f_path_hccl_demo 192 | n_cards_per_node = 8 193 | 194 | with open(f_path, "r", newline='') as f: 195 | reader = csv.DictReader(f) 196 | 197 | if hccl_demo: 198 | n_cards = 0 199 | for row in reader: 200 | if(int(row["round"]) == round): 201 | n_cards += (int(row["num_nodes"]) * n_cards_per_node) 202 | else: 203 | n_cards = len(list(reader)) 204 | 205 | total_cards = n_cards_per_node * num_nodes 206 | has_all_nodes_info = (n_cards == total_cards) 207 | num_found_nodes = n_cards // n_cards_per_node 208 | 209 | return has_all_nodes_info, num_found_nodes 210 | 211 | def extract_node_info(self): 212 | """ Extracts Detected, Infected, and Missing Nodes from Health Report. 213 | 214 | Returns: 215 | (set, set, set): (Detected Nodes, Infected Nodes, Missing Nodes) 216 | """ 217 | detected_nodes = set() 218 | missing_nodes = set() 219 | device_acquire_fail_set = set() 220 | down_links_set = set() 221 | temperature_fail_set = set() 222 | temperature_warn_set = set() 223 | 224 | with open(self.f_path, "r", newline='') as f: 225 | reader = csv.DictReader(f) 226 | for row in reader: 227 | detected_nodes.add(row["node_id"]) 228 | 229 | if row["device_acquire_fail"] == "True": 230 | device_acquire_fail_set.add(row["node_id"]) 231 | if row["down_links"] != "[]" and row["down_links"] != "": 232 | down_links_set.add(row["node_id"]) 233 | if row["missing"] == "True": 234 | missing_nodes.add(row["node_id"]) 235 | if row["temperature_state_C"] == "CRITICAL": 236 | temperature_fail_set.add(row["node_id"]) 237 | if row["temperature_state_C"] == "WARN": 238 | temperature_warn_set.add(row["node_id"]) 239 | 240 | if(len(device_acquire_fail_set)): 241 | _logger.info(f"{len(device_acquire_fail_set)} Infected (Device Acquire fail): {sorted(list(device_acquire_fail_set))}") 242 | if(len(down_links_set)): 243 | _logger.info(f"{len(down_links_set)} Infected (Down Links): {sorted(list(down_links_set))}") 244 | if(len(temperature_warn_set)): 245 | _logger.info(f"{len(temperature_warn_set)} Infected (Temperature WARN): {sorted(list(temperature_warn_set))}") 246 | if(len(temperature_fail_set)): 247 | _logger.info(f"{len(temperature_fail_set)} Infected (Temperature CRITICAL): {sorted(list(temperature_fail_set))}") 248 | 249 | infected_nodes = set() 250 | infected_nodes.update(device_acquire_fail_set) 251 | infected_nodes.update(down_links_set) 252 | infected_nodes.update(temperature_fail_set) 253 | infected_nodes.update(temperature_warn_set) 254 | 255 | return detected_nodes, infected_nodes, missing_nodes 256 | 257 | 258 | def extract_hccl_demo_info(self): 259 | """ Extracts Detected, Infected, and Missing Nodes from HCCL DEMO Health Report 260 | 261 | Returns: 262 | (set, set, set): (Detected Nodes, Infected Nodes, Missing Nodes) 263 | """ 264 | detected_nodes = set() 265 | infected_nodes = set() 266 | missing_nodes = set() 267 | fail_checks = defaultdict(list) 268 | missing_checks = defaultdict(list) 269 | 270 | with open(self.f_path_hccl_demo, "r", newline='') as f: 271 | reader = csv.DictReader(f) 272 | for row in reader: 273 | node_ids = row["node_ids"].strip("[']").replace("'","").split(', ') 274 | detected_nodes.update(node_ids) 275 | 276 | for n in node_ids: 277 | fail_status = int(row["multi_node_fail"] == "True") 278 | fail_checks[n].append(fail_status) 279 | 280 | missing_status = int(row["missing"] == "True") 281 | missing_checks[n].append(missing_status) 282 | 283 | for n, v in fail_checks.items(): 284 | if sum(v) == len(v): 285 | infected_nodes.add(n) 286 | 287 | for n, v in missing_checks.items(): 288 | if sum(v) == len(v): 289 | missing_nodes.add(n) 290 | 291 | detected_nodes -= missing_nodes 292 | infected_nodes -= missing_nodes 293 | 294 | _logger.info(f"{len(infected_nodes)} Infected (HCCL): {sorted(list(infected_nodes))}") 295 | 296 | return detected_nodes, infected_nodes, missing_nodes 297 | 298 | def gather_health_report(self, level, remote_path, hosts): 299 | """ Gathers Health Report from all hosts 300 | 301 | Args: 302 | level (str): IGHS Level 303 | remote_path (str): Remote Destintation of IGHS Report 304 | hosts (list, optional): List of IP Addresses to gather IGHS Reports 305 | """ 306 | copy_files(src=f"{remote_path}/intel_gaudi_health_screen/{self.f_dir}/L{level}", 307 | dst=f"{self.f_dir}", 308 | hosts=hosts, 309 | to_remote=False) 310 | 311 | def consolidate_health_report(self, level, report_dir): 312 | """ Consolidates the health_report_*.csv from worker pods into a single master csv file 313 | 314 | Args: 315 | level (str): IGHS Level 316 | report_dir (str): Directory of CSV files to merge 317 | """ 318 | data = list() 319 | path = f"{report_dir}/L{level}/health_report_*.csv" 320 | csv_files = glob.glob(path) 321 | 322 | for f in csv_files: 323 | with open(f, 'r', newline='') as csv_file: 324 | reader = csv.DictReader(csv_file) 325 | for row in reader: 326 | data.append(row) 327 | 328 | self.write_rows(data=data, level=level) 329 | 330 | --------------------------------------------------------------------------------