├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── SECURITY.md ├── cmake └── Modules │ └── FindoneCCL.cmake ├── demo ├── README.md └── demo.py ├── oneccl_bindings_for_pytorch ├── __init__.py └── csrc │ ├── _C.cpp │ ├── init.cpp │ └── init.h ├── requirements.txt ├── setup.py ├── src ├── CMakeLists.txt ├── ProcessGroupCCL.cpp ├── ProcessGroupCCL.hpp ├── ccl_comm_collector.cpp ├── ccl_comm_collector.h ├── cpu │ └── cpu_ccl.cpp ├── dispatch_stub.cpp ├── dispatch_stub.h ├── env.cpp ├── env.h ├── gpu │ ├── CMakeLists.txt │ ├── Makefile │ ├── README.md │ ├── allreduce.cpp │ ├── allreduce.h │ ├── allreduce_small.h │ ├── cxxopts.hpp │ ├── dpcpp_ccl.cpp │ ├── runtime.hpp │ ├── sycl_misc.hpp │ └── ze_exception.hpp ├── test │ ├── remotesync │ │ ├── Makefile │ │ ├── simple_test.cpp │ │ ├── sycl_misc.hpp │ │ ├── test.sh │ │ └── ze_exception.hpp │ ├── segfault │ │ ├── Makefile │ │ ├── simple_test.cpp │ │ ├── sycl_misc.hpp │ │ ├── test.sh │ │ └── ze_exception.hpp │ └── writeremote │ │ ├── Makefile │ │ ├── simple_test.cpp │ │ ├── sycl_misc.hpp │ │ ├── test.sh │ │ └── ze_exception.hpp ├── utils.cpp └── utils.h ├── tests ├── DeepSpeed_test │ ├── DeepSpeed.csv │ ├── Example.csv │ ├── testccl_cpu.py │ ├── testccl_gpu.py │ └── testccl_gpu_mpi.py ├── README.md ├── ddp_allreduce.py ├── ds_p2p_crossnodes.py ├── ds_subgroup_allreduce.py ├── run_ds_llm.sh ├── test_allreduce.py ├── test_barrier.py ├── test_c10d_ccl.py ├── test_c10d_p2p.py ├── test_fsdp.py ├── test_llm_allreduce.py └── test_p2p_crossnodes.py ├── third-party-programs.txt ├── tools ├── __init__.py └── setup │ ├── __init__.py │ ├── cmake.py │ └── env.py └── version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # setup.py uses the list of patterns in this file to decide 2 | # what to delete when clean up 3 | 4 | .coverage 5 | .hypothesis 6 | .mypy_cache 7 | */*.pyc 8 | */*.so* 9 | */**/__pycache__ 10 | */**/*.dylib* 11 | */**/*.pyc 12 | */**/*.pyd 13 | */**/*.so* 14 | */**/**/*.pyc 15 | */**/**/**/*.pyc 16 | */**/**/**/**/*.pyc 17 | 18 | oneccl_bindings_for_pytorch/include/ 19 | oneccl_bindings_for_pytorch/lib/ 20 | oneccl_bindings_for_pytorch/bin/ 21 | oneccl_bindings_for_pytorch/etc/ 22 | oneccl_bindings_for_pytorch/env/ 23 | oneccl_bindings_for_pytorch/examples/ 24 | oneccl_bindings_for_pytorch/licensing/ 25 | oneccl_bindings_for_pytorch/modulefiles/ 26 | oneccl_bindings_for_pytorch/version.py 27 | 28 | ## General 29 | 30 | # Debug Shell Script 31 | *.sh 32 | 33 | # Compiled Object files 34 | *.slo 35 | *.lo 36 | *.o 37 | *.cuo 38 | *.obj 39 | 40 | # Compiled Dynamic libraries 41 | *.so 42 | *.dylib 43 | *.dll 44 | 45 | # Compiled Static libraries 46 | *.lai 47 | *.la 48 | *.a 49 | *.lib 50 | 51 | # Compiled protocol buffers 52 | *.pb.h 53 | *.pb.cc 54 | *_pb2.py 55 | 56 | # Compiled python 57 | *.pyc 58 | *.pyd 59 | 60 | # Compiled MATLAB 61 | *.mex* 62 | 63 | # IPython notebook checkpoints 64 | .ipynb_checkpoints 65 | 66 | # Editor temporaries 67 | *.swn 68 | *.swo 69 | *.swp 70 | *~ 71 | 72 | # Sublime Text settings 73 | *.sublime-workspace 74 | *.sublime-project 75 | 76 | # Eclipse Project settings 77 | *.*project 78 | .settings 79 | 80 | # Files generated by CLion 81 | cmake-build-debug 82 | 83 | # QtCreator files 84 | *.user 85 | 86 | # OSX dir files 87 | .DS_Store 88 | 89 | # GDB history 90 | .gdb_history 91 | 92 | ## Caffe2 93 | 94 | # build, distribute, and bins (+ python proto bindings) 95 | build 96 | /build_* 97 | .build_debug/* 98 | .build_release/* 99 | distribute/* 100 | dist/ 101 | *.testbin 102 | *.bin 103 | cmake_build 104 | .cmake_build 105 | gen 106 | .setuptools-cmake-build 107 | 108 | # setup.py intermediates 109 | .eggs 110 | oneccl_bindings_for_pytorch.egg-info 111 | oneccl_bind_pt.egg-info 112 | 113 | # Files generated by ctags 114 | CTAGS 115 | tags 116 | TAGS 117 | 118 | # BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.) 119 | # 120 | # Below files are not deleted by "setup.py clean". 121 | 122 | # Visual Studio Code files 123 | .vscode 124 | .vs 125 | .idea 126 | 127 | # Files generated when a patch is rejected 128 | *.orig 129 | *.rej 130 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/oneCCL"] 2 | path = third_party/oneCCL 3 | url = https://github.com/oneapi-src/oneCCL.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 2 | set(CMAKE_CXX_STANDARD 17) 3 | 4 | project(oneccl_bindings_for_pytorch C CXX) 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat") 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=cpp") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat-security") 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector") 9 | 10 | set(LINUX TRUE) 11 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 12 | set(CMAKE_INSTALL_MESSAGE NEVER) 13 | 14 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules) 15 | 16 | set(RPATH_VALUE) 17 | list(APPEND RPATH_VALUE "$ORIGIN") 18 | 19 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) 20 | 21 | option(USE_SYSTEM_ONECCL "Use oneCCL library in system" OFF) 22 | 23 | option(BUILD_NO_ONECCL_PACKAGE "Build with oneCCL excluded" OFF) 24 | 25 | set(DEPENDS_LIB) 26 | 27 | # Find the Torch lib 28 | find_package(Torch REQUIRED) 29 | list(APPEND DEPENDS_LIB torch) 30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") 31 | 32 | # Find OneCCL Lib 33 | IF (USE_SYSTEM_ONECCL) 34 | # Find and link MPI lib 35 | find_package(MPI REQUIRED) 36 | list(APPEND DEPENDS_LIB ${MPI_LIBRARIES}) 37 | 38 | # Link CCL lib 39 | set(CCL_ROOT $ENV{CCL_ROOT}) 40 | set(CCL_CONFIGURATION_PATH $ENV{CCL_CONFIGURATION_PATH}) 41 | include_directories(${CCL_ROOT}/include) 42 | list(APPEND DEPENDS_LIB "${CCL_ROOT}/lib/${CCL_CONFIGURATION_PATH}/libccl.so") 43 | list(APPEND RPATH_VALUE "$ORIGIN/../../../../") 44 | ELSE() 45 | # Find OneCCL Lib 46 | find_package(oneCCL REQUIRED) 47 | link_directories(${MPI_LIB_DIR}) 48 | list(APPEND DEPENDS_LIB oneCCL mpi) 49 | ENDIF() 50 | 51 | if(COMPUTE_BACKEND STREQUAL "dpcpp") 52 | list(APPEND DEPENDS_LIB ze_loader) 53 | endif() 54 | 55 | set(CMAKE_SKIP_BUILD_RPATH FALSE) 56 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) 57 | set(CMAKE_INSTALL_RPATH "${RPATH_VALUE}") 58 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE) 59 | 60 | SET(LIB_NAME "oneccl_bindings_for_pytorch") 61 | 62 | add_subdirectory(./src) 63 | 64 | function (print_configuration_summary) 65 | get_directory_property(CMAKE_COMPILE_DEFINITIONS DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS) 66 | 67 | message(STATUS "") 68 | message(STATUS "******** Summary ********") 69 | message(STATUS "General:") 70 | message(STATUS " CMake version : ${CMAKE_VERSION}") 71 | message(STATUS " CMake command : ${CMAKE_COMMAND}") 72 | message(STATUS " System : ${CMAKE_SYSTEM_NAME}") 73 | message(STATUS " Target name : ${LIB_NAME}") 74 | message(STATUS " Install path : ${CMAKE_INSTALL_PREFIX}") 75 | message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") 76 | message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") 77 | message(STATUS " C++ compiler id : ${CMAKE_CXX_COMPILER_ID}") 78 | message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") 79 | message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}") 80 | message(STATUS " Compile flags : ${IPEX_COMPILE_FLAGS}") 81 | message(STATUS " Compile definitions : ${CMAKE_COMPILE_DEFINITIONS}") 82 | message(STATUS " Linker options : ${CMAKE_SHARED_LINKER_FLAGS}") 83 | get_target_property(LINK_LIBRARIES oneccl_bindings_for_pytorch LINK_LIBRARIES) 84 | message(STATUS " Linker libraries : ${LINK_LIBRARIES}") 85 | get_target_property(LINK_DIRECTORS oneccl_bindings_for_pytorch LINK_DIRECTORIES) 86 | message(STATUS " Linker directors : ${LINK_DIRECTORS}") 87 | 88 | message(STATUS "") 89 | endfunction() 90 | 91 | print_configuration_summary() 92 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020-2021, Intel Corporation 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the Intel Corporation nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intel® oneCCL Bindings for PyTorch (formerly known as torch_ccl) 2 | 3 | This repository holds PyTorch bindings maintained by Intel® for the Intel® oneAPI Collective Communications Library (oneCCL). 4 | 5 | ## Introduction 6 | 7 | [PyTorch](https://github.com/pytorch/pytorch) is an open-source machine learning framework. 8 | 9 | [Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training, implementing collectives like `allreduce`, `allgather`, `alltoall`. For more information on oneCCL, please refer to the [oneCCL documentation](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/oneccl/source/). 10 | 11 | `oneccl_bindings_for_pytorch` module implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now. 12 | 13 | ## Capability 14 | 15 | The table below shows which functions are available for use with CPU / Intel dGPU tensors. 16 | 17 | | | CPU | GPU | 18 | | :--------------- | :---: | :---: | 19 | | `send` | × | √ | 20 | | `recv` | × | √ | 21 | | `broadcast` | √ | √ | 22 | | `all_reduce` | √ | √ | 23 | | `reduce` | √ | √ | 24 | | `all_gather` | √ | √ | 25 | | `gather` | √ | √ | 26 | | `scatter` | √ | √ | 27 | | `reduce_scatter` | √ | √ | 28 | | `all_to_all` | √ | √ | 29 | | `barrier` | √ | √ | 30 | 31 | 32 | ## PyTorch API Align 33 | 34 | We recommend using Anaconda as Python package management system. The followings are the corresponding branches (tags) of `oneccl_bindings_for_pytorch` and supported PyTorch. 35 | 36 | | `torch` | `oneccl_bindings_for_pytorch` | 37 | | :-------------------------------------------------------------: | :-----------------------------------------------------------------------: | 38 | | `master` | `master` | 39 | | [v2.3.1](https://github.com/pytorch/pytorch/tree/v2.3.1) | [ccl_torch2.3.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.3.100+xpu) | 40 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.400](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.400+xpu) | 41 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.300](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.300+xpu) | 42 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.200](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.200+xpu) | 43 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.100+xpu) | 44 | | [v2.0.1](https://github.com/pytorch/pytorch/tree/v2.0.1) | [ccl_torch2.0.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.0.100) | 45 | | [v1.13](https://github.com/pytorch/pytorch/tree/v1.13) | [ccl_torch1.13](https://github.com/intel/torch-ccl/tree/ccl_torch1.13) | 46 | | [v1.12.1](https://github.com/pytorch/pytorch/tree/v1.12.1) | [ccl_torch1.12.100](https://github.com/intel/torch-ccl/tree/ccl_torch1.12.100) | 47 | | [v1.12.0](https://github.com/pytorch/pytorch/tree/v1.12.0) | [ccl_torch1.12](https://github.com/intel/torch-ccl/tree/ccl_torch1.12) | 48 | | [v1.11.0](https://github.com/pytorch/pytorch/tree/v1.11.0) | [ccl_torch1.11](https://github.com/intel/torch-ccl/tree/ccl_torch1.11) | 49 | | [v1.10.0](https://github.com/pytorch/pytorch/tree/v1.10.0) | [ccl_torch1.10](https://github.com/intel/torch-ccl/tree/ccl_torch1.10) | 50 | | [v1.9.0](https://github.com/pytorch/pytorch/tree/v1.9.0) | [ccl_torch1.9](https://github.com/intel/torch-ccl/tree/ccl_torch1.9) | 51 | | [v1.8.1](https://github.com/pytorch/pytorch/tree/v1.8.1) | [ccl_torch1.8](https://github.com/intel/torch-ccl/tree/ccl_torch1.8) | 52 | | [v1.7.1](https://github.com/pytorch/pytorch/tree/v1.7.1) | [ccl_torch1.7](https://github.com/intel/torch-ccl/tree/ccl_torch1.7) | 53 | | [v1.6.0](https://github.com/pytorch/pytorch/tree/v1.6.0) | [ccl_torch1.6](https://github.com/intel/torch-ccl/tree/ccl_torch1.6) | 54 | | [v1.5-rc3](https://github.com/pytorch/pytorch/tree/v1.5.0-rc3) | [beta09](https://github.com/intel/torch-ccl/tree/beta09) | 55 | 56 | The usage details can be found in the README of corresponding branch. 57 | 58 | ## Requirements 59 | 60 | - Python 3.8 or later and a C++17 compiler 61 | 62 | - PyTorch v2.3.1 63 | 64 | ## Build Option List 65 | 66 | The following build options are supported in Intel® oneCCL Bindings for PyTorch*. 67 | 68 | | Build Option | Default Value | Description | 69 | | :---------------------------------- | :------------- | :-------------------------------------------------------------------------------------------------- | 70 | | COMPUTE_BACKEND | N/A | Set oneCCL `COMPUTE_BACKEND`, set to `dpcpp` and use DPC++ compiler to enable support for Intel XPU | 71 | | USE_SYSTEM_ONECCL | OFF | Use oneCCL library in system | 72 | | CCL_PACKAGE_NAME | oneccl-bind-pt | Set wheel name | 73 | | ONECCL_BINDINGS_FOR_PYTORCH_BACKEND | cpu | Set backend | 74 | | CCL_SHA_VERSION | False | Add git head sha version into wheel name | 75 | 76 | ## Launch Option List 77 | 78 | The following launch options are supported in Intel® oneCCL Bindings for PyTorch*. 79 | 80 | | Launch Option | Default Value | Description | 81 | | :--------------------------------------- | :------------ | :-------------------------------------------------------------------- | 82 | | ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE | 0 | Set verbose level in oneccl_bindings_for_pytorch | 83 | | ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB | 0 | Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching | 84 | | TORCH_LLM_ALLREDUCE | 0 | Set 1 to enable this prototype feature for better scale-up performance. This is a prototype feature to provide better scale-up performance by enabling optimized collective algorithms in oneCCL and asynchronous execution in torch-ccl. This feature requires XeLink enabled for cross-cards communication.| 85 | | CCL_BLOCKING_WAIT | 0 | Set 1 to enable this prototype feature, which is to control whether collectives execution on XPU is host blocking or non-blocking. | 86 | | CCL_SAME_STREAM | 0 | Set 1 to enable this prototype feature, which is to allow using a computation stream as communication stream to minimize overhead for streams synchronization. | 87 | 88 | ## Installation 89 | 90 | ### Install from Source 91 | 92 | 1. clone the `oneccl_bindings_for_pytorch`. 93 | 94 | ```bash 95 | git clone https://github.com/intel/torch-ccl.git && cd torch-ccl 96 | git submodule sync 97 | git submodule update --init --recursive 98 | ``` 99 | 100 | 2. Install `oneccl_bindings_for_pytorch` 101 | 102 | ```bash 103 | # for CPU Backend Only 104 | python setup.py install 105 | # for XPU Backend: use DPC++ Compiler to enable support for Intel XPU 106 | # build with oneCCL from third party 107 | COMPUTE_BACKEND=dpcpp python setup.py install 108 | # build with oneCCL from basekit 109 | export INTELONEAPIROOT=${HOME}/intel/oneapi 110 | USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install 111 | ``` 112 | 113 | ### Install Prebuilt Wheel 114 | 115 | Wheel files are available for the following Python versions. Please always use the latest release to get started. 116 | 117 | | Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 | 118 | | :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | :---------: | 119 | | 2.3.100 | | | √ | √ | √ | √ | 120 | | 2.1.400 | | | √ | √ | √ | √ | 121 | | 2.1.300 | | | √ | √ | √ | √ | 122 | | 2.1.200 | | | √ | √ | √ | √ | 123 | | 2.1.100 | | | √ | √ | √ | √ | 124 | | 2.0.100 | | | √ | √ | √ | √ | 125 | | 1.13 | | √ | √ | √ | √ | | 126 | | 1.12.100 | | √ | √ | √ | √ | | 127 | | 1.12.0 | | √ | √ | √ | √ | | 128 | | 1.11.0 | | √ | √ | √ | √ | | 129 | | 1.10.0 | √ | √ | √ | √ | | | 130 | 131 | ```bash 132 | python -m pip install oneccl_bind_pt==2.3.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 133 | ``` 134 | 135 | **Note:** Please set proxy or update URL address to https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ if you meet connection issue. 136 | 137 | ### Runtime Dynamic Linking 138 | 139 | - If oneccl_bindings_for_pytorch is built without oneCCL and use oneCCL in system, dynamic link oneCCl from oneAPI basekit (recommended usage): 140 | 141 | ```bash 142 | source $basekit_root/ccl/latest/env/vars.sh 143 | ``` 144 | 145 | Note: Make sure you have installed [basekit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html#base-kit) when using Intel® oneCCL Bindings for Pytorch\* on Intel® GPUs. 146 | 147 | - If oneccl_bindings_for_pytorch is built with oneCCL from third party or installed from prebuilt wheel: 148 | Dynamic link oneCCL and Intel MPI libraries: 149 | 150 | ```bash 151 | source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/setvars.sh 152 | ``` 153 | 154 | Dynamic link oneCCL only (not including Intel MPI): 155 | 156 | ```bash 157 | source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/vars.sh 158 | ``` 159 | 160 | ## Usage 161 | 162 | **Note:** Please `import torch` and `import intel_extension_for_pytorch`, prior to `import oneccl_bindings_for_pytorch`. 163 | 164 | example.py 165 | 166 | ```python 167 | 168 | import torch 169 | import intel_extension_for_pytorch 170 | import oneccl_bindings_for_pytorch 171 | import torch.nn.parallel 172 | import torch.distributed as dist 173 | 174 | ... 175 | 176 | os.environ['MASTER_ADDR'] = '127.0.0.1' 177 | os.environ['MASTER_PORT'] = '29500' 178 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 179 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 180 | 181 | backend = 'ccl' 182 | dist.init_process_group(backend, ...) 183 | my_rank = dist.get_rank() 184 | my_size = dist.get_world_size() 185 | print("my rank = %d my size = %d" % (my_rank, my_size)) 186 | 187 | ... 188 | 189 | model = torch.nn.parallel.DistributedDataParallel(model, ...) 190 | 191 | ... 192 | ``` 193 | 194 | (oneccl_bindings_for_pytorch is built without oneCCL, use oneCCL and MPI(if needed) in system) 195 | 196 | ```bash 197 | source $basekit_root/ccl/latest/env/vars.sh 198 | source $basekit_root/mpi/latest/env/vars.sh 199 | 200 | mpirun -n -ppn -f python example.py 201 | ``` 202 | 203 | ## Performance Debugging 204 | 205 | For debugging performance of communication primitives PyTorch's [Autograd profiler](https://pytorch.org/docs/stable/autograd.html#profiler) 206 | can be used to inspect time spent inside oneCCL calls. 207 | 208 | Example: 209 | 210 | profiling.py 211 | 212 | ```python 213 | 214 | import torch.nn.parallel 215 | import torch.distributed as dist 216 | import oneccl_bindings_for_pytorch 217 | import os 218 | 219 | os.environ['MASTER_ADDR'] = '127.0.0.1' 220 | os.environ['MASTER_PORT'] = '29500' 221 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 222 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 223 | 224 | backend = 'ccl' 225 | dist.init_process_group(backend) 226 | my_rank = dist.get_rank() 227 | my_size = dist.get_world_size() 228 | print("my rank = %d my size = %d" % (my_rank, my_size)) 229 | 230 | x = torch.ones([2, 2]) 231 | y = torch.ones([4, 4]) 232 | with torch.autograd.profiler.profile(record_shapes=True) as prof: 233 | for _ in range(10): 234 | dist.all_reduce(x) 235 | dist.all_reduce(y) 236 | dist.barrier() 237 | print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cpu_time_total")) 238 | 239 | ``` 240 | 241 | ```bash 242 | mpirun -n 2 -l python profiling.py 243 | ``` 244 | 245 | ```bash 246 | [0] my rank = 0 my size = 2 247 | [0] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 248 | [0] Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls Input Shapes 249 | [0] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 250 | [0] oneccl_bindings_for_pytorch::allreduce 91.41% 297.900ms 91.41% 297.900ms 29.790ms 10 [[2, 2]] 251 | [0] oneccl_bindings_for_pytorch::wait::cpu::allreduce 8.24% 26.845ms 8.24% 26.845ms 2.684ms 10 [[2, 2], [2, 2]] 252 | [0] oneccl_bindings_for_pytorch::wait::cpu::allreduce 0.30% 973.651us 0.30% 973.651us 97.365us 10 [[4, 4], [4, 4]] 253 | [0] oneccl_bindings_for_pytorch::allreduce 0.06% 190.254us 0.06% 190.254us 19.025us 10 [[4, 4]] 254 | [0] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 255 | [0] Self CPU time total: 325.909ms 256 | [0] 257 | [1] my rank = 1 my size = 2 258 | [1] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 259 | [1] Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls Input Shapes 260 | [1] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 261 | [1] oneccl_bindings_for_pytorch::allreduce 96.03% 318.551ms 96.03% 318.551ms 31.855ms 10 [[2, 2]] 262 | [1] oneccl_bindings_for_pytorch::wait::cpu::allreduce 3.62% 12.019ms 3.62% 12.019ms 1.202ms 10 [[2, 2], [2, 2]] 263 | [1] oneccl_bindings_for_pytorch::allreduce 0.33% 1.082ms 0.33% 1.082ms 108.157us 10 [[4, 4]] 264 | [1] oneccl_bindings_for_pytorch::wait::cpu::allreduce 0.02% 56.505us 0.02% 56.505us 5.651us 10 [[4, 4], [4, 4]] 265 | [1] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 266 | [1] Self CPU time total: 331.708ms 267 | [1] 268 | 269 | ``` 270 | 271 | ## Known Issues 272 | 273 | For Point-to-point communication, directly call dist.send/recv after initializing the process group in launch script will trigger runtime error. Because all ranks of the group are expected to participate in this call to create communicators in our current implementation, while dist.send/recv only has a pair of ranks' participation. As a result, dist.send/recv should be used after collective call, which ensures all ranks' participation. The further solution for supporting directly call dist.send/recv after initializing the process group is still under investigation. 274 | 275 | ## License 276 | 277 | [BSD License](https://github.com/intel/torch-ccl/blob/master/LICENSE) 278 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | -------------------------------------------------------------------------------- /cmake/Modules/FindoneCCL.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find oneCCL 2 | # 3 | # The following are set after configuration is done: 4 | # ONECCL_FOUND : set to true if oneCCL is found. 5 | # ONECCL_INCLUDE_DIRS : path to oneCCL include dir. 6 | # ONECCL_LIBRARIES : list of libraries for oneCCL 7 | # 8 | # and the following imported targets: 9 | # 10 | # oneCCL 11 | 12 | IF (NOT ONECCL_FOUND) 13 | SET(ONECCL_FOUND OFF) 14 | SET(ONECCL_LIBRARIES) 15 | SET(ONECCL_INCLUDE_DIRS) 16 | 17 | SET(ONECCL_ROOT "${PROJECT_SOURCE_DIR}/third_party/oneCCL") 18 | 19 | IF(BUILD_NO_ONECCL_PACKAGE) 20 | ADD_SUBDIRECTORY(${ONECCL_ROOT} oneCCL EXCLUDE_FROM_ALL) 21 | ELSE() 22 | ADD_SUBDIRECTORY(${ONECCL_ROOT}) 23 | ENDIF() 24 | 25 | IF(NOT TARGET ccl) 26 | MESSAGE(FATAL_ERROR "Failed to find oneCCL target") 27 | ENDIF() 28 | add_library(oneCCL ALIAS ccl) 29 | 30 | GET_TARGET_PROPERTY(INCLUDE_DIRS oneCCL INCLUDE_DIRECTORIES) 31 | SET(ONECCL_INCLUDE_DIRS ${INCLUDE_DIRS}) 32 | SET(ONECCL_LIBRARIES oneCCL) 33 | 34 | find_package_handle_standard_args(oneCCL FOUND_VAR ONECCL_FOUND REQUIRED_VARS ONECCL_LIBRARIES ONECCL_INCLUDE_DIRS) 35 | 36 | set(MPI_INCLUDE_DIR "${ONECCL_ROOT}/deps/mpi/include/") 37 | set(MPI_LIB_DIR "${ONECCL_ROOT}/deps/mpi/lib/") 38 | 39 | ENDIF(NOT ONECCL_FOUND) 40 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Simple Demo for Intel® oneCCL Bindings for PyTorch* 2 | 3 | This simple demo show case the functionality for collective communication primitives in Intel® oneCCL Bindings for PyTorch*. 4 | 5 | ## Single Node Run 6 | To run the simple demo on a single node with 2 instances, run: 7 | 8 | ```bash 9 | mpirun -n 2 -l python demo.py 10 | 11 | ``` 12 | The demo could be also run on XPU with " --device xpu " argument. 13 | 14 | ```bash 15 | mpirun -n 2 -l python demo.py --device xpu 16 | ``` 17 | 18 | ## Multiple Nodes Run 19 | To run the simple demo on multiple nodes, please follow below instructions: 20 | 21 | ### Ethernet 22 | 1. Identify the network interface name for collective communication. ex: eth0 23 | 2. Identify the IPs of all nodes. ex: 10.0.0.1,10.0.0.2 24 | 3. Identify the master node IP. ex: 10.0.0.1 25 | 4. Set the value of np for the total number of instances. ex: 2 26 | 5. Set the value of ppn for the number of instance per node. ex: 1 27 | 28 | Here is a run command example for cpu according to above steps: 29 | 30 | ```bash 31 | FI_TCP_IFACE=eth0 I_MPI_OFI_PROVIDER=tcp I_MPI_HYDRA_IFACE=eth0 I_MPI_DEBUG=121 mpirun -host 10.0.0.1,10.0.0.2 -np 2 -ppn 1 --map-by node python demo.py --device cpu --dist_url 10.0.0.1 --dist_port 29500 32 | ``` 33 | The demo could be also run on XPU by changing " --device cpu " to " --device xpu " argument. 34 | 35 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn.parallel import DistributedDataParallel as DDP 5 | import torch.distributed as dist 6 | try: 7 | import intel_extension_for_pytorch 8 | except: 9 | print("cant't import ipex") 10 | 11 | import oneccl_bindings_for_pytorch 12 | import argparse 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--device', '-dev', type=str, default='cpu', help='Device type to use: cpu, xpu') 15 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 16 | parser.add_argument('--dist_port', default='29800', type=str, help='url port used to set up distributed training') 17 | args = parser.parse_args() 18 | 19 | 20 | class Model(nn.Module): 21 | def __init__(self): 22 | super(Model, self).__init__() 23 | self.linear = nn.Linear(4, 5) 24 | 25 | def forward(self, input): 26 | return self.linear(input) 27 | 28 | 29 | if __name__ == "__main__": 30 | 31 | mpi_world_size = int(os.environ.get('PMI_SIZE', -1)) 32 | mpi_rank = int(os.environ.get('PMI_RANK', -1)) 33 | if mpi_world_size > 0: 34 | os.environ['RANK'] = str(mpi_rank) 35 | os.environ['WORLD_SIZE'] = str(mpi_world_size) 36 | else: 37 | # set the default rank and world size to 0 and 1 38 | os.environ['RANK'] = str(os.environ.get('RANK', 0)) 39 | os.environ['WORLD_SIZE'] = str(os.environ.get('WORLD_SIZE', 1)) 40 | os.environ['MASTER_ADDR'] = '127.0.0.1' # your master address 41 | os.environ['MASTER_PORT'] = '29500' # your master port 42 | rank = int(os.environ.get('PMI_RANK', -1)) # global rank 43 | world_size = int(os.environ.get("WORLD_SIZE", -1)) 44 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 45 | 46 | # Initialize the process group with ccl backend 47 | dist.init_process_group(backend='ccl', init_method=init_method, world_size=world_size, rank=rank) 48 | 49 | local_rank = os.environ['MPI_LOCALRANKID'] 50 | if args.device == 'xpu': 51 | device = "xpu:{}".format(local_rank) 52 | else: 53 | device = 'cpu' 54 | 55 | model = Model().to(device) 56 | if dist.get_world_size() > 1: 57 | model = DDP(model, device_ids=[device] if (device != 'cpu') else None) 58 | 59 | optimizer = torch.optim.SGD(model.parameters(), lr=0.001) 60 | loss_fn = nn.MSELoss().to(device) 61 | for i in range(3): 62 | print("Runing Iteration: {} on device {}".format(i, device)) 63 | input = torch.randn(2, 4).to(device) 64 | labels = torch.randn(2, 5).to(device) 65 | # forward 66 | print("Runing forward: {} on device {}".format(i, device)) 67 | res = model(input) 68 | # loss 69 | print("Runing loss: {} on device {}".format(i, device)) 70 | L = loss_fn(res, labels) 71 | # backward 72 | print("Runing backward: {} on device {}".format(i, device)) 73 | with torch.autograd.profiler_legacy.profile(enabled=True) as prof: 74 | L.backward() 75 | #print(prof) 76 | # update 77 | print("Runing optim: {} on device {}".format(i, device)) 78 | optimizer.step() 79 | print("Finish") -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | import torch 5 | 6 | 7 | cwd = os.path.dirname(os.path.abspath(__file__)) 8 | if not os.path.exists(os.path.join(cwd, "version.py")): 9 | raise RuntimeError("oneccl_bindings_for_pytorch is not installed!") 10 | 11 | 12 | def set_env_default(env, key, value): 13 | new_value = env.get(key, value) 14 | env[key] = new_value 15 | 16 | from .version import __version__, git_version 17 | from . import _C as ccl_lib 18 | 19 | if hasattr(torch, 'xpu'): 20 | try: 21 | # load the CCL/XPU library 22 | import ctypes 23 | my_c_library = ctypes.cdll.LoadLibrary(os.path.join(cwd, "lib/liboneccl_bindings_for_pytorch_xpu.so")) 24 | except OSError as e: 25 | print(f"Warning: Cannot load xpu CCL. CCL doesn't work for XPU device due to {e}") 26 | 27 | __all__ = [] 28 | __all__ += [name for name in dir(ccl_lib) 29 | if name[0] != '_' and 30 | not name.endswith('Base')] 31 | 32 | 33 | def is_available(tensors): 34 | devices = set() 35 | for tensor in tensors: 36 | if not tensor.is_contiguous(): 37 | return False 38 | device = tensor.get_device() 39 | if device in devices: 40 | return False 41 | devices.add(device) 42 | 43 | return True 44 | 45 | -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/csrc/_C.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "init.h" 33 | 34 | PYBIND11_MODULE(_C, m) { 35 | torch_ccl_python_init(m); 36 | } -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/csrc/init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "init.h" 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include 46 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13 47 | #if TORCH_VERSION_MAJOR > 1 48 | #include 49 | #else 50 | #include 51 | #endif 52 | #include 53 | #include 54 | #include 55 | #else 56 | #include 57 | #include 58 | #include 59 | #include 60 | #endif 61 | 62 | #include 63 | 64 | namespace py = pybind11; 65 | 66 | 67 | namespace { 68 | 69 | // This is a intrusive helper from pytorch. 70 | template 71 | class IntrusivePtrNoGilDestructor { 72 | c10::intrusive_ptr impl_; 73 | 74 | public: 75 | IntrusivePtrNoGilDestructor() = default; 76 | IntrusivePtrNoGilDestructor(const IntrusivePtrNoGilDestructor&) = default; 77 | IntrusivePtrNoGilDestructor(IntrusivePtrNoGilDestructor&&) = default; 78 | IntrusivePtrNoGilDestructor& operator=(const IntrusivePtrNoGilDestructor&) = 79 | default; 80 | IntrusivePtrNoGilDestructor& operator=(IntrusivePtrNoGilDestructor&&) = 81 | default; 82 | /* implicit */ IntrusivePtrNoGilDestructor(c10::intrusive_ptr impl) 83 | : impl_(std::move(impl)) {} 84 | // This ctor is very important; see 85 | // https://github.com/pybind/pybind11/issues/2957 86 | explicit IntrusivePtrNoGilDestructor(T* impl) 87 | : impl_(c10::intrusive_ptr::unsafe_steal_from_new(impl)) {} 88 | ~IntrusivePtrNoGilDestructor() { 89 | if (impl_) { 90 | if (PyGILState_Check()) { 91 | pybind11::gil_scoped_release release; 92 | impl_.reset(); 93 | } else { 94 | impl_.reset(); 95 | } 96 | } 97 | } 98 | T& operator*() const noexcept { 99 | return *impl_; 100 | } 101 | T* operator->() const noexcept { 102 | return impl_.get(); 103 | } 104 | C10_NODISCARD T* get() const noexcept { 105 | return impl_.get(); 106 | } 107 | void reset() noexcept { 108 | impl_.reset(); 109 | } 110 | operator bool() const noexcept { 111 | return impl_; 112 | } 113 | }; 114 | 115 | } // anonymous namespace 116 | 117 | PYBIND11_DECLARE_HOLDER_TYPE(T, IntrusivePtrNoGilDestructor, true); 118 | 119 | template 120 | using intrusive_ptr_no_gil_destructor_class_ = 121 | py::class_>; 122 | 123 | TORCH_CCL_CPP_API void torch_ccl_python_init(pybind11::module &m) { 124 | c10d::ProcessGroupCCL::cclInitOnce(); 125 | py::object module = py::module::import("torch.distributed"); 126 | py::object register_backend = module.attr("Backend").attr("register_backend"); 127 | #if TORCH_VERSION_MAJOR > 1 128 | auto backend = py::module::import("torch._C._distributed_c10d").attr("Backend"); 129 | #else 130 | auto backend = module.attr("ProcessGroup"); 131 | #endif 132 | register_backend("ccl", py::cpp_function(&c10d::ProcessGroupCCL::createProcessGroupCCL, 133 | py::arg("store"), 134 | py::arg("rank"), 135 | py::arg("size"), 136 | py::arg("timeout") = std::chrono::milliseconds( 137 | ::c10d::ProcessGroupCCL::OP_TIMEOUT_MILLIS)), 138 | false, std::vector{"xpu", "cpu"}); 139 | 140 | auto processGroupCCL = intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupCCL>( 141 | module, "ProcessGroupCCL", backend); 142 | 143 | processGroupCCL.def( 144 | py::init([](const c10::intrusive_ptr<::c10d::Store>& store, 145 | int rank, 146 | int size, 147 | std::chrono::milliseconds timeout) { 148 | return c10::make_intrusive<::c10d::ProcessGroupCCL>(store, rank, size, timeout); 149 | }), 150 | py::arg("store"), 151 | py::arg("rank"), 152 | py::arg("size"), 153 | py::arg("timeout") = std::chrono::milliseconds(10 * 1000)); 154 | 155 | } 156 | -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/csrc/init.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #pragma once 33 | 34 | #include 35 | 36 | #define TORCH_CCL_CPP_API __attribute__ ((visibility ("default"))) 37 | 38 | void torch_ccl_python_init(pybind11::module &m); 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.10.0 2 | setuptools 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # DEBUG build with debug 2 | # 3 | # USE_SYSTEM_ONECCL=0 4 | # disables use of system-wide oneCCL (we will use our submoduled 5 | # copy in third_party/oneCCL) 6 | 7 | import os 8 | import sys 9 | import pathlib 10 | import shutil 11 | from subprocess import check_call, check_output 12 | 13 | import torch 14 | from torch.utils.cpp_extension import BuildExtension, CppExtension, library_paths 15 | from setuptools import setup 16 | from distutils.command.clean import clean 17 | from tools.setup.cmake import CMakeExtension 18 | from tools.setup.env import get_compiler 19 | 20 | # Constant known variables used throughout this file 21 | CWD = os.path.dirname(os.path.abspath(__file__)) 22 | ONECCL_BINDINGS_FOR_PYTORCH_PATH = os.path.join(CWD, "oneccl_bindings_for_pytorch") 23 | 24 | 25 | def _check_env_flag(name, default=''): 26 | return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y'] 27 | 28 | 29 | def check_file(f): 30 | if not os.path.exists(f): 31 | print("Could not find {}".format(f)) 32 | print("Did you run 'git submodule update --init --recursive'?") 33 | sys.exit(1) 34 | 35 | 36 | # all the work we need to do _before_ setup runs 37 | def create_version(): 38 | """Create the version string for torch-ccl""" 39 | package_name = os.getenv('CCL_PACKAGE_NAME', 'oneccl-bind-pt') 40 | version = open('version.txt', 'r').read().strip() 41 | sha = 'Unknown' 42 | 43 | try: 44 | sha = check_output(['git', 'rev-parse', 'HEAD'], cwd=CWD).decode('ascii').strip() 45 | except Exception: 46 | pass 47 | 48 | if os.getenv('CCL_SHA_VERSION', False): 49 | if sha != 'Unknown': 50 | version += '+' + sha[:7] 51 | 52 | if os.environ.get("COMPUTE_BACKEND") == "dpcpp": 53 | backend = "gpu" 54 | else: 55 | backend = os.environ.get("ONECCL_BINDINGS_FOR_PYTORCH_BACKEND", "cpu") 56 | 57 | if "+" not in version: 58 | version += '+' + backend 59 | 60 | print("Building {}-{}".format(package_name, version)) 61 | 62 | version_path = os.path.join(CWD, 'oneccl_bindings_for_pytorch', 'version.py') 63 | with open(version_path, 'w') as f: 64 | f.write("__version__ = '{}'\n".format(version)) 65 | f.write("git_version = {}\n".format(repr(sha))) 66 | 67 | return version, package_name 68 | 69 | 70 | class BuildCMakeExt(BuildExtension): 71 | """ 72 | Builds using cmake instead of the python setuptools implicit build 73 | """ 74 | 75 | def run(self): 76 | """ 77 | Perform build_cmake before doing the 'normal' stuff 78 | """ 79 | cmake_extensions = [ext for ext in self.extensions if isinstance(ext, CMakeExtension)] 80 | for ext in cmake_extensions: 81 | self.build_cmake(ext) 82 | 83 | self.extensions = [ext for ext in self.extensions if not isinstance(ext, CMakeExtension)] 84 | super(BuildCMakeExt, self).run() 85 | build_py = self.get_finalized_command('build_py') 86 | build_py.data_files = build_py._get_data_files() 87 | build_py.run() 88 | 89 | def build_cmake(self, extension: CMakeExtension): 90 | """ 91 | The steps required to build the extension 92 | """ 93 | build_dir = pathlib.Path('.'.join([self.build_temp, extension.name])) 94 | 95 | build_dir.mkdir(parents=True, exist_ok=True) 96 | install_dir = ONECCL_BINDINGS_FOR_PYTORCH_PATH 97 | 98 | # Now that the necessary directories are created, build 99 | my_env = os.environ.copy() 100 | my_env["CMAKE_DISABLE_FIND_PACKAGE_MKL"] = "TRUE" 101 | build_type = 'Release' 102 | 103 | if _check_env_flag('DEBUG'): 104 | build_type = 'Debug' 105 | 106 | build_options = { 107 | 'CMAKE_BUILD_TYPE': build_type, 108 | # The value cannot be easily obtained in CMakeLists.txt. 109 | 'CMAKE_PREFIX_PATH': torch.utils.cmake_prefix_path, 110 | # skip the example and test code in oneCCL 111 | 'BUILD_EXAMPLES': 'OFF', 112 | 'BUILD_CONFIG': 'OFF', 113 | 'BUILD_FT': 'OFF' 114 | } 115 | 116 | compute_backend = os.getenv('COMPUTE_BACKEND', 'n/a') 117 | runtime = 'gcc' 118 | if compute_backend == 'dpcpp': 119 | runtime = 'dpcpp' 120 | build_options['COMPUTE_BACKEND'] = compute_backend 121 | import intel_extension_for_pytorch 122 | build_options['CMAKE_PREFIX_PATH'] += ";" + intel_extension_for_pytorch.cmake_prefix_path 123 | if "DPCPP_GCC_INSTALL_DIR" in my_env: 124 | exist_cflags = "CFLAGS" in my_env 125 | cflags = "" 126 | if exist_cflags: 127 | cflags = my_env["CFLAGS"] 128 | my_env["CFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} {cflags}" 129 | exist_cxxflags = "CXXFLAGS" in my_env 130 | cxxflags = "" 131 | if exist_cxxflags: 132 | cxxflags = my_env["CXXFLAGS"] 133 | my_env["CXXFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} {cxxflags}" 134 | exist_ldflags = "LDFLAGS" in my_env 135 | ldflags = "" 136 | if exist_ldflags: 137 | ldflags = my_env["LDFLAGS"] 138 | my_env["LDFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} -fuse-ld=lld -lrt -lpthread {ldflags}" 139 | 140 | cc, cxx = get_compiler(runtime) 141 | build_options['CMAKE_C_COMPILER'] = cc 142 | build_options['CMAKE_CXX_COMPILER'] = cxx 143 | 144 | extension.generate(build_options, my_env, build_dir, install_dir) 145 | 146 | if compute_backend == 'dpcpp': 147 | if "DPCPP_GCC_INSTALL_DIR" in my_env: 148 | if exist_cflags: 149 | my_env["CFLAGS"] = cflags 150 | else: 151 | del my_env["CFLAGS"] 152 | if exist_cxxflags: 153 | my_env["CXXFLAGS"] = cxxflags 154 | else: 155 | del my_env["CXXFLAGS"] 156 | if exist_ldflags: 157 | my_env["LDFLAGS"] = ldflags 158 | else: 159 | del my_env["LDFLAGS"] 160 | 161 | build_args = ['-j', str(os.cpu_count())] 162 | check_call(['make', 'oneccl_bindings_for_pytorch'] + build_args, cwd=str(build_dir)) 163 | if compute_backend == 'dpcpp': 164 | check_call(['make', 'oneccl_bindings_for_pytorch_xpu'] + build_args, cwd=str(build_dir)) 165 | check_call(['make', 'install'], cwd=str(build_dir)) 166 | 167 | 168 | class Clean(clean): 169 | def run(self): 170 | import glob 171 | import re 172 | 173 | with open('.gitignore', 'r') as f: 174 | ignores = f.read() 175 | pat = re.compile(r'^#( BEGIN NOT-CLEAN-FILES )?') 176 | for wildcard in filter(None, ignores.split('\n')): 177 | match = pat.match(wildcard) 178 | if match: 179 | if match.group(1): 180 | # Marker is found and stop reading .gitignore. 181 | break 182 | # Ignore lines which begin with '#'. 183 | else: 184 | for filename in glob.glob(wildcard): 185 | try: 186 | os.remove(filename) 187 | except OSError: 188 | shutil.rmtree(filename, ignore_errors=True) 189 | 190 | clean.run(self) 191 | 192 | 193 | def get_python_c_module(): 194 | main_compile_args = [] 195 | main_libraries = ['oneccl_bindings_for_pytorch'] 196 | main_link_args = [] 197 | main_sources = ["oneccl_bindings_for_pytorch/csrc/_C.cpp", "oneccl_bindings_for_pytorch/csrc/init.cpp"] 198 | lib_path = os.path.join(ONECCL_BINDINGS_FOR_PYTORCH_PATH, "lib") 199 | library_dirs = [lib_path] 200 | include_path = os.path.join(CWD, "src") 201 | include_dirs = [include_path] 202 | extra_link_args = [] 203 | extra_compile_args = [ 204 | '-Wall', 205 | '-Wextra', 206 | '-Wno-strict-overflow', 207 | '-Wno-unused-parameter', 208 | '-Wno-missing-field-initializers', 209 | '-Wno-write-strings', 210 | '-Wno-unknown-pragmas', 211 | # This is required for Python 2 declarations that are deprecated in 3. 212 | '-Wno-deprecated-declarations', 213 | # Python 2.6 requires -fno-strict-aliasing, see 214 | # http://legacy.python.org/dev/peps/pep-3123/ 215 | # We also depend on it in our code (even Python 3). 216 | '-fno-strict-aliasing', 217 | # Clang has an unfixed bug leading to spurious missing 218 | # braces warnings, see 219 | # https://bugs.llvm.org/show_bug.cgi?id=21629 220 | '-Wno-missing-braces', 221 | ] 222 | 223 | def make_relative_rpath(path): 224 | ret = [] 225 | ret.append('-Wl,-rpath,$ORIGIN/' + path) 226 | if os.getenv('COMPUTE_BACKEND', 'n/a') == 'dpcpp': 227 | ret.append('-Wl,-rpath,$ORIGIN/../../../') 228 | ret.append('-Wl,--disable-new-dtags') 229 | return ret 230 | 231 | _c_module = CppExtension("oneccl_bindings_for_pytorch._C", 232 | libraries=main_libraries, 233 | sources=main_sources, 234 | language='c', 235 | extra_compile_args=main_compile_args + extra_compile_args, 236 | include_dirs=include_dirs, 237 | library_dirs=library_dirs, 238 | extra_link_args=extra_link_args + main_link_args + make_relative_rpath('lib')) 239 | 240 | return _c_module 241 | 242 | 243 | if __name__ == '__main__': 244 | version, package_name = create_version() 245 | c_module = get_python_c_module() 246 | cmake_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "CMakeLists.txt") 247 | modules = [CMakeExtension("liboneccl_bindings_for_pytorch", cmake_file), c_module] 248 | setup( 249 | name=package_name, 250 | version=version, 251 | ext_modules=modules, 252 | packages=['oneccl_bindings_for_pytorch'], 253 | package_data={ 254 | 'oneccl_bindings_for_pytorch': [ 255 | '*.py', 256 | '*/*.h', 257 | '*/*.hpp', 258 | 'lib/*.so*', 259 | 'opt/mpi/lib/*.so*', 260 | 'bin/*', 261 | 'opt/mpi/bin/*', 262 | 'env/*', 263 | 'etc/*', 264 | 'opt/mpi/etc/*', 265 | 'examples/*', 266 | 'include/native_device_api/*.h*', 267 | 'include/native_device_api/l0/*.h*', 268 | 'include/*.h*', 269 | 'opt/mpi/include/*.h*', 270 | 'lib/lib*', 271 | 'opt/mpi/libfabric/lib/lib*', 272 | 'lib/prov/lib*', 273 | 'lib/ccl/kernels/*', 274 | 'opt/mpi/libfabric/lib/prov/lib*', 275 | 'licensing/*', 276 | 'modulefiles/*', 277 | ]}, 278 | cmdclass={ 279 | 'build_ext': BuildCMakeExt, 280 | 'clean': Clean, 281 | } 282 | ) 283 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CCL_SRCS ProcessGroupCCL.cpp dispatch_stub.cpp utils.cpp ccl_comm_collector.cpp env.cpp) 2 | set(CCL_CPU_SRCS cpu/cpu_ccl.cpp) 3 | add_library(oneccl_bindings_for_pytorch SHARED ${CCL_SRCS} ${CCL_CPU_SRCS}) 4 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES OUTPUT_NAME ${LIB_NAME}) 5 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES POSITION_INDEPENDENT_CODE ON) 6 | 7 | target_compile_options(oneccl_bindings_for_pytorch PUBLIC -Wall 8 | -Wno-sign-compare 9 | -Wno-unused-function) 10 | 11 | if(COMPUTE_BACKEND STREQUAL "dpcpp") 12 | add_subdirectory(./gpu) 13 | add_definitions (-DUSE_GPU) 14 | target_compile_options(oneccl_bindings_for_pytorch PUBLIC -fsycl) 15 | target_link_options(oneccl_bindings_for_pytorch PUBLIC -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -options -vc-codegen") 16 | endif() 17 | 18 | target_include_directories(oneccl_bindings_for_pytorch PUBLIC ./) 19 | 20 | target_link_libraries(oneccl_bindings_for_pytorch PUBLIC ${DEPENDS_LIB}) 21 | 22 | foreach(RPATH ${CMAKE_INSTALL_RPATH}) 23 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}") 24 | endforeach() 25 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES LINK_FLAGS "-Wl,--disable-new-dtags") 26 | 27 | install(TARGETS oneccl_bindings_for_pytorch LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib") 28 | -------------------------------------------------------------------------------- /src/ProcessGroupCCL.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #pragma once 33 | 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include 41 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13 42 | #if TORCH_VERSION_MAJOR > 1 43 | #include 44 | #include 45 | #else 46 | #include 47 | #endif 48 | #include 49 | #include 50 | #include 51 | #include 52 | #else 53 | #include 54 | #include 55 | #include 56 | #include 57 | #endif 58 | 59 | 60 | namespace oneccl_bindings_for_pytorch { 61 | struct CCLCommCollector; 62 | 63 | static inline void format_tensors_param(std::vector& param, const at::Tensor& tensor) { 64 | param.emplace_back(tensor); 65 | } 66 | 67 | template 68 | static inline void format_tensors_param(std::vector& param, const std::vector& vec) { 69 | for (const auto& elem : vec) { 70 | format_tensors_param(param, elem); 71 | } 72 | } 73 | } 74 | 75 | namespace c10d { 76 | 77 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13 78 | using C10D_Work = c10d::Work; 79 | #else 80 | using C10D_Work = c10d::ProcessGroup::Work; 81 | #endif 82 | 83 | // WorkCCL is the state associated with a CCL operarion. 84 | // 85 | // ProcessGroupCCL implements CCL bindings for c10d. 86 | // 87 | // All functions on this class are expected to be called in the same 88 | // order across processes in the group. 89 | // 90 | // All collective functions provided by this class are scheduled 91 | // for asynchronous execution by CCL. 92 | constexpr const char* CCL_BACKEND_NAME = "ccl"; 93 | 94 | // Environment variable which controls whether wait() and synchronize() are blocking or 95 | // non-blocking. 96 | constexpr const char* CCL_BLOCKING_WAIT = "CCL_BLOCKING_WAIT"; 97 | 98 | // Environment variable which controls whether or not use default stream as 99 | // communication stream for collectives 100 | constexpr const char* CCL_SAME_STREAM = "CCL_SAME_STREAM"; 101 | 102 | constexpr const char* TORCH_LLM_ALLREDUCE = "TORCH_LLM_ALLREDUCE"; 103 | 104 | #if TORCH_VERSION_MAJOR > 1 105 | using Baseclass = Backend; 106 | #else 107 | using Baseclass = ProcessGroup; 108 | #endif 109 | class ProcessGroupCCL : public Baseclass 110 | { 111 | public: 112 | class AsyncWorkCCL : public C10D_Work { 113 | public: 114 | AsyncWorkCCL(std::vector> outputTensors, 115 | int rank = -1, 116 | c10d::OpType opType = OpType::UNKNOWN, 117 | const char* profilingTitle = nullptr, 118 | const c10::optional>& inputTensors = c10::nullopt); 119 | 120 | virtual void run() = 0; 121 | 122 | c10::intrusive_ptr getFuture() override; 123 | 124 | std::vector result() override; 125 | 126 | virtual void finishAsyncWorkCCL(); 127 | 128 | void finishAsyncWorkCCLError(std::exception_ptr eptr); 129 | 130 | public: 131 | std::string debugName; 132 | // Clone of blockingWait_ from ProcessGroupCCL. 133 | bool blockingWait_ = true; 134 | // Clone of useSameStream_ from ProcessGroupCCL. 135 | bool useSameStream_ = false; 136 | 137 | protected: 138 | friend class ProcessGroupCCL; 139 | const std::vector> outputTensors_; 140 | // The future returned by getFuture. 141 | c10::intrusive_ptr future_; 142 | }; 143 | 144 | explicit ProcessGroupCCL(const c10::intrusive_ptr& store, 145 | int rank, 146 | int size, 147 | std::chrono::milliseconds); 148 | virtual ~ProcessGroupCCL(); 149 | 150 | #if TORCH_VERSION_MINOR >= 11 151 | const std::string getBackendName() const override { 152 | return std::string(CCL_BACKEND_NAME); 153 | } 154 | #endif 155 | 156 | void startCoalescing() override; 157 | 158 | c10::intrusive_ptr endCoalescing() override; 159 | 160 | c10::intrusive_ptr broadcast( 161 | std::vector& data, 162 | const BroadcastOptions& opts = BroadcastOptions()) override; 163 | 164 | c10::intrusive_ptr allreduce( 165 | std::vector& tensors, 166 | const AllreduceOptions& opts = AllreduceOptions()) override; 167 | 168 | c10::intrusive_ptr allreduce_coalesced( 169 | std::vector& tensors, 170 | const AllreduceCoalescedOptions& opts = 171 | AllreduceCoalescedOptions()) override; 172 | 173 | c10::intrusive_ptr reduce( 174 | std::vector& tensors, 175 | const ReduceOptions& opts = ReduceOptions()) override; 176 | 177 | c10::intrusive_ptr allgather( 178 | std::vector>& outputTensors, 179 | std::vector& inputTensors, 180 | const AllgatherOptions& opts = AllgatherOptions()) override; 181 | 182 | c10::intrusive_ptr _allgather_base( 183 | at::Tensor& outputBuffer, 184 | at::Tensor& inputBuffer, 185 | const AllgatherOptions& opts = AllgatherOptions()) override; 186 | 187 | c10::intrusive_ptr allgather_coalesced( 188 | std::vector>& outputTensorLists, 189 | std::vector& inputTensors, 190 | const AllgatherOptions& opts = AllgatherOptions()) override; 191 | 192 | c10::intrusive_ptr allgather_into_tensor_coalesced( 193 | std::vector& outputTensors, 194 | std::vector& inputTensors, 195 | const AllgatherOptions& opts = AllgatherOptions()) override; 196 | 197 | c10::intrusive_ptr gather( 198 | std::vector>& outputTensors, 199 | std::vector& inputTensors, 200 | const GatherOptions& opts = GatherOptions()) override; 201 | 202 | c10::intrusive_ptr scatter( 203 | std::vector& outputTensors, 204 | std::vector>& inputTensors, 205 | const ScatterOptions& opts = ScatterOptions()) override; 206 | 207 | c10::intrusive_ptr reduce_scatter( 208 | std::vector& outputTensors, 209 | std::vector>& inputTensors, 210 | const ReduceScatterOptions& opts = ReduceScatterOptions()) override; 211 | 212 | c10::intrusive_ptr _reduce_scatter_base( 213 | at::Tensor& outputBuffer, 214 | at::Tensor& inputBuffer, 215 | const ReduceScatterOptions& opts = ReduceScatterOptions()) override; 216 | 217 | c10::intrusive_ptr reduce_scatter_tensor_coalesced( 218 | std::vector& outputs, 219 | std::vector& inputs, 220 | const ReduceScatterOptions& opts = ReduceScatterOptions()) override; 221 | 222 | c10::intrusive_ptr alltoall_base( 223 | at::Tensor& outputTensor, 224 | at::Tensor& inputTensor, 225 | std::vector& outputSplitSizes, 226 | std::vector& inputSplitSizes, 227 | const AllToAllOptions& opts = AllToAllOptions()) override; 228 | 229 | c10::intrusive_ptr alltoall( 230 | std::vector& outputTensors, 231 | std::vector& inputTensors, 232 | const AllToAllOptions& opts = AllToAllOptions()) override; 233 | 234 | c10::intrusive_ptr send( 235 | std::vector& tensors, 236 | int dstRank, 237 | int tag) override; 238 | 239 | c10::intrusive_ptr recv( 240 | std::vector& tensors, 241 | int srcRank, 242 | int tag) override; 243 | 244 | c10::intrusive_ptr recvAnysource( 245 | std::vector& tensor, 246 | int tag) override; 247 | 248 | c10::intrusive_ptr barrier( 249 | const BarrierOptions& opts = BarrierOptions()) override; 250 | 251 | // create a new ProcessGroupCCL and initialize CCL if not initialized 252 | #if TORCH_VERSION_MAJOR > 1 253 | static c10::intrusive_ptr createProcessGroupCCL( 254 | #else 255 | static c10::intrusive_ptr createProcessGroupCCL( 256 | #endif 257 | const c10::intrusive_ptr& store, 258 | int rank = -1, 259 | int size = -1, 260 | std::chrono::milliseconds op_time_out = kNoTimeout); 261 | static const int64_t OP_TIMEOUT_MILLIS; 262 | public: 263 | 264 | static void cclInitOnce(); 265 | static void cclFini(); 266 | 267 | // Store that is used to exchange information between processes. 268 | c10::intrusive_ptr store_; 269 | 270 | std::chrono::milliseconds timeout; 271 | 272 | std::unique_ptr ccl_member_; 273 | 274 | static std::mutex globalMutex; 275 | 276 | // Whether or not wait() and synchronize() are blocking operations that wait 277 | // for the operation to complete. 278 | bool blockingWait_ = true; 279 | 280 | // Environment variable which controls whether to keep same stream 281 | // for collectives and compute 282 | bool useSameStream_ = false; 283 | 284 | bool torch_llm_allreduce_ = false; 285 | 286 | // Flag to denote if a coalescing groupStart/groupEnd block is active 287 | bool is_coalescing_ = false; 288 | 289 | // Stores device indexes for all collectives run inside a coalescing block 290 | std::vector coalescedDevices_; 291 | }; 292 | 293 | } // namespace c10d 294 | -------------------------------------------------------------------------------- /src/ccl_comm_collector.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "ccl_comm_collector.h" 5 | #include "utils.h" 6 | 7 | 8 | namespace oneccl_bindings_for_pytorch { 9 | 10 | ccl::shared_ptr_class CCLCommCollector::get_kvs(int rank, c10d::Store& store) { 11 | if (kvs) 12 | return kvs; 13 | // Each process group is with different store, so we use the unique key for 14 | // broadcast the bootstrap network information. 15 | std::string storeKey = "ccl_kvs"; 16 | 17 | // Rank 0 broadcast the bootstrap network information to other ranks 18 | if (rank == 0) { 19 | call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&]() { 20 | kvs = ccl::create_main_kvs(); 21 | }); 22 | ccl::kvs::address_type main_addr = kvs->get_address(); 23 | auto ccl_kvs_addr = std::vector(main_addr.begin(), main_addr.end()); 24 | store.set(storeKey, ccl_kvs_addr); 25 | } 26 | else { 27 | auto ccl_kvs_addr = store.get(storeKey); 28 | if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { 29 | throw std::runtime_error( 30 | "Unexpected ccl kvs addr from the store\n"); 31 | } 32 | ccl::kvs::address_type main_addr; 33 | std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()), 34 | ccl::kvs::address_max_size, 35 | main_addr.begin()); 36 | call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&]() { 37 | kvs = ccl::create_kvs(main_addr); 38 | }); 39 | } 40 | 41 | return kvs; 42 | } 43 | 44 | std::shared_ptr CCLCommCollector::get_comms(const std::string& devices_key) { 45 | if (ccl_comms.find(devices_key) != ccl_comms.end()) { 46 | // Reuse the cached communicator if there is one. 47 | return ccl_comms[devices_key]; 48 | } 49 | return {nullptr}; 50 | } 51 | 52 | void CCLCommCollector::add_comms(const std::string& devices_key, 53 | std::shared_ptr comms) { 54 | if (ccl_comms.find(devices_key) != ccl_comms.end()) { 55 | // Replace the cached comms 56 | ccl_comms[devices_key] = comms; 57 | } else { 58 | ccl_comms.emplace(devices_key, comms); 59 | } 60 | } 61 | 62 | } -------------------------------------------------------------------------------- /src/ccl_comm_collector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #pragma once 33 | 34 | #include 35 | #include 36 | #include 37 | #include "ProcessGroupCCL.hpp" 38 | 39 | namespace oneccl_bindings_for_pytorch { 40 | 41 | class Comms { 42 | public: 43 | // for cpu case 44 | explicit Comms(ccl::vector_class &comms) : 45 | comms(std::move(comms)), streams{} {} 46 | 47 | // for comms with streams 48 | explicit Comms(ccl::vector_class &comms, ccl::vector_class &streams, std::vector &torch_streams) : 49 | comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {} 50 | 51 | ~Comms() noexcept(false) {} 52 | 53 | Comms() = delete; 54 | 55 | // Must not be copyable 56 | Comms(const Comms &) = delete; 57 | 58 | Comms &operator=(const Comms &) = delete; 59 | 60 | // Move constructable 61 | Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)), 62 | torch_streams(std::move(other.torch_streams)) {} 63 | 64 | // Move assignable 65 | Comms &operator=(Comms &&other) { 66 | std::swap(comms, other.comms); 67 | std::swap(streams, other.streams); 68 | std::swap(torch_streams, other.torch_streams); 69 | return *this; 70 | } 71 | 72 | public: 73 | // The Communicators used by CCL 74 | ccl::vector_class comms; 75 | // The streams used by CCL 76 | ccl::vector_class streams; 77 | // one to one mapping the torch streams to the ccl::stream. 78 | std::vector torch_streams; 79 | }; 80 | 81 | struct CCLCommCollector { 82 | 83 | CCLCommCollector() : kvs(nullptr) {}; 84 | 85 | ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store); 86 | 87 | std::shared_ptr get_comms(const std::string& devices_key); 88 | void add_comms(const std::string& devices_key, std::shared_ptr comms); 89 | 90 | // ccl kvs to identify the community. 91 | ccl::shared_ptr_class kvs; 92 | 93 | // Collects the ccl communicator that the process group has used. 94 | // The key is a list of devices that an operation is operating on 95 | // The devices are stored in a device sequence and the cache CCL 96 | // communicator is associated with this device sequence 97 | // 98 | // e.g. If the process group op only uses device 0, then the value of 99 | // the used device string stored (value of the hashmap) would be "0". 100 | // 101 | // If the process group op uses device 0 - 7 and the each tensor of the 102 | // input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately, 103 | // then the value of the used device string (key) stored would be 104 | // "0,1,2,3,4,5,6,7" 105 | // 106 | // If the process group op uses device 0 - 7 and the each tensor of the 107 | // input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately, 108 | // then the value of the used device string stored would be 109 | // "0,4,5,6,7,1,2,3" 110 | // 111 | // Note that the order of the device for the tensor list matters. 112 | std::unordered_map> ccl_comms; 113 | 114 | }; 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/env.cpp: -------------------------------------------------------------------------------- 1 | #include "env.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * All available launch options for ONECCL_BINDINGS_FOR_PYTORCH 7 | * ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE: Default = 0, Set verbose level in ONECCL_BINDINGS_FOR_PYTORCH 8 | * ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB: Default = 0, Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching 9 | */ 10 | 11 | #define ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(var) \ 12 | int var = [&]() -> int { \ 13 | if (auto env = std::getenv("ONECCL_BINDINGS_FOR_PYTORCH_" #var)) \ 14 | return std::stoi(env, 0, 10); \ 15 | return 0; \ 16 | } () 17 | 18 | int oneccl_bindings_for_pytorch_env(int env_type) { 19 | 20 | static struct { 21 | ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(ENV_VERBOSE); 22 | ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(ENV_WAIT_GDB); 23 | } env; 24 | 25 | switch (env_type) { 26 | case ENV_VERBOSE: 27 | return env.ENV_VERBOSE; 28 | case ENV_WAIT_GDB: 29 | return env.ENV_WAIT_GDB; 30 | default: 31 | return 0; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/env.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | enum ONECCL_BINDINGS_FOR_PYTORCH_ENV { 4 | ENV_VERBOSE = 0, 5 | ENV_WAIT_GDB 6 | }; 7 | 8 | int oneccl_bindings_for_pytorch_env(int env); 9 | 10 | static inline int oneccl_bindings_for_pytorch_verbose() { 11 | return oneccl_bindings_for_pytorch_env(ENV_VERBOSE); 12 | } 13 | 14 | static inline int oneccl_bindings_for_pytorch_wait_gdb() { 15 | return oneccl_bindings_for_pytorch_env(ENV_WAIT_GDB); 16 | } -------------------------------------------------------------------------------- /src/gpu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(IPEX REQUIRED) 2 | 3 | set(CCL_DPCPP_SRCS dpcpp_ccl.cpp ze_exception.hpp allreduce.h sycl_misc.hpp runtime.hpp cxxopts.hpp) 4 | 5 | set_source_files_properties(${CCL_DPCPP_SRCS} PROPERTIES COMPILE_DEFINITIONS "USE_DPCPP;__STRICT_ANSI__") 6 | set_source_files_properties(${CCL_DPCPP_SRCS} PROPERTIES COMPILE_FLAGS -fsycl) 7 | 8 | add_library(oneccl_bindings_for_pytorch_xpu SHARED ${CCL_DPCPP_SRCS}) 9 | 10 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC ${DEPENDS_LIB}) 11 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC oneccl_bindings_for_pytorch) 12 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC intel-ext-pt-gpu) 13 | 14 | foreach(RPATH ${CMAKE_INSTALL_RPATH}) 15 | set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}") 16 | endforeach() 17 | set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,--disable-new-dtags") 18 | 19 | install(TARGETS oneccl_bindings_for_pytorch_xpu LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib") 20 | 21 | -------------------------------------------------------------------------------- /src/gpu/Makefile: -------------------------------------------------------------------------------- 1 | CC=icx 2 | CXX=icpx 3 | 4 | OPT= 5 | 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required" 7 | CCL_ROOT=../ccl/release/_install 8 | 9 | INCLUDES=-I$(CCL_ROOT)/include 10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader 11 | 12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES) 13 | 14 | all : allreduce 15 | 16 | clean: 17 | rm -f allreduce 18 | -------------------------------------------------------------------------------- /src/gpu/README.md: -------------------------------------------------------------------------------- 1 | Dependencies: 2 | 1. MPI 3 | 2. Level-Zero 4 | 3. SYCL enabled compiler 5 | 6 | Build: 7 | make 8 | 9 | Run: 10 | ```mpirun -np allreduce -c 1024 -t ``` 11 | -------------------------------------------------------------------------------- /src/gpu/allreduce.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "allreduce.h" 12 | #include 13 | 14 | #define REPEAT 10 15 | 16 | int work_only = -1; 17 | int sync_only = -1; 18 | 19 | int get_work_only(int init_value = 0) { 20 | int tmp_work_only = init_value; 21 | char *tmp_str = getenv("TORCH_CCL_WORK_ONLY"); 22 | if (tmp_str) { 23 | tmp_work_only = atoi(tmp_str); 24 | } 25 | work_only = tmp_work_only; 26 | return tmp_work_only; 27 | } 28 | 29 | int get_sync_only(int init_value = 0) { 30 | int tmp_sync_only = init_value; 31 | char *tmp_str = getenv("TORCH_CCL_SYNC_ONLY"); 32 | if (tmp_str) { 33 | tmp_sync_only = atoi(tmp_str); 34 | } 35 | sync_only = tmp_sync_only; 36 | return tmp_sync_only; 37 | } 38 | 39 | void act(allreducer& ar, sycl::queue& queue, void* inout_buffer, uint32_t size); 40 | 41 | int main(int argc, char* argv[]) { 42 | // init section 43 | auto ret = MPI_Init(&argc, &argv); 44 | if (ret == MPI_ERR_OTHER) { 45 | std::cout<<"MPI init error"< ar; 68 | ar.init(queue, rank, world); 69 | 70 | sycl::half* small_buffer = (sycl::half*)sycl::malloc_device(14336 * sizeof(sycl::half), queue); 71 | sycl::half* large_buffer = (sycl::half*)sycl::malloc_device(14336 * 32 * sizeof(sycl::half), queue); 72 | 73 | for (int i = 0; i < 140; i++) { 74 | act(ar, queue, large_buffer, 14336 * 32); 75 | } 76 | for (int i = 0; i < 31; i++) { 77 | for (int j = 0; j < 140; j++) { 78 | act(ar, queue, small_buffer, 14336); 79 | } 80 | } 81 | queue.wait(); 82 | 83 | uint64_t host_time[REPEAT]; 84 | uint64_t full_time[REPEAT]; 85 | 86 | for (int k = 0; k < REPEAT; k++) { 87 | MPI_Barrier(MPI_COMM_WORLD); 88 | uint64_t start = int64_t(std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count()); 89 | 90 | for (int i = 0; i < 140; i++) { 91 | act(ar, queue, large_buffer, 14336 * 32); 92 | } 93 | for (int i = 0; i < 31; i++) { 94 | for (int j = 0; j < 140; j++) { 95 | act(ar, queue, small_buffer, 14336); 96 | } 97 | } 98 | uint64_t host_end = int64_t(std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count()); 99 | queue.wait(); 100 | uint64_t full_end = int64_t(std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count()); 101 | host_time[k] = host_end - start; 102 | full_time[k] = full_end - start; 103 | } 104 | 105 | uint64_t total_host_time = 0; 106 | uint64_t total_full_time = 0; 107 | for (int k = 0; k < REPEAT; k++) { 108 | total_host_time += host_time[k]; 109 | total_full_time += full_time[k]; 110 | } 111 | 112 | total_host_time /= REPEAT; 113 | total_full_time /= REPEAT; 114 | 115 | MPI_Barrier(MPI_COMM_WORLD); 116 | MPI_Finalize(); 117 | 118 | std::cout << "Average full time: " << total_full_time << std::endl; 119 | std::cout << "Average host time (for reference): " << total_host_time << std::endl; 120 | for (int k = 0; k < REPEAT; k++) { 121 | std::cout << " Full time on round " << k << ": " << full_time[k] << std::endl; 122 | std::cout << " Host time on round " << k << " (for reference): " << host_time[k] << std::endl; 123 | } 124 | } 125 | 126 | void act(allreducer& ar, sycl::queue& queue, void* inout_buffer, uint32_t size) { 127 | if (work_only != 0) { 128 | ar.work_only(queue, inout_buffer, size); 129 | return; 130 | } 131 | if (sync_only != 0) { 132 | ar.sync_only(queue, inout_buffer, size); 133 | return; 134 | } 135 | ar.allreduce(queue, inout_buffer, size); 136 | } 137 | -------------------------------------------------------------------------------- /src/gpu/runtime.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | sycl::queue currentQueue(int ndev, int nsub) { 32 | switch(ndev) { 33 | case 0: 34 | if (nsub == 0) 35 | return getQueue<0,0>(); 36 | else 37 | return getQueue<0,1>(); 38 | break; 39 | case 1: 40 | if (nsub == 0) 41 | return getQueue<1,0>(); 42 | else 43 | return getQueue<1,1>(); 44 | break; 45 | } 46 | throw std::exception(); 47 | } 48 | 49 | sycl::device currentSubDevice(int ndev, int nsub) { 50 | switch(ndev) { 51 | case 0: 52 | if (nsub == 0) 53 | return getSubDevice<0,0>(); 54 | else 55 | return getSubDevice<0,1>(); 56 | break; 57 | case 1: 58 | if (nsub == 0) 59 | return getSubDevice<1,0>(); 60 | else 61 | return getSubDevice<1,1>(); 62 | break; 63 | } 64 | throw std::exception(); 65 | } 66 | 67 | static uint32_t g_dev_num = 1; 68 | static uint32_t g_part_num = 0; 69 | 70 | sycl::device currentSubDevice() { 71 | return currentSubDevice(g_dev_num, g_part_num); 72 | } 73 | 74 | sycl::queue currentQueue() { 75 | return currentQueue(g_dev_num, g_part_num); 76 | } 77 | -------------------------------------------------------------------------------- /src/gpu/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/gpu/ze_exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "sycl_misc.hpp" 12 | 13 | #define MAGIC_NUM 15 14 | #define MAX_RANK 8 15 | #define MAX_BUFFER 4096 16 | #define OPERATE_SIZE 14336 17 | 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK; 19 | size_t check_size = 4096; 20 | 21 | int world = -1; 22 | int rank = -1; 23 | 24 | void* buffer[MAX_RANK]; 25 | void* sync_buffer[MAX_RANK]; 26 | void* ready_buffer[MAX_RANK]; 27 | 28 | void exchange_mem(sycl::queue& queue, void* ptr); 29 | void atomic_write_check_remote(sycl::queue& queue, uint32_t* ptr, int good); 30 | 31 | struct exchange_contents { 32 | union { 33 | ze_ipc_mem_handle_t ipc_handle; 34 | int fd = -1; 35 | }; 36 | size_t offset = 0; 37 | int pid = -1; 38 | }; 39 | 40 | #define sysCheck(x) \ 41 | if (x == -1) { \ 42 | throw std::system_error( \ 43 | std::make_error_code(std::errc(errno))); \ 44 | } 45 | 46 | int main(int argc, char* argv[]) { 47 | size_t buffer_size = buffer_base_size + 1024; 48 | 49 | auto ret = MPI_Init(&argc, &argv); 50 | if (ret == MPI_ERR_OTHER) { 51 | std::cout<<"MPI init error"< index) { 75 | ptr[index] = (uint32_t)0; 76 | })); 77 | }); 78 | queue.wait(); 79 | 80 | exchange_mem(queue, operate_buffer); 81 | 82 | atomic_write_check_remote(queue, ptr, (argc > 1)); 83 | 84 | MPI_Barrier(MPI_COMM_WORLD); 85 | std::cout << "Host MPI barrier completed" << std::endl; 86 | 87 | MPI_Finalize(); 88 | } 89 | 90 | void atomic_write_check_remote(sycl::queue& queue, uint32_t* ptr, int good) { 91 | uint32_t temp_world = world; 92 | uint32_t temp_rank = rank; 93 | 94 | int *temp_sync_buffer[MAX_RANK]; 95 | for (int index = 0; index < temp_world; index++) { 96 | temp_sync_buffer[index] = (int *)sync_buffer[index]; 97 | } 98 | 99 | for (int index = 0; index < temp_world; index++) { 100 | if (index != temp_rank) { 101 | std::cout << "Setting " << temp_sync_buffer[index] << " (remote) to 1" << std::endl; 102 | } 103 | } 104 | for (int index = 0; index < temp_world; index++) { 105 | if (index != temp_rank) { 106 | std::cout << "Checking " << (int*)((int *)ptr + index * 32) << " (local) for 1" << std::endl; 107 | } 108 | } 109 | 110 | sycl::event e = queue.submit([&](sycl::handler& cgh) { 111 | if (good != 0) { 112 | sycl::stream str(8192, 1024, cgh); 113 | } 114 | cgh.parallel_for(sycl::range { temp_world * 2 }, ([=](sycl::id<1> index) { 115 | if (index < temp_world && index != temp_rank) { 116 | int * peer_sync_ptr = (int*)temp_sync_buffer[index]; 117 | auto v = 118 | sycl::atomic_ref(peer_sync_ptr[0]); 121 | v.store(1); 122 | } 123 | if (index >= temp_world && index - temp_world != temp_rank) { 124 | int * local_sync_ptr = (int*)(ptr + (index - temp_world) * 32); 125 | auto v = 126 | sycl::atomic_ref(local_sync_ptr[0]); 129 | int count = v.load(); 130 | while (count < 1) { 131 | count = v.load(); 132 | } 133 | } 134 | })); 135 | }); 136 | e.wait(); 137 | 138 | std::cout << "Kernel done" << std::endl; 139 | } 140 | 141 | void exchange_mem(sycl::queue& queue, void* ptr) { 142 | // Step 1: Get base address of the pointer 143 | sycl::context ctx = queue.get_context(); 144 | auto l0_ctx = sycl::get_native(ctx); 145 | 146 | void *base_addr; 147 | size_t base_size; 148 | zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); 149 | 150 | std::cout << "Memory range size: " << base_size << std::endl; 151 | std::cout << "Buffer base size: " << buffer_base_size << std::endl; 152 | std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl; 153 | std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl; 154 | 155 | // Step 2: Get IPC mem handle from base address 156 | alignas(64) exchange_contents send_buf; 157 | alignas(64) exchange_contents recv_buf[world]; 158 | 159 | // fill in the exchange info 160 | zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); 161 | send_buf.offset = (char*)ptr - (char*)base_addr; 162 | send_buf.pid = getpid(); 163 | 164 | // Step 3: Exchange the handles and offsets 165 | memset(recv_buf, 0, sizeof(recv_buf)); 166 | // Overkill if we don't really needs all peer's handles 167 | MPI_Allgather( 168 | &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD); 169 | 170 | 171 | for (uint32_t i = 0; i < world; i++){ 172 | // Step 4: Prepare pid file descriptor of next process 173 | auto* peer = recv_buf + i; 174 | auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0); 175 | sysCheck(pid_fd); 176 | // 177 | // Step 5: Duplicate GEM object handle to local process 178 | // and overwrite original file descriptor number 179 | // 180 | peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0); 181 | sysCheck(peer->fd); 182 | 183 | // Step 6: Open IPC handle of remote peer 184 | auto l0_device 185 | = sycl::get_native(queue.get_device()); 186 | void* peer_base; 187 | 188 | zeCheck(zeMemOpenIpcHandle( 189 | l0_ctx, l0_device, peer->ipc_handle, 0, &peer_base)); 190 | // l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); 191 | buffer[i] = (char*)peer_base + peer->offset; 192 | sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128; 193 | ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64; 194 | 195 | char* end = (char*)peer_base + peer->offset + base_size; 196 | 197 | std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl; 198 | } 199 | } 200 | 201 | -------------------------------------------------------------------------------- /src/test/remotesync/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/test/remotesync/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu 3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1 4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test 5 | #mpirun --prepend-rank -n 2 -ppn 2 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good 6 | #mpirun --prepend-rank -n 8 -ppn 8 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test 7 | -------------------------------------------------------------------------------- /src/test/remotesync/ze_exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "sycl_misc.hpp" 12 | 13 | #define MAGIC_NUM 15 14 | #define MAX_RANK 8 15 | #define MAX_BUFFER 4096 16 | #define OPERATE_SIZE 14336 17 | 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK; 19 | size_t check_size = 4096; 20 | 21 | int world = -1; 22 | int rank = -1; 23 | 24 | int use_tmp_buffer; 25 | 26 | void* buffer[MAX_RANK]; 27 | void* sync_buffer[MAX_RANK]; 28 | void* ready_buffer[MAX_RANK]; 29 | 30 | void exchange_mem(sycl::queue& queue, void* ptr); 31 | 32 | struct exchange_contents { 33 | union { 34 | ze_ipc_mem_handle_t ipc_handle; 35 | int fd = -1; 36 | }; 37 | size_t offset = 0; 38 | int pid = -1; 39 | }; 40 | 41 | #define sysCheck(x) \ 42 | if (x == -1) { \ 43 | throw std::system_error( \ 44 | std::make_error_code(std::errc(errno))); \ 45 | } 46 | 47 | int main(int argc, char* argv[]) { 48 | if (argc > 1) { 49 | use_tmp_buffer = 1; 50 | } 51 | 52 | size_t buffer_size = buffer_base_size + 1024 * 32768; 53 | 54 | auto ret = MPI_Init(&argc, &argv); 55 | if (ret == MPI_ERR_OTHER) { 56 | std::cout<<"MPI init error"< index) { 81 | ptr[index] = (uint32_t)temp_rank; 82 | })); 83 | }); 84 | queue.wait(); 85 | 86 | exchange_mem(queue, operate_buffer); 87 | 88 | MPI_Finalize(); 89 | } 90 | 91 | void exchange_mem(sycl::queue& queue, void* ptr) { 92 | // Step 1: Get base address of the pointer 93 | sycl::context ctx = queue.get_context(); 94 | auto l0_ctx = sycl::get_native(ctx); 95 | 96 | void *base_addr; 97 | size_t base_size; 98 | zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); 99 | 100 | std::cout << "Base size: " << base_size << std::endl; 101 | std::cout << "Buffer base size: " << buffer_base_size << std::endl; 102 | std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl; 103 | std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl; 104 | 105 | // Step 2: Get IPC mem handle from base address 106 | alignas(64) exchange_contents send_buf; 107 | alignas(64) exchange_contents recv_buf[world]; 108 | 109 | // fill in the exchange info 110 | zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); 111 | send_buf.offset = (char*)ptr - (char*)base_addr; 112 | send_buf.pid = getpid(); 113 | 114 | int* host_buffer = (int *)(malloc(1024)); 115 | void* tmp_buffer = sycl::malloc_device(1024, queue); 116 | 117 | void* sync_addr = NULL; 118 | sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size); 119 | std::cout << "Sync buffer content at " << sync_addr << ": "; 120 | queue.memcpy(host_buffer, sync_addr, 1024); 121 | queue.wait(); 122 | for (int i = 0; i < 256; i += 16) { 123 | std::cout << &host_buffer[i] << ": " << host_buffer[i] << std::endl; 124 | } 125 | 126 | // Step 3: Exchange the handles and offsets 127 | memset(recv_buf, 0, sizeof(recv_buf)); 128 | // Overkill if we don't really needs all peer's handles 129 | MPI_Allgather( 130 | &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD); 131 | 132 | 133 | for (uint32_t i = 0; i < world; i++){ 134 | // Step 4: Prepare pid file descriptor of next process 135 | auto* peer = recv_buf + i; 136 | auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0); 137 | sysCheck(pid_fd); 138 | // 139 | // Step 5: Duplicate GEM object handle to local process 140 | // and overwrite original file descriptor number 141 | // 142 | peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0); 143 | sysCheck(peer->fd); 144 | 145 | // Step 6: Open IPC handle of remote peer 146 | auto l0_device 147 | = sycl::get_native(queue.get_device()); 148 | void* peer_base; 149 | 150 | zeCheck(zeMemOpenIpcHandle( 151 | l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); 152 | buffer[i] = (char*)peer_base + peer->offset; 153 | sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128; 154 | ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64; 155 | 156 | char* end = (char*)peer_base + peer->offset + base_size; 157 | 158 | std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl; 159 | 160 | sync_addr = (void *)((char*)peer_base + peer->offset + buffer_base_size); 161 | //sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size); 162 | 163 | if (use_tmp_buffer == 0) { 164 | std::cout << "Copy sync buffer (mapped from rank " << i << ") at " << sync_addr << " to host" << std::endl; 165 | queue.memcpy(host_buffer, sync_addr, 1024); 166 | } else { 167 | std::cout << "Copy sync buffer (mapped from rank " << i << ") at " << sync_addr << " to temp buffer & then to host" << std::endl; 168 | queue.memcpy(tmp_buffer, sync_addr, 1024); 169 | queue.memcpy(host_buffer, tmp_buffer, 1024); 170 | } 171 | queue.wait(); 172 | 173 | std::cout << "Sync buffer content at " << sync_addr << std::endl; 174 | for (int i = 0; i < 256; i += 16) { 175 | std::cout << &host_buffer[i] << ": " << host_buffer[i] << std::endl; 176 | } 177 | } 178 | } 179 | 180 | -------------------------------------------------------------------------------- /src/test/segfault/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/test/segfault/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu 3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1 4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test 5 | #mpirun --prepend-rank -n 2 -ppn 2 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good 6 | #mpirun --prepend-rank -n 8 -ppn 8 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test 7 | -------------------------------------------------------------------------------- /src/test/segfault/ze_exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "sycl_misc.hpp" 12 | 13 | #define MAGIC_NUM 15 14 | #define MAX_RANK 8 15 | #define MAX_BUFFER 4096 16 | #define OPERATE_SIZE 14336 17 | 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK; 19 | size_t check_size = 4096; 20 | 21 | int world = -1; 22 | int rank = -1; 23 | 24 | void* buffer[MAX_RANK]; 25 | void* sync_buffer[MAX_RANK]; 26 | void* ready_buffer[MAX_RANK]; 27 | 28 | void exchange_mem(sycl::queue& queue, void* ptr); 29 | void dump_buffer(sycl::queue& queue, void* gpu_addr); 30 | void atomic_write_remote(sycl::queue& queue, int good); 31 | 32 | struct exchange_contents { 33 | union { 34 | ze_ipc_mem_handle_t ipc_handle; 35 | int fd = -1; 36 | }; 37 | size_t offset = 0; 38 | int pid = -1; 39 | }; 40 | 41 | #define sysCheck(x) \ 42 | if (x == -1) { \ 43 | throw std::system_error( \ 44 | std::make_error_code(std::errc(errno))); \ 45 | } 46 | 47 | int main(int argc, char* argv[]) { 48 | size_t buffer_size = buffer_base_size + 1024; 49 | 50 | auto ret = MPI_Init(&argc, &argv); 51 | if (ret == MPI_ERR_OTHER) { 52 | std::cout<<"MPI init error"< index) { 75 | ptr[index] = (uint32_t)0; 76 | })); 77 | }); 78 | queue.wait(); 79 | 80 | exchange_mem(queue, operate_buffer); 81 | 82 | atomic_write_remote(queue, (argc > 1)); 83 | 84 | MPI_Barrier(MPI_COMM_WORLD); 85 | std::cout << "Host MPI barrier completed" << std::endl; 86 | 87 | dump_buffer(queue, ptr); 88 | 89 | MPI_Barrier(MPI_COMM_WORLD); 90 | std::cout << "Host MPI barrier completed" << std::endl; 91 | 92 | MPI_Finalize(); 93 | 94 | } 95 | 96 | void atomic_write_remote(sycl::queue& queue, int good) { 97 | uint32_t temp_world = world; 98 | uint32_t temp_rank = rank; 99 | 100 | int *temp_sync_buffer[MAX_RANK]; 101 | for (int index = 0; index < temp_world; index++) { 102 | temp_sync_buffer[index] = (int *)sync_buffer[index]; 103 | } 104 | 105 | for (int index = 0; index < temp_world; index++) { 106 | if (index != temp_rank) { 107 | std::cout << "Setting " << temp_sync_buffer[index] << " (remote) to 1" << std::endl; 108 | } else { 109 | std::cout << "Setting " << temp_sync_buffer[index] << " (local mapped) to 1" << std::endl; 110 | } 111 | } 112 | 113 | queue.submit([&](sycl::handler& cgh) { 114 | if (good != 0) { 115 | sycl::stream str(8192, 1024, cgh); 116 | } 117 | cgh.parallel_for(sycl::range { temp_world }, ([=](sycl::id<1> index) { 118 | //if (index != temp_rank) { 119 | int * peer_sync_ptr = (int*)temp_sync_buffer[index]; 120 | auto v = 121 | sycl::atomic_ref(peer_sync_ptr[0]); 124 | v.store(1); 125 | //} 126 | })); 127 | }); 128 | queue.wait(); 129 | 130 | std::cout << "Kernel done" << std::endl; 131 | } 132 | 133 | void exchange_mem(sycl::queue& queue, void* ptr) { 134 | // Step 1: Get base address of the pointer 135 | sycl::context ctx = queue.get_context(); 136 | auto l0_ctx = sycl::get_native(ctx); 137 | 138 | void *base_addr; 139 | size_t base_size; 140 | zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); 141 | 142 | std::cout << "Memory range size: " << base_size << std::endl; 143 | std::cout << "Buffer base size: " << buffer_base_size << std::endl; 144 | std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl; 145 | std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl; 146 | 147 | // Step 2: Get IPC mem handle from base address 148 | alignas(64) exchange_contents send_buf; 149 | alignas(64) exchange_contents recv_buf[world]; 150 | 151 | // fill in the exchange info 152 | zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); 153 | send_buf.offset = (char*)ptr - (char*)base_addr; 154 | send_buf.pid = getpid(); 155 | 156 | void * sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size); 157 | dump_buffer(queue, sync_addr); 158 | 159 | // Step 3: Exchange the handles and offsets 160 | memset(recv_buf, 0, sizeof(recv_buf)); 161 | // Overkill if we don't really needs all peer's handles 162 | MPI_Allgather( 163 | &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD); 164 | 165 | 166 | for (uint32_t i = 0; i < world; i++){ 167 | // Step 4: Prepare pid file descriptor of next process 168 | auto* peer = recv_buf + i; 169 | auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0); 170 | sysCheck(pid_fd); 171 | // 172 | // Step 5: Duplicate GEM object handle to local process 173 | // and overwrite original file descriptor number 174 | // 175 | peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0); 176 | sysCheck(peer->fd); 177 | 178 | // Step 6: Open IPC handle of remote peer 179 | auto l0_device 180 | = sycl::get_native(queue.get_device()); 181 | void* peer_base; 182 | 183 | zeCheck(zeMemOpenIpcHandle( 184 | l0_ctx, l0_device, peer->ipc_handle, 0, &peer_base)); 185 | // l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED, &peer_base)); 186 | buffer[i] = (char*)peer_base + peer->offset; 187 | sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128; 188 | ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64; 189 | 190 | char* end = (char*)peer_base + peer->offset + base_size; 191 | 192 | std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl; 193 | } 194 | } 195 | 196 | void dump_buffer(sycl::queue& queue, void* gpu_addr) { 197 | int* host_buffer = (int *)(malloc(1024)); 198 | queue.memcpy(host_buffer, gpu_addr, 1024); 199 | queue.wait(); 200 | std::cout << "Buffer copied from " << gpu_addr << " to host" << std::endl; 201 | std::cout << "Dump content of " << gpu_addr << ": " << std::endl; 202 | for (int i = 0; i < world; i++) { 203 | //if (i != rank) { 204 | std::cout << (int *)gpu_addr + i * 32 << ": " << host_buffer[i * 32] << std::endl; 205 | //} 206 | } 207 | free(host_buffer); 208 | } 209 | 210 | -------------------------------------------------------------------------------- /src/test/writeremote/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/test/writeremote/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu 3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1 4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test 5 | #mpirun --prepend-rank -n 2 -ppn 2 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good 6 | #mpirun --prepend-rank -n 8 -ppn 8 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test 7 | -------------------------------------------------------------------------------- /src/test/writeremote/ze_exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< cclOps = 39 | { 40 | {ReduceOp::MIN, ccl::reduction::min}, 41 | {ReduceOp::MAX, ccl::reduction::max}, 42 | {ReduceOp::SUM, ccl::reduction::sum}, 43 | {ReduceOp::PRODUCT, ccl::reduction::prod}, 44 | }; 45 | 46 | std::map cclDatatypes = 47 | { 48 | {at::kByte, ccl::datatype::uint8}, 49 | {at::kChar, ccl::datatype::int8}, 50 | {at::kShort, ccl::datatype::int16}, 51 | {at::kInt, ccl::datatype::int32}, 52 | {at::kLong, ccl::datatype::int64}, 53 | {at::kHalf, ccl::datatype::float16}, 54 | {at::kFloat, ccl::datatype::float32}, 55 | {at::kDouble, ccl::datatype::float64}, 56 | {at::kBFloat16, ccl::datatype::bfloat16}, 57 | {at::kBool, ccl::datatype::uint8}, 58 | }; 59 | 60 | // Get the key from the list of devices 61 | std::string get_key_from_devs(const std::vector& devices) { 62 | std::string key = DeviceTypeName(devices[0].type(), /* lower case */ true) + ":"; 63 | for (auto& device : devices) { 64 | key.append(std::to_string(device.index()) + ","); 65 | } 66 | return key; 67 | } 68 | 69 | // Get the list of devices from list of tensors 70 | std::vector get_device_list(const std::vector& tensors) { 71 | std::vector res; 72 | res.reserve(tensors.size()); 73 | for (auto& tensor : tensors) { 74 | // Tensors must all be on the same device, or all on distinct devices. 75 | if (res.size() == 0 || tensor.device() != res[0]) { 76 | res.push_back(tensor.device()); 77 | } 78 | } 79 | return res; 80 | } 81 | 82 | std::vector get_device_list(const std::vector >& tensors) { 83 | std::vector res; 84 | res.reserve(tensors.size()); 85 | for (auto& tensor : tensors) { 86 | res.push_back(tensor[0].device()); 87 | } 88 | return res; 89 | } 90 | 91 | bool check_same_size(const std::vector& tensors) { 92 | for (const auto& tensor : tensors) { 93 | if (!tensors[0].is_same_size(tensor)) { 94 | return false; 95 | } 96 | } 97 | return true; 98 | } 99 | 100 | std::vector flatten_tensor_lists(std::vector>& tensor_lists, std::vector& other, size_t world_size) { 101 | if (tensor_lists.size() != other.size()) { 102 | TORCH_CHECK( 103 | false, 104 | "Tensor list operands to scatter/gather must have the same length"); 105 | } 106 | const auto num_devices = tensor_lists.size(); 107 | 108 | std::vector flattened; 109 | flattened.resize(num_devices); 110 | 111 | for (const auto i : c10::irange(size_t{}, num_devices)) { 112 | if (tensor_lists[i].size() != world_size * num_devices) { 113 | TORCH_CHECK( 114 | false, 115 | c10::str( 116 | "Tensor list input to scatter/gather must match number of collective participants ", 117 | "but got ", 118 | tensor_lists[i].size(), 119 | " inputs", 120 | " with world_size ", 121 | world_size, 122 | " and ", 123 | num_devices, 124 | " devices.")); 125 | } 126 | 127 | // Only check device match for the first tensor in the list; the call to 128 | // newLikeFlat() below will check the rest. 129 | if (tensor_lists[i].front().get_device() != other[i].get_device()) { 130 | TORCH_CHECK( 131 | false, 132 | "Corresponding input/output tensors to scatter/gather must all reside" 133 | " on the same device"); 134 | } 135 | 136 | for (const auto& t : tensor_lists[i]) { 137 | if (t.numel() != other[i].numel()) { 138 | TORCH_CHECK( 139 | false, 140 | "All tensor operands to scatter/gather must have the same number of elements"); 141 | } 142 | } 143 | // Flatten the tensors (from all ranks) into a single big tensor. 144 | flattened[i] = c10d::newLikeFlat(tensor_lists, i); 145 | } 146 | return flattened; 147 | } 148 | 149 | std::string get_key_send_recv(int myRank, int peer) { 150 | int lowRank = myRank < peer ? myRank : peer; 151 | int highRank = myRank < peer ? peer : myRank; 152 | std::string sendRecvPair = 153 | std::to_string(lowRank) + ":" + std::to_string(highRank); 154 | return sendRecvPair; 155 | } 156 | 157 | FlatCheckResult computeLengthsAndCheckFlat( 158 | const std::vector& tensors, 159 | std::vector& lengths) 160 | { 161 | int64_t groupSize = lengths.size(); 162 | auto firstTensor = tensors[0]; 163 | int64_t offset = 0; 164 | auto firstLength = firstTensor.numel(); 165 | auto storage = firstTensor.storage(); 166 | auto firstStorageOffset = firstTensor.storage_offset(); 167 | bool isFlat = true; 168 | 169 | for (int i = 0; i < groupSize; i++) 170 | { 171 | auto& curTensor = tensors[i]; 172 | int64_t length = curTensor.numel(); 173 | 174 | if (firstLength == 0 && length != 0) 175 | { 176 | firstLength = length; 177 | firstTensor = curTensor; 178 | storage = curTensor.storage(); 179 | firstStorageOffset = curTensor.storage_offset(); 180 | } 181 | 182 | lengths[i] = length; 183 | 184 | if (isFlat && (length != 0 || firstLength != 0) && 185 | (!storage.is_alias_of(curTensor.storage()) || 186 | curTensor.storage_offset() != firstStorageOffset + offset)) 187 | isFlat = false; 188 | 189 | offset += length; 190 | } 191 | 192 | return FlatCheckResult{isFlat, offset, firstTensor}; 193 | } 194 | 195 | bool computeLengthsAndCheckAndGetFlat( 196 | const std::vector& tensors, 197 | std::vector& lengths, 198 | at::Tensor& flatTensor, 199 | int64_t& flatLength) 200 | { 201 | auto flatRes = computeLengthsAndCheckFlat(tensors, lengths); 202 | 203 | flatLength = flatRes.size; 204 | 205 | if (flatRes.isFlat) 206 | { 207 | flatTensor = flatRes.firstTensor; 208 | } 209 | else 210 | { 211 | flatTensor = at::empty({flatRes.size}, flatRes.firstTensor.options()); 212 | } 213 | 214 | return flatRes.isFlat; 215 | } 216 | 217 | void checkSingleTensorHelper(const at::Tensor& tensor) 218 | { 219 | TORCH_CHECK(tensor.is_sparse() || tensor.is_contiguous(), "input dense tensor has to be contiguous"); 220 | TORCH_CHECK(!tensor.is_cuda(), "CUDA tensor detected and CCL doesn't support CUDA buffers"); 221 | TORCH_CHECK(tensor.numel() >= 0, "input tensor numel should be non-negative"); 222 | } 223 | 224 | void checkSingleTensor(const std::vector& tensors) 225 | { 226 | TORCH_CHECK(tensors.size() == 1, 227 | "CCL process group does not support tensors count " + std::to_string(tensors.size())); 228 | 229 | checkSingleTensorHelper(tensors[0]); 230 | } 231 | 232 | 233 | void checkSameType(const at::Tensor& tensor, 234 | const std::vector& tensors) 235 | { 236 | for (size_t i = 0; i < tensors.size(); ++i) 237 | { 238 | TORCH_CHECK(tensors[i].scalar_type() == tensor.scalar_type(), 239 | "Tensors are not equal in data type"); 240 | TORCH_CHECK(tensors[i].device().type() == tensor.device().type(), 241 | "Tensors are not in same device type. Expect: ", tensor.device().type(), 242 | " But got: ", tensors[i].device().type()); 243 | 244 | checkSingleTensorHelper(tensors[i]); 245 | } 246 | } 247 | 248 | void checkSameType(const at::Tensor& tensor, 249 | const std::vector>& tensors) 250 | { 251 | for (size_t i = 0; i < tensors.size(); ++i) 252 | { 253 | checkSameType(tensor, tensors[i]); 254 | } 255 | } 256 | 257 | } 258 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/DeepSpeed.csv: -------------------------------------------------------------------------------- 1 | allreduce,1,-1 2 | broadcast,154533888,0 3 | broadcast,6291456,0 4 | broadcast,3072,0 5 | broadcast,3072,0 6 | broadcast,28311552,0 7 | broadcast,9216,0 8 | broadcast,9437184,0 9 | broadcast,3072,0 10 | broadcast,3072,0 11 | broadcast,3072,0 12 | broadcast,37748736,0 13 | broadcast,12288,0 14 | broadcast,37748736,0 15 | broadcast,3072,0 16 | broadcast,3072,0 17 | broadcast,3072,0 18 | broadcast,28311552,0 19 | broadcast,9216,0 20 | broadcast,9437184,0 21 | broadcast,3072,0 22 | broadcast,3072,0 23 | broadcast,3072,0 24 | broadcast,37748736,0 25 | broadcast,12288,0 26 | broadcast,37748736,0 27 | broadcast,3072,0 28 | broadcast,3072,0 29 | broadcast,3072,0 30 | broadcast,28311552,0 31 | broadcast,9216,0 32 | broadcast,9437184,0 33 | broadcast,3072,0 34 | broadcast,3072,0 35 | broadcast,3072,0 36 | broadcast,37748736,0 37 | broadcast,12288,0 38 | broadcast,37748736,0 39 | broadcast,3072,0 40 | broadcast,3072,0 41 | broadcast,3072,0 42 | broadcast,28311552,0 43 | broadcast,9216,0 44 | broadcast,9437184,0 45 | broadcast,3072,0 46 | broadcast,3072,0 47 | broadcast,3072,0 48 | broadcast,37748736,0 49 | broadcast,12288,0 50 | broadcast,37748736,0 51 | broadcast,3072,0 52 | broadcast,3072,0 53 | broadcast,3072,0 54 | broadcast,28311552,0 55 | broadcast,9216,0 56 | broadcast,9437184,0 57 | broadcast,3072,0 58 | broadcast,3072,0 59 | broadcast,3072,0 60 | broadcast,37748736,0 61 | broadcast,12288,0 62 | broadcast,37748736,0 63 | broadcast,3072,0 64 | broadcast,3072,0 65 | broadcast,3072,0 66 | broadcast,28311552,0 67 | broadcast,9216,0 68 | broadcast,9437184,0 69 | broadcast,3072,0 70 | broadcast,3072,0 71 | broadcast,3072,0 72 | broadcast,37748736,0 73 | broadcast,12288,0 74 | broadcast,37748736,0 75 | broadcast,3072,0 76 | broadcast,3072,0 77 | broadcast,3072,0 78 | broadcast,28311552,0 79 | broadcast,9216,0 80 | broadcast,9437184,0 81 | broadcast,3072,0 82 | broadcast,3072,0 83 | broadcast,3072,0 84 | broadcast,37748736,0 85 | broadcast,12288,0 86 | broadcast,37748736,0 87 | broadcast,3072,0 88 | broadcast,3072,0 89 | broadcast,3072,0 90 | broadcast,28311552,0 91 | broadcast,9216,0 92 | broadcast,9437184,0 93 | broadcast,3072,0 94 | broadcast,3072,0 95 | broadcast,3072,0 96 | broadcast,37748736,0 97 | broadcast,12288,0 98 | broadcast,37748736,0 99 | broadcast,3072,0 100 | broadcast,3072,0 101 | broadcast,3072,0 102 | broadcast,28311552,0 103 | broadcast,9216,0 104 | broadcast,9437184,0 105 | broadcast,3072,0 106 | broadcast,3072,0 107 | broadcast,3072,0 108 | broadcast,37748736,0 109 | broadcast,12288,0 110 | broadcast,37748736,0 111 | broadcast,3072,0 112 | broadcast,3072,0 113 | broadcast,3072,0 114 | broadcast,28311552,0 115 | broadcast,9216,0 116 | broadcast,9437184,0 117 | broadcast,3072,0 118 | broadcast,3072,0 119 | broadcast,3072,0 120 | broadcast,37748736,0 121 | broadcast,12288,0 122 | broadcast,37748736,0 123 | broadcast,3072,0 124 | broadcast,3072,0 125 | broadcast,3072,0 126 | broadcast,28311552,0 127 | broadcast,9216,0 128 | broadcast,9437184,0 129 | broadcast,3072,0 130 | broadcast,3072,0 131 | broadcast,3072,0 132 | broadcast,37748736,0 133 | broadcast,12288,0 134 | broadcast,37748736,0 135 | broadcast,3072,0 136 | broadcast,3072,0 137 | broadcast,3072,0 138 | broadcast,28311552,0 139 | broadcast,9216,0 140 | broadcast,9437184,0 141 | broadcast,3072,0 142 | broadcast,3072,0 143 | broadcast,3072,0 144 | broadcast,37748736,0 145 | broadcast,12288,0 146 | broadcast,37748736,0 147 | broadcast,3072,0 148 | broadcast,3072,0 149 | broadcast,3072,0 150 | broadcast,28311552,0 151 | broadcast,9216,0 152 | broadcast,9437184,0 153 | broadcast,3072,0 154 | broadcast,3072,0 155 | broadcast,3072,0 156 | broadcast,37748736,0 157 | broadcast,12288,0 158 | broadcast,37748736,0 159 | broadcast,3072,0 160 | broadcast,3072,0 161 | broadcast,3072,0 162 | broadcast,28311552,0 163 | broadcast,9216,0 164 | broadcast,9437184,0 165 | broadcast,3072,0 166 | broadcast,3072,0 167 | broadcast,3072,0 168 | broadcast,37748736,0 169 | broadcast,12288,0 170 | broadcast,37748736,0 171 | broadcast,3072,0 172 | broadcast,3072,0 173 | broadcast,3072,0 174 | broadcast,28311552,0 175 | broadcast,9216,0 176 | broadcast,9437184,0 177 | broadcast,3072,0 178 | broadcast,3072,0 179 | broadcast,3072,0 180 | broadcast,37748736,0 181 | broadcast,12288,0 182 | broadcast,37748736,0 183 | broadcast,3072,0 184 | broadcast,3072,0 185 | broadcast,3072,0 186 | broadcast,28311552,0 187 | broadcast,9216,0 188 | broadcast,9437184,0 189 | broadcast,3072,0 190 | broadcast,3072,0 191 | broadcast,3072,0 192 | broadcast,37748736,0 193 | broadcast,12288,0 194 | broadcast,37748736,0 195 | broadcast,3072,0 196 | broadcast,3072,0 197 | broadcast,3072,0 198 | broadcast,28311552,0 199 | broadcast,9216,0 200 | broadcast,9437184,0 201 | broadcast,3072,0 202 | broadcast,3072,0 203 | broadcast,3072,0 204 | broadcast,37748736,0 205 | broadcast,12288,0 206 | broadcast,37748736,0 207 | broadcast,3072,0 208 | broadcast,3072,0 209 | broadcast,3072,0 210 | broadcast,28311552,0 211 | broadcast,9216,0 212 | broadcast,9437184,0 213 | broadcast,3072,0 214 | broadcast,3072,0 215 | broadcast,3072,0 216 | broadcast,37748736,0 217 | broadcast,12288,0 218 | broadcast,37748736,0 219 | broadcast,3072,0 220 | broadcast,3072,0 221 | broadcast,3072,0 222 | broadcast,28311552,0 223 | broadcast,9216,0 224 | broadcast,9437184,0 225 | broadcast,3072,0 226 | broadcast,3072,0 227 | broadcast,3072,0 228 | broadcast,37748736,0 229 | broadcast,12288,0 230 | broadcast,37748736,0 231 | broadcast,3072,0 232 | broadcast,3072,0 233 | broadcast,3072,0 234 | broadcast,28311552,0 235 | broadcast,9216,0 236 | broadcast,9437184,0 237 | broadcast,3072,0 238 | broadcast,3072,0 239 | broadcast,3072,0 240 | broadcast,37748736,0 241 | broadcast,12288,0 242 | broadcast,37748736,0 243 | broadcast,3072,0 244 | broadcast,3072,0 245 | broadcast,3072,0 246 | broadcast,28311552,0 247 | broadcast,9216,0 248 | broadcast,9437184,0 249 | broadcast,3072,0 250 | broadcast,3072,0 251 | broadcast,3072,0 252 | broadcast,37748736,0 253 | broadcast,12288,0 254 | broadcast,37748736,0 255 | broadcast,3072,0 256 | broadcast,3072,0 257 | broadcast,3072,0 258 | broadcast,28311552,0 259 | broadcast,9216,0 260 | broadcast,9437184,0 261 | broadcast,3072,0 262 | broadcast,3072,0 263 | broadcast,3072,0 264 | broadcast,37748736,0 265 | broadcast,12288,0 266 | broadcast,37748736,0 267 | broadcast,3072,0 268 | broadcast,3072,0 269 | broadcast,3072,0 270 | broadcast,28311552,0 271 | broadcast,9216,0 272 | broadcast,9437184,0 273 | broadcast,3072,0 274 | broadcast,3072,0 275 | broadcast,3072,0 276 | broadcast,37748736,0 277 | broadcast,12288,0 278 | broadcast,37748736,0 279 | broadcast,3072,0 280 | broadcast,3072,0 281 | broadcast,3072,0 282 | broadcast,28311552,0 283 | broadcast,9216,0 284 | broadcast,9437184,0 285 | broadcast,3072,0 286 | broadcast,3072,0 287 | broadcast,3072,0 288 | broadcast,37748736,0 289 | broadcast,12288,0 290 | broadcast,37748736,0 291 | broadcast,3072,0 292 | broadcast,3072,0 293 | broadcast,3072,0 294 | broadcast,28311552,0 295 | broadcast,9216,0 296 | broadcast,9437184,0 297 | broadcast,3072,0 298 | broadcast,3072,0 299 | broadcast,3072,0 300 | broadcast,37748736,0 301 | broadcast,12288,0 302 | broadcast,37748736,0 303 | broadcast,3072,0 304 | broadcast,3072,0 305 | broadcast,3072,0 306 | broadcast,28311552,0 307 | broadcast,9216,0 308 | broadcast,9437184,0 309 | broadcast,3072,0 310 | broadcast,3072,0 311 | broadcast,3072,0 312 | broadcast,37748736,0 313 | broadcast,12288,0 314 | broadcast,37748736,0 315 | broadcast,3072,0 316 | broadcast,3072,0 317 | broadcast,3072,0 318 | broadcast,28311552,0 319 | broadcast,9216,0 320 | broadcast,9437184,0 321 | broadcast,3072,0 322 | broadcast,3072,0 323 | broadcast,3072,0 324 | broadcast,37748736,0 325 | broadcast,12288,0 326 | broadcast,37748736,0 327 | broadcast,3072,0 328 | broadcast,3072,0 329 | broadcast,3072,0 330 | broadcast,28311552,0 331 | broadcast,9216,0 332 | broadcast,9437184,0 333 | broadcast,3072,0 334 | broadcast,3072,0 335 | broadcast,3072,0 336 | broadcast,37748736,0 337 | broadcast,12288,0 338 | broadcast,37748736,0 339 | broadcast,3072,0 340 | broadcast,3072,0 341 | broadcast,3072,0 342 | broadcast,28311552,0 343 | broadcast,9216,0 344 | broadcast,9437184,0 345 | broadcast,3072,0 346 | broadcast,3072,0 347 | broadcast,3072,0 348 | broadcast,37748736,0 349 | broadcast,12288,0 350 | broadcast,37748736,0 351 | broadcast,3072,0 352 | broadcast,3072,0 353 | broadcast,3072,0 354 | broadcast,28311552,0 355 | broadcast,9216,0 356 | broadcast,9437184,0 357 | broadcast,3072,0 358 | broadcast,3072,0 359 | broadcast,3072,0 360 | broadcast,37748736,0 361 | broadcast,12288,0 362 | broadcast,37748736,0 363 | broadcast,3072,0 364 | broadcast,3072,0 365 | broadcast,3072,0 366 | allreduce,1,-1 367 | allreduce,1,-1 368 | broadcast,3,0 369 | broadcast,5,0 370 | broadcast,16392,0 371 | allreduce,16384,-1 372 | allreduce,16384,-1 373 | allreduce,16384,-1 374 | allreduce,1,-1 375 | reduce,264330240,0 376 | reduce,1024,0 377 | reduce,11264,0 378 | reduce,5472256,0 379 | reduce,32276480,0 380 | reduce,188823552,0 381 | reduce,75515904,0 382 | reduce,5120,0 383 | reduce,1024,0 384 | reduce,10947584,0 385 | reduce,26804224,0 386 | reduce,264333312,0 387 | reduce,9216,0 388 | reduce,9437184,0 389 | reduce,16425984,0 390 | reduce,11894784,0 391 | reduce,75528192,0 392 | reduce,151044096,0 393 | reduce,4096,0 394 | reduce,2048,0 395 | reduce,3072,0 396 | reduce,37748736,0 397 | reduce,21901312,0 398 | reduce,15859712,0 399 | reduce,226572288,0 400 | reduce,5120,0 401 | reduce,1024,0 402 | reduce,3072,0 403 | reduce,9437184,0 404 | reduce,9216,0 405 | reduce,28311552,0 406 | reduce,9216,0 407 | reduce,27361280,0 408 | reduce,10387456,0 409 | reduce,188814336,0 410 | reduce,3072,0 411 | reduce,3072,0 412 | reduce,3072,0 413 | reduce,37748736,0 414 | reduce,12288,0 415 | reduce,37748736,0 416 | reduce,4531200,0 417 | reduce,4915200,0 418 | reduce,179377152,0 419 | reduce,37748736,0 420 | reduce,4096,0 421 | reduce,2048,0 422 | reduce,3072,0 423 | reduce,9437184,0 424 | reduce,9216,0 425 | reduce,28311552,0 426 | reduce,566272,0 427 | reduce,37191680,0 428 | reduce,188814336,0 429 | reduce,2048,0 430 | reduce,4096,0 431 | reduce,3072,0 432 | reduce,37748736,0 433 | reduce,12288,0 434 | reduce,37748736,0 435 | reduce,9216,0 436 | reduce,9437184,0 437 | reduce,6038528,0 438 | reduce,22282240,0 439 | reduce,75528192,0 440 | reduce,113276928,0 441 | reduce,3072,0 442 | reduce,3072,0 443 | reduce,3072,0 444 | reduce,9437184,0 445 | reduce,9216,0 446 | reduce,28311552,0 447 | reduce,9216,0 448 | reduce,37748736,0 449 | reduce,11513856,0 450 | reduce,26247168,0 451 | reduce,151053312,0 452 | reduce,1024,0 453 | reduce,5120,0 454 | reduce,3072,0 455 | reduce,37748736,0 456 | reduce,12288,0 457 | reduce,37748736,0 458 | reduce,9216,0 459 | reduce,9437184,0 460 | reduce,9216,0 461 | reduce,28311552,0 462 | reduce,9216,0 463 | reduce,16973824,0 464 | reduce,20774912,0 465 | reduce,151047168,0 466 | reduce,2048,0 467 | reduce,4096,0 468 | reduce,3072,0 469 | reduce,9437184,0 470 | reduce,9216,0 471 | reduce,28311552,0 472 | reduce,9216,0 473 | reduce,37748736,0 474 | reduce,12288,0 475 | reduce,37748736,0 476 | reduce,9216,0 477 | reduce,9437184,0 478 | reduce,22455296,0 479 | reduce,5865472,0 480 | reduce,119583744,0 481 | reduce,154533888,0 482 | allreduce,1,-1 483 | allreduce,1,-1 484 | allreduce,1,-1 485 | allgather,42359660,-1 486 | allgather,42359660,-1 487 | allgather,42359660,-1 488 | allgather,42359660,-1 489 | allgather,42359660,-1 490 | allgather,42359660,-1 491 | allgather,42359672,-1 492 | allgather,100352,-1 493 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/Example.csv: -------------------------------------------------------------------------------- 1 | allreduce,1,-1 2 | broadcast,3072,0 3 | allreduce,1,-1 4 | broadcast,3,0 5 | broadcast,5,0 6 | reduce,1024,0 7 | reduce,11264,0 8 | allgather,1024,-1 9 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/testccl_cpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.multiprocessing import Process 4 | import os 5 | 6 | world_size = 12 7 | rounds = 100 8 | # input_file = "Example.csv" 9 | input_file = "DeepSpeed.csv" 10 | 11 | type = torch.float16 12 | 13 | 14 | def worker(given_rank): 15 | os.environ['MASTER_ADDR'] = 'localhost' 16 | os.environ['MASTER_PORT'] = '6789' 17 | os.environ['WORLD_SIZE'] = str(world_size) 18 | os.environ['RANK'] = str(given_rank) 19 | 20 | dist.init_process_group(backend = 'gloo') 21 | rank = int(dist.get_rank()) 22 | 23 | device = "cpu" 24 | 25 | ops, sizes, roots = read_file(input_file) 26 | test_ccl(ops, sizes, roots, device, rank, rounds) 27 | 28 | 29 | def main(): 30 | 31 | process_list = [] 32 | for i in range(world_size): 33 | p = Process(target=worker, args=(i,)) 34 | p.start() 35 | process_list.append(p) 36 | 37 | for p in process_list: 38 | p.join() 39 | 40 | def read_file(filename): 41 | ops = [] 42 | sizes = [] 43 | roots = [] 44 | f = open(filename, "r") 45 | for line in f: 46 | op, size, root = line.strip().split(",") 47 | size = int(size) 48 | root = int(root) 49 | if root >= world_size: 50 | print("Invalid root {}".format(root)) 51 | exit() 52 | ops.append(op) 53 | sizes.append(size) 54 | roots.append(root) 55 | f.close() 56 | return ops, sizes, roots 57 | 58 | def test_ccl(ops, sizes, roots, device, rank, rounds): 59 | input = [] 60 | output = [] 61 | print("Rank {}: starting to initialize tensors ...".format(rank)) 62 | for i in range(0, len(sizes)): 63 | data = torch.randn(sizes[i], dtype = type) 64 | data.to(device) 65 | input.append(data) 66 | if ops[i] == 'allgather': 67 | tmp_output = [] 68 | for j in range(0, world_size): 69 | data = torch.randn(sizes[i], dtype = type) 70 | data.to(device) 71 | tmp_output.append(data) 72 | output.append(tmp_output) 73 | else: 74 | output.append(data) 75 | print("Rank {}: tensors initialization finished!".format(rank)) 76 | for k in range(0, rounds): 77 | for i in range(0, len(ops)): 78 | if ops[i] == 'reduce': 79 | print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i]))) 80 | dist.reduce(input[i], roots[i], async_op=False) 81 | if ops[i] == 'allreduce': 82 | print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i]))) 83 | dist.all_reduce(input[i], async_op=False) 84 | if ops[i] == 'allgather': 85 | print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i]))) 86 | dist.all_gather(output[i], input[i], async_op=False) 87 | if ops[i] == 'broadcast': 88 | print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i]))) 89 | dist.broadcast(input[i], roots[i], async_op=False) 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/testccl_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.multiprocessing import Process 4 | import os 5 | import intel_extension_for_pytorch 6 | import argparse 7 | import sys 8 | 9 | datatype_map = { 10 | 'bf16': torch.bfloat16, 11 | 'fp32': torch.float32 12 | } 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--datatype', '-d', type=str, default='bf16', help='Data type') 16 | parser.add_argument('--world_size', default=12, type=int, help='Number of gpu for distributed training') 17 | args = parser.parse_args() 18 | type = datatype_map.get(args.datatype) 19 | if type is None: 20 | print(f'Unknown datatype: {args.datatype}') 21 | sys.exit(1) 22 | 23 | world_size = args.world_size 24 | rounds = 100 25 | # input_file = "Example.csv" 26 | input_file = "DeepSpeed.csv" 27 | 28 | type = torch.bfloat16 29 | 30 | def worker(given_rank): 31 | os.environ['MASTER_ADDR'] = '127.0.0.1' # xpu 32 | os.environ['MASTER_PORT'] = '29500' # xpu 33 | os.environ['WORLD_SIZE'] = str(world_size) 34 | os.environ['RANK'] = str(given_rank) 35 | 36 | try: 37 | import oneccl_bindings_for_pytorch 38 | except ImportError: 39 | print("oneccl_bindings_for_pytorch not available!") 40 | dist.init_process_group(backend='ccl') 41 | 42 | rank = int(dist.get_rank()) 43 | torch.xpu.set_device(rank) 44 | device = "xpu:{}".format(rank) 45 | 46 | ops, sizes, roots = read_file(input_file) 47 | test_ccl(ops, sizes, roots, device, rank, rounds) 48 | 49 | 50 | def main(): 51 | 52 | process_list = [] 53 | for i in range(world_size): 54 | p = Process(target=worker, args=(i,)) 55 | p.start() 56 | process_list.append(p) 57 | 58 | for p in process_list: 59 | p.join() 60 | 61 | def read_file(filename): 62 | ops = [] 63 | sizes = [] 64 | roots = [] 65 | f = open(filename, "r") 66 | for line in f: 67 | op, size, root = line.strip().split(",") 68 | size = int(size) 69 | root = int(root) 70 | if root >= world_size: 71 | print("Invalid root {}".format(root)) 72 | exit() 73 | ops.append(op) 74 | sizes.append(size) 75 | roots.append(root) 76 | f.close() 77 | return ops, sizes, roots 78 | 79 | def test_ccl(ops, sizes, roots, device, rank, rounds): 80 | input = [] 81 | output = [] 82 | print("Rank {}: starting to initialize tensors ...".format(rank)) 83 | for i in range(0, len(sizes)): 84 | data = torch.randn(sizes[i], dtype = type) 85 | data = data.to(device) 86 | input.append(data) 87 | if ops[i] == 'allgather': 88 | tmp_output = [] 89 | for j in range(0, world_size): 90 | data = torch.randn(sizes[i], dtype = type) 91 | data = data.to(device) 92 | tmp_output.append(data) 93 | output.append(tmp_output) 94 | else: 95 | output.append(data) 96 | print("Rank {}: tensors initialization finished!".format(rank)) 97 | for k in range(0, rounds): 98 | print("test round: ", k) 99 | for i in range(0, len(ops)): 100 | if ops[i] == 'reduce': 101 | print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i]))) 102 | dist.reduce(input[i], roots[i], async_op=False) 103 | if ops[i] == 'allreduce': 104 | print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i]))) 105 | dist.all_reduce(input[i], async_op=False) 106 | if ops[i] == 'allgather': 107 | print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i]))) 108 | dist.all_gather(output[i], input[i], async_op=False) 109 | if ops[i] == 'broadcast': 110 | print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i]))) 111 | dist.broadcast(input[i], roots[i], async_op=False) 112 | 113 | torch.xpu.synchronize() 114 | 115 | if __name__ == '__main__': 116 | main() 117 | print("All tests finished!") 118 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/testccl_gpu_mpi.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.multiprocessing import Process 4 | import os 5 | import intel_extension_for_pytorch 6 | import oneccl_bindings_for_pytorch 7 | import argparse 8 | import sys 9 | 10 | rounds = 40 11 | # input_file = "Example.csv" 12 | input_file = "DeepSpeed.csv" 13 | 14 | data_type = torch.bfloat16 15 | 16 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 17 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 18 | os.environ['MASTER_ADDR'] = '127.0.0.1' 19 | os.environ['MASTER_PORT'] = '29500' 20 | dist.init_process_group("ccl") 21 | rank = dist.get_rank() 22 | world_size = dist.get_world_size() 23 | 24 | def main(): 25 | torch.xpu.set_device(rank) 26 | device = "xpu:{}".format(rank) 27 | ops, sizes, roots = read_file(input_file) 28 | test_ccl(ops, sizes, roots, device, rank, rounds) 29 | 30 | def read_file(filename): 31 | ops = [] 32 | sizes = [] 33 | roots = [] 34 | f = open(filename, "r") 35 | for line in f: 36 | op, size, root = line.strip().split(",") 37 | size = int(size) 38 | root = int(root) 39 | if root >= world_size: 40 | print("Invalid root {}".format(root)) 41 | exit() 42 | ops.append(op) 43 | sizes.append(size) 44 | roots.append(root) 45 | f.close() 46 | return ops, sizes, roots 47 | 48 | def test_ccl(ops, sizes, roots, device, rank, rounds): 49 | input = [] 50 | output = [] 51 | print("Rank {}: starting to initialize tensors ...".format(rank)) 52 | for i in range(0, len(sizes)): 53 | data = torch.randn(sizes[i], dtype = data_type) 54 | data = data.to(device) 55 | input.append(data) 56 | if ops[i] == 'allgather': 57 | tmp_output = [] 58 | for j in range(0, world_size): 59 | data = torch.randn(sizes[i], dtype = data_type) 60 | data = data.to(device) 61 | tmp_output.append(data) 62 | output.append(tmp_output) 63 | else: 64 | output.append(data) 65 | print("Rank {}: tensors initialization finished!".format(rank), flush=True) 66 | for k in range(0, rounds): 67 | print("test round: ", k) 68 | for i in range(0, len(ops)): 69 | if ops[i] == 'reduce': 70 | print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i])), flush=True) 71 | dist.reduce(input[i], roots[i], async_op=False) 72 | if ops[i] == 'allreduce': 73 | print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i])), flush=True) 74 | dist.all_reduce(input[i], async_op=False) 75 | if ops[i] == 'allgather': 76 | print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i])), flush=True) 77 | dist.all_gather(output[i], input[i], async_op=False) 78 | if ops[i] == 'broadcast': 79 | print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i])), flush=True) 80 | dist.broadcast(input[i], roots[i], async_op=False) 81 | 82 | torch.xpu.synchronize() 83 | 84 | if __name__ == '__main__': 85 | main() 86 | print("All tests finished!") 87 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Intel® oneCCL Bindings for PyTorch* unit tests 2 | 3 | These tests validate the functionality and performance for collective communication primitives in Intel® oneCCL Bindings for PyTorch*. 4 | 5 | ## functionality validation of collective communication primitives 6 | To start the test_c10d_ccl.py test, run: 7 | 8 | ```bash 9 | python test_c10d_ccl.py 10 | ``` 11 | 12 | ## functionality validation of point-to-point communication primitives 13 | For within-card and cross-cards p2p test, run: 14 | 15 | ```bash 16 | python test_c10d_p2p.py 17 | ``` 18 | 19 | For cross-nodes p2p test, run: 20 | 21 | ```bash 22 | # Mpich 23 | mpiexec -host nodeA,nodeB -np 24 -ppn 12 python -u test_p2p_crossnodes.py --dist_url $NODE_IP --world_size 24 24 | ``` 25 | 26 | ## functionality validation of barrier 27 | For cpu barrier, run: 28 | 29 | ```bash 30 | mpirun -np 2 python test_barrier.py 31 | ``` 32 | 33 | For xpu barrier (built with "COMPUTE_BACKEND=dpcpp"), run: 34 | 35 | ```bash 36 | mpirun -np 2 python test_barrier.py --device xpu 37 | ``` 38 | 39 | ## broadcast/allreduce profiling 40 | To start the test_allreduce.py test, run: 41 | 42 | ```bash 43 | mpirun -np 12 -ppn 12 python ddp_allreduce.py --warm 10 --iter 20 --fixed 44 | ``` 45 | 46 | ## DeepSpeed test 47 | cpu test: 48 | ```bash 49 | python testccl_cpu.py 50 | ``` 51 | 52 | gpu test (runs on 1 node 6 cards 12 tiles): 53 | ```bash 54 | python testccl_gpu.py --world_size 12 55 | ``` 56 | gpu test for scale-out (runs on 2nodes and 24 ranks): 57 | ```bash 58 | mpirun -np 24 -ppn 12 python testccl_gpu_mpi.py 59 | ``` 60 | 61 | Note this unit test is a stress test with a long time to start. You may need to wait ~5min to get the log "starting to initialize tensors ...". 62 | 63 | ## allreduce of LLM path 64 | This test case goes to special path for allreduce operation on xpu device if launched rank(-np) <= 8. Run: 65 | ```bash 66 | mpirun -np 2 python test_llm_allreduce.py 67 | ``` 68 | If you want to disable this path and use oneCCL allreduce instead, set TORCH_CCL_GPU_ALLREDUCE to 0. Run: 69 | ```bash 70 | TORCH_CCL_GPU_ALLREDUCE=0 mpirun -np 2 python test_llm_allreduce.py 71 | ## Test Functionality of FSDP 72 | ```bash 73 | export CCL_ZE_IPC_EXCHANGE=sockets # for pytorch multiprocessing launch 74 | python test_fsdp.py 75 | ``` 76 | 77 | ## subgroup tests ds_subgroup_allreduce.py 78 | # for OAM (sub_group=2/4) 79 | ```bash 80 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=2 81 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=4 82 | ``` 83 | # for Aurora System(TP=2/3/4/6) 84 | ```bash 85 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=2 86 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=3 87 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=4 88 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=6 89 | ``` 90 | 91 | ## deep speed scale-out tests 92 | The ds_p2p_crossnodes.py test case should be run on 3 nodes 93 | ```bash 94 | mpirun -host x1002c4s1b0n0,x1002c4s2b0n0,x1002c4s3b0n0 -np 36 -ppn 12 python -u ds_p2p_crossnodes.py --dist_url 10.0.1.141 --world_size 36 95 | ``` 96 | -host is the name for this 3 nodes 97 | --dist_url is the IP on your node, you can use (hostname -I) to get. -------------------------------------------------------------------------------- /tests/ddp_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import time 4 | import os 5 | import argparse 6 | import torch.distributed as dist 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--ptrace', 10 | action='store_true', 11 | default=False, 12 | help='pytorch trace') 13 | parser.add_argument('--warm', type=int, default=10, help='#warmup') 14 | parser.add_argument('--iter', type=int, default=10, help='#iteration') 15 | parser.add_argument('--size', type=int, default=25557032, help='number of f32/bf16 elements') 16 | parser.add_argument('--no-cuda', action='store_true', default=False) 17 | parser.add_argument('--broadcast', action='store_true', default=False) 18 | parser.add_argument('--bf16', action='store_true', default=False) 19 | parser.add_argument('--fixed', 20 | action='store_true', 21 | default=False, 22 | help='fixed size') 23 | args = parser.parse_args() 24 | args.cuda = not args.no_cuda and torch.cuda.is_available() 25 | 26 | if 'PMI_RANK' in os.environ.keys() and 'PMI_SIZE' in os.environ.keys(): 27 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 28 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) # mpich set 29 | elif 'PMIX_RANK' in os.environ.keys() and 'PALS_LOCAL_SIZE' in os.environ.keys(): 30 | os.environ['RANK'] = os.environ.get('PMIX_RANK') 31 | os.environ['WORLD_SIZE'] = str(os.environ.get('PALS_LOCAL_SIZE', -1)) 32 | os.environ['MASTER_ADDR'] = '127.0.0.1' # your master address 33 | os.environ['MASTER_PORT'] = '29500' # your master port 34 | 35 | if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ.keys(): 36 | local_rank = os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] 37 | elif 'MPI_LOCALRANKID' in os.environ.keys(): 38 | local_rank = os.environ['MPI_LOCALRANKID'] 39 | if 'MPI_LOCALNRANKS' in os.environ.keys(): 40 | os.environ['LOCAL_WORLD_SIZE'] = str(os.environ.get('MPI_LOCALNRANKS',-1)) 41 | else: 42 | local_rank = os.environ['PALS_LOCAL_RANKID'] 43 | 44 | local_rank = int(local_rank) 45 | devid = local_rank 46 | 47 | if not args.cuda: 48 | import intel_extension_for_pytorch 49 | try: 50 | import oneccl_bindings_for_pytorch 51 | except: 52 | import torch_ccl 53 | torch.xpu.set_device(devid) 54 | device = "xpu:{}".format(devid) 55 | dist.init_process_group(backend='ccl') 56 | else: 57 | torch.cuda.set_device(devid) 58 | device = "cuda" 59 | dist.init_process_group(backend='nccl') 60 | 61 | try: 62 | from horovod.torch import mpi_lib_v2 as mpi_lib 63 | if mpi_lib.ctenabled(): 64 | mpi_lib = mpi_lib 65 | except: 66 | mpi_lib = None 67 | 68 | print(f'DDP local rank: {devid}') 69 | 70 | if devid == 0: 71 | print(f'PyTorch DDP {"Broadcast" if args.broadcast else "AllReduce"} on {os.environ["WORLD_SIZE"]} {device} devices: ') 72 | 73 | def _time(): 74 | if args.cuda: 75 | torch.cuda.synchronize() 76 | else: 77 | torch.xpu.synchronize() 78 | return time.time() 79 | 80 | if args.fixed: 81 | N = args.size 82 | else: 83 | N = 1 84 | 85 | 86 | with torch.autograd.profiler.profile(enabled=args.ptrace) as prof: 87 | while N <= args.size: 88 | for i in range(args.warm): 89 | data = torch.randn(N, dtype=torch.bfloat16 if args.bf16 else torch.float32).to(device) 90 | with torch.no_grad(): 91 | if not args.broadcast: 92 | dist.all_reduce(data) 93 | else: 94 | dist.broadcast(data, 0) 95 | elapsed = [] 96 | for i in range(args.iter): 97 | data = torch.randn(N, dtype=torch.bfloat16 if args.bf16 else torch.float32).to(device) 98 | t = _time() 99 | if mpi_lib: 100 | mpi_lib.ctpush("IPEX_ALLREDUCE") 101 | with torch.no_grad(): 102 | if not args.broadcast: 103 | dist.all_reduce(data) 104 | else: 105 | dist.broadcast(data, 0) 106 | elapsed.append((_time() - t) * 1e6) 107 | if mpi_lib and mpi_lib.ctenabled(): 108 | mpi_lib.ctpop() 109 | if devid == 0: 110 | print( 111 | f'{N*(2 if args.bf16 else 4):<10}{np.mean(elapsed):>10.1f}us ({np.min(elapsed):.1f}-{np.max(elapsed):.1f}) +-{1.96 * np.std(elapsed):.1f}' 112 | ) 113 | if N == args.size: 114 | break 115 | N = 2 * N 116 | if N != args.size and N > args.size: 117 | N = args.size 118 | 119 | if args.ptrace: 120 | prof.export_chrome_trace('rank' + str(hvd.rank()) + '_timeline.json') 121 | dist.destroy_process_group() 122 | 123 | -------------------------------------------------------------------------------- /tests/ds_p2p_crossnodes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--world_size', default=-1, type=int, help='number of gpu for distributed training') 11 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 12 | parser.add_argument('--dist_port', default='29600', type=str, help='url port used to set up distributed training') 13 | args = parser.parse_args() 14 | 15 | os.environ['RANK'] = str(os.environ.get('PMIX_RANK',0)) 16 | os.environ['WORLD_SIZE'] = str(args.world_size) 17 | os.environ['MASTER_ADDR'] = '127.0.0.1' 18 | os.environ['MASTER_PORT'] = '29600' 19 | 20 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 21 | dist.init_process_group(backend='ccl', init_method=init_method, 22 | world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK'])) 23 | 24 | rank = dist.get_rank() 25 | print("-----global rank: ", rank) 26 | size = dist.get_world_size() 27 | local_rank = os.environ['PALS_LOCAL_RANKID'] 28 | device = "xpu:{}".format(local_rank) 29 | print('world_size:{}, global rank:{}, local_rank:{}'.format(size, rank, local_rank)) 30 | 31 | def send_tensor(buffer, recv_stage): 32 | if isinstance(buffer, torch.Tensor): 33 | type_tensor = torch.LongTensor(data=[0]).to(device) 34 | dist.send(type_tensor, recv_stage) 35 | send_shape = torch.LongTensor(data=buffer.size()).to(device) 36 | send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(device) 37 | dist.send(send_ndims, recv_stage) 38 | dist.send(send_shape, recv_stage) 39 | 40 | def recv_tensor(send_stage): 41 | type_tensor = torch.LongTensor(data=[0]).to(device) 42 | dist.recv(type_tensor, send_stage) 43 | recv_type = type_tensor.item() 44 | 45 | if recv_type == 0: 46 | recv_ndims = torch.LongTensor(data=[0]).to(device) 47 | dist.recv(recv_ndims, send_stage) 48 | recv_ndims = recv_ndims.item() 49 | recv_shape = torch.LongTensor([1] * recv_ndims).to(device) 50 | dist.recv(recv_shape, send_stage) 51 | print("recv_ndims", recv_ndims) 52 | print("recv_shape", recv_shape) 53 | else: 54 | print("----------------error-------------------") 55 | 56 | size = dist.get_world_size() 57 | device = "xpu:{}".format(local_rank) 58 | 59 | data = torch.randn(1, dtype=torch.float32).to(device) 60 | dist.all_reduce(data) 61 | 62 | # send/recv(rank0 -> rank12 -> rank24) 63 | if rank <= 11: 64 | tensor = torch.ones(2048,3,256).xpu(device) 65 | send_tensor(tensor, rank+12) 66 | elif rank >= 24 : 67 | recv_tensor(rank-12) 68 | else: 69 | recv_tensor(rank-12) 70 | tensor = torch.ones(2048,3,256).xpu(device) 71 | send_tensor(tensor, rank+12) 72 | print("-----finished send/recv-----") 73 | 74 | # all_gather_base after p2p 75 | torch.distributed.barrier() 76 | world_size=36 77 | device = "xpu:{}".format(local_rank) 78 | rank_name_to_time = torch.zeros((world_size, 2), 79 | dtype=torch.float, 80 | device=device) 81 | 82 | torch.distributed._all_gather_base(rank_name_to_time.view(-1), 83 | rank_name_to_time[rank, :].view(-1)) 84 | print("all_gather is done") 85 | -------------------------------------------------------------------------------- /tests/ds_subgroup_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | import time 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 11 | parser.add_argument('--dist_port', default='29500', type=str, help='url port used to set up distributed training') 12 | parser.add_argument('--sub_group', default=4, type=int, help='url port used to set up distributed training') 13 | args = parser.parse_args() 14 | 15 | if 'PMI_RANK' in os.environ.keys() and 'PMI_SIZE' in os.environ.keys(): 16 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 17 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) # mpich set 18 | elif 'PMIX_RANK' in os.environ.keys() and 'PALS_LOCAL_SIZE' in os.environ.keys(): 19 | os.environ['RANK'] = os.environ.get('PMIX_RANK') 20 | os.environ['WORLD_SIZE'] = str(os.environ.get('PALS_LOCAL_SIZE', -1)) 21 | 22 | os.environ['MASTER_ADDR'] = '127.0.0.1' 23 | os.environ['MASTER_PORT'] = '29500' 24 | 25 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 26 | dist.init_process_group(backend='ccl', init_method=init_method, 27 | world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK'])) 28 | 29 | rank = dist.get_rank() 30 | size = dist.get_world_size() 31 | device = "xpu:{}".format(rank) 32 | print('world_size:{}, global rank:{}'.format(size, rank)) 33 | 34 | shape = int(2048) 35 | warm_shape = int(1) 36 | warm = torch.ones(warm_shape).bfloat16().to(device) 37 | 38 | input_shape = shape 39 | input = torch.ones(input_shape).bfloat16().to(device) 40 | 41 | #warm_up 42 | dist.all_reduce(warm) 43 | 44 | #sub_group=1(TP=12) 45 | group1 = dist.new_group([0]) 46 | if rank ==0: 47 | dist.all_reduce(input, group=group1) 48 | 49 | group_size = [[i+(size // args.sub_group)*j for j in range(args.sub_group)] for i in range(size // args.sub_group)] 50 | sub_group = [] 51 | 52 | #construct sub group 53 | for i in range(len(group_size)): 54 | sub_group.append(dist.new_group(group_size[i])) 55 | 56 | for i in range(len(group_size)): 57 | if dist.get_rank() in group_size[i]: 58 | dist.all_reduce(input, group=sub_group[i]) 59 | -------------------------------------------------------------------------------- /tests/run_ds_llm.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | # run the ds_subgroup_allreduce.py 3 | # for OAM (sub_group=2/4) 4 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=2 5 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=4 6 | # for Aurora System(TP=2/3/4/6) 7 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=2 8 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=3 9 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=4 10 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=6 11 | 12 | # should run the ds_p2p_crossnodes.py on 3 nodes 13 | # -host is the name for this 3 nodes 14 | # --dist_url is the IP on your node, you can use (hostname -I) to get. 15 | mpirun -host x1002c4s1b0n0,x1002c4s2b0n0,x1002c4s3b0n0 -np 36 -ppn 12 python -u ds_p2p_crossnodes.py --dist_url 10.0.1.141 --world_size 36 16 | -------------------------------------------------------------------------------- /tests/test_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | import time 7 | 8 | tokens = 16 9 | rounds = 70 * 2 * tokens 10 | 11 | count = 14336 12 | 13 | total = 1024 * 1024 * 72 14 | repeat = 4 15 | 16 | # profiling = False 17 | # profiling = True 18 | 19 | datatype = torch.float16 20 | # datatype = torch.float32 21 | 22 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 23 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 24 | os.environ['MASTER_ADDR'] = '127.0.0.1' 25 | os.environ['MASTER_PORT'] = '29500' 26 | 27 | dist.init_process_group("ccl") 28 | rank = dist.get_rank() 29 | size = dist.get_world_size() 30 | 31 | device = "xpu:{}".format(rank) 32 | # allreduce data 33 | data = (torch.ones(count, dtype=datatype) * 0.1).to(device) 34 | 35 | a = (torch.zeros((int(total / count), count), dtype=datatype)).to(device) 36 | 37 | # warm up 38 | for i in range(5): 39 | a[0] += (data * 0.1) 40 | for j in range(repeat): 41 | a += 0.01 42 | dist.all_reduce(data) 43 | data /= size 44 | sync = data.cpu() 45 | 46 | #start_events = [] 47 | #end_events = [] 48 | 49 | dist.barrier() 50 | start = time.time() 51 | for i in range(rounds): 52 | # start_event = None 53 | # end_event = None 54 | # if profiling: 55 | # start_event = torch.xpu.Event(enable_timing=True) 56 | # end_event = torch.xpu.Event(enable_timing=True) 57 | a[0] += (data * 0.1) 58 | for j in range(repeat): 59 | a += 0.01 60 | #print("XPU: {} {}".format(i, a[0][0])) 61 | # if profiling: 62 | # start_event.record() 63 | dist.all_reduce(data) 64 | # if profiling: 65 | # end_event.record() 66 | data /= size 67 | sync = data.cpu() 68 | # if profiling: 69 | # start_events.append(start_event) 70 | # end_events.append(end_event) 71 | 72 | # print(data[0]) 73 | data = data.cpu() 74 | # torch.xpu.synchronize('xpu:{}'.format(rank)) 75 | span = time.time() - start 76 | print('{} rounds on reducing {} elements. Time used {}'.format(rounds, count, span)) 77 | 78 | tmp_a = torch.zeros(1, dtype=datatype) 79 | tmp_data = torch.ones(1, dtype=datatype) * 0.1 80 | for i in range(5): 81 | tmp_a += (tmp_data * 0.1) 82 | for j in range(repeat): 83 | tmp_a += 0.01 84 | tmp_data *= size 85 | tmp_data /= size 86 | 87 | for i in range(rounds): 88 | tmp_a += (tmp_data * 0.1) 89 | for j in range(repeat): 90 | tmp_a += 0.01 91 | #print("CPU: {} {}".format(i, tmp_a[0])) 92 | tmp_data *= size 93 | tmp_data /= size 94 | 95 | a = a.cpu() 96 | 97 | error = False 98 | for i in range(count): 99 | if tmp_a[0] != a[0][i]: 100 | if not error: 101 | print("Error on {}: {} vs {}".format(i, tmp_a[0], a[0][i])) 102 | error = True 103 | else: 104 | if error: 105 | print("No error on {}".format(i)) 106 | error = False 107 | 108 | #if profiling: 109 | # for i in range(len(start_events)): 110 | # allreduce_time = start_events[i].elapsed_time(end_events[i]) 111 | # print('Round %d allreduce time %.3fms' % (i, allreduce_time)) 112 | # if i != len(start_events) - 1: 113 | # compute_time = end_events[i].elapsed_time(start_events[i + 1]) 114 | # print('Round %d compute time %.3fms' % (i + 1, compute_time)) 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /tests/test_barrier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--device', '-dev', type=str, default='cpu', help='Device type to use: cpu, xpu') 10 | args = parser.parse_args() 11 | 12 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 13 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 14 | os.environ['MASTER_ADDR'] = '127.0.0.1' 15 | os.environ['MASTER_PORT'] = '29500' 16 | 17 | dist.init_process_group("ccl") 18 | rank = dist.get_rank() 19 | size = dist.get_world_size() 20 | 21 | if args.device == 'xpu': 22 | device = "xpu:{}".format(rank) 23 | else: 24 | device = 'cpu' 25 | 26 | print("Barrier using device: ", args.device) 27 | dist.barrier() 28 | print("Finish") 29 | -------------------------------------------------------------------------------- /tests/test_c10d_p2p.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import torch 4 | 5 | try: 6 | import intel_extension_for_pytorch 7 | xpu_is_available = torch.xpu.is_available() if hasattr(torch, 'xpu') else False 8 | except ImportError: 9 | # ignore the ipex 10 | xpu_is_available = False 11 | pass 12 | 13 | import oneccl_bindings_for_pytorch 14 | from torch.testing._internal.common_utils import run_tests 15 | from torch.testing._internal.common_distributed import MultiProcessTestCase 16 | 17 | import torch.distributed as dist 18 | 19 | class ProcessGroupCCLTest(MultiProcessTestCase): 20 | 21 | def setUp(self): 22 | super(ProcessGroupCCLTest, self).setUp() 23 | self._spawn_processes() 24 | 25 | @property 26 | def world_size(self): 27 | return 6 28 | 29 | def _build_tensor(self, size, value=None, dtype=torch.float, device=None): 30 | if value is None: 31 | value = size 32 | if device is None: 33 | return torch.empty(size, size, size, dtype=dtype).fill_(value) 34 | else: 35 | return torch.empty(size, size, size, dtype=dtype).fill_(value).to(device) 36 | 37 | def _test_send_recv_withincard(self): 38 | store = dist.FileStore(self.file_name, self.world_size) 39 | dist.init_process_group( 40 | "ccl", 41 | world_size=self.world_size, 42 | rank=self.rank, 43 | store=store, 44 | ) 45 | device = "xpu:{}".format(self.rank) 46 | 47 | # WA: allreduce 48 | # Ensure the process group has been fully initialized 49 | data = torch.zeros(1).to(device) 50 | dist.all_reduce(data) 51 | 52 | torch.xpu.set_device(device) 53 | tensor = self._build_tensor(self.rank + 1, device=device) 54 | 55 | # rank0 -> rank1 56 | src = 0 57 | dst = 1 58 | if self.rank == src: 59 | # Send 60 | dist.send(tensor, dst) 61 | elif self.rank == dst: 62 | # Recv 63 | expected_tensor = self._build_tensor(src + 1) 64 | output_tensor = self._build_tensor( 65 | src + 1, value=-1, device=device 66 | ) 67 | dist.recv(output_tensor, src) 68 | self.assertEqual(output_tensor, expected_tensor) 69 | 70 | def test_send_recv_withincard(self): 71 | self._test_send_recv_withincard() 72 | 73 | def _test_send_recv_3rank(self): 74 | # cross-cards p2p: rank1 -> rank3 -> rank5 75 | store = dist.FileStore(self.file_name, self.world_size) 76 | dist.init_process_group( 77 | "ccl", 78 | world_size=self.world_size, 79 | rank=self.rank, 80 | store=store, 81 | ) 82 | device = "xpu:{}".format(self.rank) 83 | 84 | # WA: allreduce 85 | # Ensure the process group has been fully initialized 86 | data = torch.zeros(1).to(device) 87 | dist.all_reduce(data) 88 | 89 | torch.xpu.set_device(device) 90 | tensor = self._build_tensor(self.rank + 1, device=device) 91 | 92 | if self.rank == 1: 93 | dist.send(tensor, 3) 94 | if self.rank == 3: 95 | expected_tensor1 = self._build_tensor(1 + 1) 96 | output_tensor1 = self._build_tensor( 97 | 1 + 1, value=-1, device=device 98 | ) 99 | dist.recv(output_tensor1, 1) 100 | self.assertEqual(output_tensor1, expected_tensor1) 101 | 102 | # rank3 -> rank5 103 | dist.send(tensor, 5) 104 | if self.rank == 5: 105 | expected_tensor2 = self._build_tensor(3 + 1) 106 | output_tensor2 = self._build_tensor( 107 | 3 + 1, value=-1, device=device 108 | ) 109 | dist.recv(output_tensor2, 3) 110 | self.assertEqual(output_tensor2, expected_tensor2) 111 | 112 | def test_send_recv_3rank(self): 113 | self._test_send_recv_3rank() 114 | 115 | def _test_send_recv_crosscard(self): 116 | store = dist.FileStore(self.file_name, self.world_size) 117 | dist.init_process_group( 118 | "ccl", 119 | world_size=self.world_size, 120 | rank=self.rank, 121 | store=store, 122 | ) 123 | device = "xpu:{}".format(self.rank) 124 | 125 | # WA: allreduce 126 | # Ensure the process group has been fully initialized 127 | data = torch.zeros(1).to(device) 128 | dist.all_reduce(data) 129 | 130 | torch.xpu.set_device(device) 131 | tensor = self._build_tensor(self.rank + 1, device=device) 132 | 133 | for src in range(0, self.world_size): 134 | if src == self.rank: 135 | # Send mode 136 | for dst in range(0, self.world_size): 137 | if dst == self.rank: 138 | continue 139 | dist.send(tensor, dst) 140 | else: 141 | # Recv mode 142 | expected_tensor = self._build_tensor(src + 1) 143 | output_tensor = self._build_tensor( 144 | src + 1, value=-1, device=device 145 | ) 146 | dist.recv(output_tensor, src) 147 | self.assertEqual(output_tensor, expected_tensor) 148 | 149 | def test_send_recv_crosscard(self): 150 | self._test_send_recv_crosscard() 151 | 152 | def _test_send_recv_with_tag(self): 153 | store = dist.FileStore(self.file_name, self.world_size) 154 | dist.init_process_group( 155 | "ccl", 156 | world_size=self.world_size, 157 | rank=self.rank, 158 | store=store, 159 | ) 160 | device = "xpu:{}".format(self.rank) 161 | 162 | # WA: allreduce 163 | # Ensure the process group has been fully initialized 164 | data = torch.zeros(1).to(device) 165 | dist.all_reduce(data) 166 | 167 | torch.xpu.set_device(device) 168 | tensor = self._build_tensor(10, value=self.rank, device=device) 169 | 170 | for dst in range(0, self.world_size): 171 | if dst == self.rank: 172 | # Recv mode 173 | for src in range(0, self.world_size): 174 | if src == self.rank: 175 | continue 176 | output_tensor = self._build_tensor(10, value=-1, device=device) 177 | dist.recv(output_tensor, src, tag=src) 178 | self.assertTrue(output_tensor.eq(src).all()) 179 | else: 180 | # Send mode 181 | dist.send(tensor, dst, tag=self.rank) 182 | 183 | def test_send_recv_with_tag(self): 184 | self._test_send_recv_with_tag() 185 | 186 | if __name__ == '__main__': 187 | run_tests() 188 | -------------------------------------------------------------------------------- /tests/test_fsdp.py: -------------------------------------------------------------------------------- 1 | # Reference: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html 2 | 3 | import os 4 | import argparse 5 | import functools 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | from torchvision import datasets, transforms 11 | import time 12 | 13 | from torch.optim.lr_scheduler import StepLR 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 20 | from torch.distributed.fsdp.fully_sharded_data_parallel import ( 21 | CPUOffload, 22 | BackwardPrefetch, 23 | ) 24 | from torch.distributed.fsdp.wrap import ( 25 | size_based_auto_wrap_policy, 26 | enable_wrap, 27 | wrap, 28 | ) 29 | 30 | import intel_extension_for_pytorch 31 | import oneccl_bindings_for_pytorch 32 | 33 | def setup(rank, world_size): 34 | os.environ['MASTER_ADDR'] = 'localhost' 35 | os.environ['MASTER_PORT'] = '12355' 36 | 37 | # initialize the process group 38 | dist.init_process_group("ccl", rank=rank, world_size=world_size) 39 | 40 | def cleanup(): 41 | dist.destroy_process_group() 42 | 43 | 44 | class Net(nn.Module): 45 | def __init__(self): 46 | super(Net, self).__init__() 47 | self.conv1 = nn.Conv2d(1, 32, 3, 1) 48 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 49 | self.dropout1 = nn.Dropout(0.25) 50 | self.dropout2 = nn.Dropout(0.5) 51 | self.fc1 = nn.Linear(9216, 128) 52 | self.fc2 = nn.Linear(128, 10) 53 | 54 | def forward(self, x): 55 | 56 | x = self.conv1(x) 57 | x = F.relu(x) 58 | x = self.conv2(x) 59 | x = F.relu(x) 60 | x = F.max_pool2d(x, 2) 61 | x = self.dropout1(x) 62 | x = torch.flatten(x, 1) 63 | x = self.fc1(x) 64 | x = F.relu(x) 65 | x = self.dropout2(x) 66 | x = self.fc2(x) 67 | output = F.log_softmax(x, dim=1) 68 | return output 69 | 70 | def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): 71 | model.train() 72 | ddp_loss = torch.zeros(2).to("xpu:{}".format(rank)) 73 | if sampler: 74 | sampler.set_epoch(epoch) 75 | for batch_idx, (data, target) in enumerate(train_loader): 76 | if batch_idx < 3: 77 | data, target = data.to("xpu:{}".format(rank)), target.to("xpu:{}".format(rank)) 78 | optimizer.zero_grad() 79 | output = model(data) 80 | loss = F.nll_loss(output, target, reduction='sum') 81 | loss.backward() 82 | optimizer.step() 83 | ddp_loss[0] += loss.item() 84 | ddp_loss[1] += len(data) 85 | 86 | dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) 87 | if rank == 0: 88 | print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) 89 | 90 | 91 | def test(model, rank, world_size, test_loader): 92 | model.eval() 93 | correct = 0 94 | ddp_loss = torch.zeros(3).to("xpu:{}".format(rank)) 95 | with torch.no_grad(): 96 | for data, target in test_loader: 97 | data, target = data.to("xpu:{}".format(rank)), target.to("xpu:{}".format(rank)) 98 | output = model(data) 99 | ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 100 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 101 | ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() 102 | ddp_loss[2] += len(data) 103 | 104 | dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) 105 | 106 | if rank == 0: 107 | test_loss = ddp_loss[0] / ddp_loss[2] 108 | print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( 109 | test_loss, int(ddp_loss[1]), int(ddp_loss[2]), 110 | 100. * ddp_loss[1] / ddp_loss[2])) 111 | 112 | 113 | def fsdp_main(rank, world_size, args): 114 | torch.manual_seed(123) 115 | torch.xpu.manual_seed(123) 116 | setup(rank, world_size) 117 | 118 | transform=transforms.Compose([ 119 | transforms.ToTensor(), 120 | transforms.Normalize((0.1307,), (0.3081,)) 121 | ]) 122 | 123 | dataset1 = datasets.MNIST('../data', train=True, download=True, 124 | transform=transform) 125 | dataset2 = datasets.MNIST('../data', train=False, 126 | transform=transform) 127 | 128 | sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) 129 | sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) 130 | 131 | train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} 132 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} 133 | cuda_kwargs = {'num_workers': 2, 134 | 'pin_memory': True, 135 | 'shuffle': False} 136 | train_kwargs.update(cuda_kwargs) 137 | test_kwargs.update(cuda_kwargs) 138 | 139 | train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) 140 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) 141 | my_auto_wrap_policy = functools.partial( 142 | size_based_auto_wrap_policy, min_num_params=100 143 | ) 144 | 145 | xpu_device = "xpu:{}".format(rank) 146 | torch.xpu.set_device(xpu_device) 147 | 148 | #init_start_event = torch.Event(enable_timing=True) 149 | #init_end_event = torch.Event(enable_timing=True) 150 | 151 | model = Net().to("xpu:{}".format(rank)) 152 | 153 | model = FSDP(model, device_id="xpu:{}".format(rank)) 154 | 155 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 156 | 157 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 158 | #init_start_event.record() 159 | elapsed = time.time() 160 | for epoch in range(1): 161 | train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) 162 | test(model, rank, world_size, test_loader) 163 | scheduler.step() 164 | 165 | #init_end_event.record() 166 | elapsed = time.time() - elapsed 167 | if rank == 0: 168 | #print(f"XPU event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") 169 | print(f"XPU event elapsed time: {elapsed}sec") 170 | print(f"{model}") 171 | 172 | if args.save_model: 173 | # use a barrier to make sure training is done on all ranks 174 | dist.barrier() 175 | # state_dict for FSDP model is only available on Nightlies for now 176 | states = model.state_dict() 177 | if rank == 0: 178 | torch.save(states, "mnist_cnn.pt") 179 | 180 | cleanup() 181 | 182 | 183 | 184 | if __name__ == '__main__': 185 | # Training settings 186 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 187 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 188 | help='input batch size for training (default: 64)') 189 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 190 | help='input batch size for testing (default: 1000)') 191 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 192 | help='number of epochs to train (default: 14)') 193 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 194 | help='learning rate (default: 1.0)') 195 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 196 | help='Learning rate step gamma (default: 0.7)') 197 | parser.add_argument('--no-cuda', action='store_true', default=False, 198 | help='disables CUDA training') 199 | parser.add_argument('--seed', type=int, default=1, metavar='S', 200 | help='random seed (default: 1)') 201 | parser.add_argument('--save-model', action='store_true', default=False, 202 | help='For Saving the current Model') 203 | args = parser.parse_args() 204 | 205 | torch.manual_seed(args.seed) 206 | 207 | # WORLD_SIZE = torch.xpu.device_count() 208 | WORLD_SIZE = 2 209 | mp.spawn(fsdp_main, 210 | args=(WORLD_SIZE, args), 211 | nprocs=WORLD_SIZE, 212 | join=True) 213 | -------------------------------------------------------------------------------- /tests/test_llm_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 8 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 9 | os.environ['MASTER_ADDR'] = '127.0.0.1' 10 | os.environ['MASTER_PORT'] = '29500' 11 | dist.init_process_group("ccl") 12 | rank = dist.get_rank() 13 | size = dist.get_world_size() 14 | 15 | device = "xpu:{}".format(rank) 16 | llm_shapes = [ 17 | # GPT-J 6B 18 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 19 | # Llama 7B 20 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 21 | # Llama 13B 22 | (1, 32, 5120), (1, 1024, 5120), (1, 4, 5120), (1, 1, 5120), 23 | # Llama2 7B 24 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 25 | # Llama2 13B 26 | (1, 32, 5120), (1, 1024, 5120), (1, 4, 5120), (1, 1, 5120), 27 | # Llama2 70B 28 | (1, 32, 8192), (1, 1024, 8192), (1, 1, 8192), (1, 4, 8192), 29 | # OPT 6.7B 30 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 31 | # OPT 30B 32 | (1, 32, 7168), (1, 1, 7168), (1, 1024, 7168), (1, 4, 7168), 33 | # Bloom 7B 34 | (1, 33, 4096), (1, 1, 4096), (1, 4, 4096), (1, 1028, 4096), 35 | # Bloom 176B 36 | (1, 4, 14336), (1, 1028, 14336), (1, 33, 14336), (1, 1, 14336) 37 | ] 38 | 39 | os.environ['TORCH_LLM_ALLREDUCE_DEBUG'] = '1' 40 | for shape in llm_shapes: 41 | data = torch.rand(shape, dtype=torch.float16).to(device) 42 | # Expected value is identical to input for average allreduce. 43 | expect_result = data 44 | # Allreduce is an inplace op, data will represent output. 45 | dist.all_reduce(data) 46 | assert torch.allclose(data, expect_result) 47 | -------------------------------------------------------------------------------- /tests/test_p2p_crossnodes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--world_size', default=-1, type=int, help='number of gpu for distributed training') 11 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 12 | parser.add_argument('--dist_port', default='29800', type=str, help='url port used to set up distributed training') 13 | args = parser.parse_args() 14 | 15 | os.environ['RANK'] = str(os.environ.get('PMIX_RANK',0)) 16 | os.environ['WORLD_SIZE'] = str(args.world_size) 17 | os.environ['MASTER_ADDR'] = '127.0.0.1' 18 | os.environ['MASTER_PORT'] = '29500' 19 | 20 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 21 | dist.init_process_group(backend='ccl', init_method=init_method, 22 | world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK'])) 23 | 24 | rank = dist.get_rank() 25 | size = dist.get_world_size() 26 | local_rank = os.environ['PALS_LOCAL_RANKID'] 27 | device = "xpu:{}".format(local_rank) 28 | print('world_size:{}, global rank:{}, local_rank:{}'.format(size, rank, local_rank)) 29 | 30 | # allreduce is WA 31 | data = torch.randn(2, dtype=torch.float32).to(device) 32 | dist.all_reduce(data) 33 | 34 | def send_tensor(buffer, recv_stage): 35 | if isinstance(buffer, torch.Tensor): 36 | type_tensor = torch.LongTensor(data=[0]).to(device) 37 | dist.send(type_tensor, recv_stage) 38 | send_shape = torch.LongTensor(data=buffer.size()).to(device) 39 | send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(device) 40 | dist.send(send_ndims, recv_stage) 41 | dist.send(send_shape, recv_stage) 42 | 43 | def recv_tensor(send_stage): 44 | type_tensor = torch.LongTensor(data=[0]).to(device) 45 | dist.recv(type_tensor, send_stage) 46 | recv_type = type_tensor.item() 47 | 48 | if recv_type == 0: 49 | recv_ndims = torch.LongTensor(data=[0]).to(device) 50 | dist.recv(recv_ndims, send_stage) 51 | recv_ndims = recv_ndims.item() 52 | recv_shape = torch.LongTensor([1] * recv_ndims).to(device) 53 | dist.recv(recv_shape, send_stage) 54 | print("recv_ndims", recv_ndims) 55 | print("recv_shape", recv_shape) 56 | else: 57 | print("----------------error-------------------") 58 | size = dist.get_world_size() 59 | device = "xpu:{}".format(local_rank) 60 | 61 | data = torch.randn(1, dtype=torch.float32).to(device) 62 | dist.all_reduce(data) 63 | 64 | # rank1 -> rank3 -> rank15 -> rank23 -> rank8 65 | if rank == 1: 66 | tensor = torch.ones(2048,3,256).xpu(device) 67 | send_tensor(tensor, 3) 68 | if rank == 3: 69 | recv_tensor(1) 70 | tensor = torch.ones(2048,3,256).xpu(device) 71 | send_tensor(tensor, 15) 72 | if rank == 15: 73 | recv_tensor(3) 74 | tensor = torch.ones(2048,3,256).xpu(device) 75 | send_tensor(tensor, 23) 76 | if rank == 23: 77 | recv_tensor(15) 78 | tensor = torch.ones(2048,3,256).xpu(device) 79 | send_tensor(tensor, 8) 80 | if rank == 8: 81 | recv_tensor(23) 82 | -------------------------------------------------------------------------------- /third-party-programs.txt: -------------------------------------------------------------------------------- 1 | PyTorch binding for Intel(R) oneAPI Collective Communications Library (oneCCL) 2 | Third Party Programs File 3 | 4 | This file is the "third-party-programs.txt" file specified in the associated 5 | Intel end user license agreement for the Intel software you are licensing. The 6 | third party programs and their corresponding required notices and/or license 7 | terms are listed below. 8 | 9 | ------------------------------------------------------------------------------- 10 | 11 | 1. PyTorch 12 | 13 | From PyTorch: 14 | 15 | Copyright (c) 2016- Facebook, Inc (Adam Paszke) 16 | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 17 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 18 | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 19 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 20 | Copyright (c) 2011-2013 NYU (Clement Farabet) 21 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, 22 | Iain Melvin, Jason Weston) 23 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 24 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, 25 | Johnny Mariethoz) 26 | 27 | From Caffe2: 28 | 29 | Copyright (c) 2016-present, Facebook Inc. All rights reserved. 30 | 31 | All contributions by Facebook: 32 | Copyright (c) 2016 Facebook Inc. 33 | 34 | All contributions by Google: 35 | Copyright (c) 2015 Google Inc. 36 | All rights reserved. 37 | 38 | All contributions by Yangqing Jia: 39 | Copyright (c) 2015 Yangqing Jia 40 | All rights reserved. 41 | 42 | All contributions from Caffe: 43 | Copyright(c) 2013, 2014, 2015, the respective contributors 44 | All rights reserved. 45 | 46 | All other contributions: 47 | Copyright(c) 2015, 2016 the respective contributors 48 | All rights reserved. 49 | 50 | 51 | The -3-Clause BSD license 52 | 53 | Caffe2 uses a copyright model similar to Caffe: each contributor holds 54 | copyright over their contributions to Caffe2. The project versioning records 55 | all such contribution and copyright details. If a contributor wants to further 56 | mark their specific copyright on a particular contribution, they should 57 | indicate their copyright solely in the commit message of the change when it is 58 | committed. 59 | 60 | All rights reserved. 61 | 62 | Redistribution and use in source and binary forms, with or without 63 | modification, are permitted provided that the following conditions are met: 64 | 65 | 1. Redistributions of source code must retain the above copyright 66 | notice, this list of conditions and the following disclaimer. 67 | 68 | 2. Redistributions in binary form must reproduce the above copyright 69 | notice, this list of conditions and the following disclaimer in the 70 | documentation and/or other materials provided with the distribution. 71 | 72 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories 73 | America and IDIAP Research Institute nor the names of its contributors may 74 | be used to endorse or promote products derived from this software without 75 | specific prior written permission. 76 | 77 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 78 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 79 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 80 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 81 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 82 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 83 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 84 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 85 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 86 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 87 | POSSIBILITY OF SUCH DAMAGE. 88 | 89 | ------------------------------------------------------------------------------- 90 | 91 | Other names and brands may be claimed as the property of others. 92 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-ccl/6acc2008785b7e0a859dfcd22377d6b891212351/tools/__init__.py -------------------------------------------------------------------------------- /tools/setup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-ccl/6acc2008785b7e0a859dfcd22377d6b891212351/tools/setup/__init__.py -------------------------------------------------------------------------------- /tools/setup/cmake.py: -------------------------------------------------------------------------------- 1 | "Manages CMake." 2 | import os 3 | import re 4 | import shutil 5 | from subprocess import check_call, check_output 6 | import sys 7 | import distutils 8 | import distutils.sysconfig 9 | from distutils.version import LooseVersion 10 | from setuptools import Extension 11 | from collections import defaultdict 12 | from .env import BUILD_DIR, check_env_flag 13 | # from .numpy_ import USE_NUMPY, NUMPY_INCLUDE_DIR 14 | 15 | 16 | def _mkdir_p(d): 17 | try: 18 | os.makedirs(d) 19 | except OSError: 20 | pass 21 | 22 | 23 | # Ninja 24 | # Use ninja if it is on the PATH. Previous version of PyTorch required the 25 | # ninja python package, but we no longer use it, so we do not have to import it 26 | # USE_NINJA = (not check_negative_env_flag('USE_NINJA') and 27 | # shutil.which('ninja') is not None) 28 | def convert_cmake_value_to_python_value(cmake_value, cmake_type): 29 | r"""Convert a CMake value in a string form to a Python value. 30 | 31 | Arguments: 32 | cmake_value (string): The CMake value in a string form (e.g., "ON", "OFF", "1"). 33 | cmake_type (string): The CMake type of :attr:`cmake_value`. 34 | 35 | Returns: 36 | A Python value corresponding to :attr:`cmake_value` with type :attr:`cmake_type`. 37 | """ 38 | 39 | cmake_type = cmake_type.upper() 40 | up_val = cmake_value.upper() 41 | if cmake_type == 'BOOL': 42 | # https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/VariablesListsStrings#boolean-values-in-cmake 43 | return not (up_val in ('FALSE', 'OFF', 'N', 'NO', '0', '', 'NOTFOUND') or up_val.endswith('-NOTFOUND')) 44 | elif cmake_type == 'FILEPATH': 45 | if up_val.endswith('-NOTFOUND'): 46 | return None 47 | else: 48 | return cmake_value 49 | else: # Directly return the cmake_value. 50 | return cmake_value 51 | 52 | 53 | def get_cmake_cache_variables_from_file(cmake_cache_file): 54 | r"""Gets values in CMakeCache.txt into a dictionary. 55 | 56 | Arguments: 57 | cmake_cache_file: A CMakeCache.txt file object. 58 | Returns: 59 | dict: A ``dict`` containing the value of cached CMake variables. 60 | """ 61 | 62 | results = dict() 63 | for i, line in enumerate(cmake_cache_file, 1): 64 | line = line.strip() 65 | if not line or line.startswith(('#', '//')): 66 | # Blank or comment line, skip 67 | continue 68 | 69 | # Almost any character can be part of variable name and value. As a practical matter, we assume the type must be 70 | # valid if it were a C variable name. It should match the following kinds of strings: 71 | # 72 | # USE_CUDA:BOOL=ON 73 | # "USE_CUDA":BOOL=ON 74 | # USE_CUDA=ON 75 | # USE_CUDA:=ON 76 | # Intel(R) MKL-DNN_SOURCE_DIR:STATIC=/path/to/pytorch/third_party/ideep/mkl-dnn 77 | # "OpenMP_COMPILE_RESULT_CXX_openmp:experimental":INTERNAL=FALSE 78 | matched = re.match(r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line) 79 | if matched is None: # Illegal line 80 | raise ValueError('Unexpected line {} in {}: {}'.format(i, repr(cmake_cache_file), line)) 81 | _, variable, type_, value = matched.groups() 82 | if type_ is None: 83 | type_ = '' 84 | if type_.upper() in ('INTERNAL', 'STATIC'): 85 | # CMake internal variable, do not touch 86 | continue 87 | results[variable] = convert_cmake_value_to_python_value(value, type_) 88 | 89 | return results 90 | 91 | 92 | class CMakeExtension(Extension): 93 | """CMake extension""" 94 | def __init__(self, name, cmake_file): 95 | super().__init__(name, []) 96 | self.build_dir = BUILD_DIR 97 | self.cmake_file = cmake_file 98 | self._cmake_command = CMakeExtension._get_cmake_command() 99 | self.debug = True 100 | self.cmake_dir = os.path.dirname(cmake_file) 101 | 102 | @staticmethod 103 | def _get_version(cmd): 104 | """Returns cmake version.""" 105 | 106 | for line in check_output([cmd, '--version']).decode('utf-8').split('\n'): 107 | if 'version' in line: 108 | return LooseVersion(line.strip().split(' ')[2]) 109 | raise RuntimeError('no version found') 110 | 111 | @staticmethod 112 | def _get_cmake_command(): 113 | """Returns cmake command.""" 114 | 115 | cmake_command = shutil.which('cmake') 116 | cmake3 = shutil.which('cmake3') 117 | if cmake3 is not None: 118 | cmake = shutil.which('cmake') 119 | if cmake is not None: 120 | bare_version = CMakeExtension._get_version(cmake) 121 | if (bare_version < LooseVersion("3.5.0") and 122 | CMakeExtension._get_version(cmake3) > bare_version): 123 | cmake_command = 'cmake3' 124 | return cmake_command 125 | 126 | @staticmethod 127 | def defines(args, **kwargs): 128 | "Adds definitions to a cmake argument list." 129 | for key, value in sorted(kwargs.items()): 130 | if value is not None: 131 | args.append('-D{}={}'.format(key, value)) 132 | 133 | @staticmethod 134 | def _cmake_value(value): 135 | if type(value) is str: 136 | if value.startswith(('OFF', '0', 'False', 'FALSE')): 137 | return False 138 | if value.startswith(('ON', '1', 'True', 'TRUE')): 139 | return True 140 | return value 141 | 142 | @staticmethod 143 | def extract(args): 144 | "Adds definitions to a cmake argument list." 145 | build_options = {} 146 | pat = re.compile(r'^-D(.*)=(.*)') 147 | for arg in args: 148 | match = pat.match(arg) 149 | 150 | build_options[match.group(1)] = CMakeExtension._cmake_value(match.group(2)) 151 | 152 | return build_options 153 | 154 | @staticmethod 155 | def convert_cmake_dirs(paths): 156 | def converttostr(input_seq, seperator): 157 | # Join all the strings in list 158 | final_str = seperator.join(input_seq) 159 | return final_str 160 | try: 161 | return converttostr(paths, ";") 162 | except: 163 | return paths 164 | 165 | @property 166 | def _cmake_cache_file(self): 167 | r"""Returns the path to CMakeCache.txt. 168 | 169 | Returns: 170 | string: The path to CMakeCache.txt. 171 | """ 172 | return os.path.join(self.build_dir, 'CMakeCache.txt') 173 | 174 | def _get_cmake_cache_variables(self): 175 | r"""Gets values in CMakeCache.txt into a dictionary. 176 | Returns: 177 | dict: A ``dict`` containing the value of cached CMake variables. 178 | """ 179 | with open(self._cmake_cache_file) as f: 180 | return get_cmake_cache_variables_from_file(f) 181 | 182 | def _run(self, args, env): 183 | """Executes cmake with arguments and an environment.""" 184 | command = [self._cmake_command] + args + [self.cmake_dir] 185 | print(' '.join(command)) 186 | check_call(command, cwd=self.build_dir, env=env) 187 | 188 | def generate(self, build_options, env, build_dir, install_dir): 189 | """Runs cmake to generate native build files.""" 190 | 191 | self.build_dir = build_dir 192 | 193 | cmake_args = [] 194 | 195 | for var, val in env.items(): 196 | if var.startswith(('BUILD_', 'USE_', 'CMAKE_')): 197 | # TODO: DO NOT OVERWRITE CMAKE_PREFIX_PATH 198 | if var.strip() == "CMAKE_PREFIX_PATH": 199 | build_options[var] += ";" + val 200 | else: 201 | build_options[var] = val 202 | 203 | if 'CMAKE_BUILD_TYPE' not in env: 204 | if check_env_flag('DEBUG', env=env): 205 | build_options['CMAKE_BUILD_TYPE'] = 'Debug' 206 | elif check_env_flag('REL_WITH_DEB_INFO', env=env): 207 | build_options['CMAKE_BUILD_TYPE'] = 'RelWithDebInfo' 208 | else: 209 | build_options['CMAKE_BUILD_TYPE'] = 'Release' 210 | build_options['CMAKE_INSTALL_PREFIX'] = install_dir 211 | 212 | CMakeExtension.defines(cmake_args, **build_options) 213 | if os.path.exists(self._cmake_cache_file): 214 | try: 215 | cmake_cache_vars = defaultdict(lambda: False, self._get_cmake_cache_variables()) 216 | except FileNotFoundError: 217 | # CMakeCache.txt does not exist. Probably running "python setup.py clean" over a clean directory. 218 | cmake_cache_vars = defaultdict(lambda: False) 219 | 220 | cache_build_options = CMakeExtension.extract(cmake_args) 221 | if all(option in cmake_cache_vars and 222 | CMakeExtension._cmake_value(cache_build_options[option]) == CMakeExtension._cmake_value(cmake_cache_vars[option]) 223 | for option in cache_build_options): 224 | # Everything's in place. Do not rerun. 225 | return 226 | self._run(cmake_args, env=env) 227 | -------------------------------------------------------------------------------- /tools/setup/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import platform 4 | 5 | 6 | IS_LINUX = (platform.system() == 'Linux') 7 | 8 | BUILD_DIR = 'build' 9 | 10 | 11 | def get_compiler(runtime): 12 | if runtime == 'dpcpp': 13 | c_compiler = 'icx' 14 | cpp_compiler = 'icpx' 15 | else: 16 | c_compiler = 'cc' 17 | cpp_compiler = 'c++' 18 | 19 | cc = shutil.which(c_compiler) 20 | cpp = shutil.which(cpp_compiler) 21 | if cpp is None or cc is None: 22 | raise RuntimeError("couldn't find the compiler '{}' or '{}'".format(c_compiler, cpp_compiler)) 23 | return cc, cpp 24 | 25 | 26 | def check_env_flag(name, env=os.environ, default=''): 27 | return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y'] 28 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 2.3.0 2 | --------------------------------------------------------------------------------