├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── SECURITY.md
├── cmake
    └── Modules
    │   └── FindoneCCL.cmake
├── demo
    ├── README.md
    └── demo.py
├── oneccl_bindings_for_pytorch
    ├── __init__.py
    └── csrc
    │   ├── _C.cpp
    │   ├── init.cpp
    │   └── init.h
├── requirements.txt
├── setup.py
├── src
    ├── CMakeLists.txt
    ├── ProcessGroupCCL.cpp
    ├── ProcessGroupCCL.hpp
    ├── ccl_comm_collector.cpp
    ├── ccl_comm_collector.h
    ├── cpu
    │   └── cpu_ccl.cpp
    ├── dispatch_stub.cpp
    ├── dispatch_stub.h
    ├── env.cpp
    ├── env.h
    ├── gpu
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── README.md
    │   ├── allreduce.cpp
    │   ├── allreduce.h
    │   ├── allreduce_small.h
    │   ├── cxxopts.hpp
    │   ├── dpcpp_ccl.cpp
    │   ├── runtime.hpp
    │   ├── sycl_misc.hpp
    │   └── ze_exception.hpp
    ├── test
    │   ├── remotesync
    │   │   ├── Makefile
    │   │   ├── simple_test.cpp
    │   │   ├── sycl_misc.hpp
    │   │   ├── test.sh
    │   │   └── ze_exception.hpp
    │   ├── segfault
    │   │   ├── Makefile
    │   │   ├── simple_test.cpp
    │   │   ├── sycl_misc.hpp
    │   │   ├── test.sh
    │   │   └── ze_exception.hpp
    │   └── writeremote
    │   │   ├── Makefile
    │   │   ├── simple_test.cpp
    │   │   ├── sycl_misc.hpp
    │   │   ├── test.sh
    │   │   └── ze_exception.hpp
    ├── utils.cpp
    └── utils.h
├── tests
    ├── DeepSpeed_test
    │   ├── DeepSpeed.csv
    │   ├── Example.csv
    │   ├── testccl_cpu.py
    │   ├── testccl_gpu.py
    │   └── testccl_gpu_mpi.py
    ├── README.md
    ├── ddp_allreduce.py
    ├── ds_p2p_crossnodes.py
    ├── ds_subgroup_allreduce.py
    ├── run_ds_llm.sh
    ├── test_allreduce.py
    ├── test_barrier.py
    ├── test_c10d_ccl.py
    ├── test_c10d_p2p.py
    ├── test_fsdp.py
    ├── test_llm_allreduce.py
    └── test_p2p_crossnodes.py
├── third-party-programs.txt
├── tools
    ├── __init__.py
    └── setup
    │   ├── __init__.py
    │   ├── cmake.py
    │   └── env.py
└── version.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # setup.py uses the list of patterns in this file to decide
  2 | # what to delete when clean up
  3 | 
  4 | .coverage
  5 | .hypothesis
  6 | .mypy_cache
  7 | */*.pyc
  8 | */*.so*
  9 | */**/__pycache__
 10 | */**/*.dylib*
 11 | */**/*.pyc
 12 | */**/*.pyd
 13 | */**/*.so*
 14 | */**/**/*.pyc
 15 | */**/**/**/*.pyc
 16 | */**/**/**/**/*.pyc
 17 | 
 18 | oneccl_bindings_for_pytorch/include/
 19 | oneccl_bindings_for_pytorch/lib/
 20 | oneccl_bindings_for_pytorch/bin/
 21 | oneccl_bindings_for_pytorch/etc/
 22 | oneccl_bindings_for_pytorch/env/
 23 | oneccl_bindings_for_pytorch/examples/
 24 | oneccl_bindings_for_pytorch/licensing/
 25 | oneccl_bindings_for_pytorch/modulefiles/
 26 | oneccl_bindings_for_pytorch/version.py
 27 | 
 28 | ## General
 29 | 
 30 | # Debug Shell Script
 31 | *.sh
 32 | 
 33 | # Compiled Object files
 34 | *.slo
 35 | *.lo
 36 | *.o
 37 | *.cuo
 38 | *.obj
 39 | 
 40 | # Compiled Dynamic libraries
 41 | *.so
 42 | *.dylib
 43 | *.dll
 44 | 
 45 | # Compiled Static libraries
 46 | *.lai
 47 | *.la
 48 | *.a
 49 | *.lib
 50 | 
 51 | # Compiled protocol buffers
 52 | *.pb.h
 53 | *.pb.cc
 54 | *_pb2.py
 55 | 
 56 | # Compiled python
 57 | *.pyc
 58 | *.pyd
 59 | 
 60 | # Compiled MATLAB
 61 | *.mex*
 62 | 
 63 | # IPython notebook checkpoints
 64 | .ipynb_checkpoints
 65 | 
 66 | # Editor temporaries
 67 | *.swn
 68 | *.swo
 69 | *.swp
 70 | *~
 71 | 
 72 | # Sublime Text settings
 73 | *.sublime-workspace
 74 | *.sublime-project
 75 | 
 76 | # Eclipse Project settings
 77 | *.*project
 78 | .settings
 79 | 
 80 | # Files generated by CLion
 81 | cmake-build-debug
 82 | 
 83 | # QtCreator files
 84 | *.user
 85 | 
 86 | # OSX dir files
 87 | .DS_Store
 88 | 
 89 | # GDB history
 90 | .gdb_history
 91 | 
 92 | ## Caffe2
 93 | 
 94 | # build, distribute, and bins (+ python proto bindings)
 95 | build
 96 | /build_*
 97 | .build_debug/*
 98 | .build_release/*
 99 | distribute/*
100 | dist/
101 | *.testbin
102 | *.bin
103 | cmake_build
104 | .cmake_build
105 | gen
106 | .setuptools-cmake-build
107 | 
108 | # setup.py intermediates
109 | .eggs
110 | oneccl_bindings_for_pytorch.egg-info
111 | oneccl_bind_pt.egg-info
112 | 
113 | # Files generated by ctags
114 | CTAGS
115 | tags
116 | TAGS
117 | 
118 | # BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.)
119 | #
120 | # Below files are not deleted by "setup.py clean".
121 | 
122 | # Visual Studio Code files
123 | .vscode
124 | .vs
125 | .idea
126 | 
127 | # Files generated when a patch is rejected
128 | *.orig
129 | *.rej
130 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/oneCCL"]
2 | 	path = third_party/oneCCL
3 | 	url = https://github.com/oneapi-src/oneCCL.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 2 | set(CMAKE_CXX_STANDARD 17)
 3 | 
 4 | project(oneccl_bindings_for_pytorch C CXX)
 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat")
 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=cpp")
 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat-security")
 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector")
 9 | 
10 | set(LINUX TRUE)
11 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
12 | set(CMAKE_INSTALL_MESSAGE NEVER)
13 | 
14 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
15 | 
16 | set(RPATH_VALUE)
17 | list(APPEND RPATH_VALUE "$ORIGIN")
18 | 
19 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true)
20 | 
21 | option(USE_SYSTEM_ONECCL "Use oneCCL library in system" OFF)
22 | 
23 | option(BUILD_NO_ONECCL_PACKAGE "Build with oneCCL excluded" OFF)
24 | 
25 | set(DEPENDS_LIB)
26 | 
27 | # Find the Torch lib
28 | find_package(Torch REQUIRED)
29 | list(APPEND DEPENDS_LIB torch)
30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
31 | 
32 | # Find OneCCL Lib
33 | IF (USE_SYSTEM_ONECCL)
34 |     # Find and link MPI lib
35 |     find_package(MPI REQUIRED)
36 |     list(APPEND DEPENDS_LIB ${MPI_LIBRARIES})
37 | 
38 |     # Link CCL lib
39 |     set(CCL_ROOT $ENV{CCL_ROOT})
40 |     set(CCL_CONFIGURATION_PATH $ENV{CCL_CONFIGURATION_PATH})
41 |     include_directories(${CCL_ROOT}/include)
42 |     list(APPEND DEPENDS_LIB "${CCL_ROOT}/lib/${CCL_CONFIGURATION_PATH}/libccl.so")
43 |     list(APPEND RPATH_VALUE "$ORIGIN/../../../../")
44 | ELSE()
45 |     # Find OneCCL Lib
46 |     find_package(oneCCL REQUIRED)
47 |     link_directories(${MPI_LIB_DIR})
48 |     list(APPEND DEPENDS_LIB oneCCL mpi)
49 | ENDIF()
50 | 
51 | if(COMPUTE_BACKEND STREQUAL "dpcpp")
52 |     list(APPEND DEPENDS_LIB ze_loader)
53 | endif()
54 | 
55 | set(CMAKE_SKIP_BUILD_RPATH  FALSE)
56 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
57 | set(CMAKE_INSTALL_RPATH "${RPATH_VALUE}")
58 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
59 | 
60 | SET(LIB_NAME "oneccl_bindings_for_pytorch")
61 | 
62 | add_subdirectory(./src)
63 | 
64 | function (print_configuration_summary)
65 |     get_directory_property(CMAKE_COMPILE_DEFINITIONS DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
66 | 
67 |     message(STATUS "")
68 |     message(STATUS "******** Summary ********")
69 |     message(STATUS "General:")
70 |     message(STATUS "  CMake version         : ${CMAKE_VERSION}")
71 |     message(STATUS "  CMake command         : ${CMAKE_COMMAND}")
72 |     message(STATUS "  System                : ${CMAKE_SYSTEM_NAME}")
73 |     message(STATUS "  Target name           : ${LIB_NAME}")
74 |     message(STATUS "  Install path          : ${CMAKE_INSTALL_PREFIX}")
75 |     message(STATUS "  Build type            : ${CMAKE_BUILD_TYPE}")
76 |     message(STATUS "  C++ compiler          : ${CMAKE_CXX_COMPILER}")
77 |     message(STATUS "  C++ compiler id       : ${CMAKE_CXX_COMPILER_ID}")
78 |     message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
79 |     message(STATUS "  CXX flags             : ${CMAKE_CXX_FLAGS}")
80 |     message(STATUS "  Compile flags         : ${IPEX_COMPILE_FLAGS}")
81 |     message(STATUS "  Compile definitions   : ${CMAKE_COMPILE_DEFINITIONS}")
82 |     message(STATUS "  Linker options        : ${CMAKE_SHARED_LINKER_FLAGS}")
83 |     get_target_property(LINK_LIBRARIES oneccl_bindings_for_pytorch LINK_LIBRARIES)
84 |     message(STATUS "  Linker libraries        : ${LINK_LIBRARIES}")
85 |     get_target_property(LINK_DIRECTORS oneccl_bindings_for_pytorch LINK_DIRECTORIES)
86 |     message(STATUS "  Linker directors        : ${LINK_DIRECTORS}")
87 | 
88 |     message(STATUS "")
89 | endfunction()
90 | 
91 | print_configuration_summary()
92 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |  Copyright (c) 2020-2021, Intel Corporation
 2 |  All rights reserved.
 3 |  
 4 |  Redistribution and use in source and binary forms, with or without
 5 |  modification, are permitted provided that the following conditions are met:
 6 |  
 7 |  1. Redistributions of source code must retain the above copyright
 8 |     notice, this list of conditions and the following disclaimer.
 9 |  
10 |  2. Redistributions in binary form must reproduce the above copyright
11 |     notice, this list of conditions and the following disclaimer in the
12 |     documentation and/or other materials provided with the distribution.
13 |  
14 |  3. Neither the name of the Intel Corporation nor the names of its contributors
15 |     may be used to endorse or promote products derived from this software
16 |     without specific prior written permission.
17 |  
18 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 |  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 |  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 |  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 |  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 |  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 |  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 |  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 |  POSSIBILITY OF SUCH DAMAGE.
29 |  


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Intel® oneCCL Bindings for PyTorch (formerly known as torch_ccl)
  2 | 
  3 | This repository holds PyTorch bindings maintained by Intel® for the Intel® oneAPI Collective Communications Library (oneCCL).
  4 | 
  5 | ## Introduction
  6 | 
  7 | [PyTorch](https://github.com/pytorch/pytorch) is an open-source machine learning framework.
  8 | 
  9 | [Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training, implementing collectives like `allreduce`, `allgather`, `alltoall`. For more information on oneCCL, please refer to the [oneCCL documentation](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/oneccl/source/).
 10 | 
 11 | `oneccl_bindings_for_pytorch` module implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now.
 12 | 
 13 | ## Capability
 14 | 
 15 | The table below shows which functions are available for use with CPU / Intel dGPU tensors.
 16 | 
 17 | |                  | CPU   | GPU   |
 18 | | :--------------- | :---: | :---: |
 19 | | `send`           | ×     | √     |
 20 | | `recv`           | ×     | √     |
 21 | | `broadcast`      | √     | √     |
 22 | | `all_reduce`     | √     | √     |
 23 | | `reduce`         | √     | √     |
 24 | | `all_gather`     | √     | √     |
 25 | | `gather`         | √     | √     |
 26 | | `scatter`        | √     | √     |
 27 | | `reduce_scatter` | √     | √     |
 28 | | `all_to_all`     | √     | √     |
 29 | | `barrier`        | √     | √     |
 30 | 
 31 | 
 32 | ## PyTorch API Align
 33 | 
 34 | We recommend using Anaconda as Python package management system. The followings are the corresponding branches (tags) of `oneccl_bindings_for_pytorch` and supported PyTorch.
 35 | 
 36 |    | `torch`                                                         | `oneccl_bindings_for_pytorch`                                             |
 37 |    | :-------------------------------------------------------------: | :-----------------------------------------------------------------------: |
 38 |    | `master`                                                        |  `master`                                                                 |
 39 |    | [v2.3.1](https://github.com/pytorch/pytorch/tree/v2.3.1)        |  [ccl_torch2.3.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.3.100+xpu)   |
 40 |    | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0)        |  [ccl_torch2.1.400](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.400+xpu)   |
 41 |    | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0)        |  [ccl_torch2.1.300](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.300+xpu)   |
 42 |    | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0)        |  [ccl_torch2.1.200](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.200+xpu)   |
 43 |    | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0)        |  [ccl_torch2.1.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.100+xpu)   |
 44 |    | [v2.0.1](https://github.com/pytorch/pytorch/tree/v2.0.1)        |  [ccl_torch2.0.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.0.100)   |
 45 |    | [v1.13](https://github.com/pytorch/pytorch/tree/v1.13)          |  [ccl_torch1.13](https://github.com/intel/torch-ccl/tree/ccl_torch1.13)   |
 46 |    | [v1.12.1](https://github.com/pytorch/pytorch/tree/v1.12.1)      |  [ccl_torch1.12.100](https://github.com/intel/torch-ccl/tree/ccl_torch1.12.100)   |
 47 |    | [v1.12.0](https://github.com/pytorch/pytorch/tree/v1.12.0)      |  [ccl_torch1.12](https://github.com/intel/torch-ccl/tree/ccl_torch1.12)   |
 48 |    | [v1.11.0](https://github.com/pytorch/pytorch/tree/v1.11.0)      |  [ccl_torch1.11](https://github.com/intel/torch-ccl/tree/ccl_torch1.11)   |
 49 |    | [v1.10.0](https://github.com/pytorch/pytorch/tree/v1.10.0)      |  [ccl_torch1.10](https://github.com/intel/torch-ccl/tree/ccl_torch1.10)   |
 50 |    | [v1.9.0](https://github.com/pytorch/pytorch/tree/v1.9.0)        |  [ccl_torch1.9](https://github.com/intel/torch-ccl/tree/ccl_torch1.9)     |
 51 |    | [v1.8.1](https://github.com/pytorch/pytorch/tree/v1.8.1)        |  [ccl_torch1.8](https://github.com/intel/torch-ccl/tree/ccl_torch1.8)     |
 52 |    | [v1.7.1](https://github.com/pytorch/pytorch/tree/v1.7.1)        |  [ccl_torch1.7](https://github.com/intel/torch-ccl/tree/ccl_torch1.7)     |
 53 |    | [v1.6.0](https://github.com/pytorch/pytorch/tree/v1.6.0)        |  [ccl_torch1.6](https://github.com/intel/torch-ccl/tree/ccl_torch1.6)     |
 54 |    | [v1.5-rc3](https://github.com/pytorch/pytorch/tree/v1.5.0-rc3)  |  [beta09](https://github.com/intel/torch-ccl/tree/beta09)                 |
 55 | 
 56 | The usage details can be found in the README of corresponding branch.
 57 | 
 58 | ## Requirements
 59 | 
 60 | - Python 3.8 or later and a C++17 compiler
 61 | 
 62 | - PyTorch v2.3.1
 63 | 
 64 | ## Build Option List
 65 | 
 66 | The following build options are supported in Intel® oneCCL Bindings for PyTorch*.
 67 | 
 68 | | Build Option                        | Default Value  | Description                                                                                         |
 69 | | :---------------------------------- | :------------- | :-------------------------------------------------------------------------------------------------- |
 70 | | COMPUTE_BACKEND                     | N/A            | Set oneCCL `COMPUTE_BACKEND`, set to `dpcpp`  and use DPC++ compiler to enable support for Intel XPU |
 71 | | USE_SYSTEM_ONECCL                   | OFF            | Use oneCCL library in system                                                                        |
 72 | | CCL_PACKAGE_NAME                    | oneccl-bind-pt | Set wheel name                                                                                      |
 73 | | ONECCL_BINDINGS_FOR_PYTORCH_BACKEND | cpu            | Set backend                                                                                         |
 74 | | CCL_SHA_VERSION                     | False          | Add git head sha version into wheel name                                                            |
 75 | 
 76 | ## Launch Option List
 77 | 
 78 | The following launch options are supported in Intel® oneCCL Bindings for PyTorch*.
 79 | 
 80 | | Launch Option                             | Default Value | Description                                                           |
 81 | | :--------------------------------------- | :------------ | :-------------------------------------------------------------------- |
 82 | | ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE  | 0             | Set verbose level in oneccl_bindings_for_pytorch                      |
 83 | | ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB | 0             | Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching |
 84 | | TORCH_LLM_ALLREDUCE                      | 0             | Set 1 to enable this prototype feature for better scale-up performance. This is a prototype feature to provide better scale-up performance by enabling optimized collective algorithms in oneCCL and asynchronous execution in torch-ccl. This feature requires XeLink enabled for cross-cards communication.|
 85 | | CCL_BLOCKING_WAIT                        | 0             | Set 1 to enable this prototype feature, which is to control whether collectives execution on XPU is host blocking or non-blocking. |
 86 | | CCL_SAME_STREAM                          | 0             | Set 1 to enable this prototype feature, which is to allow using a computation stream as communication stream to minimize overhead for streams synchronization. |
 87 | 
 88 | ## Installation
 89 | 
 90 | ### Install from Source
 91 | 
 92 | 1. clone the `oneccl_bindings_for_pytorch`.
 93 | 
 94 |    ```bash
 95 |    git clone https://github.com/intel/torch-ccl.git && cd torch-ccl
 96 |    git submodule sync
 97 |    git submodule update --init --recursive
 98 |    ```
 99 | 
100 | 2. Install `oneccl_bindings_for_pytorch`
101 | 
102 |    ```bash
103 |    # for CPU Backend Only
104 |    python setup.py install
105 |    # for XPU Backend: use DPC++ Compiler to enable support for Intel XPU
106 |    # build with oneCCL from third party
107 |    COMPUTE_BACKEND=dpcpp python setup.py install
108 |    # build with oneCCL from basekit
109 |    export INTELONEAPIROOT=${HOME}/intel/oneapi
110 |    USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install
111 |    ```
112 | 
113 | ### Install Prebuilt Wheel
114 | 
115 | Wheel files are available for the following Python versions. Please always use the latest release to get started.
116 | 
117 | | Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 |
118 | | :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | :---------: |
119 | | 2.3.100           |            |            | √          | √          | √           | √           |
120 | | 2.1.400           |            |            | √          | √          | √           | √           |
121 | | 2.1.300           |            |            | √          | √          | √           | √           |
122 | | 2.1.200           |            |            | √          | √          | √           | √           |
123 | | 2.1.100           |            |            | √          | √          | √           | √           |
124 | | 2.0.100           |            |            | √          | √          | √           | √           |
125 | | 1.13              |            | √          | √          | √          | √           |             |
126 | | 1.12.100          |            | √          | √          | √          | √           |             |
127 | | 1.12.0            |            | √          | √          | √          | √           |             |
128 | | 1.11.0            |            | √          | √          | √          | √           |             |
129 | | 1.10.0            | √          | √          | √          | √          |             |             |
130 | 
131 | ```bash
132 | python -m pip install oneccl_bind_pt==2.3.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
133 | ```
134 | 
135 | **Note:** Please set proxy or update URL address to https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ if you meet connection issue.
136 | 
137 | ### Runtime Dynamic Linking
138 | 
139 | - If oneccl_bindings_for_pytorch is built without oneCCL and use oneCCL in system, dynamic link oneCCl from oneAPI basekit (recommended usage):
140 | 
141 | ```bash
142 | source $basekit_root/ccl/latest/env/vars.sh
143 | ```
144 | 
145 | Note: Make sure you have installed [basekit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html#base-kit) when using Intel® oneCCL Bindings for Pytorch\* on Intel® GPUs.
146 | 
147 | - If oneccl_bindings_for_pytorch is built with oneCCL from third party or installed from prebuilt wheel:
148 | Dynamic link oneCCL and Intel MPI libraries:
149 | 
150 | ```bash
151 | source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/setvars.sh
152 | ```
153 | 
154 | Dynamic link oneCCL only (not including Intel MPI):
155 | 
156 | ```bash
157 | source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/vars.sh
158 | ```
159 | 
160 | ## Usage
161 | 
162 | **Note:** Please `import torch` and `import intel_extension_for_pytorch`, prior to `import oneccl_bindings_for_pytorch`.
163 | 
164 | example.py
165 | 
166 | ```python
167 | 
168 | import torch
169 | import intel_extension_for_pytorch
170 | import oneccl_bindings_for_pytorch
171 | import torch.nn.parallel
172 | import torch.distributed as dist
173 | 
174 | ...
175 | 
176 | os.environ['MASTER_ADDR'] = '127.0.0.1'
177 | os.environ['MASTER_PORT'] = '29500'
178 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
179 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
180 | 
181 | backend = 'ccl'
182 | dist.init_process_group(backend, ...)
183 | my_rank = dist.get_rank()
184 | my_size = dist.get_world_size()
185 | print("my rank = %d  my size = %d" % (my_rank, my_size))
186 | 
187 | ...
188 | 
189 | model = torch.nn.parallel.DistributedDataParallel(model, ...)
190 | 
191 | ...
192 | ```
193 | 
194 | (oneccl_bindings_for_pytorch is built without oneCCL, use oneCCL and MPI(if needed) in system)
195 | 
196 | ```bash
197 | source $basekit_root/ccl/latest/env/vars.sh
198 | source $basekit_root/mpi/latest/env/vars.sh
199 | 
200 | mpirun -n <N> -ppn <PPN> -f <hostfile> python example.py
201 | ```
202 | 
203 | ## Performance Debugging
204 | 
205 | For debugging performance of communication primitives PyTorch's [Autograd profiler](https://pytorch.org/docs/stable/autograd.html#profiler)
206 | can be used to inspect time spent inside oneCCL calls.
207 | 
208 | Example:
209 | 
210 | profiling.py
211 | 
212 | ```python
213 | 
214 | import torch.nn.parallel
215 | import torch.distributed as dist
216 | import oneccl_bindings_for_pytorch
217 | import os
218 | 
219 | os.environ['MASTER_ADDR'] = '127.0.0.1'
220 | os.environ['MASTER_PORT'] = '29500'
221 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
222 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
223 | 
224 | backend = 'ccl'
225 | dist.init_process_group(backend)
226 | my_rank = dist.get_rank()
227 | my_size = dist.get_world_size()
228 | print("my rank = %d  my size = %d" % (my_rank, my_size))
229 | 
230 | x = torch.ones([2, 2])
231 | y = torch.ones([4, 4])
232 | with torch.autograd.profiler.profile(record_shapes=True) as prof:
233 |     for _ in range(10):
234 |         dist.all_reduce(x)
235 |         dist.all_reduce(y)
236 | dist.barrier()
237 | print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cpu_time_total"))
238 | 
239 | ```
240 | 
241 | ```bash
242 | mpirun -n 2 -l python profiling.py
243 | ```
244 | 
245 | ```bash
246 | [0] my rank = 0  my size = 2
247 | [0] -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------
248 | [0]                                                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls          Input Shapes
249 | [0] -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------
250 | [0]                oneccl_bindings_for_pytorch::allreduce        91.41%     297.900ms        91.41%     297.900ms      29.790ms            10              [[2, 2]]
251 | [0]     oneccl_bindings_for_pytorch::wait::cpu::allreduce         8.24%      26.845ms         8.24%      26.845ms       2.684ms            10      [[2, 2], [2, 2]]
252 | [0]     oneccl_bindings_for_pytorch::wait::cpu::allreduce         0.30%     973.651us         0.30%     973.651us      97.365us            10      [[4, 4], [4, 4]]
253 | [0]                oneccl_bindings_for_pytorch::allreduce         0.06%     190.254us         0.06%     190.254us      19.025us            10              [[4, 4]]
254 | [0] -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------
255 | [0] Self CPU time total: 325.909ms
256 | [0]
257 | [1] my rank = 1  my size = 2
258 | [1] -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------
259 | [1]                                                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls          Input Shapes
260 | [1] -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------
261 | [1]                oneccl_bindings_for_pytorch::allreduce        96.03%     318.551ms        96.03%     318.551ms      31.855ms            10              [[2, 2]]
262 | [1]     oneccl_bindings_for_pytorch::wait::cpu::allreduce         3.62%      12.019ms         3.62%      12.019ms       1.202ms            10      [[2, 2], [2, 2]]
263 | [1]                oneccl_bindings_for_pytorch::allreduce         0.33%       1.082ms         0.33%       1.082ms     108.157us            10              [[4, 4]]
264 | [1]     oneccl_bindings_for_pytorch::wait::cpu::allreduce         0.02%      56.505us         0.02%      56.505us       5.651us            10      [[4, 4], [4, 4]]
265 | [1] -----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------
266 | [1] Self CPU time total: 331.708ms
267 | [1]
268 | 
269 | ```
270 | 
271 | ## Known Issues
272 | 
273 | For Point-to-point communication, directly call dist.send/recv after initializing the process group in launch script will trigger runtime error. Because all ranks of the group are expected to participate in this call to create communicators in our current implementation, while dist.send/recv only has a pair of ranks' participation. As a result, dist.send/recv should be used after collective call, which ensures all ranks' participation. The further solution for supporting directly call dist.send/recv after initializing the process group is still under investigation.
274 | 
275 | ## License
276 | 
277 | [BSD License](https://github.com/intel/torch-ccl/blob/master/LICENSE)
278 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation.
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindoneCCL.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find oneCCL
 2 | #
 3 | # The following are set after configuration is done:
 4 | #  ONECCL_FOUND          : set to true if oneCCL is found.
 5 | #  ONECCL_INCLUDE_DIRS   : path to oneCCL include dir.
 6 | #  ONECCL_LIBRARIES      : list of libraries for oneCCL
 7 | #
 8 | # and the following imported targets:
 9 | #
10 | #   oneCCL
11 | 
12 | IF (NOT ONECCL_FOUND)
13 | SET(ONECCL_FOUND OFF)
14 | SET(ONECCL_LIBRARIES)
15 | SET(ONECCL_INCLUDE_DIRS)
16 | 
17 | SET(ONECCL_ROOT "${PROJECT_SOURCE_DIR}/third_party/oneCCL")
18 | 
19 | IF(BUILD_NO_ONECCL_PACKAGE)
20 |     ADD_SUBDIRECTORY(${ONECCL_ROOT} oneCCL EXCLUDE_FROM_ALL)
21 | ELSE()
22 |     ADD_SUBDIRECTORY(${ONECCL_ROOT})
23 | ENDIF()
24 | 
25 | IF(NOT TARGET ccl)
26 |     MESSAGE(FATAL_ERROR "Failed to find oneCCL target")
27 | ENDIF()
28 | add_library(oneCCL ALIAS ccl)
29 | 
30 | GET_TARGET_PROPERTY(INCLUDE_DIRS oneCCL INCLUDE_DIRECTORIES)
31 | SET(ONECCL_INCLUDE_DIRS ${INCLUDE_DIRS})
32 | SET(ONECCL_LIBRARIES oneCCL)
33 | 
34 | find_package_handle_standard_args(oneCCL FOUND_VAR ONECCL_FOUND REQUIRED_VARS ONECCL_LIBRARIES ONECCL_INCLUDE_DIRS)
35 | 
36 | set(MPI_INCLUDE_DIR "${ONECCL_ROOT}/deps/mpi/include/")
37 | set(MPI_LIB_DIR "${ONECCL_ROOT}/deps/mpi/lib/")
38 | 
39 | ENDIF(NOT ONECCL_FOUND)
40 | 


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Demo for Intel® oneCCL Bindings for PyTorch*
 2 | 
 3 | This simple demo show case the functionality for collective communication primitives in Intel® oneCCL Bindings for PyTorch*.
 4 | 
 5 | ## Single Node Run
 6 | To run the simple demo on a single node with 2 instances, run: 
 7 | 
 8 | ```bash
 9 | mpirun -n 2 -l python demo.py
10 | 
11 | ```
12 | The demo could be also run on XPU with " --device xpu " argument. 
13 | 
14 | ```bash
15 | mpirun -n 2 -l python demo.py --device xpu
16 | ```
17 | 
18 | ## Multiple Nodes Run
19 | To run the simple demo on multiple nodes, please follow below instructions:
20 | 
21 | ### Ethernet
22 | 1. Identify the network interface name for collective communication. ex: eth0
23 | 2. Identify the IPs of all nodes. ex: 10.0.0.1,10.0.0.2
24 | 3. Identify the master node IP. ex: 10.0.0.1
25 | 4. Set the value of np for the total number of instances. ex: 2
26 | 5. Set the value of ppn for the number of instance per node. ex: 1
27 | 
28 | Here is a run command example for cpu according to above steps:
29 | 
30 | ```bash
31 | FI_TCP_IFACE=eth0 I_MPI_OFI_PROVIDER=tcp I_MPI_HYDRA_IFACE=eth0  I_MPI_DEBUG=121 mpirun -host 10.0.0.1,10.0.0.2 -np 2 -ppn 1  --map-by node  python demo.py --device cpu --dist_url 10.0.0.1  --dist_port 29500
32 | ```
33 | The demo could be also run on XPU by changing " --device cpu " to " --device xpu " argument. 
34 | 
35 | 


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.nn.parallel import DistributedDataParallel as DDP
 5 | import torch.distributed as dist
 6 | try:
 7 |    import intel_extension_for_pytorch
 8 | except:
 9 |    print("cant't import ipex")
10 | 
11 | import oneccl_bindings_for_pytorch
12 | import argparse
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--device', '-dev', type=str, default='cpu', help='Device type to use: cpu, xpu')
15 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training')
16 | parser.add_argument('--dist_port', default='29800', type=str, help='url port used to set up distributed training')
17 | args = parser.parse_args()
18 | 
19 | 
20 | class Model(nn.Module):
21 |     def __init__(self):
22 |         super(Model, self).__init__()
23 |         self.linear = nn.Linear(4, 5)
24 | 
25 |     def forward(self, input):
26 |         return self.linear(input)
27 | 
28 | 
29 | if __name__ == "__main__":
30 | 
31 |     mpi_world_size = int(os.environ.get('PMI_SIZE', -1))
32 |     mpi_rank = int(os.environ.get('PMI_RANK', -1))
33 |     if mpi_world_size > 0:
34 |         os.environ['RANK'] = str(mpi_rank)
35 |         os.environ['WORLD_SIZE'] = str(mpi_world_size)
36 |     else:
37 |         # set the default rank and world size to 0 and 1
38 |         os.environ['RANK'] = str(os.environ.get('RANK', 0))
39 |         os.environ['WORLD_SIZE'] = str(os.environ.get('WORLD_SIZE', 1))
40 |     os.environ['MASTER_ADDR'] = '127.0.0.1'  # your master address
41 |     os.environ['MASTER_PORT'] = '29500'  # your master port
42 |     rank = int(os.environ.get('PMI_RANK', -1)) # global rank
43 |     world_size = int(os.environ.get("WORLD_SIZE", -1))
44 |     init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
45 | 
46 |     # Initialize the process group with ccl backend
47 |     dist.init_process_group(backend='ccl', init_method=init_method, world_size=world_size, rank=rank)
48 | 
49 |     local_rank = os.environ['MPI_LOCALRANKID']
50 |     if args.device == 'xpu':
51 |         device = "xpu:{}".format(local_rank)
52 |     else:
53 |         device = 'cpu'
54 | 
55 |     model = Model().to(device)
56 |     if dist.get_world_size() > 1:
57 |         model = DDP(model, device_ids=[device] if (device != 'cpu') else None)
58 | 
59 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
60 |     loss_fn = nn.MSELoss().to(device)
61 |     for i in range(3):
62 |         print("Runing Iteration: {} on device {}".format(i, device))
63 |         input = torch.randn(2, 4).to(device)
64 |         labels = torch.randn(2, 5).to(device)
65 |         # forward
66 |         print("Runing forward: {} on device {}".format(i, device))
67 |         res = model(input)
68 |         # loss
69 |         print("Runing loss: {} on device {}".format(i, device))
70 |         L = loss_fn(res, labels)
71 |         # backward
72 |         print("Runing backward: {} on device {}".format(i, device))
73 |         with torch.autograd.profiler_legacy.profile(enabled=True) as prof:
74 |               L.backward()
75 |         #print(prof)
76 |         # update
77 |         print("Runing optim: {} on device {}".format(i, device))
78 |         optimizer.step()
79 |     print("Finish")


--------------------------------------------------------------------------------
/oneccl_bindings_for_pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import warnings
 4 | import torch
 5 | 
 6 | 
 7 | cwd = os.path.dirname(os.path.abspath(__file__))
 8 | if not os.path.exists(os.path.join(cwd, "version.py")):
 9 |     raise RuntimeError("oneccl_bindings_for_pytorch is not installed!")
10 | 
11 | 
12 | def set_env_default(env, key, value):
13 |     new_value = env.get(key, value)
14 |     env[key] = new_value
15 | 
16 | from .version import __version__, git_version
17 | from . import _C as ccl_lib
18 | 
19 | if hasattr(torch, 'xpu'):
20 |     try:
21 |         # load the CCL/XPU library
22 |         import ctypes
23 |         my_c_library = ctypes.cdll.LoadLibrary(os.path.join(cwd, "lib/liboneccl_bindings_for_pytorch_xpu.so"))
24 |     except OSError as e:
25 |         print(f"Warning: Cannot load xpu CCL. CCL doesn't work for XPU device due to {e}")
26 | 
27 | __all__ = []
28 | __all__ += [name for name in dir(ccl_lib)
29 |             if name[0] != '_' and
30 |             not name.endswith('Base')]
31 | 
32 | 
33 | def is_available(tensors):
34 |     devices = set()
35 |     for tensor in tensors:
36 |         if not tensor.is_contiguous():
37 |             return False
38 |         device = tensor.get_device()
39 |         if device in devices:
40 |             return False
41 |         devices.add(device)
42 | 
43 |     return True
44 | 
45 | 


--------------------------------------------------------------------------------
/oneccl_bindings_for_pytorch/csrc/_C.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, Intel Corporation
 3 |  * All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *
 8 |  * 1. Redistributions of source code must retain the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer.
10 |  *
11 |  * 2. Redistributions in binary form must reproduce the above copyright
12 |  *    notice, this list of conditions and the following disclaimer in the
13 |  *    documentation and/or other materials provided with the distribution.
14 |  *
15 |  * 3. Neither the name of the Intel Corporation nor the names of its contributors
16 |  *    may be used to endorse or promote products derived from this software
17 |  *    without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 |  * POSSIBILITY OF SUCH DAMAGE.
30 |  */
31 | 
32 | #include "init.h"
33 | 
34 | PYBIND11_MODULE(_C, m) {
35 |   torch_ccl_python_init(m);
36 | }


--------------------------------------------------------------------------------
/oneccl_bindings_for_pytorch/csrc/init.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020-2021, Intel Corporation
  3 |  * All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright
 12 |  *    notice, this list of conditions and the following disclaimer in the
 13 |  *    documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the Intel Corporation nor the names of its contributors
 16 |  *    may be used to endorse or promote products derived from this software
 17 |  *    without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 23 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 |  * POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | 
 32 | #include "init.h"
 33 | #include <torch/extension.h>
 34 | 
 35 | #include <deque>
 36 | #include <exception>
 37 | #include <memory>
 38 | #include <mutex>
 39 | #include <thread>
 40 | #include <vector>
 41 | #include <chrono>
 42 | #include <pybind11/chrono.h>
 43 | #include <pybind11/cast.h>
 44 | 
 45 | #include <torch/version.h>
 46 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13
 47 | #if TORCH_VERSION_MAJOR > 1
 48 | #include <torch/csrc/distributed/c10d/Backend.hpp>
 49 | #else
 50 | #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 51 | #endif
 52 | #include <torch/csrc/distributed/c10d/Store.hpp>
 53 | #include <torch/csrc/distributed/c10d/Types.hpp>
 54 | #include <torch/csrc/distributed/c10d/Utils.hpp>
 55 | #else
 56 | #include <c10d/ProcessGroup.hpp>
 57 | #include <c10d/Store.hpp>
 58 | #include <c10d/Types.hpp>
 59 | #include <c10d/Utils.hpp>
 60 | #endif
 61 | 
 62 | #include <ProcessGroupCCL.hpp>
 63 | 
 64 | namespace py = pybind11;
 65 | 
 66 | 
 67 | namespace {
 68 | 
 69 | // This is a intrusive helper from pytorch.
 70 | template <typename T>
 71 | class IntrusivePtrNoGilDestructor {
 72 |   c10::intrusive_ptr<T> impl_;
 73 | 
 74 | public:
 75 |   IntrusivePtrNoGilDestructor() = default;
 76 |   IntrusivePtrNoGilDestructor(const IntrusivePtrNoGilDestructor&) = default;
 77 |   IntrusivePtrNoGilDestructor(IntrusivePtrNoGilDestructor&&) = default;
 78 |   IntrusivePtrNoGilDestructor& operator=(const IntrusivePtrNoGilDestructor&) =
 79 |   default;
 80 |   IntrusivePtrNoGilDestructor& operator=(IntrusivePtrNoGilDestructor&&) =
 81 |   default;
 82 |   /* implicit */ IntrusivePtrNoGilDestructor(c10::intrusive_ptr<T> impl)
 83 |           : impl_(std::move(impl)) {}
 84 |   // This ctor is very important; see
 85 |   // https://github.com/pybind/pybind11/issues/2957
 86 |   explicit IntrusivePtrNoGilDestructor(T* impl)
 87 |           : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
 88 |   ~IntrusivePtrNoGilDestructor() {
 89 |     if (impl_) {
 90 |       if (PyGILState_Check()) {
 91 |         pybind11::gil_scoped_release release;
 92 |         impl_.reset();
 93 |       } else {
 94 |         impl_.reset();
 95 |       }
 96 |     }
 97 |   }
 98 |   T& operator*() const noexcept {
 99 |     return *impl_;
100 |   }
101 |   T* operator->() const noexcept {
102 |     return impl_.get();
103 |   }
104 |   C10_NODISCARD T* get() const noexcept {
105 |     return impl_.get();
106 |   }
107 |   void reset() noexcept {
108 |     impl_.reset();
109 |   }
110 |   operator bool() const noexcept {
111 |     return impl_;
112 |   }
113 | };
114 | 
115 | } // anonymous namespace
116 | 
117 | PYBIND11_DECLARE_HOLDER_TYPE(T, IntrusivePtrNoGilDestructor<T>, true);
118 | 
119 | template <typename T>
120 | using intrusive_ptr_no_gil_destructor_class_ =
121 | py::class_<T, IntrusivePtrNoGilDestructor<T>>;
122 | 
123 | TORCH_CCL_CPP_API void torch_ccl_python_init(pybind11::module &m) {
124 |   c10d::ProcessGroupCCL::cclInitOnce();
125 |   py::object module = py::module::import("torch.distributed");
126 |   py::object register_backend = module.attr("Backend").attr("register_backend");
127 |   #if TORCH_VERSION_MAJOR > 1 
128 |   auto backend = py::module::import("torch._C._distributed_c10d").attr("Backend");
129 |   #else 
130 |   auto backend = module.attr("ProcessGroup");
131 |   #endif
132 |   register_backend("ccl", py::cpp_function(&c10d::ProcessGroupCCL::createProcessGroupCCL,
133 |                                            py::arg("store"),
134 |                                            py::arg("rank"),
135 |                                            py::arg("size"),
136 |                                            py::arg("timeout") = std::chrono::milliseconds(
137 |                                                    ::c10d::ProcessGroupCCL::OP_TIMEOUT_MILLIS)),
138 | 		                           false, std::vector<std::string>{"xpu", "cpu"});
139 |   
140 |   auto processGroupCCL = intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupCCL>(
141 |           module, "ProcessGroupCCL", backend);
142 | 
143 |   processGroupCCL.def(
144 |     py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
145 |                 int rank,
146 |                 int size,
147 |                 std::chrono::milliseconds timeout) {
148 |       return c10::make_intrusive<::c10d::ProcessGroupCCL>(store, rank, size, timeout);
149 |     }),
150 |     py::arg("store"),
151 |     py::arg("rank"),
152 |     py::arg("size"),
153 |     py::arg("timeout") = std::chrono::milliseconds(10 * 1000));
154 | 
155 | }
156 | 


--------------------------------------------------------------------------------
/oneccl_bindings_for_pytorch/csrc/init.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, Intel Corporation
 3 |  * All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *
 8 |  * 1. Redistributions of source code must retain the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer.
10 |  *
11 |  * 2. Redistributions in binary form must reproduce the above copyright
12 |  *    notice, this list of conditions and the following disclaimer in the
13 |  *    documentation and/or other materials provided with the distribution.
14 |  *
15 |  * 3. Neither the name of the Intel Corporation nor the names of its contributors
16 |  *    may be used to endorse or promote products derived from this software
17 |  *    without specific prior written permission.
18 |  *
19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 |  * POSSIBILITY OF SUCH DAMAGE.
30 |  */
31 | 
32 | #pragma once
33 | 
34 | #include <pybind11/pybind11.h>
35 | 
36 | #define TORCH_CCL_CPP_API __attribute__ ((visibility ("default")))
37 | 
38 | void torch_ccl_python_init(pybind11::module &m);
39 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.10.0
2 | setuptools
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # DEBUG build with debug
  2 | #
  3 | #   USE_SYSTEM_ONECCL=0
  4 | #     disables use of system-wide oneCCL (we will use our submoduled
  5 | #     copy in third_party/oneCCL)
  6 | 
  7 | import os
  8 | import sys
  9 | import pathlib
 10 | import shutil
 11 | from subprocess import check_call, check_output
 12 | 
 13 | import torch
 14 | from torch.utils.cpp_extension import BuildExtension, CppExtension, library_paths
 15 | from setuptools import setup
 16 | from distutils.command.clean import clean
 17 | from tools.setup.cmake import CMakeExtension
 18 | from tools.setup.env import get_compiler
 19 | 
 20 | # Constant known variables used throughout this file
 21 | CWD = os.path.dirname(os.path.abspath(__file__))
 22 | ONECCL_BINDINGS_FOR_PYTORCH_PATH = os.path.join(CWD, "oneccl_bindings_for_pytorch")
 23 | 
 24 | 
 25 | def _check_env_flag(name, default=''):
 26 |     return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y']
 27 | 
 28 | 
 29 | def check_file(f):
 30 |     if not os.path.exists(f):
 31 |         print("Could not find {}".format(f))
 32 |         print("Did you run 'git submodule update --init --recursive'?")
 33 |         sys.exit(1)
 34 | 
 35 | 
 36 | # all the work we need to do _before_ setup runs
 37 | def create_version():
 38 |     """Create the version string for torch-ccl"""
 39 |     package_name = os.getenv('CCL_PACKAGE_NAME', 'oneccl-bind-pt')
 40 |     version = open('version.txt', 'r').read().strip()
 41 |     sha = 'Unknown'
 42 | 
 43 |     try:
 44 |         sha = check_output(['git', 'rev-parse', 'HEAD'], cwd=CWD).decode('ascii').strip()
 45 |     except Exception:
 46 |         pass
 47 | 
 48 |     if os.getenv('CCL_SHA_VERSION', False):
 49 |         if sha != 'Unknown':
 50 |             version += '+' + sha[:7]
 51 | 
 52 |     if os.environ.get("COMPUTE_BACKEND") == "dpcpp":
 53 |         backend = "gpu"
 54 |     else:
 55 |         backend = os.environ.get("ONECCL_BINDINGS_FOR_PYTORCH_BACKEND", "cpu")
 56 | 
 57 |     if "+" not in version:
 58 |         version += '+' + backend
 59 | 
 60 |     print("Building {}-{}".format(package_name, version))
 61 | 
 62 |     version_path = os.path.join(CWD, 'oneccl_bindings_for_pytorch', 'version.py')
 63 |     with open(version_path, 'w') as f:
 64 |         f.write("__version__ = '{}'\n".format(version))
 65 |         f.write("git_version = {}\n".format(repr(sha)))
 66 | 
 67 |     return version, package_name
 68 | 
 69 | 
 70 | class BuildCMakeExt(BuildExtension):
 71 |     """
 72 |     Builds using cmake instead of the python setuptools implicit build
 73 |     """
 74 | 
 75 |     def run(self):
 76 |         """
 77 |         Perform build_cmake before doing the 'normal' stuff
 78 |         """
 79 |         cmake_extensions = [ext for ext in self.extensions if isinstance(ext, CMakeExtension)]
 80 |         for ext in cmake_extensions:
 81 |             self.build_cmake(ext)
 82 | 
 83 |         self.extensions = [ext for ext in self.extensions if not isinstance(ext, CMakeExtension)]
 84 |         super(BuildCMakeExt, self).run()
 85 |         build_py = self.get_finalized_command('build_py')
 86 |         build_py.data_files = build_py._get_data_files()
 87 |         build_py.run()
 88 | 
 89 |     def build_cmake(self, extension: CMakeExtension):
 90 |         """
 91 |         The steps required to build the extension
 92 |         """
 93 |         build_dir = pathlib.Path('.'.join([self.build_temp, extension.name]))
 94 | 
 95 |         build_dir.mkdir(parents=True, exist_ok=True)
 96 |         install_dir = ONECCL_BINDINGS_FOR_PYTORCH_PATH
 97 | 
 98 |         # Now that the necessary directories are created, build
 99 |         my_env = os.environ.copy()
100 |         my_env["CMAKE_DISABLE_FIND_PACKAGE_MKL"] = "TRUE"
101 |         build_type = 'Release'
102 | 
103 |         if _check_env_flag('DEBUG'):
104 |             build_type = 'Debug'
105 | 
106 |         build_options = {
107 |             'CMAKE_BUILD_TYPE': build_type,
108 |             # The value cannot be easily obtained in CMakeLists.txt.
109 |             'CMAKE_PREFIX_PATH': torch.utils.cmake_prefix_path,
110 |             # skip the example and test code in oneCCL
111 |             'BUILD_EXAMPLES': 'OFF',
112 |             'BUILD_CONFIG': 'OFF',
113 |             'BUILD_FT': 'OFF'
114 |         }
115 | 
116 |         compute_backend = os.getenv('COMPUTE_BACKEND', 'n/a')
117 |         runtime = 'gcc'
118 |         if compute_backend == 'dpcpp':
119 |             runtime = 'dpcpp'
120 |             build_options['COMPUTE_BACKEND'] = compute_backend
121 |             import intel_extension_for_pytorch
122 |             build_options['CMAKE_PREFIX_PATH'] += ";" + intel_extension_for_pytorch.cmake_prefix_path
123 |             if "DPCPP_GCC_INSTALL_DIR" in my_env:
124 |                 exist_cflags = "CFLAGS" in my_env
125 |                 cflags = ""
126 |                 if exist_cflags:
127 |                     cflags = my_env["CFLAGS"]
128 |                 my_env["CFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} {cflags}"
129 |                 exist_cxxflags = "CXXFLAGS" in my_env
130 |                 cxxflags = ""
131 |                 if exist_cxxflags:
132 |                     cxxflags = my_env["CXXFLAGS"]
133 |                 my_env["CXXFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} {cxxflags}"
134 |                 exist_ldflags = "LDFLAGS" in my_env
135 |                 ldflags = ""
136 |                 if exist_ldflags:
137 |                     ldflags = my_env["LDFLAGS"]
138 |                 my_env["LDFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} -fuse-ld=lld -lrt -lpthread {ldflags}"
139 | 
140 |         cc, cxx = get_compiler(runtime)
141 |         build_options['CMAKE_C_COMPILER'] = cc
142 |         build_options['CMAKE_CXX_COMPILER'] = cxx
143 | 
144 |         extension.generate(build_options, my_env, build_dir, install_dir)
145 | 
146 |         if compute_backend == 'dpcpp':
147 |             if "DPCPP_GCC_INSTALL_DIR" in my_env:
148 |                 if exist_cflags:
149 |                     my_env["CFLAGS"] = cflags
150 |                 else:
151 |                     del my_env["CFLAGS"]
152 |                 if exist_cxxflags:
153 |                     my_env["CXXFLAGS"] = cxxflags
154 |                 else:
155 |                     del my_env["CXXFLAGS"]
156 |                 if exist_ldflags:
157 |                     my_env["LDFLAGS"] = ldflags
158 |                 else:
159 |                     del my_env["LDFLAGS"]
160 | 
161 |         build_args = ['-j', str(os.cpu_count())]
162 |         check_call(['make', 'oneccl_bindings_for_pytorch'] + build_args, cwd=str(build_dir))
163 |         if compute_backend == 'dpcpp':
164 |             check_call(['make', 'oneccl_bindings_for_pytorch_xpu'] + build_args, cwd=str(build_dir))
165 |         check_call(['make', 'install'], cwd=str(build_dir))
166 | 
167 | 
168 | class Clean(clean):
169 |     def run(self):
170 |         import glob
171 |         import re
172 | 
173 |         with open('.gitignore', 'r') as f:
174 |             ignores = f.read()
175 |             pat = re.compile(r'^#( BEGIN NOT-CLEAN-FILES )?')
176 |             for wildcard in filter(None, ignores.split('\n')):
177 |                 match = pat.match(wildcard)
178 |                 if match:
179 |                     if match.group(1):
180 |                         # Marker is found and stop reading .gitignore.
181 |                         break
182 |                     # Ignore lines which begin with '#'.
183 |                 else:
184 |                     for filename in glob.glob(wildcard):
185 |                         try:
186 |                             os.remove(filename)
187 |                         except OSError:
188 |                             shutil.rmtree(filename, ignore_errors=True)
189 | 
190 |         clean.run(self)
191 | 
192 | 
193 | def get_python_c_module():
194 |     main_compile_args = []
195 |     main_libraries = ['oneccl_bindings_for_pytorch']
196 |     main_link_args = []
197 |     main_sources = ["oneccl_bindings_for_pytorch/csrc/_C.cpp", "oneccl_bindings_for_pytorch/csrc/init.cpp"]
198 |     lib_path = os.path.join(ONECCL_BINDINGS_FOR_PYTORCH_PATH, "lib")
199 |     library_dirs = [lib_path]
200 |     include_path = os.path.join(CWD, "src")
201 |     include_dirs = [include_path]
202 |     extra_link_args = []
203 |     extra_compile_args = [
204 |         '-Wall',
205 |         '-Wextra',
206 |         '-Wno-strict-overflow',
207 |         '-Wno-unused-parameter',
208 |         '-Wno-missing-field-initializers',
209 |         '-Wno-write-strings',
210 |         '-Wno-unknown-pragmas',
211 |         # This is required for Python 2 declarations that are deprecated in 3.
212 |         '-Wno-deprecated-declarations',
213 |         # Python 2.6 requires -fno-strict-aliasing, see
214 |         # http://legacy.python.org/dev/peps/pep-3123/
215 |         # We also depend on it in our code (even Python 3).
216 |         '-fno-strict-aliasing',
217 |         # Clang has an unfixed bug leading to spurious missing
218 |         # braces warnings, see
219 |         # https://bugs.llvm.org/show_bug.cgi?id=21629
220 |         '-Wno-missing-braces',
221 |     ]
222 | 
223 |     def make_relative_rpath(path):
224 |         ret = []
225 |         ret.append('-Wl,-rpath,$ORIGIN/' + path)
226 |         if os.getenv('COMPUTE_BACKEND', 'n/a') == 'dpcpp':
227 |             ret.append('-Wl,-rpath,$ORIGIN/../../../')
228 |             ret.append('-Wl,--disable-new-dtags')
229 |         return ret
230 | 
231 |     _c_module = CppExtension("oneccl_bindings_for_pytorch._C",
232 |                              libraries=main_libraries,
233 |                              sources=main_sources,
234 |                              language='c',
235 |                              extra_compile_args=main_compile_args + extra_compile_args,
236 |                              include_dirs=include_dirs,
237 |                              library_dirs=library_dirs,
238 |                              extra_link_args=extra_link_args + main_link_args + make_relative_rpath('lib'))
239 | 
240 |     return _c_module
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     version, package_name = create_version()
245 |     c_module = get_python_c_module()
246 |     cmake_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "CMakeLists.txt")
247 |     modules = [CMakeExtension("liboneccl_bindings_for_pytorch", cmake_file), c_module]
248 |     setup(
249 |         name=package_name,
250 |         version=version,
251 |         ext_modules=modules,
252 |         packages=['oneccl_bindings_for_pytorch'],
253 |         package_data={
254 |             'oneccl_bindings_for_pytorch': [
255 |                 '*.py',
256 |                 '*/*.h',
257 |                 '*/*.hpp',
258 |                 'lib/*.so*',
259 |                 'opt/mpi/lib/*.so*',
260 |                 'bin/*',
261 |                 'opt/mpi/bin/*',
262 |                 'env/*',
263 |                 'etc/*',
264 |                 'opt/mpi/etc/*',
265 |                 'examples/*',
266 |                 'include/native_device_api/*.h*',
267 |                 'include/native_device_api/l0/*.h*',
268 |                 'include/*.h*',
269 |                 'opt/mpi/include/*.h*',
270 |                 'lib/lib*',
271 |                 'opt/mpi/libfabric/lib/lib*',
272 |                 'lib/prov/lib*',
273 |                 'lib/ccl/kernels/*',
274 |                 'opt/mpi/libfabric/lib/prov/lib*',
275 |                 'licensing/*',
276 |                 'modulefiles/*',
277 |             ]},
278 |         cmdclass={
279 |             'build_ext': BuildCMakeExt,
280 |             'clean': Clean,
281 |         }
282 |     )
283 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(CCL_SRCS ProcessGroupCCL.cpp dispatch_stub.cpp utils.cpp ccl_comm_collector.cpp env.cpp)
 2 | set(CCL_CPU_SRCS cpu/cpu_ccl.cpp)
 3 | add_library(oneccl_bindings_for_pytorch SHARED ${CCL_SRCS} ${CCL_CPU_SRCS})
 4 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES OUTPUT_NAME ${LIB_NAME})
 5 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES POSITION_INDEPENDENT_CODE ON)
 6 | 
 7 | target_compile_options(oneccl_bindings_for_pytorch PUBLIC -Wall
 8 |         -Wno-sign-compare
 9 |         -Wno-unused-function)
10 | 
11 | if(COMPUTE_BACKEND STREQUAL "dpcpp")
12 |     add_subdirectory(./gpu)
13 |     add_definitions (-DUSE_GPU)
14 |     target_compile_options(oneccl_bindings_for_pytorch PUBLIC -fsycl)
15 |     target_link_options(oneccl_bindings_for_pytorch PUBLIC -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -options -vc-codegen")
16 | endif()
17 | 
18 | target_include_directories(oneccl_bindings_for_pytorch PUBLIC ./)
19 | 
20 | target_link_libraries(oneccl_bindings_for_pytorch PUBLIC ${DEPENDS_LIB})
21 | 
22 | foreach(RPATH ${CMAKE_INSTALL_RPATH})
23 |     set_target_properties(oneccl_bindings_for_pytorch PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}")
24 | endforeach()
25 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES LINK_FLAGS "-Wl,--disable-new-dtags")
26 | 
27 | install(TARGETS oneccl_bindings_for_pytorch LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib")
28 | 


--------------------------------------------------------------------------------
/src/ProcessGroupCCL.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020-2021, Intel Corporation
  3 |  * All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright
 12 |  *    notice, this list of conditions and the following disclaimer in the
 13 |  *    documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the Intel Corporation nor the names of its contributors
 16 |  *    may be used to endorse or promote products derived from this software
 17 |  *    without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 23 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 |  * POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | 
 32 | #pragma once
 33 | 
 34 | 
 35 | #include <exception>
 36 | #include <memory>
 37 | #include <mutex>
 38 | #include <vector>
 39 | 
 40 | #include <torch/version.h>
 41 | #if  TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13
 42 |   #if TORCH_VERSION_MAJOR > 1
 43 |   #include <torch/csrc/distributed/c10d/Backend.hpp>
 44 |   #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 45 |   #else
 46 |   #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 47 |   #endif 
 48 | #include <torch/csrc/distributed/c10d/Store.hpp>
 49 | #include <torch/csrc/distributed/c10d/Types.hpp>
 50 | #include <torch/csrc/distributed/c10d/Utils.hpp>
 51 | #include <torch/csrc/distributed/c10d/Backend.hpp>
 52 | #else
 53 | #include <c10d/ProcessGroup.hpp>
 54 | #include <c10d/Store.hpp>
 55 | #include <c10d/Types.hpp>
 56 | #include <c10d/Utils.hpp>
 57 | #endif
 58 | 
 59 | 
 60 | namespace oneccl_bindings_for_pytorch {
 61 | struct CCLCommCollector;
 62 | 
 63 | static inline void format_tensors_param(std::vector<c10::IValue>& param, const at::Tensor& tensor) {
 64 |   param.emplace_back(tensor);
 65 | }
 66 | 
 67 | template <typename T>
 68 | static inline void format_tensors_param(std::vector<c10::IValue>& param, const std::vector<T>& vec) {
 69 |   for (const auto& elem : vec) {
 70 |     format_tensors_param(param, elem);
 71 |   }
 72 | }
 73 | }
 74 | 
 75 | namespace c10d {
 76 | 
 77 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13
 78 | using C10D_Work = c10d::Work;
 79 | #else
 80 | using C10D_Work = c10d::ProcessGroup::Work;
 81 | #endif
 82 | 
 83 | // WorkCCL is the state associated with a CCL operarion.
 84 | //
 85 | // ProcessGroupCCL implements CCL bindings for c10d.
 86 | //
 87 | // All functions on this class are expected to be called in the same
 88 | // order across processes in the group.
 89 | //
 90 | // All collective functions provided by this class are scheduled
 91 | // for asynchronous execution by CCL.
 92 | constexpr const char* CCL_BACKEND_NAME = "ccl";
 93 | 
 94 | // Environment variable which controls whether wait() and synchronize() are blocking or
 95 | // non-blocking.
 96 | constexpr const char* CCL_BLOCKING_WAIT = "CCL_BLOCKING_WAIT";
 97 | 
 98 | // Environment variable which controls whether or not use default stream as
 99 | // communication stream for collectives
100 | constexpr const char* CCL_SAME_STREAM = "CCL_SAME_STREAM";
101 | 
102 | constexpr const char* TORCH_LLM_ALLREDUCE = "TORCH_LLM_ALLREDUCE";
103 | 
104 | #if TORCH_VERSION_MAJOR > 1
105 | using Baseclass = Backend;
106 | #else
107 | using Baseclass = ProcessGroup;
108 | #endif
109 | class ProcessGroupCCL : public Baseclass 
110 | {
111 | public:
112 |   class AsyncWorkCCL : public C10D_Work {
113 |   public:
114 |     AsyncWorkCCL(std::vector<std::vector<at::Tensor>> outputTensors,
115 |                  int rank = -1,
116 |                  c10d::OpType opType = OpType::UNKNOWN,
117 |                  const char* profilingTitle = nullptr,
118 |                  const c10::optional<std::vector<at::Tensor>>& inputTensors = c10::nullopt);
119 | 
120 |     virtual void run() = 0;
121 | 
122 |     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
123 | 
124 |     std::vector<at::Tensor> result() override;
125 | 
126 |     virtual void finishAsyncWorkCCL();
127 | 
128 |     void finishAsyncWorkCCLError(std::exception_ptr eptr);
129 | 
130 |   public:
131 |     std::string debugName;
132 |     // Clone of blockingWait_ from ProcessGroupCCL.
133 |     bool blockingWait_ = true;
134 |     // Clone of useSameStream_ from ProcessGroupCCL.
135 |     bool useSameStream_ = false;
136 | 
137 |   protected:
138 |     friend class ProcessGroupCCL;
139 |     const std::vector<std::vector<at::Tensor>> outputTensors_;
140 |     // The future returned by getFuture.
141 |     c10::intrusive_ptr<at::ivalue::Future> future_;
142 |   };
143 | 
144 |   explicit ProcessGroupCCL(const c10::intrusive_ptr<Store>& store,
145 |                            int rank,
146 |                            int size,
147 |                            std::chrono::milliseconds);
148 |   virtual ~ProcessGroupCCL();
149 | 
150 | #if TORCH_VERSION_MINOR >= 11
151 |   const std::string getBackendName() const override {
152 |     return std::string(CCL_BACKEND_NAME);
153 |   }
154 | #endif
155 | 
156 |   void startCoalescing() override;
157 | 
158 |   c10::intrusive_ptr<Work> endCoalescing() override;
159 | 
160 |   c10::intrusive_ptr<C10D_Work> broadcast(
161 |       std::vector<at::Tensor>& data,
162 |       const BroadcastOptions& opts = BroadcastOptions()) override;
163 | 
164 |   c10::intrusive_ptr<C10D_Work> allreduce(
165 |       std::vector<at::Tensor>& tensors,
166 |       const AllreduceOptions& opts = AllreduceOptions()) override;
167 | 
168 |   c10::intrusive_ptr<C10D_Work> allreduce_coalesced(
169 |       std::vector<at::Tensor>& tensors,
170 |       const AllreduceCoalescedOptions& opts =
171 |           AllreduceCoalescedOptions()) override;
172 | 
173 |   c10::intrusive_ptr<C10D_Work> reduce(
174 |       std::vector<at::Tensor>& tensors,
175 |       const ReduceOptions& opts = ReduceOptions()) override;
176 | 
177 |   c10::intrusive_ptr<C10D_Work> allgather(
178 |       std::vector<std::vector<at::Tensor>>& outputTensors,
179 |       std::vector<at::Tensor>& inputTensors,
180 |       const AllgatherOptions& opts = AllgatherOptions()) override;
181 | 
182 |   c10::intrusive_ptr<C10D_Work> _allgather_base(
183 |       at::Tensor& outputBuffer,
184 |       at::Tensor& inputBuffer,
185 |       const AllgatherOptions& opts = AllgatherOptions()) override;
186 | 
187 |   c10::intrusive_ptr<C10D_Work> allgather_coalesced(
188 |       std::vector<std::vector<at::Tensor>>& outputTensorLists,
189 |       std::vector<at::Tensor>& inputTensors,
190 |       const AllgatherOptions& opts = AllgatherOptions()) override;
191 | 
192 |   c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
193 |       std::vector<at::Tensor>& outputTensors,
194 |       std::vector<at::Tensor>& inputTensors,
195 |       const AllgatherOptions& opts = AllgatherOptions()) override;
196 | 
197 |   c10::intrusive_ptr<C10D_Work> gather(
198 |       std::vector<std::vector<at::Tensor>>& outputTensors,
199 |       std::vector<at::Tensor>& inputTensors,
200 |       const GatherOptions& opts = GatherOptions()) override;
201 | 
202 |   c10::intrusive_ptr<C10D_Work> scatter(
203 |       std::vector<at::Tensor>& outputTensors,
204 |       std::vector<std::vector<at::Tensor>>& inputTensors,
205 |       const ScatterOptions& opts = ScatterOptions()) override;
206 | 
207 |   c10::intrusive_ptr<C10D_Work> reduce_scatter(
208 |       std::vector<at::Tensor>& outputTensors,
209 |       std::vector<std::vector<at::Tensor>>& inputTensors,
210 |       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
211 |   
212 |   c10::intrusive_ptr<C10D_Work> _reduce_scatter_base(
213 |           at::Tensor& outputBuffer,
214 |           at::Tensor& inputBuffer,
215 |           const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
216 | 
217 |   c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
218 |       std::vector<at::Tensor>& outputs,
219 |       std::vector<at::Tensor>& inputs,
220 |       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
221 | 
222 |   c10::intrusive_ptr<C10D_Work> alltoall_base(
223 |       at::Tensor& outputTensor,
224 |       at::Tensor& inputTensor,
225 |       std::vector<int64_t>& outputSplitSizes,
226 |       std::vector<int64_t>& inputSplitSizes,
227 |       const AllToAllOptions& opts = AllToAllOptions()) override;
228 | 
229 |   c10::intrusive_ptr<C10D_Work> alltoall(
230 |       std::vector<at::Tensor>& outputTensors,
231 |       std::vector<at::Tensor>& inputTensors,
232 |       const AllToAllOptions& opts = AllToAllOptions()) override;
233 | 
234 |   c10::intrusive_ptr<C10D_Work> send(
235 |       std::vector<at::Tensor>& tensors,
236 |       int dstRank,
237 |       int tag) override;
238 | 
239 |   c10::intrusive_ptr<C10D_Work> recv(
240 |       std::vector<at::Tensor>& tensors,
241 |       int srcRank,
242 |       int tag) override;
243 | 
244 |   c10::intrusive_ptr<C10D_Work> recvAnysource(
245 |       std::vector<at::Tensor>& tensor,
246 |       int tag) override;
247 | 
248 |   c10::intrusive_ptr<C10D_Work> barrier(
249 |       const BarrierOptions& opts = BarrierOptions()) override;
250 | 
251 |   // create a new ProcessGroupCCL and initialize CCL if not initialized
252 |   #if TORCH_VERSION_MAJOR > 1
253 |   static c10::intrusive_ptr<Backend> createProcessGroupCCL(
254 |   #else
255 |   static c10::intrusive_ptr<ProcessGroup> createProcessGroupCCL(
256 |   #endif
257 |       const c10::intrusive_ptr<Store>& store,
258 |       int rank = -1,
259 |       int size = -1,
260 |       std::chrono::milliseconds op_time_out = kNoTimeout);
261 |   static const int64_t OP_TIMEOUT_MILLIS;
262 |  public:
263 | 
264 |   static void cclInitOnce();
265 |   static void cclFini();
266 | 
267 |   // Store that is used to exchange information between processes.
268 |   c10::intrusive_ptr<Store> store_;
269 | 
270 |   std::chrono::milliseconds timeout;
271 | 
272 |   std::unique_ptr<oneccl_bindings_for_pytorch::CCLCommCollector> ccl_member_;
273 | 
274 |   static std::mutex globalMutex;
275 | 
276 |   // Whether or not wait() and synchronize() are blocking operations that wait
277 |   // for the operation to complete.
278 |   bool blockingWait_ = true;
279 | 
280 |   // Environment variable which controls whether to keep same stream
281 |   // for collectives and compute
282 |   bool useSameStream_ = false;
283 | 
284 |   bool torch_llm_allreduce_ = false;
285 | 
286 |   // Flag to denote if a coalescing groupStart/groupEnd block is active
287 |   bool is_coalescing_ = false;
288 | 
289 |   // Stores device indexes for all collectives run inside a coalescing block
290 |   std::vector<at::Device> coalescedDevices_;
291 | };
292 | 
293 | } // namespace c10d
294 | 


--------------------------------------------------------------------------------
/src/ccl_comm_collector.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <sys/types.h>
 3 | #include <unistd.h>
 4 | #include "ccl_comm_collector.h"
 5 | #include "utils.h"
 6 | 
 7 | 
 8 | namespace oneccl_bindings_for_pytorch {
 9 | 
10 | ccl::shared_ptr_class<ccl::kvs> CCLCommCollector::get_kvs(int rank, c10d::Store& store) {
11 |   if (kvs)
12 |     return kvs;
13 |   // Each process group is with different store, so we use the unique key for
14 |   // broadcast the bootstrap network information.
15 |   std::string storeKey = "ccl_kvs";
16 | 
17 |   // Rank 0 broadcast the bootstrap network information to other ranks
18 |   if (rank == 0) {
19 |     call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&]() {
20 |         kvs = ccl::create_main_kvs();
21 |     });
22 |     ccl::kvs::address_type main_addr = kvs->get_address();
23 |     auto ccl_kvs_addr = std::vector<uint8_t>(main_addr.begin(), main_addr.end());
24 |     store.set(storeKey, ccl_kvs_addr);
25 |   }
26 |   else {
27 |     auto ccl_kvs_addr = store.get(storeKey);
28 |     if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) {
29 |       throw std::runtime_error(
30 |               "Unexpected ccl kvs addr from the store\n");
31 |     }
32 |     ccl::kvs::address_type main_addr;
33 |     std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()),
34 |                 ccl::kvs::address_max_size,
35 |                 main_addr.begin());
36 |     call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&]() {
37 |         kvs = ccl::create_kvs(main_addr);
38 |     });
39 |   }
40 | 
41 |   return kvs;
42 | }
43 | 
44 | std::shared_ptr<oneccl_bindings_for_pytorch::Comms> CCLCommCollector::get_comms(const std::string& devices_key) {
45 |   if (ccl_comms.find(devices_key) != ccl_comms.end()) {
46 |     // Reuse the cached communicator if there is one.
47 |     return ccl_comms[devices_key];
48 |   }
49 |   return {nullptr};
50 | }
51 | 
52 | void CCLCommCollector::add_comms(const std::string& devices_key,
53 |                                  std::shared_ptr<oneccl_bindings_for_pytorch::Comms> comms) {
54 |   if (ccl_comms.find(devices_key) != ccl_comms.end()) {
55 |     // Replace the cached comms
56 |     ccl_comms[devices_key] = comms;
57 |   } else {
58 |     ccl_comms.emplace(devices_key, comms);
59 |   }
60 | }
61 | 
62 | }


--------------------------------------------------------------------------------
/src/ccl_comm_collector.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020-2021, Intel Corporation
  3 |  * All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright
 12 |  *    notice, this list of conditions and the following disclaimer in the
 13 |  *    documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the Intel Corporation nor the names of its contributors
 16 |  *    may be used to endorse or promote products derived from this software
 17 |  *    without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 23 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 |  * POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | 
 32 | #pragma once
 33 | 
 34 | #include <c10/core/Device.h>
 35 | #include <oneapi/ccl.hpp>
 36 | #include <unordered_map>
 37 | #include "ProcessGroupCCL.hpp"
 38 | 
 39 | namespace oneccl_bindings_for_pytorch {
 40 | 
 41 | class Comms {
 42 | public:
 43 |   // for cpu case
 44 |   explicit Comms(ccl::vector_class<ccl::communicator> &comms) :
 45 |     comms(std::move(comms)), streams{} {}
 46 | 
 47 |   // for comms with streams
 48 |   explicit Comms(ccl::vector_class<ccl::communicator> &comms, ccl::vector_class<ccl::stream> &streams, std::vector<c10::Stream> &torch_streams) :
 49 |     comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {}
 50 | 
 51 |   ~Comms() noexcept(false) {}
 52 | 
 53 |   Comms() = delete;
 54 | 
 55 |   // Must not be copyable
 56 |   Comms(const Comms &) = delete;
 57 | 
 58 |   Comms &operator=(const Comms &) = delete;
 59 | 
 60 |   // Move constructable
 61 |   Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)),
 62 |                          torch_streams(std::move(other.torch_streams)) {}
 63 | 
 64 |   // Move assignable
 65 |   Comms &operator=(Comms &&other) {
 66 |     std::swap(comms, other.comms);
 67 |     std::swap(streams, other.streams);
 68 |     std::swap(torch_streams, other.torch_streams);
 69 |     return *this;
 70 |   }
 71 | 
 72 | public:
 73 |   // The Communicators used by CCL
 74 |   ccl::vector_class<ccl::communicator> comms;
 75 |   // The streams used by CCL
 76 |   ccl::vector_class<ccl::stream> streams;
 77 |   // one to one mapping the torch streams to the ccl::stream.
 78 |   std::vector<c10::Stream> torch_streams;
 79 | };
 80 | 
 81 | struct CCLCommCollector {
 82 | 
 83 |   CCLCommCollector() : kvs(nullptr) {};
 84 | 
 85 |   ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store);
 86 | 
 87 |   std::shared_ptr<oneccl_bindings_for_pytorch::Comms> get_comms(const std::string& devices_key);
 88 |   void add_comms(const std::string& devices_key, std::shared_ptr<oneccl_bindings_for_pytorch::Comms> comms);
 89 | 
 90 |   // ccl kvs to identify the community.
 91 |   ccl::shared_ptr_class<ccl::kvs> kvs;
 92 | 
 93 |   // Collects the ccl communicator that the process group has used.
 94 |   // The key is a list of devices that an operation is operating on
 95 |   // The devices are stored in a device sequence and the cache CCL
 96 |   // communicator is associated with this device sequence
 97 |   //
 98 |   // e.g. If the process group op only uses device 0, then the value of
 99 |   // the used device string stored (value of the hashmap) would be "0".
100 |   //
101 |   //      If the process group op uses device 0 - 7 and the each tensor of the
102 |   //      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
103 |   //      then the value of the used device string (key) stored would be
104 |   //      "0,1,2,3,4,5,6,7"
105 |   //
106 |   //      If the process group op uses device 0 - 7 and the each tensor of the
107 |   //      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
108 |   //      then the value of the used device string stored would be
109 |   //      "0,4,5,6,7,1,2,3"
110 |   //
111 |   //      Note that the order of the device for the tensor list matters.
112 |   std::unordered_map<std::string, std::shared_ptr<oneccl_bindings_for_pytorch::Comms>> ccl_comms;
113 | 
114 | };
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/src/env.cpp:
--------------------------------------------------------------------------------
 1 | #include "env.h"
 2 | #include <sstream>
 3 | #include <iostream>
 4 | 
 5 | /*
 6 |  * All available launch options for ONECCL_BINDINGS_FOR_PYTORCH
 7 |  * ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE:           Default = 0, Set verbose level in ONECCL_BINDINGS_FOR_PYTORCH
 8 |  * ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB:          Default = 0, Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching
 9 |  */
10 | 
11 | #define ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(var) \
12 |     int var = [&]() -> int { \
13 |       if (auto env = std::getenv("ONECCL_BINDINGS_FOR_PYTORCH_" #var)) \
14 |         return std::stoi(env, 0, 10); \
15 |       return 0; \
16 |     } ()
17 | 
18 | int oneccl_bindings_for_pytorch_env(int env_type) {
19 | 
20 |   static struct {
21 |     ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(ENV_VERBOSE);
22 |     ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(ENV_WAIT_GDB);
23 |   } env;
24 | 
25 |   switch (env_type) {
26 |     case ENV_VERBOSE:
27 |       return env.ENV_VERBOSE;
28 |     case ENV_WAIT_GDB:
29 |       return env.ENV_WAIT_GDB;
30 |     default:
31 |       return 0;
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/env.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | enum ONECCL_BINDINGS_FOR_PYTORCH_ENV {
 4 |   ENV_VERBOSE = 0,
 5 |   ENV_WAIT_GDB
 6 | };
 7 | 
 8 | int oneccl_bindings_for_pytorch_env(int env);
 9 | 
10 | static inline int oneccl_bindings_for_pytorch_verbose() {
11 |   return oneccl_bindings_for_pytorch_env(ENV_VERBOSE);
12 | }
13 | 
14 | static inline int oneccl_bindings_for_pytorch_wait_gdb() {
15 |   return oneccl_bindings_for_pytorch_env(ENV_WAIT_GDB);
16 | }


--------------------------------------------------------------------------------
/src/gpu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(IPEX REQUIRED)
 2 | 
 3 | set(CCL_DPCPP_SRCS dpcpp_ccl.cpp ze_exception.hpp allreduce.h sycl_misc.hpp runtime.hpp cxxopts.hpp)
 4 | 
 5 | set_source_files_properties(${CCL_DPCPP_SRCS} PROPERTIES COMPILE_DEFINITIONS "USE_DPCPP;__STRICT_ANSI__")
 6 | set_source_files_properties(${CCL_DPCPP_SRCS} PROPERTIES COMPILE_FLAGS -fsycl)
 7 | 
 8 | add_library(oneccl_bindings_for_pytorch_xpu SHARED ${CCL_DPCPP_SRCS})
 9 | 
10 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC ${DEPENDS_LIB})
11 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC oneccl_bindings_for_pytorch)
12 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC intel-ext-pt-gpu)
13 | 
14 | foreach(RPATH ${CMAKE_INSTALL_RPATH})
15 |     set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}")
16 | endforeach()
17 | set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,--disable-new-dtags")
18 | 
19 | install(TARGETS oneccl_bindings_for_pytorch_xpu LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib")
20 | 
21 | 


--------------------------------------------------------------------------------
/src/gpu/Makefile:
--------------------------------------------------------------------------------
 1 | CC=icx
 2 | CXX=icpx
 3 | 
 4 | OPT=
 5 | 
 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required"
 7 | CCL_ROOT=../ccl/release/_install
 8 | 
 9 | INCLUDES=-I$(CCL_ROOT)/include
10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader
11 | 
12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES)
13 | 
14 | all : allreduce
15 | 
16 | clean:
17 | 	rm -f allreduce
18 | 


--------------------------------------------------------------------------------
/src/gpu/README.md:
--------------------------------------------------------------------------------
 1 | Dependencies:
 2 |   1. MPI
 3 |   2. Level-Zero
 4 |   3. SYCL enabled compiler
 5 | 
 6 | Build:
 7 |   make
 8 | 
 9 | Run:
10 |   ```mpirun -np <N> allreduce -c 1024 -t <float>```
11 | 


--------------------------------------------------------------------------------
/src/gpu/allreduce.cpp:
--------------------------------------------------------------------------------
  1 | #include <sys/mman.h>
  2 | #include <sys/syscall.h>
  3 | #include <unistd.h>
  4 | #include <system_error>
  5 | 
  6 | #include <mpi.h>
  7 | #include <sycl/sycl.hpp>
  8 | #include <level_zero/ze_api.h>
  9 | 
 10 | #include "ze_exception.hpp"
 11 | #include "allreduce.h"
 12 | #include <chrono>
 13 | 
 14 | #define REPEAT 10
 15 | 
 16 | int work_only = -1;
 17 | int sync_only = -1;
 18 | 
 19 | int get_work_only(int init_value = 0) {
 20 |   int tmp_work_only = init_value;
 21 |   char *tmp_str = getenv("TORCH_CCL_WORK_ONLY");
 22 |   if (tmp_str) {
 23 |     tmp_work_only = atoi(tmp_str);
 24 |   }
 25 |   work_only = tmp_work_only;
 26 |   return tmp_work_only;
 27 | }
 28 | 
 29 | int get_sync_only(int init_value = 0) {
 30 |   int tmp_sync_only = init_value;
 31 |   char *tmp_str = getenv("TORCH_CCL_SYNC_ONLY");
 32 |   if (tmp_str) {
 33 |     tmp_sync_only = atoi(tmp_str);
 34 |   }
 35 |   sync_only = tmp_sync_only;
 36 |   return tmp_sync_only;
 37 | }
 38 | 
 39 | void act(allreducer<sycl::half>& ar, sycl::queue& queue, void* inout_buffer, uint32_t size);
 40 | 
 41 | int main(int argc, char* argv[]) {
 42 |   // init section
 43 |   auto ret = MPI_Init(&argc, &argv);
 44 |   if (ret == MPI_ERR_OTHER) {
 45 |     std::cout<<"MPI init error"<<std::endl;
 46 |     return -1;
 47 |   }
 48 | 
 49 |   if (work_only == -1) {
 50 |     get_work_only(0);
 51 |   }
 52 |   if (sync_only == -1) {
 53 |     get_sync_only(0);
 54 |   }
 55 | 
 56 |   zeCheck(zeInit(0));
 57 |   int rank, world;
 58 | 
 59 |   MPI_Comm_size(MPI_COMM_WORLD, &world);
 60 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 61 | 
 62 |   // rank 0, device 0, subdevice 0
 63 |   // rank 1, device 0, subdevice 1
 64 |   // rank 2, device 1, subdevice 0
 65 |   // ...
 66 |   auto queue = currentQueue(rank / 2, rank & 1);
 67 |   allreducer<sycl::half> ar;
 68 |   ar.init(queue, rank, world);
 69 | 
 70 |   sycl::half* small_buffer = (sycl::half*)sycl::malloc_device(14336 * sizeof(sycl::half), queue);
 71 |   sycl::half* large_buffer = (sycl::half*)sycl::malloc_device(14336 * 32 * sizeof(sycl::half), queue);
 72 | 
 73 |   for (int i = 0; i < 140; i++) {
 74 |     act(ar, queue, large_buffer, 14336 * 32);
 75 |   }
 76 |   for (int i = 0; i < 31; i++) {
 77 |     for (int j = 0; j < 140; j++) {
 78 |       act(ar, queue, small_buffer, 14336);
 79 |     }
 80 |   }
 81 |   queue.wait();
 82 | 
 83 |   uint64_t host_time[REPEAT];
 84 |   uint64_t full_time[REPEAT];
 85 | 
 86 |   for (int k = 0; k < REPEAT; k++) {
 87 |     MPI_Barrier(MPI_COMM_WORLD);
 88 |     uint64_t start = int64_t(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count());
 89 | 
 90 |     for (int i = 0; i < 140; i++) {
 91 |       act(ar, queue, large_buffer, 14336 * 32);
 92 |     }
 93 |     for (int i = 0; i < 31; i++) {
 94 |       for (int j = 0; j < 140; j++) {
 95 |         act(ar, queue, small_buffer, 14336);
 96 |       }
 97 |     }
 98 |     uint64_t host_end = int64_t(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count());
 99 |     queue.wait();
100 |     uint64_t full_end = int64_t(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count());
101 |     host_time[k] = host_end - start;
102 |     full_time[k] = full_end - start;
103 |   }
104 | 
105 |   uint64_t total_host_time = 0;
106 |   uint64_t total_full_time = 0;
107 |   for (int k = 0; k < REPEAT; k++) {
108 |     total_host_time += host_time[k];
109 |     total_full_time += full_time[k];
110 |   }
111 | 
112 |   total_host_time /= REPEAT;
113 |   total_full_time /= REPEAT;
114 | 
115 |   MPI_Barrier(MPI_COMM_WORLD);
116 |   MPI_Finalize();
117 | 
118 |   std::cout << "Average full time: " << total_full_time << std::endl;
119 |   std::cout << "Average host time (for reference): " << total_host_time << std::endl;
120 |   for (int k = 0; k < REPEAT; k++) {
121 |     std::cout << "  Full time on round " << k << ": " << full_time[k] << std::endl;
122 |     std::cout << "  Host time on round " << k << " (for reference): " << host_time[k] << std::endl;
123 |   }
124 | }
125 | 
126 | void act(allreducer<sycl::half>& ar, sycl::queue& queue, void* inout_buffer, uint32_t size) {
127 |   if (work_only != 0) {
128 |     ar.work_only(queue, inout_buffer, size);
129 |     return;
130 |   }
131 |   if (sync_only != 0) {
132 |     ar.sync_only(queue, inout_buffer, size);
133 |     return;
134 |   }
135 |   ar.allreduce(queue, inout_buffer, size);
136 | }
137 | 


--------------------------------------------------------------------------------
/src/gpu/runtime.hpp:
--------------------------------------------------------------------------------
 1 | #include <CL/sycl.hpp>
 2 | #include <iostream>
 3 | 
 4 | template <int ndev, int nsub>
 5 | sycl::device getSubDevice() {
 6 |   static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu);
 7 |   auto dev = devs[ndev];
 8 |   try {
 9 |     static auto subs = dev.template create_sub_devices<
10 |       sycl::info::partition_property::partition_by_affinity_domain>(
11 |           sycl::info::partition_affinity_domain::numa);
12 | 
13 |     return subs[nsub];
14 |   } catch (sycl::exception &e) {
15 |     std::cout<<e.what()<<std::endl;
16 |     return dev;
17 |   };
18 | }
19 | 
20 | template <int ndev, int nsub>
21 | sycl::queue getQueue() {
22 |   static sycl::queue q(
23 |       getSubDevice<ndev, nsub>(),
24 |       sycl::property_list {
25 |         sycl::property::queue::enable_profiling(),
26 |         sycl::property::queue::in_order()
27 |       });
28 |   return q;
29 | }
30 | 
31 | sycl::queue currentQueue(int ndev, int nsub) {
32 |   switch(ndev) {
33 |   case 0:
34 |     if (nsub == 0)
35 |       return getQueue<0,0>();
36 |     else
37 |       return getQueue<0,1>();
38 |     break;
39 |   case 1:
40 |     if (nsub == 0)
41 |       return getQueue<1,0>();
42 |     else
43 |       return getQueue<1,1>();
44 |     break;
45 |   }
46 |   throw std::exception();
47 | }
48 | 
49 | sycl::device currentSubDevice(int ndev, int nsub) {
50 |   switch(ndev) {
51 |   case 0:
52 |     if (nsub == 0)
53 |       return getSubDevice<0,0>();
54 |     else
55 |       return getSubDevice<0,1>();
56 |     break;
57 |   case 1:
58 |     if (nsub == 0)
59 |       return getSubDevice<1,0>();
60 |     else
61 |       return getSubDevice<1,1>();
62 |     break;
63 |   }
64 |   throw std::exception();
65 | }
66 | 
67 | static uint32_t g_dev_num = 1;
68 | static uint32_t g_part_num = 0;
69 | 
70 | sycl::device currentSubDevice() {
71 |   return currentSubDevice(g_dev_num, g_part_num);
72 | }
73 | 
74 | sycl::queue currentQueue() {
75 |   return currentQueue(g_dev_num, g_part_num);
76 | }
77 | 


--------------------------------------------------------------------------------
/src/gpu/sycl_misc.hpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <iostream>
 3 | 
 4 | template <int ndev, int nsub>
 5 | sycl::device getSubDevice() {
 6 |   static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu);
 7 |   auto dev = devs[ndev];
 8 |   try {
 9 |     static auto subs = dev.template create_sub_devices<
10 |       sycl::info::partition_property::partition_by_affinity_domain>(
11 |           sycl::info::partition_affinity_domain::numa);
12 | 
13 |     return subs[nsub];
14 |   } catch (sycl::exception &e) {
15 |     std::cout<<e.what()<<std::endl;
16 |     return dev;
17 |   };
18 | }
19 | 
20 | template <int ndev, int nsub>
21 | sycl::queue getQueue() {
22 |   static sycl::queue q(
23 |       getSubDevice<ndev, nsub>(),
24 |       sycl::property_list {
25 |         sycl::property::queue::enable_profiling(),
26 |         sycl::property::queue::in_order()
27 |       });
28 |   return q;
29 | }
30 | 
31 | #define queue_case(x) \
32 | case x: \
33 |   if (nsub == 0) \
34 |     return getQueue<x, 0>(); \
35 |   else \
36 |     return getQueue<x, 1>();
37 | 
38 | sycl::queue currentQueue(int ndev, int nsub) {
39 |   switch(ndev) {
40 |     queue_case(0);
41 |     queue_case(1);
42 |     queue_case(2);
43 |     queue_case(3);
44 |     queue_case(4);
45 |     queue_case(5);
46 |     queue_case(6);
47 |     queue_case(7);
48 |   }
49 |   throw std::exception();
50 | }
51 | 
52 | #define subdev_case(x) \
53 | case x: \
54 |   if (nsub == 0) \
55 |     return getSubDevice<x, 0>(); \
56 |   else \
57 |     return getSubDevice<x, 1>();
58 | 
59 | sycl::device currentSubDevice(int ndev, int nsub) {
60 |   switch(ndev) {
61 |     subdev_case(0);
62 |     subdev_case(1);
63 |     subdev_case(2);
64 |     subdev_case(3);
65 |     subdev_case(4);
66 |     subdev_case(5);
67 |     subdev_case(6);
68 |     subdev_case(7);
69 |   }
70 |   throw std::exception();
71 | }
72 | 
73 | static uint32_t g_dev_num = 1;
74 | static uint32_t g_part_num = 0;
75 | 
76 | sycl::device currentSubDevice() {
77 |   return currentSubDevice(g_dev_num, g_part_num);
78 | }
79 | 
80 | sycl::queue currentQueue() {
81 |   return currentQueue(g_dev_num, g_part_num);
82 | }
83 | 


--------------------------------------------------------------------------------
/src/gpu/ze_exception.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <level_zero/ze_api.h>
 4 | #include <exception>
 5 | #include <unordered_map>
 6 | #include <iostream>
 7 | 
 8 | // Mapping from status to human readable string
 9 | class zeException : std::exception {
10 |   const char * zeResultToString(ze_result_t status) const {
11 |     static const std::unordered_map<ze_result_t, const char *> zeResultToStringMap{
12 |       {ZE_RESULT_SUCCESS, "[Core] success"},
13 |       {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"},
14 |       {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"},
15 |       {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"},
16 |       {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"},
17 |       {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"},
18 |       {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"},
19 |       {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"},
20 |       {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"},
21 |       {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"},
22 |       {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"},
23 |       {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"},
24 |       {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"},
25 |       {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"},
26 |       {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"},
27 |       {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"},
28 |     };
29 |     auto it = zeResultToStringMap.find(status);
30 |     if (it != zeResultToStringMap.end())
31 |       return it->second;
32 |     else
33 |       return "Unknown Reason";
34 |   }
35 | 
36 | public:
37 |   zeException(ze_result_t ret) : result_(ret) {}
38 | 
39 |   ze_result_t result_;
40 | 
41 |   const char* what() const noexcept override {
42 |     return zeResultToString(result_);
43 |   }
44 | };
45 | 
46 | #define zeCheck(x)             \
47 |   if (x != ZE_RESULT_SUCCESS)  {    \
48 |     auto e = zeException(x);  \
49 |     std::cout<<"Throw "<<e.what()<<std::endl; \
50 |     throw e;      \
51 |   }
52 | 


--------------------------------------------------------------------------------
/src/test/remotesync/Makefile:
--------------------------------------------------------------------------------
 1 | CC=icx
 2 | CXX=icpx
 3 | 
 4 | OPT=
 5 | 
 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required"
 7 | CCL_ROOT=../ccl/release/_install
 8 | 
 9 | INCLUDES=-I$(CCL_ROOT)/include
10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader
11 | 
12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES)
13 | 
14 | all : simple_test
15 | 
16 | clean:
17 | 	rm -f simple_test
18 | 


--------------------------------------------------------------------------------
/src/test/remotesync/simple_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <sys/mman.h>
  2 | #include <sys/syscall.h>
  3 | #include <unistd.h>
  4 | #include <system_error>
  5 | 
  6 | #include <mpi.h>
  7 | #include <sycl/sycl.hpp>
  8 | #include <level_zero/ze_api.h>
  9 | 
 10 | #include "ze_exception.hpp"
 11 | #include "sycl_misc.hpp"
 12 | 
 13 | #define MAGIC_NUM 15
 14 | #define MAX_RANK 8
 15 | #define MAX_BUFFER 4096
 16 | #define OPERATE_SIZE 14336
 17 | 
 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK;
 19 | size_t check_size = 4096;
 20 | 
 21 | int world = -1;
 22 | int rank = -1;
 23 | 
 24 | void* buffer[MAX_RANK];
 25 | void* sync_buffer[MAX_RANK];
 26 | void* ready_buffer[MAX_RANK];
 27 | 
 28 | void exchange_mem(sycl::queue& queue, void* ptr);
 29 | void atomic_write_check_remote(sycl::queue& queue, uint32_t* ptr, int good);
 30 | 
 31 | struct exchange_contents {
 32 |   union {
 33 |     ze_ipc_mem_handle_t ipc_handle;
 34 |     int fd = -1;
 35 |   };
 36 |   size_t offset = 0;
 37 |   int pid = -1;
 38 | };
 39 | 
 40 | #define sysCheck(x) \
 41 |   if (x == -1) {  \
 42 |     throw std::system_error(  \
 43 |         std::make_error_code(std::errc(errno)));  \
 44 |   }
 45 |   
 46 | int main(int argc, char* argv[]) {
 47 |   size_t buffer_size = buffer_base_size + 1024;
 48 | 
 49 |   auto ret = MPI_Init(&argc, &argv);
 50 |   if (ret == MPI_ERR_OTHER) {
 51 |     std::cout<<"MPI init error"<<std::endl;
 52 |     return -1;
 53 |   }
 54 | 
 55 |   MPI_Comm_size(MPI_COMM_WORLD, &world);
 56 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 57 |   
 58 |   zeCheck(zeInit(0));
 59 | 
 60 |   // rank 0, device 0, subdevice 0
 61 |   // rank 1, device 0, subdevice 1
 62 |   // rank 2, device 1, subdevice 0
 63 |   // ...
 64 |   auto queue = currentQueue(rank / 2, rank & 1);
 65 |   std::cout << "Buffer size: " << buffer_size << std::endl;
 66 | 
 67 |   void* operate_buffer = sycl::malloc_device(buffer_size, queue);
 68 | 
 69 |   sycl::event e;
 70 | 
 71 | 
 72 |   uint32_t* ptr = (uint32_t *)operate_buffer + buffer_base_size / sizeof(uint32_t);
 73 |   e = queue.submit([&](sycl::handler& cgh) {
 74 |     cgh.parallel_for(sycl::range { 1024 / sizeof(uint32_t) }, ([=](sycl::id<1> index) {
 75 |       ptr[index] = (uint32_t)0;
 76 |     }));
 77 |   });
 78 |   queue.wait();
 79 | 
 80 |   exchange_mem(queue, operate_buffer);
 81 | 
 82 |   atomic_write_check_remote(queue, ptr, (argc > 1));
 83 | 
 84 |   MPI_Barrier(MPI_COMM_WORLD);
 85 |   std::cout << "Host MPI barrier completed" << std::endl;  
 86 | 
 87 |   MPI_Finalize();
 88 | }
 89 | 
 90 | void atomic_write_check_remote(sycl::queue& queue, uint32_t* ptr, int good) {
 91 |   uint32_t temp_world = world;
 92 |   uint32_t temp_rank = rank;
 93 | 
 94 |   int *temp_sync_buffer[MAX_RANK];
 95 |   for (int index = 0; index < temp_world; index++) {
 96 |     temp_sync_buffer[index] = (int *)sync_buffer[index];
 97 |   }
 98 | 
 99 |   for (int index = 0; index < temp_world; index++) {
100 |     if (index != temp_rank) {
101 |       std::cout << "Setting " << temp_sync_buffer[index] << " (remote) to 1" << std::endl;
102 |     }
103 |   }
104 |   for (int index = 0; index < temp_world; index++) {
105 |     if (index != temp_rank) {
106 |       std::cout << "Checking " << (int*)((int *)ptr + index * 32) << " (local) for 1" << std::endl;
107 |     }
108 |   } 
109 | 
110 |   sycl::event e = queue.submit([&](sycl::handler& cgh) {
111 |     if (good != 0) {
112 |       sycl::stream str(8192, 1024, cgh);
113 |     }
114 |     cgh.parallel_for(sycl::range { temp_world * 2 }, ([=](sycl::id<1> index) {
115 |       if (index < temp_world && index != temp_rank) {
116 |         int * peer_sync_ptr = (int*)temp_sync_buffer[index];
117 |         auto v =
118 |             sycl::atomic_ref<int, sycl::memory_order::acq_rel,
119 |                                   sycl::memory_scope::system,
120 |                                   sycl::access::address_space::global_space>(peer_sync_ptr[0]);
121 |         v.store(1);
122 |       }
123 |       if (index >= temp_world && index - temp_world != temp_rank) {
124 |         int * local_sync_ptr = (int*)(ptr + (index - temp_world) * 32);
125 |         auto v =
126 |             sycl::atomic_ref<int, sycl::memory_order::acq_rel,
127 |                                   sycl::memory_scope::system,
128 |                                   sycl::access::address_space::global_space>(local_sync_ptr[0]);
129 |         int count = v.load();
130 |         while (count < 1) {
131 |           count = v.load();
132 |         }
133 |       }
134 |     }));
135 |   });
136 |   e.wait();
137 | 
138 |   std::cout << "Kernel done" << std::endl;
139 | }
140 | 
141 | void exchange_mem(sycl::queue& queue, void* ptr) {
142 |   // Step 1: Get base address of the pointer
143 |   sycl::context ctx = queue.get_context();
144 |   auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
145 | 
146 |   void *base_addr;
147 |   size_t base_size;
148 |   zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size));
149 |   
150 |   std::cout << "Memory range size: " << base_size << std::endl;
151 |   std::cout << "Buffer base size: " << buffer_base_size << std::endl;
152 |   std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl;
153 |   std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl;
154 | 
155 |   // Step 2: Get IPC mem handle from base address
156 |   alignas(64) exchange_contents send_buf;
157 |   alignas(64) exchange_contents recv_buf[world];
158 | 
159 |   // fill in the exchange info
160 |   zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle));
161 |   send_buf.offset = (char*)ptr - (char*)base_addr;
162 |   send_buf.pid = getpid();
163 | 
164 |   // Step 3: Exchange the handles and offsets
165 |   memset(recv_buf, 0, sizeof(recv_buf));
166 |   // Overkill if we don't really needs all peer's handles
167 |   MPI_Allgather(
168 |       &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD);
169 | 
170 |     
171 |   for (uint32_t i = 0; i < world; i++){
172 |     // Step 4: Prepare pid file descriptor of next process
173 |     auto* peer = recv_buf + i;
174 |     auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0);
175 |     sysCheck(pid_fd);
176 |     //
177 |     // Step 5: Duplicate GEM object handle to local process
178 |     // and overwrite original file descriptor number
179 |     //
180 |     peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0);
181 |     sysCheck(peer->fd);
182 | 
183 |     // Step 6: Open IPC handle of remote peer
184 |     auto l0_device
185 |         = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_device());
186 |     void* peer_base;
187 | 
188 |     zeCheck(zeMemOpenIpcHandle(
189 |             l0_ctx, l0_device, peer->ipc_handle, 0, &peer_base));
190 |     //        l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base));
191 |     buffer[i] = (char*)peer_base + peer->offset;
192 |     sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128;
193 |     ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64;
194 | 
195 |     char* end = (char*)peer_base + peer->offset + base_size;
196 | 
197 |     std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl;
198 |   }    
199 | }
200 | 
201 | 


--------------------------------------------------------------------------------
/src/test/remotesync/sycl_misc.hpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <iostream>
 3 | 
 4 | template <int ndev, int nsub>
 5 | sycl::device getSubDevice() {
 6 |   static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu);
 7 |   auto dev = devs[ndev];
 8 |   try {
 9 |     static auto subs = dev.template create_sub_devices<
10 |       sycl::info::partition_property::partition_by_affinity_domain>(
11 |           sycl::info::partition_affinity_domain::numa);
12 | 
13 |     return subs[nsub];
14 |   } catch (sycl::exception &e) {
15 |     std::cout<<e.what()<<std::endl;
16 |     return dev;
17 |   };
18 | }
19 | 
20 | template <int ndev, int nsub>
21 | sycl::queue getQueue() {
22 |   static sycl::queue q(
23 |       getSubDevice<ndev, nsub>(),
24 |       sycl::property_list {
25 |         sycl::property::queue::enable_profiling(),
26 |         sycl::property::queue::in_order()
27 |       });
28 |   return q;
29 | }
30 | 
31 | #define queue_case(x) \
32 | case x: \
33 |   if (nsub == 0) \
34 |     return getQueue<x, 0>(); \
35 |   else \
36 |     return getQueue<x, 1>();
37 | 
38 | sycl::queue currentQueue(int ndev, int nsub) {
39 |   switch(ndev) {
40 |     queue_case(0);
41 |     queue_case(1);
42 |     queue_case(2);
43 |     queue_case(3);
44 |     queue_case(4);
45 |     queue_case(5);
46 |     queue_case(6);
47 |     queue_case(7);
48 |   }
49 |   throw std::exception();
50 | }
51 | 
52 | #define subdev_case(x) \
53 | case x: \
54 |   if (nsub == 0) \
55 |     return getSubDevice<x, 0>(); \
56 |   else \
57 |     return getSubDevice<x, 1>();
58 | 
59 | sycl::device currentSubDevice(int ndev, int nsub) {
60 |   switch(ndev) {
61 |     subdev_case(0);
62 |     subdev_case(1);
63 |     subdev_case(2);
64 |     subdev_case(3);
65 |     subdev_case(4);
66 |     subdev_case(5);
67 |     subdev_case(6);
68 |     subdev_case(7);
69 |   }
70 |   throw std::exception();
71 | }
72 | 
73 | static uint32_t g_dev_num = 1;
74 | static uint32_t g_part_num = 0;
75 | 
76 | sycl::device currentSubDevice() {
77 |   return currentSubDevice(g_dev_num, g_part_num);
78 | }
79 | 
80 | sycl::queue currentQueue() {
81 |   return currentQueue(g_dev_num, g_part_num);
82 | }
83 | 


--------------------------------------------------------------------------------
/src/test/remotesync/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1
4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test
5 | #mpirun --prepend-rank -n 2 -ppn 2  -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good
6 | #mpirun --prepend-rank -n 8 -ppn 8  -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test
7 | 


--------------------------------------------------------------------------------
/src/test/remotesync/ze_exception.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <level_zero/ze_api.h>
 4 | #include <exception>
 5 | #include <unordered_map>
 6 | #include <iostream>
 7 | 
 8 | // Mapping from status to human readable string
 9 | class zeException : std::exception {
10 |   const char * zeResultToString(ze_result_t status) const {
11 |     static const std::unordered_map<ze_result_t, const char *> zeResultToStringMap{
12 |       {ZE_RESULT_SUCCESS, "[Core] success"},
13 |       {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"},
14 |       {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"},
15 |       {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"},
16 |       {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"},
17 |       {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"},
18 |       {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"},
19 |       {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"},
20 |       {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"},
21 |       {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"},
22 |       {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"},
23 |       {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"},
24 |       {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"},
25 |       {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"},
26 |       {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"},
27 |       {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"},
28 |     };
29 |     auto it = zeResultToStringMap.find(status);
30 |     if (it != zeResultToStringMap.end())
31 |       return it->second;
32 |     else
33 |       return "Unknown Reason";
34 |   }
35 | 
36 | public:
37 |   zeException(ze_result_t ret) : result_(ret) {}
38 | 
39 |   ze_result_t result_;
40 | 
41 |   const char* what() const noexcept override {
42 |     return zeResultToString(result_);
43 |   }
44 | };
45 | 
46 | #define zeCheck(x)             \
47 |   if (x != ZE_RESULT_SUCCESS)  {    \
48 |     auto e = zeException(x);  \
49 |     std::cout<<"Throw "<<e.what()<<std::endl; \
50 |     throw e;      \
51 |   }
52 | 


--------------------------------------------------------------------------------
/src/test/segfault/Makefile:
--------------------------------------------------------------------------------
 1 | CC=icx
 2 | CXX=icpx
 3 | 
 4 | OPT=
 5 | 
 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required"
 7 | CCL_ROOT=../ccl/release/_install
 8 | 
 9 | INCLUDES=-I$(CCL_ROOT)/include
10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader
11 | 
12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES)
13 | 
14 | all : simple_test
15 | 
16 | clean:
17 | 	rm -f simple_test
18 | 


--------------------------------------------------------------------------------
/src/test/segfault/simple_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <sys/mman.h>
  2 | #include <sys/syscall.h>
  3 | #include <unistd.h>
  4 | #include <system_error>
  5 | 
  6 | #include <mpi.h>
  7 | #include <sycl/sycl.hpp>
  8 | #include <level_zero/ze_api.h>
  9 | 
 10 | #include "ze_exception.hpp"
 11 | #include "sycl_misc.hpp"
 12 | 
 13 | #define MAGIC_NUM 15
 14 | #define MAX_RANK 8
 15 | #define MAX_BUFFER 4096
 16 | #define OPERATE_SIZE 14336
 17 | 
 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK;
 19 | size_t check_size = 4096;
 20 | 
 21 | int world = -1;
 22 | int rank = -1;
 23 | 
 24 | int use_tmp_buffer;
 25 | 
 26 | void* buffer[MAX_RANK];
 27 | void* sync_buffer[MAX_RANK];
 28 | void* ready_buffer[MAX_RANK];
 29 | 
 30 | void exchange_mem(sycl::queue& queue, void* ptr);
 31 | 
 32 | struct exchange_contents {
 33 |   union {
 34 |     ze_ipc_mem_handle_t ipc_handle;
 35 |     int fd = -1;
 36 |   };
 37 |   size_t offset = 0;
 38 |   int pid = -1;
 39 | };
 40 | 
 41 | #define sysCheck(x) \
 42 |   if (x == -1) {  \
 43 |     throw std::system_error(  \
 44 |         std::make_error_code(std::errc(errno)));  \
 45 |   }
 46 |   
 47 | int main(int argc, char* argv[]) {
 48 |   if (argc > 1) {
 49 |     use_tmp_buffer = 1;
 50 |   }
 51 | 
 52 |   size_t buffer_size = buffer_base_size + 1024 * 32768;
 53 | 
 54 |   auto ret = MPI_Init(&argc, &argv);
 55 |   if (ret == MPI_ERR_OTHER) {
 56 |     std::cout<<"MPI init error"<<std::endl;
 57 |     return -1;
 58 |   }
 59 | 
 60 |   MPI_Comm_size(MPI_COMM_WORLD, &world);
 61 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 62 |   
 63 |   zeCheck(zeInit(0));
 64 | 
 65 |   // rank 0, device 0, subdevice 0
 66 |   // rank 1, device 0, subdevice 1
 67 |   // rank 2, device 1, subdevice 0
 68 |   // ...
 69 |   auto queue = currentQueue(rank / 2, rank & 1);
 70 |   std::cout << "Buffer size: " << buffer_size << std::endl;
 71 | 
 72 |   void* operate_buffer = sycl::malloc_device(buffer_size, queue);
 73 | 
 74 |   sycl::event e;
 75 | 
 76 |   uint32_t temp_rank = rank;
 77 | 
 78 |   uint32_t* ptr = (uint32_t *)operate_buffer + buffer_base_size / sizeof(uint32_t);
 79 |   e = queue.submit([&](sycl::handler& cgh) {
 80 |     cgh.parallel_for(sycl::range { 1024 / sizeof(uint32_t) }, ([=](sycl::id<1> index) {
 81 |       ptr[index] = (uint32_t)temp_rank;
 82 |     }));
 83 |   });
 84 |   queue.wait();
 85 | 
 86 |   exchange_mem(queue, operate_buffer);
 87 | 
 88 |   MPI_Finalize();
 89 | }
 90 | 
 91 | void exchange_mem(sycl::queue& queue, void* ptr) {
 92 |   // Step 1: Get base address of the pointer
 93 |   sycl::context ctx = queue.get_context();
 94 |   auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
 95 | 
 96 |   void *base_addr;
 97 |   size_t base_size;
 98 |   zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size));
 99 |   
100 |   std::cout << "Base size: " << base_size << std::endl;
101 |   std::cout << "Buffer base size: " << buffer_base_size << std::endl;
102 |   std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl;
103 |   std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl;
104 | 
105 |   // Step 2: Get IPC mem handle from base address
106 |   alignas(64) exchange_contents send_buf;
107 |   alignas(64) exchange_contents recv_buf[world];
108 | 
109 |   // fill in the exchange info
110 |   zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle));
111 |   send_buf.offset = (char*)ptr - (char*)base_addr;
112 |   send_buf.pid = getpid();
113 | 
114 |   int* host_buffer = (int *)(malloc(1024)); 
115 |   void* tmp_buffer = sycl::malloc_device(1024, queue);
116 | 
117 |   void* sync_addr = NULL;
118 |   sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size);
119 |   std::cout << "Sync buffer content at " << sync_addr << ": ";
120 |   queue.memcpy(host_buffer, sync_addr, 1024);
121 |   queue.wait();
122 |   for (int i = 0; i < 256; i += 16) {
123 |     std::cout << &host_buffer[i] << ": " << host_buffer[i] << std::endl;
124 |   }
125 | 
126 |   // Step 3: Exchange the handles and offsets
127 |   memset(recv_buf, 0, sizeof(recv_buf));
128 |   // Overkill if we don't really needs all peer's handles
129 |   MPI_Allgather(
130 |       &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD);
131 | 
132 |     
133 |   for (uint32_t i = 0; i < world; i++){
134 |     // Step 4: Prepare pid file descriptor of next process
135 |     auto* peer = recv_buf + i;
136 |     auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0);
137 |     sysCheck(pid_fd);
138 |     //
139 |     // Step 5: Duplicate GEM object handle to local process
140 |     // and overwrite original file descriptor number
141 |     //
142 |     peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0);
143 |     sysCheck(peer->fd);
144 | 
145 |     // Step 6: Open IPC handle of remote peer
146 |     auto l0_device
147 |         = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_device());
148 |     void* peer_base;
149 | 
150 |     zeCheck(zeMemOpenIpcHandle(
151 |             l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base));
152 |     buffer[i] = (char*)peer_base + peer->offset;
153 |     sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128;
154 |     ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64;
155 | 
156 |     char* end = (char*)peer_base + peer->offset + base_size;
157 | 
158 |     std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl;
159 | 
160 |     sync_addr = (void *)((char*)peer_base + peer->offset + buffer_base_size);
161 |     //sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size);
162 | 
163 |     if (use_tmp_buffer == 0) {    
164 |       std::cout << "Copy sync buffer (mapped from rank " << i << ") at " << sync_addr << " to host" << std::endl;
165 |       queue.memcpy(host_buffer, sync_addr, 1024);
166 |     } else {
167 |       std::cout << "Copy sync buffer (mapped from rank " << i << ") at " << sync_addr << " to temp buffer & then to host" << std::endl;
168 |       queue.memcpy(tmp_buffer, sync_addr, 1024);
169 |       queue.memcpy(host_buffer, tmp_buffer, 1024);
170 |     }
171 |     queue.wait();
172 |    
173 |     std::cout << "Sync buffer content at " << sync_addr << std::endl;
174 |     for (int i = 0; i < 256; i += 16) {
175 |       std::cout << &host_buffer[i] << ": " << host_buffer[i] << std::endl;
176 |     }
177 |   }    
178 | }
179 | 
180 | 


--------------------------------------------------------------------------------
/src/test/segfault/sycl_misc.hpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <iostream>
 3 | 
 4 | template <int ndev, int nsub>
 5 | sycl::device getSubDevice() {
 6 |   static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu);
 7 |   auto dev = devs[ndev];
 8 |   try {
 9 |     static auto subs = dev.template create_sub_devices<
10 |       sycl::info::partition_property::partition_by_affinity_domain>(
11 |           sycl::info::partition_affinity_domain::numa);
12 | 
13 |     return subs[nsub];
14 |   } catch (sycl::exception &e) {
15 |     std::cout<<e.what()<<std::endl;
16 |     return dev;
17 |   };
18 | }
19 | 
20 | template <int ndev, int nsub>
21 | sycl::queue getQueue() {
22 |   static sycl::queue q(
23 |       getSubDevice<ndev, nsub>(),
24 |       sycl::property_list {
25 |         sycl::property::queue::enable_profiling(),
26 |         sycl::property::queue::in_order()
27 |       });
28 |   return q;
29 | }
30 | 
31 | #define queue_case(x) \
32 | case x: \
33 |   if (nsub == 0) \
34 |     return getQueue<x, 0>(); \
35 |   else \
36 |     return getQueue<x, 1>();
37 | 
38 | sycl::queue currentQueue(int ndev, int nsub) {
39 |   switch(ndev) {
40 |     queue_case(0);
41 |     queue_case(1);
42 |     queue_case(2);
43 |     queue_case(3);
44 |     queue_case(4);
45 |     queue_case(5);
46 |     queue_case(6);
47 |     queue_case(7);
48 |   }
49 |   throw std::exception();
50 | }
51 | 
52 | #define subdev_case(x) \
53 | case x: \
54 |   if (nsub == 0) \
55 |     return getSubDevice<x, 0>(); \
56 |   else \
57 |     return getSubDevice<x, 1>();
58 | 
59 | sycl::device currentSubDevice(int ndev, int nsub) {
60 |   switch(ndev) {
61 |     subdev_case(0);
62 |     subdev_case(1);
63 |     subdev_case(2);
64 |     subdev_case(3);
65 |     subdev_case(4);
66 |     subdev_case(5);
67 |     subdev_case(6);
68 |     subdev_case(7);
69 |   }
70 |   throw std::exception();
71 | }
72 | 
73 | static uint32_t g_dev_num = 1;
74 | static uint32_t g_part_num = 0;
75 | 
76 | sycl::device currentSubDevice() {
77 |   return currentSubDevice(g_dev_num, g_part_num);
78 | }
79 | 
80 | sycl::queue currentQueue() {
81 |   return currentQueue(g_dev_num, g_part_num);
82 | }
83 | 


--------------------------------------------------------------------------------
/src/test/segfault/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1
4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test
5 | #mpirun --prepend-rank -n 2 -ppn 2  -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good
6 | #mpirun --prepend-rank -n 8 -ppn 8  -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test
7 | 


--------------------------------------------------------------------------------
/src/test/segfault/ze_exception.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <level_zero/ze_api.h>
 4 | #include <exception>
 5 | #include <unordered_map>
 6 | #include <iostream>
 7 | 
 8 | // Mapping from status to human readable string
 9 | class zeException : std::exception {
10 |   const char * zeResultToString(ze_result_t status) const {
11 |     static const std::unordered_map<ze_result_t, const char *> zeResultToStringMap{
12 |       {ZE_RESULT_SUCCESS, "[Core] success"},
13 |       {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"},
14 |       {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"},
15 |       {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"},
16 |       {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"},
17 |       {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"},
18 |       {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"},
19 |       {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"},
20 |       {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"},
21 |       {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"},
22 |       {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"},
23 |       {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"},
24 |       {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"},
25 |       {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"},
26 |       {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"},
27 |       {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"},
28 |     };
29 |     auto it = zeResultToStringMap.find(status);
30 |     if (it != zeResultToStringMap.end())
31 |       return it->second;
32 |     else
33 |       return "Unknown Reason";
34 |   }
35 | 
36 | public:
37 |   zeException(ze_result_t ret) : result_(ret) {}
38 | 
39 |   ze_result_t result_;
40 | 
41 |   const char* what() const noexcept override {
42 |     return zeResultToString(result_);
43 |   }
44 | };
45 | 
46 | #define zeCheck(x)             \
47 |   if (x != ZE_RESULT_SUCCESS)  {    \
48 |     auto e = zeException(x);  \
49 |     std::cout<<"Throw "<<e.what()<<std::endl; \
50 |     throw e;      \
51 |   }
52 | 


--------------------------------------------------------------------------------
/src/test/writeremote/Makefile:
--------------------------------------------------------------------------------
 1 | CC=icx
 2 | CXX=icpx
 3 | 
 4 | OPT=
 5 | 
 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required"
 7 | CCL_ROOT=../ccl/release/_install
 8 | 
 9 | INCLUDES=-I$(CCL_ROOT)/include
10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader
11 | 
12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES)
13 | 
14 | all : simple_test
15 | 
16 | clean:
17 | 	rm -f simple_test
18 | 


--------------------------------------------------------------------------------
/src/test/writeremote/simple_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <sys/mman.h>
  2 | #include <sys/syscall.h>
  3 | #include <unistd.h>
  4 | #include <system_error>
  5 | 
  6 | #include <mpi.h>
  7 | #include <sycl/sycl.hpp>
  8 | #include <level_zero/ze_api.h>
  9 | 
 10 | #include "ze_exception.hpp"
 11 | #include "sycl_misc.hpp"
 12 | 
 13 | #define MAGIC_NUM 15
 14 | #define MAX_RANK 8
 15 | #define MAX_BUFFER 4096
 16 | #define OPERATE_SIZE 14336
 17 | 
 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK;
 19 | size_t check_size = 4096;
 20 | 
 21 | int world = -1;
 22 | int rank = -1;
 23 | 
 24 | void* buffer[MAX_RANK];
 25 | void* sync_buffer[MAX_RANK];
 26 | void* ready_buffer[MAX_RANK];
 27 | 
 28 | void exchange_mem(sycl::queue& queue, void* ptr);
 29 | void dump_buffer(sycl::queue& queue, void* gpu_addr);
 30 | void atomic_write_remote(sycl::queue& queue, int good);
 31 | 
 32 | struct exchange_contents {
 33 |   union {
 34 |     ze_ipc_mem_handle_t ipc_handle;
 35 |     int fd = -1;
 36 |   };
 37 |   size_t offset = 0;
 38 |   int pid = -1;
 39 | };
 40 | 
 41 | #define sysCheck(x) \
 42 |   if (x == -1) {  \
 43 |     throw std::system_error(  \
 44 |         std::make_error_code(std::errc(errno)));  \
 45 |   }
 46 |   
 47 | int main(int argc, char* argv[]) {
 48 |   size_t buffer_size = buffer_base_size + 1024;
 49 | 
 50 |   auto ret = MPI_Init(&argc, &argv);
 51 |   if (ret == MPI_ERR_OTHER) {
 52 |     std::cout<<"MPI init error"<<std::endl;
 53 |     return -1;
 54 |   }
 55 | 
 56 |   MPI_Comm_size(MPI_COMM_WORLD, &world);
 57 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 58 |   
 59 |   zeCheck(zeInit(0));
 60 | 
 61 |   // rank 0, device 0, subdevice 0
 62 |   // rank 1, device 0, subdevice 1
 63 |   // rank 2, device 1, subdevice 0
 64 |   // ...
 65 |   auto queue = currentQueue(rank / 2, rank & 1);
 66 |   std::cout << "Buffer size: " << buffer_size << std::endl;
 67 | 
 68 |   void* operate_buffer = sycl::malloc_device(buffer_size, queue);
 69 | 
 70 |   sycl::event e;
 71 | 
 72 |   uint32_t* ptr = (uint32_t *)operate_buffer + buffer_base_size / sizeof(uint32_t);
 73 |   e = queue.submit([&](sycl::handler& cgh) {
 74 |     cgh.parallel_for(sycl::range { 1024 / sizeof(uint32_t) }, ([=](sycl::id<1> index) {
 75 |       ptr[index] = (uint32_t)0;
 76 |     }));
 77 |   });
 78 |   queue.wait();
 79 | 
 80 |   exchange_mem(queue, operate_buffer);
 81 | 
 82 |   atomic_write_remote(queue, (argc > 1));
 83 | 
 84 |   MPI_Barrier(MPI_COMM_WORLD);
 85 |   std::cout << "Host MPI barrier completed" << std::endl;  
 86 | 
 87 |   dump_buffer(queue, ptr);
 88 | 
 89 |   MPI_Barrier(MPI_COMM_WORLD);
 90 |   std::cout << "Host MPI barrier completed" << std::endl;  
 91 | 
 92 |   MPI_Finalize();
 93 | 
 94 | }
 95 | 
 96 | void atomic_write_remote(sycl::queue& queue, int good) {
 97 |   uint32_t temp_world = world;
 98 |   uint32_t temp_rank = rank;
 99 | 
100 |   int *temp_sync_buffer[MAX_RANK];
101 |   for (int index = 0; index < temp_world; index++) {
102 |     temp_sync_buffer[index] = (int *)sync_buffer[index];
103 |   }
104 | 
105 |   for (int index = 0; index < temp_world; index++) {
106 |     if (index != temp_rank) {
107 |       std::cout << "Setting " << temp_sync_buffer[index] << " (remote) to 1" << std::endl;
108 |     } else {
109 |       std::cout << "Setting " << temp_sync_buffer[index] << " (local mapped) to 1" << std::endl;
110 |     }
111 |   }
112 | 
113 |   queue.submit([&](sycl::handler& cgh) {
114 |     if (good != 0) {
115 |       sycl::stream str(8192, 1024, cgh);
116 |     }
117 |     cgh.parallel_for(sycl::range { temp_world }, ([=](sycl::id<1> index) {
118 |       //if (index != temp_rank) {
119 |       int * peer_sync_ptr = (int*)temp_sync_buffer[index];
120 |       auto v =
121 |           sycl::atomic_ref<int, sycl::memory_order::acq_rel,
122 |                                 sycl::memory_scope::system,
123 |                                 sycl::access::address_space::global_space>(peer_sync_ptr[0]);
124 |       v.store(1);
125 |       //}
126 |     }));
127 |   });
128 |   queue.wait();
129 | 
130 |   std::cout << "Kernel done" << std::endl;
131 | }
132 | 
133 | void exchange_mem(sycl::queue& queue, void* ptr) {
134 |   // Step 1: Get base address of the pointer
135 |   sycl::context ctx = queue.get_context();
136 |   auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
137 | 
138 |   void *base_addr;
139 |   size_t base_size;
140 |   zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size));
141 |   
142 |   std::cout << "Memory range size: " << base_size << std::endl;
143 |   std::cout << "Buffer base size: " << buffer_base_size << std::endl;
144 |   std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl;
145 |   std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl;
146 | 
147 |   // Step 2: Get IPC mem handle from base address
148 |   alignas(64) exchange_contents send_buf;
149 |   alignas(64) exchange_contents recv_buf[world];
150 | 
151 |   // fill in the exchange info
152 |   zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle));
153 |   send_buf.offset = (char*)ptr - (char*)base_addr;
154 |   send_buf.pid = getpid();
155 | 
156 |   void * sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size);
157 |   dump_buffer(queue, sync_addr);
158 | 
159 |   // Step 3: Exchange the handles and offsets
160 |   memset(recv_buf, 0, sizeof(recv_buf));
161 |   // Overkill if we don't really needs all peer's handles
162 |   MPI_Allgather(
163 |       &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD);
164 | 
165 |     
166 |   for (uint32_t i = 0; i < world; i++){
167 |     // Step 4: Prepare pid file descriptor of next process
168 |     auto* peer = recv_buf + i;
169 |     auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0);
170 |     sysCheck(pid_fd);
171 |     //
172 |     // Step 5: Duplicate GEM object handle to local process
173 |     // and overwrite original file descriptor number
174 |     //
175 |     peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0);
176 |     sysCheck(peer->fd);
177 | 
178 |     // Step 6: Open IPC handle of remote peer
179 |     auto l0_device
180 |         = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_device());
181 |     void* peer_base;
182 | 
183 |     zeCheck(zeMemOpenIpcHandle(
184 |             l0_ctx, l0_device, peer->ipc_handle, 0, &peer_base));
185 |     //        l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED, &peer_base));
186 |     buffer[i] = (char*)peer_base + peer->offset;
187 |     sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128;
188 |     ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64;
189 | 
190 |     char* end = (char*)peer_base + peer->offset + base_size;
191 | 
192 |     std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl;
193 |   }    
194 | }
195 | 
196 | void dump_buffer(sycl::queue& queue, void* gpu_addr) {
197 |   int* host_buffer = (int *)(malloc(1024)); 
198 |   queue.memcpy(host_buffer, gpu_addr, 1024);
199 |   queue.wait();
200 |   std::cout << "Buffer copied from " << gpu_addr << " to host" << std::endl;
201 |   std::cout << "Dump content of " << gpu_addr << ": " << std::endl;
202 |   for (int i = 0; i < world; i++) {
203 |     //if (i != rank) {
204 |     std::cout << (int *)gpu_addr + i * 32 << ": " << host_buffer[i * 32] << std::endl;
205 |     //}
206 |   }
207 |   free(host_buffer);
208 | }
209 | 
210 | 


--------------------------------------------------------------------------------
/src/test/writeremote/sycl_misc.hpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <iostream>
 3 | 
 4 | template <int ndev, int nsub>
 5 | sycl::device getSubDevice() {
 6 |   static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu);
 7 |   auto dev = devs[ndev];
 8 |   try {
 9 |     static auto subs = dev.template create_sub_devices<
10 |       sycl::info::partition_property::partition_by_affinity_domain>(
11 |           sycl::info::partition_affinity_domain::numa);
12 | 
13 |     return subs[nsub];
14 |   } catch (sycl::exception &e) {
15 |     std::cout<<e.what()<<std::endl;
16 |     return dev;
17 |   };
18 | }
19 | 
20 | template <int ndev, int nsub>
21 | sycl::queue getQueue() {
22 |   static sycl::queue q(
23 |       getSubDevice<ndev, nsub>(),
24 |       sycl::property_list {
25 |         sycl::property::queue::enable_profiling(),
26 |         sycl::property::queue::in_order()
27 |       });
28 |   return q;
29 | }
30 | 
31 | #define queue_case(x) \
32 | case x: \
33 |   if (nsub == 0) \
34 |     return getQueue<x, 0>(); \
35 |   else \
36 |     return getQueue<x, 1>();
37 | 
38 | sycl::queue currentQueue(int ndev, int nsub) {
39 |   switch(ndev) {
40 |     queue_case(0);
41 |     queue_case(1);
42 |     queue_case(2);
43 |     queue_case(3);
44 |     queue_case(4);
45 |     queue_case(5);
46 |     queue_case(6);
47 |     queue_case(7);
48 |   }
49 |   throw std::exception();
50 | }
51 | 
52 | #define subdev_case(x) \
53 | case x: \
54 |   if (nsub == 0) \
55 |     return getSubDevice<x, 0>(); \
56 |   else \
57 |     return getSubDevice<x, 1>();
58 | 
59 | sycl::device currentSubDevice(int ndev, int nsub) {
60 |   switch(ndev) {
61 |     subdev_case(0);
62 |     subdev_case(1);
63 |     subdev_case(2);
64 |     subdev_case(3);
65 |     subdev_case(4);
66 |     subdev_case(5);
67 |     subdev_case(6);
68 |     subdev_case(7);
69 |   }
70 |   throw std::exception();
71 | }
72 | 
73 | static uint32_t g_dev_num = 1;
74 | static uint32_t g_part_num = 0;
75 | 
76 | sycl::device currentSubDevice() {
77 |   return currentSubDevice(g_dev_num, g_part_num);
78 | }
79 | 
80 | sycl::queue currentQueue() {
81 |   return currentQueue(g_dev_num, g_part_num);
82 | }
83 | 


--------------------------------------------------------------------------------
/src/test/writeremote/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1
4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test
5 | #mpirun --prepend-rank -n 2 -ppn 2  -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good
6 | #mpirun --prepend-rank -n 8 -ppn 8  -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test
7 | 


--------------------------------------------------------------------------------
/src/test/writeremote/ze_exception.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <level_zero/ze_api.h>
 4 | #include <exception>
 5 | #include <unordered_map>
 6 | #include <iostream>
 7 | 
 8 | // Mapping from status to human readable string
 9 | class zeException : std::exception {
10 |   const char * zeResultToString(ze_result_t status) const {
11 |     static const std::unordered_map<ze_result_t, const char *> zeResultToStringMap{
12 |       {ZE_RESULT_SUCCESS, "[Core] success"},
13 |       {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"},
14 |       {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"},
15 |       {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"},
16 |       {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"},
17 |       {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"},
18 |       {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"},
19 |       {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"},
20 |       {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"},
21 |       {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"},
22 |       {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"},
23 |       {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"},
24 |       {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"},
25 |       {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"},
26 |       {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"},
27 |       {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"},
28 |     };
29 |     auto it = zeResultToStringMap.find(status);
30 |     if (it != zeResultToStringMap.end())
31 |       return it->second;
32 |     else
33 |       return "Unknown Reason";
34 |   }
35 | 
36 | public:
37 |   zeException(ze_result_t ret) : result_(ret) {}
38 | 
39 |   ze_result_t result_;
40 | 
41 |   const char* what() const noexcept override {
42 |     return zeResultToString(result_);
43 |   }
44 | };
45 | 
46 | #define zeCheck(x)             \
47 |   if (x != ZE_RESULT_SUCCESS)  {    \
48 |     auto e = zeException(x);  \
49 |     std::cout<<"Throw "<<e.what()<<std::endl; \
50 |     throw e;      \
51 |   }
52 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020-2021, Intel Corporation
  3 |  * All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  * 1. Redistributions of source code must retain the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer.
 10 |  *
 11 |  * 2. Redistributions in binary form must reproduce the above copyright
 12 |  *    notice, this list of conditions and the following disclaimer in the
 13 |  *    documentation and/or other materials provided with the distribution.
 14 |  *
 15 |  * 3. Neither the name of the Intel Corporation nor the names of its contributors
 16 |  *    may be used to endorse or promote products derived from this software
 17 |  *    without specific prior written permission.
 18 |  *
 19 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 23 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 |  * POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | 
 32 | #include "utils.h"
 33 | 
 34 | namespace oneccl_bindings_for_pytorch {
 35 | 
 36 | // Op mapping
 37 | using c10d::ReduceOp;
 38 | std::map<c10d::ReduceOp, ccl::reduction> cclOps =
 39 |   {
 40 |     {ReduceOp::MIN, ccl::reduction::min},
 41 |     {ReduceOp::MAX, ccl::reduction::max},
 42 |     {ReduceOp::SUM, ccl::reduction::sum},
 43 |     {ReduceOp::PRODUCT, ccl::reduction::prod},
 44 |   };
 45 | 
 46 | std::map<at::ScalarType, ccl::datatype> cclDatatypes =
 47 |   {
 48 |     {at::kByte, ccl::datatype::uint8},
 49 |     {at::kChar, ccl::datatype::int8},
 50 |     {at::kShort, ccl::datatype::int16},
 51 |     {at::kInt, ccl::datatype::int32},
 52 |     {at::kLong, ccl::datatype::int64},
 53 |     {at::kHalf, ccl::datatype::float16},
 54 |     {at::kFloat, ccl::datatype::float32},
 55 |     {at::kDouble, ccl::datatype::float64},
 56 |     {at::kBFloat16, ccl::datatype::bfloat16},
 57 |     {at::kBool, ccl::datatype::uint8},
 58 |   };
 59 | 
 60 | // Get the key from the list of devices
 61 | std::string get_key_from_devs(const std::vector<at::Device>& devices) {
 62 |   std::string key = DeviceTypeName(devices[0].type(), /* lower case */ true) + ":";
 63 |   for (auto& device : devices) {
 64 |     key.append(std::to_string(device.index()) + ",");
 65 |   }
 66 |   return key;
 67 | }
 68 | 
 69 | // Get the list of devices from list of tensors
 70 | std::vector<at::Device> get_device_list(const std::vector<at::Tensor>& tensors) {
 71 |   std::vector<at::Device> res;
 72 |   res.reserve(tensors.size());
 73 |   for (auto& tensor : tensors) {
 74 |     // Tensors must all be on the same device, or all on distinct devices.
 75 |     if (res.size() == 0 || tensor.device() != res[0]) {
 76 |       res.push_back(tensor.device());
 77 |     }
 78 |   }
 79 |   return res;
 80 | }
 81 | 
 82 | std::vector<at::Device> get_device_list(const std::vector<std::vector<at::Tensor> >& tensors) {
 83 |   std::vector<at::Device> res;
 84 |   res.reserve(tensors.size());
 85 |   for (auto& tensor : tensors) {
 86 |     res.push_back(tensor[0].device());
 87 |   }
 88 |   return res;
 89 | }
 90 | 
 91 | bool check_same_size(const std::vector<at::Tensor>& tensors) {
 92 |   for (const auto& tensor : tensors) {
 93 |     if (!tensors[0].is_same_size(tensor)) {
 94 |       return false;
 95 |     }
 96 |   }
 97 |   return true;
 98 | }
 99 | 
100 | std::vector<at::Tensor> flatten_tensor_lists(std::vector<std::vector<at::Tensor>>& tensor_lists, std::vector<at::Tensor>& other, size_t world_size) {
101 |   if (tensor_lists.size() != other.size()) {
102 |     TORCH_CHECK(
103 |         false,
104 |         "Tensor list operands to scatter/gather must have the same length");
105 |   }
106 |   const auto num_devices = tensor_lists.size();
107 | 
108 |   std::vector<at::Tensor> flattened;
109 |   flattened.resize(num_devices);
110 | 
111 |   for (const auto i : c10::irange(size_t{}, num_devices)) {
112 |     if (tensor_lists[i].size() != world_size * num_devices) {
113 |       TORCH_CHECK(
114 |           false,
115 |           c10::str(
116 |               "Tensor list input to scatter/gather must match number of collective participants ",
117 |               "but got ",
118 |               tensor_lists[i].size(),
119 |               " inputs",
120 |               " with world_size ",
121 |               world_size,
122 |               " and ",
123 |               num_devices,
124 |               " devices."));
125 |     }
126 | 
127 |     // Only check device match for the first tensor in the list; the call to
128 |     // newLikeFlat() below will check the rest.
129 |     if (tensor_lists[i].front().get_device() != other[i].get_device()) {
130 |       TORCH_CHECK(
131 |           false,
132 |           "Corresponding input/output tensors to scatter/gather must all reside"
133 |           " on the same device");
134 |     }
135 | 
136 |     for (const auto& t : tensor_lists[i]) {
137 |       if (t.numel() != other[i].numel()) {
138 |         TORCH_CHECK(
139 |             false,
140 |             "All tensor operands to scatter/gather must have the same number of elements");
141 |       }
142 |     }
143 |     // Flatten the tensors (from all ranks) into a single big tensor.
144 |     flattened[i] = c10d::newLikeFlat(tensor_lists, i);
145 |   }
146 |   return flattened;
147 | }
148 | 
149 | std::string get_key_send_recv(int myRank, int peer) {
150 |   int lowRank = myRank < peer ? myRank : peer;
151 |   int highRank = myRank < peer ? peer : myRank;
152 |   std::string sendRecvPair =
153 |       std::to_string(lowRank) + ":" + std::to_string(highRank);
154 |   return sendRecvPair;
155 | }
156 | 
157 | FlatCheckResult computeLengthsAndCheckFlat(
158 |         const std::vector<at::Tensor>& tensors,
159 |         std::vector<size_t>& lengths)
160 | {
161 |   int64_t groupSize = lengths.size();
162 |   auto firstTensor = tensors[0];
163 |   int64_t offset = 0;
164 |   auto firstLength = firstTensor.numel();
165 |   auto storage = firstTensor.storage();
166 |   auto firstStorageOffset = firstTensor.storage_offset();
167 |   bool isFlat = true;
168 | 
169 |   for (int i = 0; i < groupSize; i++)
170 |   {
171 |     auto& curTensor = tensors[i];
172 |     int64_t length = curTensor.numel();
173 | 
174 |     if (firstLength == 0 && length != 0)
175 |     {
176 |       firstLength = length;
177 |       firstTensor = curTensor;
178 |       storage = curTensor.storage();
179 |       firstStorageOffset = curTensor.storage_offset();
180 |     }
181 | 
182 |     lengths[i] = length;
183 | 
184 |     if (isFlat && (length != 0 || firstLength != 0) &&
185 |         (!storage.is_alias_of(curTensor.storage()) ||
186 |          curTensor.storage_offset() != firstStorageOffset + offset))
187 |       isFlat = false;
188 | 
189 |     offset += length;
190 |   }
191 | 
192 |   return FlatCheckResult{isFlat, offset, firstTensor};
193 | }
194 | 
195 | bool computeLengthsAndCheckAndGetFlat(
196 |         const std::vector<at::Tensor>& tensors,
197 |         std::vector<size_t>& lengths,
198 |         at::Tensor& flatTensor,
199 |         int64_t& flatLength)
200 | {
201 |   auto flatRes = computeLengthsAndCheckFlat(tensors, lengths);
202 | 
203 |   flatLength = flatRes.size;
204 | 
205 |   if (flatRes.isFlat)
206 |   {
207 |     flatTensor = flatRes.firstTensor;
208 |   }
209 |   else
210 |   {
211 |     flatTensor = at::empty({flatRes.size}, flatRes.firstTensor.options());
212 |   }
213 | 
214 |   return flatRes.isFlat;
215 | }
216 | 
217 | void checkSingleTensorHelper(const at::Tensor& tensor)
218 | {
219 |   TORCH_CHECK(tensor.is_sparse() || tensor.is_contiguous(), "input dense tensor has to be contiguous");
220 |   TORCH_CHECK(!tensor.is_cuda(), "CUDA tensor detected and CCL doesn't support CUDA buffers");
221 |   TORCH_CHECK(tensor.numel() >= 0, "input tensor numel should be non-negative");
222 | }
223 | 
224 | void checkSingleTensor(const std::vector<at::Tensor>& tensors)
225 | {
226 |   TORCH_CHECK(tensors.size() == 1,
227 |               "CCL process group does not support tensors count " + std::to_string(tensors.size()));
228 | 
229 |   checkSingleTensorHelper(tensors[0]);
230 | }
231 | 
232 | 
233 | void checkSameType(const at::Tensor& tensor,
234 |                    const std::vector<at::Tensor>& tensors)
235 | {
236 |   for (size_t i = 0; i < tensors.size(); ++i)
237 |   {
238 |     TORCH_CHECK(tensors[i].scalar_type() == tensor.scalar_type(),
239 |                 "Tensors are not equal in data type");
240 |     TORCH_CHECK(tensors[i].device().type() == tensor.device().type(),
241 |                 "Tensors are not in same device type. Expect: ", tensor.device().type(),
242 |                 " But got: ", tensors[i].device().type());
243 | 
244 |     checkSingleTensorHelper(tensors[i]);
245 |   }
246 | }
247 | 
248 | void checkSameType(const at::Tensor& tensor,
249 |                    const std::vector<std::vector<at::Tensor>>& tensors)
250 | {
251 |   for (size_t i = 0; i < tensors.size(); ++i)
252 |   {
253 |     checkSameType(tensor, tensors[i]);
254 |   }
255 | }
256 | 
257 | }
258 | 


--------------------------------------------------------------------------------
/tests/DeepSpeed_test/DeepSpeed.csv:
--------------------------------------------------------------------------------
  1 | allreduce,1,-1
  2 | broadcast,154533888,0
  3 | broadcast,6291456,0
  4 | broadcast,3072,0
  5 | broadcast,3072,0
  6 | broadcast,28311552,0
  7 | broadcast,9216,0
  8 | broadcast,9437184,0
  9 | broadcast,3072,0
 10 | broadcast,3072,0
 11 | broadcast,3072,0
 12 | broadcast,37748736,0
 13 | broadcast,12288,0
 14 | broadcast,37748736,0
 15 | broadcast,3072,0
 16 | broadcast,3072,0
 17 | broadcast,3072,0
 18 | broadcast,28311552,0
 19 | broadcast,9216,0
 20 | broadcast,9437184,0
 21 | broadcast,3072,0
 22 | broadcast,3072,0
 23 | broadcast,3072,0
 24 | broadcast,37748736,0
 25 | broadcast,12288,0
 26 | broadcast,37748736,0
 27 | broadcast,3072,0
 28 | broadcast,3072,0
 29 | broadcast,3072,0
 30 | broadcast,28311552,0
 31 | broadcast,9216,0
 32 | broadcast,9437184,0
 33 | broadcast,3072,0
 34 | broadcast,3072,0
 35 | broadcast,3072,0
 36 | broadcast,37748736,0
 37 | broadcast,12288,0
 38 | broadcast,37748736,0
 39 | broadcast,3072,0
 40 | broadcast,3072,0
 41 | broadcast,3072,0
 42 | broadcast,28311552,0
 43 | broadcast,9216,0
 44 | broadcast,9437184,0
 45 | broadcast,3072,0
 46 | broadcast,3072,0
 47 | broadcast,3072,0
 48 | broadcast,37748736,0
 49 | broadcast,12288,0
 50 | broadcast,37748736,0
 51 | broadcast,3072,0
 52 | broadcast,3072,0
 53 | broadcast,3072,0
 54 | broadcast,28311552,0
 55 | broadcast,9216,0
 56 | broadcast,9437184,0
 57 | broadcast,3072,0
 58 | broadcast,3072,0
 59 | broadcast,3072,0
 60 | broadcast,37748736,0
 61 | broadcast,12288,0
 62 | broadcast,37748736,0
 63 | broadcast,3072,0
 64 | broadcast,3072,0
 65 | broadcast,3072,0
 66 | broadcast,28311552,0
 67 | broadcast,9216,0
 68 | broadcast,9437184,0
 69 | broadcast,3072,0
 70 | broadcast,3072,0
 71 | broadcast,3072,0
 72 | broadcast,37748736,0
 73 | broadcast,12288,0
 74 | broadcast,37748736,0
 75 | broadcast,3072,0
 76 | broadcast,3072,0
 77 | broadcast,3072,0
 78 | broadcast,28311552,0
 79 | broadcast,9216,0
 80 | broadcast,9437184,0
 81 | broadcast,3072,0
 82 | broadcast,3072,0
 83 | broadcast,3072,0
 84 | broadcast,37748736,0
 85 | broadcast,12288,0
 86 | broadcast,37748736,0
 87 | broadcast,3072,0
 88 | broadcast,3072,0
 89 | broadcast,3072,0
 90 | broadcast,28311552,0
 91 | broadcast,9216,0
 92 | broadcast,9437184,0
 93 | broadcast,3072,0
 94 | broadcast,3072,0
 95 | broadcast,3072,0
 96 | broadcast,37748736,0
 97 | broadcast,12288,0
 98 | broadcast,37748736,0
 99 | broadcast,3072,0
100 | broadcast,3072,0
101 | broadcast,3072,0
102 | broadcast,28311552,0
103 | broadcast,9216,0
104 | broadcast,9437184,0
105 | broadcast,3072,0
106 | broadcast,3072,0
107 | broadcast,3072,0
108 | broadcast,37748736,0
109 | broadcast,12288,0
110 | broadcast,37748736,0
111 | broadcast,3072,0
112 | broadcast,3072,0
113 | broadcast,3072,0
114 | broadcast,28311552,0
115 | broadcast,9216,0
116 | broadcast,9437184,0
117 | broadcast,3072,0
118 | broadcast,3072,0
119 | broadcast,3072,0
120 | broadcast,37748736,0
121 | broadcast,12288,0
122 | broadcast,37748736,0
123 | broadcast,3072,0
124 | broadcast,3072,0
125 | broadcast,3072,0
126 | broadcast,28311552,0
127 | broadcast,9216,0
128 | broadcast,9437184,0
129 | broadcast,3072,0
130 | broadcast,3072,0
131 | broadcast,3072,0
132 | broadcast,37748736,0
133 | broadcast,12288,0
134 | broadcast,37748736,0
135 | broadcast,3072,0
136 | broadcast,3072,0
137 | broadcast,3072,0
138 | broadcast,28311552,0
139 | broadcast,9216,0
140 | broadcast,9437184,0
141 | broadcast,3072,0
142 | broadcast,3072,0
143 | broadcast,3072,0
144 | broadcast,37748736,0
145 | broadcast,12288,0
146 | broadcast,37748736,0
147 | broadcast,3072,0
148 | broadcast,3072,0
149 | broadcast,3072,0
150 | broadcast,28311552,0
151 | broadcast,9216,0
152 | broadcast,9437184,0
153 | broadcast,3072,0
154 | broadcast,3072,0
155 | broadcast,3072,0
156 | broadcast,37748736,0
157 | broadcast,12288,0
158 | broadcast,37748736,0
159 | broadcast,3072,0
160 | broadcast,3072,0
161 | broadcast,3072,0
162 | broadcast,28311552,0
163 | broadcast,9216,0
164 | broadcast,9437184,0
165 | broadcast,3072,0
166 | broadcast,3072,0
167 | broadcast,3072,0
168 | broadcast,37748736,0
169 | broadcast,12288,0
170 | broadcast,37748736,0
171 | broadcast,3072,0
172 | broadcast,3072,0
173 | broadcast,3072,0
174 | broadcast,28311552,0
175 | broadcast,9216,0
176 | broadcast,9437184,0
177 | broadcast,3072,0
178 | broadcast,3072,0
179 | broadcast,3072,0
180 | broadcast,37748736,0
181 | broadcast,12288,0
182 | broadcast,37748736,0
183 | broadcast,3072,0
184 | broadcast,3072,0
185 | broadcast,3072,0
186 | broadcast,28311552,0
187 | broadcast,9216,0
188 | broadcast,9437184,0
189 | broadcast,3072,0
190 | broadcast,3072,0
191 | broadcast,3072,0
192 | broadcast,37748736,0
193 | broadcast,12288,0
194 | broadcast,37748736,0
195 | broadcast,3072,0
196 | broadcast,3072,0
197 | broadcast,3072,0
198 | broadcast,28311552,0
199 | broadcast,9216,0
200 | broadcast,9437184,0
201 | broadcast,3072,0
202 | broadcast,3072,0
203 | broadcast,3072,0
204 | broadcast,37748736,0
205 | broadcast,12288,0
206 | broadcast,37748736,0
207 | broadcast,3072,0
208 | broadcast,3072,0
209 | broadcast,3072,0
210 | broadcast,28311552,0
211 | broadcast,9216,0
212 | broadcast,9437184,0
213 | broadcast,3072,0
214 | broadcast,3072,0
215 | broadcast,3072,0
216 | broadcast,37748736,0
217 | broadcast,12288,0
218 | broadcast,37748736,0
219 | broadcast,3072,0
220 | broadcast,3072,0
221 | broadcast,3072,0
222 | broadcast,28311552,0
223 | broadcast,9216,0
224 | broadcast,9437184,0
225 | broadcast,3072,0
226 | broadcast,3072,0
227 | broadcast,3072,0
228 | broadcast,37748736,0
229 | broadcast,12288,0
230 | broadcast,37748736,0
231 | broadcast,3072,0
232 | broadcast,3072,0
233 | broadcast,3072,0
234 | broadcast,28311552,0
235 | broadcast,9216,0
236 | broadcast,9437184,0
237 | broadcast,3072,0
238 | broadcast,3072,0
239 | broadcast,3072,0
240 | broadcast,37748736,0
241 | broadcast,12288,0
242 | broadcast,37748736,0
243 | broadcast,3072,0
244 | broadcast,3072,0
245 | broadcast,3072,0
246 | broadcast,28311552,0
247 | broadcast,9216,0
248 | broadcast,9437184,0
249 | broadcast,3072,0
250 | broadcast,3072,0
251 | broadcast,3072,0
252 | broadcast,37748736,0
253 | broadcast,12288,0
254 | broadcast,37748736,0
255 | broadcast,3072,0
256 | broadcast,3072,0
257 | broadcast,3072,0
258 | broadcast,28311552,0
259 | broadcast,9216,0
260 | broadcast,9437184,0
261 | broadcast,3072,0
262 | broadcast,3072,0
263 | broadcast,3072,0
264 | broadcast,37748736,0
265 | broadcast,12288,0
266 | broadcast,37748736,0
267 | broadcast,3072,0
268 | broadcast,3072,0
269 | broadcast,3072,0
270 | broadcast,28311552,0
271 | broadcast,9216,0
272 | broadcast,9437184,0
273 | broadcast,3072,0
274 | broadcast,3072,0
275 | broadcast,3072,0
276 | broadcast,37748736,0
277 | broadcast,12288,0
278 | broadcast,37748736,0
279 | broadcast,3072,0
280 | broadcast,3072,0
281 | broadcast,3072,0
282 | broadcast,28311552,0
283 | broadcast,9216,0
284 | broadcast,9437184,0
285 | broadcast,3072,0
286 | broadcast,3072,0
287 | broadcast,3072,0
288 | broadcast,37748736,0
289 | broadcast,12288,0
290 | broadcast,37748736,0
291 | broadcast,3072,0
292 | broadcast,3072,0
293 | broadcast,3072,0
294 | broadcast,28311552,0
295 | broadcast,9216,0
296 | broadcast,9437184,0
297 | broadcast,3072,0
298 | broadcast,3072,0
299 | broadcast,3072,0
300 | broadcast,37748736,0
301 | broadcast,12288,0
302 | broadcast,37748736,0
303 | broadcast,3072,0
304 | broadcast,3072,0
305 | broadcast,3072,0
306 | broadcast,28311552,0
307 | broadcast,9216,0
308 | broadcast,9437184,0
309 | broadcast,3072,0
310 | broadcast,3072,0
311 | broadcast,3072,0
312 | broadcast,37748736,0
313 | broadcast,12288,0
314 | broadcast,37748736,0
315 | broadcast,3072,0
316 | broadcast,3072,0
317 | broadcast,3072,0
318 | broadcast,28311552,0
319 | broadcast,9216,0
320 | broadcast,9437184,0
321 | broadcast,3072,0
322 | broadcast,3072,0
323 | broadcast,3072,0
324 | broadcast,37748736,0
325 | broadcast,12288,0
326 | broadcast,37748736,0
327 | broadcast,3072,0
328 | broadcast,3072,0
329 | broadcast,3072,0
330 | broadcast,28311552,0
331 | broadcast,9216,0
332 | broadcast,9437184,0
333 | broadcast,3072,0
334 | broadcast,3072,0
335 | broadcast,3072,0
336 | broadcast,37748736,0
337 | broadcast,12288,0
338 | broadcast,37748736,0
339 | broadcast,3072,0
340 | broadcast,3072,0
341 | broadcast,3072,0
342 | broadcast,28311552,0
343 | broadcast,9216,0
344 | broadcast,9437184,0
345 | broadcast,3072,0
346 | broadcast,3072,0
347 | broadcast,3072,0
348 | broadcast,37748736,0
349 | broadcast,12288,0
350 | broadcast,37748736,0
351 | broadcast,3072,0
352 | broadcast,3072,0
353 | broadcast,3072,0
354 | broadcast,28311552,0
355 | broadcast,9216,0
356 | broadcast,9437184,0
357 | broadcast,3072,0
358 | broadcast,3072,0
359 | broadcast,3072,0
360 | broadcast,37748736,0
361 | broadcast,12288,0
362 | broadcast,37748736,0
363 | broadcast,3072,0
364 | broadcast,3072,0
365 | broadcast,3072,0
366 | allreduce,1,-1
367 | allreduce,1,-1
368 | broadcast,3,0
369 | broadcast,5,0
370 | broadcast,16392,0
371 | allreduce,16384,-1
372 | allreduce,16384,-1
373 | allreduce,16384,-1
374 | allreduce,1,-1
375 | reduce,264330240,0
376 | reduce,1024,0
377 | reduce,11264,0
378 | reduce,5472256,0
379 | reduce,32276480,0
380 | reduce,188823552,0
381 | reduce,75515904,0
382 | reduce,5120,0
383 | reduce,1024,0
384 | reduce,10947584,0
385 | reduce,26804224,0
386 | reduce,264333312,0
387 | reduce,9216,0
388 | reduce,9437184,0
389 | reduce,16425984,0
390 | reduce,11894784,0
391 | reduce,75528192,0
392 | reduce,151044096,0
393 | reduce,4096,0
394 | reduce,2048,0
395 | reduce,3072,0
396 | reduce,37748736,0
397 | reduce,21901312,0
398 | reduce,15859712,0
399 | reduce,226572288,0
400 | reduce,5120,0
401 | reduce,1024,0
402 | reduce,3072,0
403 | reduce,9437184,0
404 | reduce,9216,0
405 | reduce,28311552,0
406 | reduce,9216,0
407 | reduce,27361280,0
408 | reduce,10387456,0
409 | reduce,188814336,0
410 | reduce,3072,0
411 | reduce,3072,0
412 | reduce,3072,0
413 | reduce,37748736,0
414 | reduce,12288,0
415 | reduce,37748736,0
416 | reduce,4531200,0
417 | reduce,4915200,0
418 | reduce,179377152,0
419 | reduce,37748736,0
420 | reduce,4096,0
421 | reduce,2048,0
422 | reduce,3072,0
423 | reduce,9437184,0
424 | reduce,9216,0
425 | reduce,28311552,0
426 | reduce,566272,0
427 | reduce,37191680,0
428 | reduce,188814336,0
429 | reduce,2048,0
430 | reduce,4096,0
431 | reduce,3072,0
432 | reduce,37748736,0
433 | reduce,12288,0
434 | reduce,37748736,0
435 | reduce,9216,0
436 | reduce,9437184,0
437 | reduce,6038528,0
438 | reduce,22282240,0
439 | reduce,75528192,0
440 | reduce,113276928,0
441 | reduce,3072,0
442 | reduce,3072,0
443 | reduce,3072,0
444 | reduce,9437184,0
445 | reduce,9216,0
446 | reduce,28311552,0
447 | reduce,9216,0
448 | reduce,37748736,0
449 | reduce,11513856,0
450 | reduce,26247168,0
451 | reduce,151053312,0
452 | reduce,1024,0
453 | reduce,5120,0
454 | reduce,3072,0
455 | reduce,37748736,0
456 | reduce,12288,0
457 | reduce,37748736,0
458 | reduce,9216,0
459 | reduce,9437184,0
460 | reduce,9216,0
461 | reduce,28311552,0
462 | reduce,9216,0
463 | reduce,16973824,0
464 | reduce,20774912,0
465 | reduce,151047168,0
466 | reduce,2048,0
467 | reduce,4096,0
468 | reduce,3072,0
469 | reduce,9437184,0
470 | reduce,9216,0
471 | reduce,28311552,0
472 | reduce,9216,0
473 | reduce,37748736,0
474 | reduce,12288,0
475 | reduce,37748736,0
476 | reduce,9216,0
477 | reduce,9437184,0
478 | reduce,22455296,0
479 | reduce,5865472,0
480 | reduce,119583744,0
481 | reduce,154533888,0
482 | allreduce,1,-1
483 | allreduce,1,-1
484 | allreduce,1,-1
485 | allgather,42359660,-1
486 | allgather,42359660,-1
487 | allgather,42359660,-1
488 | allgather,42359660,-1
489 | allgather,42359660,-1
490 | allgather,42359660,-1
491 | allgather,42359672,-1
492 | allgather,100352,-1
493 | 


--------------------------------------------------------------------------------
/tests/DeepSpeed_test/Example.csv:
--------------------------------------------------------------------------------
1 | allreduce,1,-1
2 | broadcast,3072,0
3 | allreduce,1,-1
4 | broadcast,3,0
5 | broadcast,5,0
6 | reduce,1024,0
7 | reduce,11264,0
8 | allgather,1024,-1
9 | 


--------------------------------------------------------------------------------
/tests/DeepSpeed_test/testccl_cpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from torch.multiprocessing import Process
 4 | import os
 5 | 
 6 | world_size = 12
 7 | rounds = 100
 8 | # input_file = "Example.csv"
 9 | input_file = "DeepSpeed.csv"
10 | 
11 | type = torch.float16
12 | 
13 | 
14 | def worker(given_rank):
15 |     os.environ['MASTER_ADDR'] = 'localhost'
16 |     os.environ['MASTER_PORT'] = '6789'
17 |     os.environ['WORLD_SIZE'] = str(world_size)
18 |     os.environ['RANK'] = str(given_rank)
19 | 
20 |     dist.init_process_group(backend = 'gloo')
21 |     rank = int(dist.get_rank())
22 | 
23 |     device = "cpu"
24 | 
25 |     ops, sizes, roots = read_file(input_file)
26 |     test_ccl(ops, sizes, roots, device, rank, rounds)
27 |  
28 |  
29 | def main():
30 |     
31 |     process_list = []
32 |     for i in range(world_size):
33 |         p = Process(target=worker, args=(i,))
34 |         p.start()
35 |         process_list.append(p)
36 |  
37 |     for p in process_list:
38 |         p.join()
39 | 
40 | def read_file(filename):
41 |     ops = []
42 |     sizes = []
43 |     roots = []
44 |     f = open(filename, "r")
45 |     for line in f:
46 |         op, size, root = line.strip().split(",")
47 |         size = int(size)
48 |         root = int(root)
49 |         if root >= world_size:
50 |             print("Invalid root {}".format(root))
51 |             exit()
52 |         ops.append(op)
53 |         sizes.append(size)
54 |         roots.append(root)
55 |     f.close()
56 |     return ops, sizes, roots
57 | 
58 | def test_ccl(ops, sizes,  roots, device, rank, rounds):
59 |     input = []
60 |     output = []
61 |     print("Rank {}: starting to initialize tensors ...".format(rank))
62 |     for i in range(0, len(sizes)):
63 |         data = torch.randn(sizes[i], dtype = type)
64 |         data.to(device)
65 |         input.append(data)
66 |         if ops[i] == 'allgather':
67 |             tmp_output = []
68 |             for j in range(0, world_size):
69 |                 data = torch.randn(sizes[i], dtype = type)
70 |                 data.to(device)
71 |                 tmp_output.append(data)
72 |             output.append(tmp_output)
73 |         else:
74 |             output.append(data)
75 |     print("Rank {}: tensors initialization finished!".format(rank))
76 |     for k in range(0, rounds):
77 |         for i in range(0, len(ops)):
78 |             if ops[i] == 'reduce':
79 |                 print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i])))
80 |                 dist.reduce(input[i], roots[i], async_op=False)
81 |             if ops[i] == 'allreduce':
82 |                 print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i])))
83 |                 dist.all_reduce(input[i], async_op=False)
84 |             if ops[i] == 'allgather':
85 |                 print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i])))
86 |                 dist.all_gather(output[i], input[i], async_op=False)
87 |             if ops[i] == 'broadcast':
88 |                 print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i])))
89 |                 dist.broadcast(input[i], roots[i], async_op=False)
90 | 
91 |         
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/tests/DeepSpeed_test/testccl_gpu.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed as dist
  3 | from torch.multiprocessing import Process
  4 | import os
  5 | import intel_extension_for_pytorch
  6 | import argparse
  7 | import sys
  8 | 
  9 | datatype_map = {
 10 |     'bf16': torch.bfloat16,
 11 |     'fp32': torch.float32
 12 | }
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('--datatype', '-d', type=str, default='bf16', help='Data type')
 16 | parser.add_argument('--world_size', default=12, type=int, help='Number of gpu for distributed training')
 17 | args = parser.parse_args()
 18 | type = datatype_map.get(args.datatype)
 19 | if type is None:
 20 |     print(f'Unknown datatype: {args.datatype}')
 21 |     sys.exit(1)
 22 | 
 23 | world_size = args.world_size
 24 | rounds = 100
 25 | # input_file = "Example.csv"
 26 | input_file = "DeepSpeed.csv"
 27 | 
 28 | type = torch.bfloat16
 29 | 
 30 | def worker(given_rank):
 31 |     os.environ['MASTER_ADDR'] = '127.0.0.1' # xpu
 32 |     os.environ['MASTER_PORT'] = '29500' # xpu
 33 |     os.environ['WORLD_SIZE'] = str(world_size)
 34 |     os.environ['RANK'] = str(given_rank)
 35 | 
 36 |     try:
 37 |         import oneccl_bindings_for_pytorch
 38 |     except ImportError:
 39 |         print("oneccl_bindings_for_pytorch not available!")
 40 |     dist.init_process_group(backend='ccl')
 41 | 
 42 |     rank = int(dist.get_rank())
 43 |     torch.xpu.set_device(rank)
 44 |     device = "xpu:{}".format(rank)
 45 | 
 46 |     ops, sizes, roots = read_file(input_file)
 47 |     test_ccl(ops, sizes, roots, device, rank, rounds)
 48 |  
 49 |  
 50 | def main():
 51 |     
 52 |     process_list = []
 53 |     for i in range(world_size):
 54 |         p = Process(target=worker, args=(i,))
 55 |         p.start()
 56 |         process_list.append(p)
 57 |  
 58 |     for p in process_list:
 59 |         p.join()
 60 | 
 61 | def read_file(filename):
 62 |     ops = []
 63 |     sizes = []
 64 |     roots = []
 65 |     f = open(filename, "r")
 66 |     for line in f:
 67 |         op, size, root = line.strip().split(",")
 68 |         size = int(size)
 69 |         root = int(root)
 70 |         if root >= world_size:
 71 |             print("Invalid root {}".format(root))
 72 |             exit()
 73 |         ops.append(op)
 74 |         sizes.append(size)
 75 |         roots.append(root)
 76 |     f.close()
 77 |     return ops, sizes, roots
 78 | 
 79 | def test_ccl(ops, sizes,  roots, device, rank, rounds):
 80 |     input = []
 81 |     output = []
 82 |     print("Rank {}: starting to initialize tensors ...".format(rank))
 83 |     for i in range(0, len(sizes)):
 84 |         data = torch.randn(sizes[i], dtype = type)
 85 |         data = data.to(device)
 86 |         input.append(data)
 87 |         if ops[i] == 'allgather':
 88 |             tmp_output = []
 89 |             for j in range(0, world_size):
 90 |                 data = torch.randn(sizes[i], dtype = type)
 91 |                 data = data.to(device)
 92 |                 tmp_output.append(data)
 93 |             output.append(tmp_output)
 94 |         else:
 95 |             output.append(data)
 96 |     print("Rank {}: tensors initialization finished!".format(rank))
 97 |     for k in range(0, rounds):
 98 |         print("test round: ", k)
 99 |         for i in range(0, len(ops)):
100 |             if ops[i] == 'reduce':
101 |                 print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i])))
102 |                 dist.reduce(input[i], roots[i], async_op=False)
103 |             if ops[i] == 'allreduce':
104 |                 print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i])))
105 |                 dist.all_reduce(input[i], async_op=False)
106 |             if ops[i] == 'allgather':
107 |                 print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i])))
108 |                 dist.all_gather(output[i], input[i], async_op=False)
109 |             if ops[i] == 'broadcast':
110 |                 print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i])))
111 |                 dist.broadcast(input[i], roots[i], async_op=False)
112 | 
113 |             torch.xpu.synchronize()
114 |         
115 | if __name__ == '__main__':
116 |     main()
117 |     print("All tests finished!")
118 | 


--------------------------------------------------------------------------------
/tests/DeepSpeed_test/testccl_gpu_mpi.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from torch.multiprocessing import Process
 4 | import os
 5 | import intel_extension_for_pytorch
 6 | import oneccl_bindings_for_pytorch
 7 | import argparse
 8 | import sys
 9 | 
10 | rounds = 40
11 | # input_file = "Example.csv"
12 | input_file = "DeepSpeed.csv"
13 | 
14 | data_type = torch.bfloat16
15 | 
16 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
17 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
18 | os.environ['MASTER_ADDR'] = '127.0.0.1'
19 | os.environ['MASTER_PORT'] = '29500'
20 | dist.init_process_group("ccl")
21 | rank = dist.get_rank()
22 | world_size = dist.get_world_size()
23 | 
24 | def main():
25 |     torch.xpu.set_device(rank)
26 |     device = "xpu:{}".format(rank)
27 |     ops, sizes, roots = read_file(input_file)
28 |     test_ccl(ops, sizes, roots, device, rank, rounds)
29 | 
30 | def read_file(filename):
31 |     ops = []
32 |     sizes = []
33 |     roots = []
34 |     f = open(filename, "r")
35 |     for line in f:
36 |         op, size, root = line.strip().split(",")
37 |         size = int(size)
38 |         root = int(root)
39 |         if root >= world_size:
40 |             print("Invalid root {}".format(root))
41 |             exit()
42 |         ops.append(op)
43 |         sizes.append(size)
44 |         roots.append(root)
45 |     f.close()
46 |     return ops, sizes, roots
47 | 
48 | def test_ccl(ops, sizes,  roots, device, rank, rounds):
49 |     input = []
50 |     output = []
51 |     print("Rank {}: starting to initialize tensors ...".format(rank))
52 |     for i in range(0, len(sizes)):
53 |         data = torch.randn(sizes[i], dtype = data_type)
54 |         data = data.to(device)
55 |         input.append(data)
56 |         if ops[i] == 'allgather':
57 |             tmp_output = []
58 |             for j in range(0, world_size):
59 |                 data = torch.randn(sizes[i], dtype = data_type)
60 |                 data = data.to(device)
61 |                 tmp_output.append(data)
62 |             output.append(tmp_output)
63 |         else:
64 |             output.append(data)
65 |     print("Rank {}: tensors initialization finished!".format(rank), flush=True)
66 |     for k in range(0, rounds):
67 |         print("test round: ", k)
68 |         for i in range(0, len(ops)):
69 |             if ops[i] == 'reduce':
70 |                 print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i])), flush=True)
71 |                 dist.reduce(input[i], roots[i], async_op=False)
72 |             if ops[i] == 'allreduce':
73 |                 print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i])), flush=True)
74 |                 dist.all_reduce(input[i], async_op=False)
75 |             if ops[i] == 'allgather':
76 |                 print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i])), flush=True)
77 |                 dist.all_gather(output[i], input[i], async_op=False)
78 |             if ops[i] == 'broadcast':
79 |                 print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i])), flush=True)
80 |                 dist.broadcast(input[i], roots[i], async_op=False)
81 | 
82 |             torch.xpu.synchronize()
83 |         
84 | if __name__ == '__main__':
85 |     main()
86 |     print("All tests finished!")
87 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Intel® oneCCL Bindings for PyTorch* unit tests
 2 | 
 3 | These tests validate the functionality and performance for collective communication primitives in Intel® oneCCL Bindings for PyTorch*.
 4 | 
 5 | ## functionality validation of collective communication primitives
 6 | To start the test_c10d_ccl.py test, run: 
 7 | 
 8 | ```bash
 9 | python test_c10d_ccl.py
10 | ```
11 | 
12 | ## functionality validation of point-to-point communication primitives
13 | For within-card and cross-cards p2p test, run:
14 | 
15 | ```bash
16 | python test_c10d_p2p.py
17 | ```
18 | 
19 | For cross-nodes p2p test, run:
20 | 
21 | ```bash
22 | # Mpich
23 | mpiexec -host nodeA,nodeB -np 24 -ppn 12 python -u test_p2p_crossnodes.py --dist_url $NODE_IP --world_size 24
24 | ```
25 | 
26 | ## functionality validation of barrier
27 | For cpu barrier, run:
28 | 
29 | ```bash
30 | mpirun -np 2 python test_barrier.py
31 | ```
32 | 
33 | For xpu barrier (built with "COMPUTE_BACKEND=dpcpp"), run:
34 | 
35 | ```bash
36 | mpirun -np 2 python test_barrier.py --device xpu
37 | ```
38 | 
39 | ## broadcast/allreduce profiling
40 | To start the test_allreduce.py test, run:
41 | 
42 | ```bash
43 | mpirun -np 12 -ppn 12 python ddp_allreduce.py --warm 10 --iter 20 --fixed
44 | ```
45 | 
46 | ## DeepSpeed test
47 | cpu test:
48 | ```bash
49 | python testccl_cpu.py
50 | ```
51 | 
52 | gpu test (runs on 1 node 6 cards 12 tiles):
53 | ```bash
54 | python testccl_gpu.py --world_size 12
55 | ```
56 | gpu test for scale-out (runs on 2nodes and 24 ranks):
57 | ```bash
58 | mpirun -np 24 -ppn 12 python testccl_gpu_mpi.py
59 | ```
60 | 
61 | Note this unit test is a stress test with a long time to start. You may need to wait ~5min to get the log "starting to initialize tensors ...".
62 | 
63 | ## allreduce of LLM path
64 | This test case goes to special path for allreduce operation on xpu device if launched rank(-np) <= 8. Run:
65 | ```bash
66 | mpirun -np 2 python test_llm_allreduce.py
67 | ```
68 | If you want to disable this path and use oneCCL allreduce instead, set TORCH_CCL_GPU_ALLREDUCE to 0. Run:
69 | ```bash
70 | TORCH_CCL_GPU_ALLREDUCE=0 mpirun -np 2 python test_llm_allreduce.py
71 | ## Test Functionality of FSDP
72 | ```bash
73 | export CCL_ZE_IPC_EXCHANGE=sockets # for pytorch multiprocessing launch
74 | python test_fsdp.py
75 | ```
76 | 
77 | ## subgroup tests ds_subgroup_allreduce.py
78 | # for OAM (sub_group=2/4)
79 | ```bash
80 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=2
81 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=4
82 | ```
83 | # for Aurora System(TP=2/3/4/6)
84 | ```bash
85 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=2
86 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=3
87 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=4
88 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=6
89 | ```
90 | 
91 | ## deep speed scale-out tests
92 | The ds_p2p_crossnodes.py test case should be run on 3 nodes 
93 | ```bash
94 | mpirun -host x1002c4s1b0n0,x1002c4s2b0n0,x1002c4s3b0n0 -np 36 -ppn 12 python -u ds_p2p_crossnodes.py --dist_url 10.0.1.141 --world_size 36
95 | ```
96 | -host is the name for this 3 nodes
97 | --dist_url is the IP on your node, you can use (hostname -I) to get.


--------------------------------------------------------------------------------
/tests/ddp_allreduce.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import time
  4 | import os
  5 | import argparse
  6 | import torch.distributed as dist
  7 | 
  8 | parser = argparse.ArgumentParser()
  9 | parser.add_argument('--ptrace',
 10 |                     action='store_true',
 11 |                     default=False,
 12 |                     help='pytorch trace')
 13 | parser.add_argument('--warm', type=int, default=10, help='#warmup')
 14 | parser.add_argument('--iter', type=int, default=10, help='#iteration')
 15 | parser.add_argument('--size', type=int, default=25557032, help='number of f32/bf16 elements')
 16 | parser.add_argument('--no-cuda', action='store_true', default=False)
 17 | parser.add_argument('--broadcast', action='store_true', default=False)
 18 | parser.add_argument('--bf16', action='store_true', default=False)
 19 | parser.add_argument('--fixed',
 20 |                     action='store_true',
 21 |                     default=False,
 22 |                     help='fixed size')
 23 | args = parser.parse_args()
 24 | args.cuda = not args.no_cuda and torch.cuda.is_available()
 25 | 
 26 | if 'PMI_RANK' in os.environ.keys() and 'PMI_SIZE' in os.environ.keys():
 27 |     os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
 28 |     os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) # mpich set
 29 | elif 'PMIX_RANK' in os.environ.keys() and 'PALS_LOCAL_SIZE' in os.environ.keys():
 30 |     os.environ['RANK'] = os.environ.get('PMIX_RANK')
 31 |     os.environ['WORLD_SIZE'] = str(os.environ.get('PALS_LOCAL_SIZE', -1))
 32 | os.environ['MASTER_ADDR'] = '127.0.0.1'  # your master address
 33 | os.environ['MASTER_PORT'] = '29500'  # your master port
 34 | 
 35 | if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ.keys():
 36 |     local_rank = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
 37 | elif 'MPI_LOCALRANKID' in os.environ.keys():
 38 |     local_rank = os.environ['MPI_LOCALRANKID']
 39 |     if 'MPI_LOCALNRANKS' in os.environ.keys():
 40 |         os.environ['LOCAL_WORLD_SIZE'] = str(os.environ.get('MPI_LOCALNRANKS',-1))
 41 | else:
 42 |     local_rank = os.environ['PALS_LOCAL_RANKID']
 43 | 
 44 | local_rank = int(local_rank)
 45 | devid = local_rank
 46 | 
 47 | if not args.cuda:
 48 |     import intel_extension_for_pytorch
 49 |     try:
 50 |         import oneccl_bindings_for_pytorch
 51 |     except:
 52 |         import torch_ccl
 53 |     torch.xpu.set_device(devid)
 54 |     device = "xpu:{}".format(devid)
 55 |     dist.init_process_group(backend='ccl')
 56 | else:
 57 |     torch.cuda.set_device(devid)
 58 |     device = "cuda"
 59 |     dist.init_process_group(backend='nccl')
 60 | 
 61 | try:
 62 |     from horovod.torch import mpi_lib_v2 as mpi_lib
 63 |     if mpi_lib.ctenabled():
 64 |         mpi_lib = mpi_lib
 65 | except:
 66 |     mpi_lib = None
 67 | 
 68 | print(f'DDP local rank: {devid}')
 69 | 
 70 | if devid == 0:
 71 |     print(f'PyTorch DDP {"Broadcast" if args.broadcast else "AllReduce"} on {os.environ["WORLD_SIZE"]} {device} devices: ')
 72 | 
 73 | def _time():
 74 |     if args.cuda:
 75 |         torch.cuda.synchronize()
 76 |     else:
 77 |         torch.xpu.synchronize()
 78 |     return time.time()
 79 | 
 80 | if args.fixed:
 81 |     N = args.size
 82 | else:
 83 |     N = 1
 84 | 
 85 | 
 86 | with torch.autograd.profiler.profile(enabled=args.ptrace) as prof:
 87 |     while N <= args.size:
 88 |         for i in range(args.warm):
 89 |             data = torch.randn(N, dtype=torch.bfloat16 if args.bf16 else torch.float32).to(device)
 90 |             with torch.no_grad():
 91 |                 if not args.broadcast:
 92 |                     dist.all_reduce(data)
 93 |                 else:
 94 |                     dist.broadcast(data, 0)
 95 |         elapsed = []
 96 |         for i in range(args.iter):
 97 |             data = torch.randn(N, dtype=torch.bfloat16 if args.bf16 else torch.float32).to(device)
 98 |             t = _time()
 99 |             if mpi_lib:
100 |                 mpi_lib.ctpush("IPEX_ALLREDUCE")
101 |             with torch.no_grad():
102 |                 if not args.broadcast:
103 |                     dist.all_reduce(data)
104 |                 else:
105 |                     dist.broadcast(data, 0)
106 |             elapsed.append((_time() - t) * 1e6)
107 |             if mpi_lib and mpi_lib.ctenabled():
108 |                 mpi_lib.ctpop()
109 |         if devid == 0:
110 |             print(
111 |                 f'{N*(2 if args.bf16 else 4):<10}{np.mean(elapsed):>10.1f}us ({np.min(elapsed):.1f}-{np.max(elapsed):.1f}) +-{1.96 * np.std(elapsed):.1f}'
112 |             )
113 |         if N == args.size:
114 |             break
115 |         N = 2 * N
116 |         if N != args.size and N > args.size:
117 |             N = args.size
118 | 
119 | if args.ptrace:
120 |     prof.export_chrome_trace('rank' + str(hvd.rank()) + '_timeline.json')
121 | dist.destroy_process_group()
122 | 
123 | 


--------------------------------------------------------------------------------
/tests/ds_p2p_crossnodes.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import intel_extension_for_pytorch
 3 | import oneccl_bindings_for_pytorch
 4 | import torch.distributed as dist
 5 | import os
 6 | 
 7 | 
 8 | import argparse
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--world_size', default=-1, type=int, help='number of gpu for distributed training')
11 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training')
12 | parser.add_argument('--dist_port', default='29600', type=str, help='url port used to set up distributed training')
13 | args = parser.parse_args()
14 | 
15 | os.environ['RANK'] = str(os.environ.get('PMIX_RANK',0))
16 | os.environ['WORLD_SIZE'] = str(args.world_size)
17 | os.environ['MASTER_ADDR'] = '127.0.0.1'
18 | os.environ['MASTER_PORT'] = '29600'
19 | 
20 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
21 | dist.init_process_group(backend='ccl', init_method=init_method,
22 |                         world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK']))
23 | 
24 | rank = dist.get_rank()
25 | print("-----global rank: ", rank)
26 | size = dist.get_world_size()
27 | local_rank = os.environ['PALS_LOCAL_RANKID']
28 | device = "xpu:{}".format(local_rank)
29 | print('world_size:{}, global rank:{}, local_rank:{}'.format(size, rank, local_rank))
30 | 
31 | def send_tensor(buffer, recv_stage):
32 |     if isinstance(buffer, torch.Tensor):
33 |         type_tensor = torch.LongTensor(data=[0]).to(device)
34 |         dist.send(type_tensor, recv_stage)
35 |         send_shape = torch.LongTensor(data=buffer.size()).to(device)
36 |         send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(device)
37 |         dist.send(send_ndims, recv_stage)
38 |         dist.send(send_shape, recv_stage)
39 |  
40 | def recv_tensor(send_stage):
41 |     type_tensor = torch.LongTensor(data=[0]).to(device)
42 |     dist.recv(type_tensor, send_stage)
43 |     recv_type = type_tensor.item()
44 |  
45 |     if recv_type == 0:
46 |         recv_ndims = torch.LongTensor(data=[0]).to(device)
47 |         dist.recv(recv_ndims, send_stage)
48 |         recv_ndims = recv_ndims.item()
49 |         recv_shape = torch.LongTensor([1] * recv_ndims).to(device)
50 |         dist.recv(recv_shape, send_stage)
51 |         print("recv_ndims", recv_ndims)
52 |         print("recv_shape", recv_shape)
53 |     else:
54 |         print("----------------error-------------------")
55 | 
56 | size = dist.get_world_size()
57 | device = "xpu:{}".format(local_rank)
58 | 
59 | data = torch.randn(1, dtype=torch.float32).to(device)
60 | dist.all_reduce(data)
61 | 
62 | # send/recv(rank0 -> rank12 -> rank24)
63 | if rank <= 11:
64 |     tensor = torch.ones(2048,3,256).xpu(device)
65 |     send_tensor(tensor, rank+12)
66 | elif rank >= 24 :
67 |     recv_tensor(rank-12)
68 | else:
69 |     recv_tensor(rank-12)
70 |     tensor = torch.ones(2048,3,256).xpu(device)
71 |     send_tensor(tensor, rank+12)
72 | print("-----finished send/recv-----")
73 | 
74 | # all_gather_base after p2p
75 | torch.distributed.barrier()
76 | world_size=36
77 | device = "xpu:{}".format(local_rank)
78 | rank_name_to_time = torch.zeros((world_size, 2),
79 |                                  dtype=torch.float,
80 |                                  device=device)
81 | 
82 | torch.distributed._all_gather_base(rank_name_to_time.view(-1),
83 |                                    rank_name_to_time[rank, :].view(-1))
84 | print("all_gather is done")
85 | 


--------------------------------------------------------------------------------
/tests/ds_subgroup_allreduce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import intel_extension_for_pytorch
 3 | import oneccl_bindings_for_pytorch
 4 | import torch.distributed as dist
 5 | import os
 6 | import time
 7 | 
 8 | import argparse
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training')
11 | parser.add_argument('--dist_port', default='29500', type=str, help='url port used to set up distributed training')
12 | parser.add_argument('--sub_group', default=4, type=int, help='url port used to set up distributed training')
13 | args = parser.parse_args()
14 | 
15 | if 'PMI_RANK' in os.environ.keys() and 'PMI_SIZE' in os.environ.keys():
16 |     os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
17 |     os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) # mpich set
18 | elif 'PMIX_RANK' in os.environ.keys() and 'PALS_LOCAL_SIZE' in os.environ.keys():
19 |     os.environ['RANK'] = os.environ.get('PMIX_RANK')
20 |     os.environ['WORLD_SIZE'] = str(os.environ.get('PALS_LOCAL_SIZE', -1))
21 | 
22 | os.environ['MASTER_ADDR'] = '127.0.0.1'
23 | os.environ['MASTER_PORT'] = '29500'
24 | 
25 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
26 | dist.init_process_group(backend='ccl', init_method=init_method,
27 |                         world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK']))
28 | 
29 | rank = dist.get_rank()
30 | size = dist.get_world_size()
31 | device = "xpu:{}".format(rank)
32 | print('world_size:{}, global rank:{}'.format(size, rank))
33 | 
34 | shape = int(2048)
35 | warm_shape = int(1)
36 | warm = torch.ones(warm_shape).bfloat16().to(device)
37 | 
38 | input_shape = shape
39 | input = torch.ones(input_shape).bfloat16().to(device)
40 | 
41 | #warm_up
42 | dist.all_reduce(warm)
43 | 
44 | #sub_group=1(TP=12)
45 | group1 = dist.new_group([0])
46 | if rank ==0:
47 |     dist.all_reduce(input, group=group1)
48 | 
49 | group_size = [[i+(size // args.sub_group)*j for j in range(args.sub_group)] for i in range(size // args.sub_group)]
50 | sub_group = []
51 | 
52 | #construct sub group
53 | for i in range(len(group_size)):
54 |     sub_group.append(dist.new_group(group_size[i]))
55 | 
56 | for i in range(len(group_size)):
57 |     if dist.get_rank() in group_size[i]:
58 |         dist.all_reduce(input, group=sub_group[i])
59 | 


--------------------------------------------------------------------------------
/tests/run_ds_llm.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | # run the ds_subgroup_allreduce.py
 3 | # for OAM (sub_group=2/4)
 4 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=2
 5 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=4
 6 | # for Aurora System(TP=2/3/4/6)
 7 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=2
 8 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=3
 9 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=4
10 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=6
11 | 
12 | # should run the ds_p2p_crossnodes.py on 3 nodes 
13 | # -host is the name for this 3 nodes
14 | # --dist_url is the IP on your node, you can use (hostname -I) to get.
15 | mpirun -host x1002c4s1b0n0,x1002c4s2b0n0,x1002c4s3b0n0 -np 36 -ppn 12 python -u ds_p2p_crossnodes.py --dist_url 10.0.1.141 --world_size 36
16 | 


--------------------------------------------------------------------------------
/tests/test_allreduce.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import intel_extension_for_pytorch
  3 | import oneccl_bindings_for_pytorch
  4 | import torch.distributed as dist
  5 | import os
  6 | import time
  7 | 
  8 | tokens = 16
  9 | rounds = 70 * 2 * tokens
 10 | 
 11 | count = 14336
 12 | 
 13 | total = 1024 * 1024 * 72
 14 | repeat = 4
 15 | 
 16 | # profiling = False
 17 | # profiling = True
 18 | 
 19 | datatype = torch.float16
 20 | # datatype = torch.float32
 21 | 
 22 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
 23 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
 24 | os.environ['MASTER_ADDR'] = '127.0.0.1'
 25 | os.environ['MASTER_PORT'] = '29500'
 26 | 
 27 | dist.init_process_group("ccl")
 28 | rank = dist.get_rank()
 29 | size = dist.get_world_size()
 30 | 
 31 | device = "xpu:{}".format(rank)
 32 | # allreduce data
 33 | data = (torch.ones(count, dtype=datatype) * 0.1).to(device)
 34 | 
 35 | a = (torch.zeros((int(total / count), count), dtype=datatype)).to(device)
 36 | 
 37 | # warm up
 38 | for i in range(5):
 39 |     a[0] += (data * 0.1)
 40 |     for j in range(repeat):
 41 |         a += 0.01
 42 |     dist.all_reduce(data)
 43 |     data /= size
 44 |     sync = data.cpu()
 45 | 
 46 | #start_events = []
 47 | #end_events = []
 48 | 
 49 | dist.barrier()
 50 | start = time.time()
 51 | for i in range(rounds):
 52 | #    start_event = None
 53 | #    end_event = None
 54 | #    if profiling:
 55 | #        start_event = torch.xpu.Event(enable_timing=True)
 56 | #        end_event = torch.xpu.Event(enable_timing=True)
 57 |     a[0] += (data * 0.1)
 58 |     for j in range(repeat):
 59 |         a += 0.01
 60 |     #print("XPU: {} {}".format(i, a[0][0]))
 61 | #    if profiling:
 62 | #        start_event.record()
 63 |     dist.all_reduce(data)
 64 | #    if profiling:
 65 | #        end_event.record()
 66 |     data /= size
 67 |     sync = data.cpu()
 68 | #    if profiling:
 69 | #        start_events.append(start_event)
 70 | #        end_events.append(end_event)
 71 | 
 72 | # print(data[0])
 73 | data = data.cpu()
 74 | # torch.xpu.synchronize('xpu:{}'.format(rank))
 75 | span = time.time() - start
 76 | print('{} rounds on reducing {} elements. Time used {}'.format(rounds, count, span))
 77 | 
 78 | tmp_a = torch.zeros(1, dtype=datatype)
 79 | tmp_data = torch.ones(1, dtype=datatype) * 0.1
 80 | for i in range(5):
 81 |     tmp_a += (tmp_data * 0.1)
 82 |     for j in range(repeat):
 83 |         tmp_a += 0.01
 84 |     tmp_data *= size
 85 |     tmp_data /= size
 86 | 
 87 | for i in range(rounds):
 88 |     tmp_a += (tmp_data * 0.1)
 89 |     for j in range(repeat):
 90 |         tmp_a += 0.01
 91 |     #print("CPU: {} {}".format(i, tmp_a[0]))
 92 |     tmp_data *= size
 93 |     tmp_data /= size
 94 | 
 95 | a = a.cpu()
 96 | 
 97 | error = False
 98 | for i in range(count):
 99 |     if tmp_a[0] != a[0][i]:
100 |         if not error:
101 |             print("Error on {}: {} vs {}".format(i, tmp_a[0], a[0][i]))
102 |             error = True
103 |     else:
104 |         if error:
105 |             print("No error on {}".format(i))
106 |             error = False
107 | 
108 | #if profiling:
109 | #    for i in range(len(start_events)):
110 | #        allreduce_time = start_events[i].elapsed_time(end_events[i])
111 | #        print('Round %d allreduce time %.3fms' % (i, allreduce_time))
112 | #        if i != len(start_events) - 1:
113 | #            compute_time = end_events[i].elapsed_time(start_events[i + 1])
114 | #            print('Round %d compute time %.3fms' % (i + 1, compute_time))
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/tests/test_barrier.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import intel_extension_for_pytorch
 3 | import oneccl_bindings_for_pytorch
 4 | import torch.distributed as dist
 5 | import os
 6 | 
 7 | import argparse
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--device', '-dev', type=str, default='cpu', help='Device type to use: cpu, xpu')
10 | args = parser.parse_args()
11 | 
12 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
13 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
14 | os.environ['MASTER_ADDR'] = '127.0.0.1'
15 | os.environ['MASTER_PORT'] = '29500'
16 | 
17 | dist.init_process_group("ccl")
18 | rank = dist.get_rank()
19 | size = dist.get_world_size()
20 | 
21 | if args.device == 'xpu':
22 |     device = "xpu:{}".format(rank)
23 | else:
24 |     device = 'cpu'
25 | 
26 | print("Barrier using device: ", args.device)
27 | dist.barrier()
28 | print("Finish")
29 | 


--------------------------------------------------------------------------------
/tests/test_c10d_p2p.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import torch
  4 | 
  5 | try:
  6 |     import intel_extension_for_pytorch
  7 |     xpu_is_available = torch.xpu.is_available() if hasattr(torch, 'xpu') else False
  8 | except ImportError:
  9 |     # ignore the ipex
 10 |     xpu_is_available = False
 11 |     pass
 12 | 
 13 | import oneccl_bindings_for_pytorch
 14 | from torch.testing._internal.common_utils import run_tests
 15 | from torch.testing._internal.common_distributed import MultiProcessTestCase
 16 | 
 17 | import torch.distributed as dist
 18 | 
 19 | class ProcessGroupCCLTest(MultiProcessTestCase):
 20 | 
 21 |     def setUp(self):
 22 |         super(ProcessGroupCCLTest, self).setUp()
 23 |         self._spawn_processes()
 24 |       
 25 |     @property
 26 |     def world_size(self):
 27 |         return 6
 28 | 
 29 |     def _build_tensor(self, size, value=None, dtype=torch.float, device=None):
 30 |         if value is None:
 31 |             value = size
 32 |         if device is None:
 33 |             return torch.empty(size, size, size, dtype=dtype).fill_(value)
 34 |         else:
 35 |             return torch.empty(size, size, size, dtype=dtype).fill_(value).to(device)
 36 | 
 37 |     def _test_send_recv_withincard(self):
 38 |         store = dist.FileStore(self.file_name, self.world_size)
 39 |         dist.init_process_group(
 40 |             "ccl",
 41 |             world_size=self.world_size,
 42 |             rank=self.rank,
 43 |             store=store,
 44 |             )
 45 |         device = "xpu:{}".format(self.rank)
 46 | 
 47 |         # WA: allreduce
 48 |         # Ensure the process group has been fully initialized
 49 |         data = torch.zeros(1).to(device)
 50 |         dist.all_reduce(data)
 51 | 
 52 |         torch.xpu.set_device(device)
 53 |         tensor = self._build_tensor(self.rank + 1, device=device)
 54 | 
 55 |         # rank0 -> rank1
 56 |         src = 0
 57 |         dst = 1
 58 |         if self.rank == src:
 59 |             # Send
 60 |             dist.send(tensor, dst)
 61 |         elif self.rank == dst:
 62 |             # Recv
 63 |             expected_tensor = self._build_tensor(src + 1)
 64 |             output_tensor = self._build_tensor(
 65 |                 src + 1, value=-1, device=device
 66 |             )
 67 |             dist.recv(output_tensor, src)
 68 |             self.assertEqual(output_tensor, expected_tensor)
 69 | 
 70 |     def test_send_recv_withincard(self):
 71 |         self._test_send_recv_withincard()
 72 | 
 73 |     def _test_send_recv_3rank(self):
 74 |         # cross-cards p2p: rank1 -> rank3 -> rank5
 75 |         store = dist.FileStore(self.file_name, self.world_size)
 76 |         dist.init_process_group(
 77 |             "ccl",
 78 |             world_size=self.world_size,
 79 |             rank=self.rank,
 80 |             store=store,
 81 |             )
 82 |         device = "xpu:{}".format(self.rank)
 83 |         
 84 |         # WA: allreduce
 85 |         # Ensure the process group has been fully initialized
 86 |         data = torch.zeros(1).to(device)
 87 |         dist.all_reduce(data)
 88 | 
 89 |         torch.xpu.set_device(device)
 90 |         tensor = self._build_tensor(self.rank + 1, device=device)
 91 | 
 92 |         if self.rank == 1:
 93 |             dist.send(tensor, 3)
 94 |         if self.rank == 3:
 95 |             expected_tensor1 = self._build_tensor(1 + 1)
 96 |             output_tensor1 = self._build_tensor(
 97 |                 1 + 1, value=-1, device=device
 98 |             )
 99 |             dist.recv(output_tensor1, 1)
100 |             self.assertEqual(output_tensor1, expected_tensor1)
101 | 
102 |             # rank3 -> rank5
103 |             dist.send(tensor, 5)
104 |         if self.rank == 5:
105 |             expected_tensor2 = self._build_tensor(3 + 1)
106 |             output_tensor2 = self._build_tensor(
107 |                 3 + 1, value=-1, device=device
108 |             )
109 |             dist.recv(output_tensor2, 3)
110 |             self.assertEqual(output_tensor2, expected_tensor2)
111 | 
112 |     def test_send_recv_3rank(self):
113 |         self._test_send_recv_3rank()
114 | 
115 |     def _test_send_recv_crosscard(self):
116 |         store = dist.FileStore(self.file_name, self.world_size)
117 |         dist.init_process_group(
118 |             "ccl",
119 |             world_size=self.world_size,
120 |             rank=self.rank,
121 |             store=store,
122 |             )
123 |         device = "xpu:{}".format(self.rank)
124 | 
125 |         # WA: allreduce
126 |         # Ensure the process group has been fully initialized
127 |         data = torch.zeros(1).to(device)
128 |         dist.all_reduce(data)
129 | 
130 |         torch.xpu.set_device(device)
131 |         tensor = self._build_tensor(self.rank + 1, device=device)
132 | 
133 |         for src in range(0, self.world_size):
134 |             if src == self.rank:
135 |                 # Send mode
136 |                 for dst in range(0, self.world_size):
137 |                     if dst == self.rank:
138 |                         continue
139 |                     dist.send(tensor, dst)
140 |             else:
141 |                 # Recv mode
142 |                 expected_tensor = self._build_tensor(src + 1)
143 |                 output_tensor = self._build_tensor(
144 |                     src + 1, value=-1, device=device
145 |                 )
146 |                 dist.recv(output_tensor, src)
147 |                 self.assertEqual(output_tensor, expected_tensor)
148 | 
149 |     def test_send_recv_crosscard(self):
150 |         self._test_send_recv_crosscard()
151 | 
152 |     def _test_send_recv_with_tag(self):
153 |         store = dist.FileStore(self.file_name, self.world_size)
154 |         dist.init_process_group(
155 |             "ccl",
156 |             world_size=self.world_size,
157 |             rank=self.rank,
158 |             store=store,
159 |             )
160 |         device = "xpu:{}".format(self.rank)
161 | 
162 |         # WA: allreduce
163 |         # Ensure the process group has been fully initialized
164 |         data = torch.zeros(1).to(device)
165 |         dist.all_reduce(data)
166 | 
167 |         torch.xpu.set_device(device)
168 |         tensor = self._build_tensor(10, value=self.rank, device=device)
169 | 
170 |         for dst in range(0, self.world_size):
171 |             if dst == self.rank:
172 |                 # Recv mode
173 |                 for src in range(0, self.world_size):
174 |                     if src == self.rank:
175 |                         continue
176 |                     output_tensor = self._build_tensor(10, value=-1, device=device)
177 |                     dist.recv(output_tensor, src, tag=src)
178 |                     self.assertTrue(output_tensor.eq(src).all())
179 |             else:
180 |                 # Send mode
181 |                 dist.send(tensor, dst, tag=self.rank)
182 | 
183 |     def test_send_recv_with_tag(self):
184 |         self._test_send_recv_with_tag()
185 | 
186 | if __name__ == '__main__':
187 |     run_tests()
188 | 


--------------------------------------------------------------------------------
/tests/test_fsdp.py:
--------------------------------------------------------------------------------
  1 | # Reference: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html
  2 | 
  3 | import os
  4 | import argparse
  5 | import functools
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | from torchvision import datasets, transforms
 11 | import time
 12 | 
 13 | from torch.optim.lr_scheduler import StepLR
 14 | 
 15 | import torch.distributed as dist
 16 | import torch.multiprocessing as mp
 17 | from torch.nn.parallel import DistributedDataParallel as DDP
 18 | from torch.utils.data.distributed import DistributedSampler
 19 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 20 | from torch.distributed.fsdp.fully_sharded_data_parallel import (
 21 |     CPUOffload,
 22 |     BackwardPrefetch,
 23 | )
 24 | from torch.distributed.fsdp.wrap import (
 25 |     size_based_auto_wrap_policy,
 26 |     enable_wrap,
 27 |     wrap,
 28 | )
 29 | 
 30 | import intel_extension_for_pytorch
 31 | import oneccl_bindings_for_pytorch
 32 | 
 33 | def setup(rank, world_size):
 34 |     os.environ['MASTER_ADDR'] = 'localhost'
 35 |     os.environ['MASTER_PORT'] = '12355'
 36 | 
 37 |     # initialize the process group
 38 |     dist.init_process_group("ccl", rank=rank, world_size=world_size)
 39 | 
 40 | def cleanup():
 41 |     dist.destroy_process_group()
 42 |     
 43 |     
 44 | class Net(nn.Module):
 45 |     def __init__(self):
 46 |         super(Net, self).__init__()
 47 |         self.conv1 = nn.Conv2d(1, 32, 3, 1)
 48 |         self.conv2 = nn.Conv2d(32, 64, 3, 1)
 49 |         self.dropout1 = nn.Dropout(0.25)
 50 |         self.dropout2 = nn.Dropout(0.5)
 51 |         self.fc1 = nn.Linear(9216, 128)
 52 |         self.fc2 = nn.Linear(128, 10)
 53 | 
 54 |     def forward(self, x):
 55 | 
 56 |         x = self.conv1(x)
 57 |         x = F.relu(x)
 58 |         x = self.conv2(x)
 59 |         x = F.relu(x)
 60 |         x = F.max_pool2d(x, 2)
 61 |         x = self.dropout1(x)
 62 |         x = torch.flatten(x, 1)
 63 |         x = self.fc1(x)
 64 |         x = F.relu(x)
 65 |         x = self.dropout2(x)
 66 |         x = self.fc2(x)
 67 |         output = F.log_softmax(x, dim=1)
 68 |         return output
 69 |       
 70 | def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
 71 |     model.train()
 72 |     ddp_loss = torch.zeros(2).to("xpu:{}".format(rank))
 73 |     if sampler:
 74 |         sampler.set_epoch(epoch)
 75 |     for batch_idx, (data, target) in enumerate(train_loader):
 76 |         if batch_idx < 3:
 77 |             data, target = data.to("xpu:{}".format(rank)), target.to("xpu:{}".format(rank))
 78 |             optimizer.zero_grad()
 79 |             output = model(data)
 80 |             loss = F.nll_loss(output, target, reduction='sum')
 81 |             loss.backward()
 82 |             optimizer.step()
 83 |             ddp_loss[0] += loss.item()
 84 |             ddp_loss[1] += len(data)
 85 | 
 86 |     dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
 87 |     if rank == 0:
 88 |         print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))
 89 |         
 90 |         
 91 | def test(model, rank, world_size, test_loader):
 92 |     model.eval()
 93 |     correct = 0
 94 |     ddp_loss = torch.zeros(3).to("xpu:{}".format(rank))
 95 |     with torch.no_grad():
 96 |         for data, target in test_loader:
 97 |             data, target = data.to("xpu:{}".format(rank)), target.to("xpu:{}".format(rank))
 98 |             output = model(data)
 99 |             ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
100 |             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
101 |             ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
102 |             ddp_loss[2] += len(data)
103 | 
104 |     dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
105 | 
106 |     if rank == 0:
107 |         test_loss = ddp_loss[0] / ddp_loss[2]
108 |         print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
109 |             test_loss, int(ddp_loss[1]), int(ddp_loss[2]),
110 |             100. * ddp_loss[1] / ddp_loss[2]))
111 |         
112 |         
113 | def fsdp_main(rank, world_size, args):
114 |     torch.manual_seed(123)
115 |     torch.xpu.manual_seed(123)
116 |     setup(rank, world_size)
117 | 
118 |     transform=transforms.Compose([
119 |         transforms.ToTensor(),
120 |         transforms.Normalize((0.1307,), (0.3081,))
121 |     ])
122 | 
123 |     dataset1 = datasets.MNIST('../data', train=True, download=True,
124 |                         transform=transform)
125 |     dataset2 = datasets.MNIST('../data', train=False,
126 |                         transform=transform)
127 | 
128 |     sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
129 |     sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size)
130 | 
131 |     train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1}
132 |     test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2}
133 |     cuda_kwargs = {'num_workers': 2,
134 |                     'pin_memory': True,
135 |                     'shuffle': False}
136 |     train_kwargs.update(cuda_kwargs)
137 |     test_kwargs.update(cuda_kwargs)
138 | 
139 |     train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
140 |     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
141 |     my_auto_wrap_policy = functools.partial(
142 |         size_based_auto_wrap_policy, min_num_params=100
143 |     )
144 | 
145 |     xpu_device = "xpu:{}".format(rank)
146 |     torch.xpu.set_device(xpu_device)
147 | 
148 |     #init_start_event = torch.Event(enable_timing=True)
149 |     #init_end_event = torch.Event(enable_timing=True)
150 | 
151 |     model = Net().to("xpu:{}".format(rank))
152 | 
153 |     model = FSDP(model, device_id="xpu:{}".format(rank))
154 | 
155 |     optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
156 | 
157 |     scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
158 |     #init_start_event.record()
159 |     elapsed = time.time()
160 |     for epoch in range(1):
161 |         train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
162 |         test(model, rank, world_size, test_loader)
163 |         scheduler.step()
164 | 
165 |     #init_end_event.record()
166 |     elapsed = time.time() - elapsed
167 |     if rank == 0:
168 |         #print(f"XPU event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
169 |         print(f"XPU event elapsed time: {elapsed}sec")
170 |         print(f"{model}")
171 | 
172 |     if args.save_model:
173 |         # use a barrier to make sure training is done on all ranks
174 |         dist.barrier()
175 |         # state_dict for FSDP model is only available on Nightlies for now
176 |         states = model.state_dict()
177 |         if rank == 0:
178 |             torch.save(states, "mnist_cnn.pt")
179 | 
180 |     cleanup()
181 |     
182 |     
183 | 
184 | if __name__ == '__main__':
185 |     # Training settings
186 |     parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
187 |     parser.add_argument('--batch-size', type=int, default=64, metavar='N',
188 |                         help='input batch size for training (default: 64)')
189 |     parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
190 |                         help='input batch size for testing (default: 1000)')
191 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
192 |                         help='number of epochs to train (default: 14)')
193 |     parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
194 |                         help='learning rate (default: 1.0)')
195 |     parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
196 |                         help='Learning rate step gamma (default: 0.7)')
197 |     parser.add_argument('--no-cuda', action='store_true', default=False,
198 |                         help='disables CUDA training')
199 |     parser.add_argument('--seed', type=int, default=1, metavar='S',
200 |                         help='random seed (default: 1)')
201 |     parser.add_argument('--save-model', action='store_true', default=False,
202 |                         help='For Saving the current Model')
203 |     args = parser.parse_args()
204 | 
205 |     torch.manual_seed(args.seed)
206 | 
207 |     # WORLD_SIZE = torch.xpu.device_count()
208 |     WORLD_SIZE = 2
209 |     mp.spawn(fsdp_main,
210 |         args=(WORLD_SIZE, args),
211 |         nprocs=WORLD_SIZE,
212 |         join=True)
213 | 


--------------------------------------------------------------------------------
/tests/test_llm_allreduce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import intel_extension_for_pytorch
 3 | import oneccl_bindings_for_pytorch
 4 | import torch.distributed as dist
 5 | import os
 6 | 
 7 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
 8 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
 9 | os.environ['MASTER_ADDR'] = '127.0.0.1'
10 | os.environ['MASTER_PORT'] = '29500'
11 | dist.init_process_group("ccl")
12 | rank = dist.get_rank()
13 | size = dist.get_world_size()
14 | 
15 | device = "xpu:{}".format(rank)
16 | llm_shapes = [
17 |     # GPT-J 6B
18 |     (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096),
19 |     # Llama 7B
20 |     (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096),
21 |     # Llama 13B
22 |     (1, 32, 5120), (1, 1024, 5120), (1, 4, 5120), (1, 1, 5120),
23 |     # Llama2 7B
24 |     (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096),
25 |     # Llama2 13B
26 |     (1, 32, 5120), (1, 1024, 5120), (1, 4, 5120), (1, 1, 5120),
27 |     # Llama2 70B
28 |     (1, 32, 8192), (1, 1024, 8192), (1, 1, 8192), (1, 4, 8192),
29 |     # OPT 6.7B
30 |     (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096),
31 |     # OPT 30B
32 |     (1, 32, 7168), (1, 1, 7168), (1, 1024, 7168), (1, 4, 7168),
33 |     # Bloom 7B
34 |     (1, 33, 4096), (1, 1, 4096), (1, 4, 4096), (1, 1028, 4096),
35 |     # Bloom 176B
36 |     (1, 4, 14336), (1, 1028, 14336), (1, 33, 14336), (1, 1, 14336)
37 | ]
38 | 
39 | os.environ['TORCH_LLM_ALLREDUCE_DEBUG'] = '1'
40 | for shape in llm_shapes:
41 |     data = torch.rand(shape, dtype=torch.float16).to(device)
42 |     # Expected value is identical to input for average allreduce.
43 |     expect_result = data
44 |     # Allreduce is an inplace op, data will represent output.
45 |     dist.all_reduce(data)
46 |     assert torch.allclose(data, expect_result)
47 | 


--------------------------------------------------------------------------------
/tests/test_p2p_crossnodes.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import intel_extension_for_pytorch
 3 | import oneccl_bindings_for_pytorch
 4 | import torch.distributed as dist
 5 | import os
 6 | 
 7 | 
 8 | import argparse
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--world_size', default=-1, type=int, help='number of gpu for distributed training')
11 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training')
12 | parser.add_argument('--dist_port', default='29800', type=str, help='url port used to set up distributed training')
13 | args = parser.parse_args()
14 | 
15 | os.environ['RANK'] = str(os.environ.get('PMIX_RANK',0))
16 | os.environ['WORLD_SIZE'] = str(args.world_size)
17 | os.environ['MASTER_ADDR'] = '127.0.0.1'
18 | os.environ['MASTER_PORT'] = '29500'
19 | 
20 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
21 | dist.init_process_group(backend='ccl', init_method=init_method,
22 |                         world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK']))
23 | 
24 | rank = dist.get_rank()
25 | size = dist.get_world_size()
26 | local_rank = os.environ['PALS_LOCAL_RANKID']
27 | device = "xpu:{}".format(local_rank)
28 | print('world_size:{}, global rank:{}, local_rank:{}'.format(size, rank, local_rank))
29 | 
30 | # allreduce is WA
31 | data = torch.randn(2, dtype=torch.float32).to(device)
32 | dist.all_reduce(data)
33 | 
34 | def send_tensor(buffer, recv_stage):
35 |     if isinstance(buffer, torch.Tensor):
36 |         type_tensor = torch.LongTensor(data=[0]).to(device)
37 |         dist.send(type_tensor, recv_stage)
38 |         send_shape = torch.LongTensor(data=buffer.size()).to(device)
39 |         send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(device)
40 |         dist.send(send_ndims, recv_stage)
41 |         dist.send(send_shape, recv_stage)
42 |  
43 | def recv_tensor(send_stage):
44 |     type_tensor = torch.LongTensor(data=[0]).to(device)
45 |     dist.recv(type_tensor, send_stage)
46 |     recv_type = type_tensor.item()
47 |  
48 |     if recv_type == 0:
49 |         recv_ndims = torch.LongTensor(data=[0]).to(device)
50 |         dist.recv(recv_ndims, send_stage)
51 |         recv_ndims = recv_ndims.item()
52 |         recv_shape = torch.LongTensor([1] * recv_ndims).to(device)
53 |         dist.recv(recv_shape, send_stage)
54 |         print("recv_ndims", recv_ndims)
55 |         print("recv_shape", recv_shape)
56 |     else:
57 |         print("----------------error-------------------")
58 | size = dist.get_world_size()
59 | device = "xpu:{}".format(local_rank)
60 |  
61 | data = torch.randn(1, dtype=torch.float32).to(device)
62 | dist.all_reduce(data)
63 |     
64 | # rank1 -> rank3 -> rank15 -> rank23 -> rank8
65 | if rank == 1:
66 |     tensor = torch.ones(2048,3,256).xpu(device)
67 |     send_tensor(tensor, 3)
68 | if rank == 3:
69 |     recv_tensor(1)
70 |     tensor = torch.ones(2048,3,256).xpu(device)
71 |     send_tensor(tensor, 15)
72 | if rank == 15:
73 |     recv_tensor(3)
74 |     tensor = torch.ones(2048,3,256).xpu(device)
75 |     send_tensor(tensor, 23)
76 | if rank == 23:
77 |     recv_tensor(15)
78 |     tensor = torch.ones(2048,3,256).xpu(device)
79 |     send_tensor(tensor, 8)
80 | if rank == 8:
81 |     recv_tensor(23)
82 | 


--------------------------------------------------------------------------------
/third-party-programs.txt:
--------------------------------------------------------------------------------
 1 | PyTorch binding for Intel(R) oneAPI Collective Communications Library (oneCCL)
 2 | Third Party Programs File
 3 | 
 4 | This file is the "third-party-programs.txt" file specified in the associated
 5 | Intel end user license agreement for the Intel software you are licensing. The
 6 | third party programs and their corresponding required notices and/or license
 7 | terms are listed below.
 8 | 
 9 | -------------------------------------------------------------------------------
10 | 
11 | 1. PyTorch
12 | 
13 | From PyTorch:
14 | 
15 | Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
16 | Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
17 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
18 | Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
19 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
20 | Copyright (c) 2011-2013 NYU                      (Clement Farabet)
21 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
22 | Iain Melvin, Jason Weston)
23 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
24 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio,
25 | Johnny Mariethoz)
26 | 
27 | From Caffe2:
28 | 
29 | Copyright (c) 2016-present, Facebook Inc. All rights reserved.
30 | 
31 | All contributions by Facebook:
32 | Copyright (c) 2016 Facebook Inc.
33 | 
34 | All contributions by Google:
35 | Copyright (c) 2015 Google Inc.
36 | All rights reserved.
37 | 
38 | All contributions by Yangqing Jia:
39 | Copyright (c) 2015 Yangqing Jia
40 | All rights reserved.
41 | 
42 | All contributions from Caffe:
43 | Copyright(c) 2013, 2014, 2015, the respective contributors
44 | All rights reserved.
45 | 
46 | All other contributions:
47 | Copyright(c) 2015, 2016 the respective contributors
48 | All rights reserved.
49 | 
50 | 
51 | The -3-Clause BSD license
52 | 
53 | Caffe2 uses a copyright model similar to Caffe: each contributor holds
54 | copyright over their contributions to Caffe2. The project versioning records
55 | all such contribution and copyright details. If a contributor wants to further
56 | mark their specific copyright on a particular contribution, they should
57 | indicate their copyright solely in the commit message of the change when it is
58 | committed.
59 | 
60 | All rights reserved.
61 | 
62 | Redistribution and use in source and binary forms, with or without
63 | modification, are permitted provided that the following conditions are met:
64 | 
65 | 1. Redistributions of source code must retain the above copyright
66 |    notice, this list of conditions and the following disclaimer.
67 | 
68 | 2. Redistributions in binary form must reproduce the above copyright
69 |    notice, this list of conditions and the following disclaimer in the
70 |    documentation and/or other materials provided with the distribution.
71 | 
72 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
73 |    America and IDIAP Research Institute nor the names of its contributors may
74 |    be used to endorse or promote products derived from this software without
75 |    specific prior written permission.
76 | 
77 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
78 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
79 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
80 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
81 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
82 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
83 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
84 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
85 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
86 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
87 | POSSIBILITY OF SUCH DAMAGE.
88 | 
89 | -------------------------------------------------------------------------------
90 | 
91 | Other names and brands may be claimed as the property of others.
92 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-ccl/6acc2008785b7e0a859dfcd22377d6b891212351/tools/__init__.py


--------------------------------------------------------------------------------
/tools/setup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-ccl/6acc2008785b7e0a859dfcd22377d6b891212351/tools/setup/__init__.py


--------------------------------------------------------------------------------
/tools/setup/cmake.py:
--------------------------------------------------------------------------------
  1 | "Manages CMake."
  2 | import os
  3 | import re
  4 | import shutil
  5 | from subprocess import check_call, check_output
  6 | import sys
  7 | import distutils
  8 | import distutils.sysconfig
  9 | from distutils.version import LooseVersion
 10 | from setuptools import Extension
 11 | from collections import defaultdict
 12 | from .env import BUILD_DIR, check_env_flag
 13 | # from .numpy_ import USE_NUMPY, NUMPY_INCLUDE_DIR
 14 | 
 15 | 
 16 | def _mkdir_p(d):
 17 |     try:
 18 |         os.makedirs(d)
 19 |     except OSError:
 20 |         pass
 21 | 
 22 | 
 23 | # Ninja
 24 | # Use ninja if it is on the PATH. Previous version of PyTorch required the
 25 | # ninja python package, but we no longer use it, so we do not have to import it
 26 | # USE_NINJA = (not check_negative_env_flag('USE_NINJA') and
 27 | #              shutil.which('ninja') is not None)
 28 | def convert_cmake_value_to_python_value(cmake_value, cmake_type):
 29 |     r"""Convert a CMake value in a string form to a Python value.
 30 | 
 31 |     Arguments:
 32 |       cmake_value (string): The CMake value in a string form (e.g., "ON", "OFF", "1").
 33 |       cmake_type (string): The CMake type of :attr:`cmake_value`.
 34 | 
 35 |     Returns:
 36 |       A Python value corresponding to :attr:`cmake_value` with type :attr:`cmake_type`.
 37 |     """
 38 | 
 39 |     cmake_type = cmake_type.upper()
 40 |     up_val = cmake_value.upper()
 41 |     if cmake_type == 'BOOL':
 42 |         # https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/VariablesListsStrings#boolean-values-in-cmake
 43 |         return not (up_val in ('FALSE', 'OFF', 'N', 'NO', '0', '', 'NOTFOUND') or up_val.endswith('-NOTFOUND'))
 44 |     elif cmake_type == 'FILEPATH':
 45 |         if up_val.endswith('-NOTFOUND'):
 46 |             return None
 47 |         else:
 48 |             return cmake_value
 49 |     else:  # Directly return the cmake_value.
 50 |         return cmake_value
 51 | 
 52 | 
 53 | def get_cmake_cache_variables_from_file(cmake_cache_file):
 54 |     r"""Gets values in CMakeCache.txt into a dictionary.
 55 | 
 56 |     Arguments:
 57 |       cmake_cache_file: A CMakeCache.txt file object.
 58 |     Returns:
 59 |       dict: A ``dict`` containing the value of cached CMake variables.
 60 |     """
 61 | 
 62 |     results = dict()
 63 |     for i, line in enumerate(cmake_cache_file, 1):
 64 |         line = line.strip()
 65 |         if not line or line.startswith(('#', '//')):
 66 |             # Blank or comment line, skip
 67 |             continue
 68 | 
 69 |         # Almost any character can be part of variable name and value. As a practical matter, we assume the type must be
 70 |         # valid if it were a C variable name. It should match the following kinds of strings:
 71 |         #
 72 |         #   USE_CUDA:BOOL=ON
 73 |         #   "USE_CUDA":BOOL=ON
 74 |         #   USE_CUDA=ON
 75 |         #   USE_CUDA:=ON
 76 |         #   Intel(R) MKL-DNN_SOURCE_DIR:STATIC=/path/to/pytorch/third_party/ideep/mkl-dnn
 77 |         #   "OpenMP_COMPILE_RESULT_CXX_openmp:experimental":INTERNAL=FALSE
 78 |         matched = re.match(r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line)
 79 |         if matched is None:  # Illegal line
 80 |             raise ValueError('Unexpected line {} in {}: {}'.format(i, repr(cmake_cache_file), line))
 81 |         _, variable, type_, value = matched.groups()
 82 |         if type_ is None:
 83 |             type_ = ''
 84 |         if type_.upper() in ('INTERNAL', 'STATIC'):
 85 |             # CMake internal variable, do not touch
 86 |             continue
 87 |         results[variable] = convert_cmake_value_to_python_value(value, type_)
 88 | 
 89 |     return results
 90 | 
 91 | 
 92 | class CMakeExtension(Extension):
 93 |     """CMake extension"""
 94 |     def __init__(self, name, cmake_file):
 95 |         super().__init__(name, [])
 96 |         self.build_dir = BUILD_DIR
 97 |         self.cmake_file = cmake_file
 98 |         self._cmake_command = CMakeExtension._get_cmake_command()
 99 |         self.debug = True
100 |         self.cmake_dir = os.path.dirname(cmake_file)
101 | 
102 |     @staticmethod
103 |     def _get_version(cmd):
104 |         """Returns cmake version."""
105 | 
106 |         for line in check_output([cmd, '--version']).decode('utf-8').split('\n'):
107 |             if 'version' in line:
108 |                 return LooseVersion(line.strip().split(' ')[2])
109 |         raise RuntimeError('no version found')
110 | 
111 |     @staticmethod
112 |     def _get_cmake_command():
113 |         """Returns cmake command."""
114 | 
115 |         cmake_command = shutil.which('cmake')
116 |         cmake3 = shutil.which('cmake3')
117 |         if cmake3 is not None:
118 |             cmake = shutil.which('cmake')
119 |             if cmake is not None:
120 |                 bare_version = CMakeExtension._get_version(cmake)
121 |                 if (bare_version < LooseVersion("3.5.0") and
122 |                         CMakeExtension._get_version(cmake3) > bare_version):
123 |                     cmake_command = 'cmake3'
124 |         return cmake_command
125 | 
126 |     @staticmethod
127 |     def defines(args, **kwargs):
128 |         "Adds definitions to a cmake argument list."
129 |         for key, value in sorted(kwargs.items()):
130 |             if value is not None:
131 |                 args.append('-D{}={}'.format(key, value))
132 | 
133 |     @staticmethod
134 |     def _cmake_value(value):
135 |         if type(value) is str:
136 |             if value.startswith(('OFF', '0', 'False', 'FALSE')):
137 |                 return False
138 |             if value.startswith(('ON', '1', 'True', 'TRUE')):
139 |                 return True
140 |         return value
141 | 
142 |     @staticmethod
143 |     def extract(args):
144 |         "Adds definitions to a cmake argument list."
145 |         build_options = {}
146 |         pat = re.compile(r'^-D(.*)=(.*)')
147 |         for arg in args:
148 |             match = pat.match(arg)
149 | 
150 |             build_options[match.group(1)] = CMakeExtension._cmake_value(match.group(2))
151 | 
152 |         return build_options
153 | 
154 |     @staticmethod
155 |     def convert_cmake_dirs(paths):
156 |         def converttostr(input_seq, seperator):
157 |             # Join all the strings in list
158 |             final_str = seperator.join(input_seq)
159 |             return final_str
160 |         try:
161 |             return converttostr(paths, ";")
162 |         except:
163 |             return paths
164 | 
165 |     @property
166 |     def _cmake_cache_file(self):
167 |         r"""Returns the path to CMakeCache.txt.
168 | 
169 |         Returns:
170 |           string: The path to CMakeCache.txt.
171 |         """
172 |         return os.path.join(self.build_dir, 'CMakeCache.txt')
173 | 
174 |     def _get_cmake_cache_variables(self):
175 |         r"""Gets values in CMakeCache.txt into a dictionary.
176 |         Returns:
177 |           dict: A ``dict`` containing the value of cached CMake variables.
178 |         """
179 |         with open(self._cmake_cache_file) as f:
180 |             return get_cmake_cache_variables_from_file(f)
181 | 
182 |     def _run(self, args, env):
183 |         """Executes cmake with arguments and an environment."""
184 |         command = [self._cmake_command] + args + [self.cmake_dir]
185 |         print(' '.join(command))
186 |         check_call(command, cwd=self.build_dir, env=env)
187 | 
188 |     def generate(self, build_options, env, build_dir, install_dir):
189 |         """Runs cmake to generate native build files."""
190 | 
191 |         self.build_dir = build_dir
192 | 
193 |         cmake_args = []
194 | 
195 |         for var, val in env.items():
196 |             if var.startswith(('BUILD_', 'USE_', 'CMAKE_')):
197 |                 # TODO: DO NOT OVERWRITE CMAKE_PREFIX_PATH
198 |                 if var.strip() == "CMAKE_PREFIX_PATH":
199 |                     build_options[var] += ";" + val
200 |                 else:
201 |                     build_options[var] = val
202 | 
203 |         if 'CMAKE_BUILD_TYPE' not in env:
204 |             if check_env_flag('DEBUG', env=env):
205 |                 build_options['CMAKE_BUILD_TYPE'] = 'Debug'
206 |             elif check_env_flag('REL_WITH_DEB_INFO', env=env):
207 |                 build_options['CMAKE_BUILD_TYPE'] = 'RelWithDebInfo'
208 |             else:
209 |                 build_options['CMAKE_BUILD_TYPE'] = 'Release'
210 |         build_options['CMAKE_INSTALL_PREFIX'] = install_dir
211 | 
212 |         CMakeExtension.defines(cmake_args, **build_options)
213 |         if os.path.exists(self._cmake_cache_file):
214 |             try:
215 |                 cmake_cache_vars = defaultdict(lambda: False, self._get_cmake_cache_variables())
216 |             except FileNotFoundError:
217 |                 # CMakeCache.txt does not exist. Probably running "python setup.py clean" over a clean directory.
218 |                 cmake_cache_vars = defaultdict(lambda: False)
219 | 
220 |             cache_build_options = CMakeExtension.extract(cmake_args)
221 |             if all(option in cmake_cache_vars and
222 |                    CMakeExtension._cmake_value(cache_build_options[option]) == CMakeExtension._cmake_value(cmake_cache_vars[option])
223 |                    for option in cache_build_options):
224 |                 # Everything's in place. Do not rerun.
225 |                 return
226 |         self._run(cmake_args, env=env)
227 | 


--------------------------------------------------------------------------------
/tools/setup/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import platform
 4 | 
 5 | 
 6 | IS_LINUX = (platform.system() == 'Linux')
 7 | 
 8 | BUILD_DIR = 'build'
 9 | 
10 | 
11 | def get_compiler(runtime):
12 |     if runtime == 'dpcpp':
13 |         c_compiler = 'icx'
14 |         cpp_compiler = 'icpx'
15 |     else:
16 |         c_compiler = 'cc'
17 |         cpp_compiler = 'c++'
18 | 
19 |     cc = shutil.which(c_compiler)
20 |     cpp = shutil.which(cpp_compiler)
21 |     if cpp is None or cc is None:
22 |         raise RuntimeError("couldn't find the compiler '{}' or '{}'".format(c_compiler, cpp_compiler))
23 |     return cc, cpp
24 | 
25 | 
26 | def check_env_flag(name, env=os.environ, default=''):
27 |     return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y']
28 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 2.3.0
2 | 


--------------------------------------------------------------------------------