├── tools ├── __init__.py └── setup │ ├── __init__.py │ ├── env.py │ └── cmake.py ├── version.txt ├── requirements.txt ├── .gitmodules ├── tests ├── DeepSpeed_test │ ├── Example.csv │ ├── testccl_cpu.py │ ├── testccl_gpu_mpi.py │ ├── testccl_gpu.py │ └── DeepSpeed.csv ├── test_barrier.py ├── run_ds_llm.sh ├── test_llm_allreduce.py ├── ds_subgroup_allreduce.py ├── test_p2p_crossnodes.py ├── README.md ├── ds_p2p_crossnodes.py ├── test_allreduce.py ├── ddp_allreduce.py ├── test_c10d_p2p.py └── test_fsdp.py ├── src ├── gpu │ ├── README.md │ ├── Makefile │ ├── CMakeLists.txt │ ├── runtime.hpp │ ├── sycl_misc.hpp │ ├── ze_exception.hpp │ └── allreduce.cpp ├── env.h ├── test │ ├── remotesync │ │ ├── test.sh │ │ ├── Makefile │ │ ├── sycl_misc.hpp │ │ ├── ze_exception.hpp │ │ └── simple_test.cpp │ ├── segfault │ │ ├── test.sh │ │ ├── Makefile │ │ ├── sycl_misc.hpp │ │ ├── ze_exception.hpp │ │ └── simple_test.cpp │ └── writeremote │ │ ├── test.sh │ │ ├── Makefile │ │ ├── sycl_misc.hpp │ │ ├── ze_exception.hpp │ │ └── simple_test.cpp ├── env.cpp ├── CMakeLists.txt ├── ccl_comm_collector.cpp ├── ccl_comm_collector.h ├── utils.cpp └── ProcessGroupCCL.hpp ├── SECURITY.md ├── cmake └── Modules │ └── FindoneCCL.cmake ├── demo ├── README.md └── demo.py ├── oneccl_bindings_for_pytorch ├── __init__.py └── csrc │ ├── _C.cpp │ ├── init.h │ └── init.cpp ├── LICENSE ├── .gitignore ├── CMakeLists.txt ├── third-party-programs.txt ├── setup.py └── README.md /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/setup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 2.8.0+xpu 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.10.0 2 | setuptools 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/oneCCL"] 2 | path = third_party/oneCCL 3 | url = https://github.com/uxlfoundation/oneCCL.git 4 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/Example.csv: -------------------------------------------------------------------------------- 1 | allreduce,1,-1 2 | broadcast,3072,0 3 | allreduce,1,-1 4 | broadcast,3,0 5 | broadcast,5,0 6 | reduce,1024,0 7 | reduce,11264,0 8 | allgather,1024,-1 9 | -------------------------------------------------------------------------------- /src/gpu/README.md: -------------------------------------------------------------------------------- 1 | Dependencies: 2 | 1. MPI 3 | 2. Level-Zero 4 | 3. SYCL enabled compiler 5 | 6 | Build: 7 | make 8 | 9 | Run: 10 | ```mpirun -np allreduce -c 1024 -t ``` 11 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | -------------------------------------------------------------------------------- /src/env.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | enum ONECCL_BINDINGS_FOR_PYTORCH_ENV { 4 | ENV_VERBOSE = 0, 5 | ENV_WAIT_GDB 6 | }; 7 | 8 | int oneccl_bindings_for_pytorch_env(int env); 9 | 10 | static inline int oneccl_bindings_for_pytorch_verbose() { 11 | return oneccl_bindings_for_pytorch_env(ENV_VERBOSE); 12 | } 13 | 14 | static inline int oneccl_bindings_for_pytorch_wait_gdb() { 15 | return oneccl_bindings_for_pytorch_env(ENV_WAIT_GDB); 16 | } -------------------------------------------------------------------------------- /src/test/remotesync/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu 3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1 4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test 5 | #mpirun --prepend-rank -n 2 -ppn 2 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good 6 | #mpirun --prepend-rank -n 8 -ppn 8 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test 7 | -------------------------------------------------------------------------------- /src/test/segfault/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu 3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1 4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test 5 | #mpirun --prepend-rank -n 2 -ppn 2 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good 6 | #mpirun --prepend-rank -n 8 -ppn 8 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test 7 | -------------------------------------------------------------------------------- /src/test/writeremote/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export ONEAPI_DEVICE_SELECTOR=level_zero:gpu 3 | mpirun --prepend-rank -n 2 -ppn 2 ./simple_test $1 4 | #mpirun --prepend-rank -n 8 -ppn 8 ./simple_test 5 | #mpirun --prepend-rank -n 2 -ppn 2 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test good 6 | #mpirun --prepend-rank -n 8 -ppn 8 -outfile-pattern log/out.%r.log -errfile-pattern log/err.%r.log ze_tracer -c ./simple_test 7 | -------------------------------------------------------------------------------- /src/gpu/Makefile: -------------------------------------------------------------------------------- 1 | CC=icx 2 | CXX=icpx 3 | 4 | OPT= 5 | 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required" 7 | CCL_ROOT=../ccl/release/_install 8 | 9 | INCLUDES=-I$(CCL_ROOT)/include 10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader 11 | 12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES) 13 | 14 | all : allreduce 15 | 16 | clean: 17 | rm -f allreduce 18 | -------------------------------------------------------------------------------- /src/test/remotesync/Makefile: -------------------------------------------------------------------------------- 1 | CC=icx 2 | CXX=icpx 3 | 4 | OPT= 5 | 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required" 7 | CCL_ROOT=../ccl/release/_install 8 | 9 | INCLUDES=-I$(CCL_ROOT)/include 10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader 11 | 12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES) 13 | 14 | all : simple_test 15 | 16 | clean: 17 | rm -f simple_test 18 | -------------------------------------------------------------------------------- /src/test/segfault/Makefile: -------------------------------------------------------------------------------- 1 | CC=icx 2 | CXX=icpx 3 | 4 | OPT= 5 | 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required" 7 | CCL_ROOT=../ccl/release/_install 8 | 9 | INCLUDES=-I$(CCL_ROOT)/include 10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader 11 | 12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES) 13 | 14 | all : simple_test 15 | 16 | clean: 17 | rm -f simple_test 18 | -------------------------------------------------------------------------------- /src/test/writeremote/Makefile: -------------------------------------------------------------------------------- 1 | CC=icx 2 | CXX=icpx 3 | 4 | OPT= 5 | 6 | SYCLFLAGS=-fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -internal_options -ze-intel-has-buffer-offset-arg -internal_options -cl-intel-greater-than-4GB-buffer-required" 7 | CCL_ROOT=../ccl/release/_install 8 | 9 | INCLUDES=-I$(CCL_ROOT)/include 10 | LIBRARIES=-L$(CCL_ROOT)/lib -lmpi -lze_loader 11 | 12 | CXXFLAGS=-std=c++17 $(SYCLFLAGS) $(OPT) -Wall $(INCLUDES) $(LIBRARIES) 13 | 14 | all : simple_test 15 | 16 | clean: 17 | rm -f simple_test 18 | -------------------------------------------------------------------------------- /tools/setup/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import platform 4 | 5 | 6 | IS_LINUX = (platform.system() == 'Linux') 7 | 8 | BUILD_DIR = 'build' 9 | 10 | 11 | def get_compiler(runtime): 12 | if runtime == 'dpcpp': 13 | c_compiler = 'icx' 14 | cpp_compiler = 'icpx' 15 | else: 16 | c_compiler = os.environ.get('CC', 'cc') 17 | cpp_compiler = os.environ.get('CXX', 'c++') 18 | 19 | cc = shutil.which(c_compiler) 20 | cpp = shutil.which(cpp_compiler) 21 | if cpp is None or cc is None: 22 | raise RuntimeError("couldn't find the compiler '{}' or '{}'".format(c_compiler, cpp_compiler)) 23 | return cc, cpp 24 | 25 | 26 | def check_env_flag(name, env=os.environ, default=''): 27 | return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y'] 28 | -------------------------------------------------------------------------------- /tests/test_barrier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--device', '-dev', type=str, default='cpu', help='Device type to use: cpu, xpu') 10 | args = parser.parse_args() 11 | 12 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 13 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 14 | os.environ['MASTER_ADDR'] = '127.0.0.1' 15 | os.environ['MASTER_PORT'] = '29500' 16 | 17 | dist.init_process_group("ccl") 18 | rank = dist.get_rank() 19 | size = dist.get_world_size() 20 | 21 | if args.device == 'xpu': 22 | device = "xpu:{}".format(rank) 23 | else: 24 | device = 'cpu' 25 | 26 | print("Barrier using device: ", args.device) 27 | dist.barrier() 28 | print("Finish") 29 | -------------------------------------------------------------------------------- /tests/run_ds_llm.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | # run the ds_subgroup_allreduce.py 3 | # for OAM (sub_group=2/4) 4 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=2 5 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=4 6 | # for Aurora System(TP=2/3/4/6) 7 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=2 8 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=3 9 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=4 10 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=6 11 | 12 | # should run the ds_p2p_crossnodes.py on 3 nodes 13 | # -host is the name for this 3 nodes 14 | # --dist_url is the IP on your node, you can use (hostname -I) to get. 15 | mpirun -host x1002c4s1b0n0,x1002c4s2b0n0,x1002c4s3b0n0 -np 36 -ppn 12 python -u ds_p2p_crossnodes.py --dist_url 10.0.1.141 --world_size 36 16 | -------------------------------------------------------------------------------- /src/gpu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CCL_DPCPP_SRCS dpcpp_ccl.cpp ze_exception.hpp allreduce.h sycl_misc.hpp runtime.hpp cxxopts.hpp) 2 | 3 | set_source_files_properties(${CCL_DPCPP_SRCS} PROPERTIES COMPILE_DEFINITIONS "USE_DPCPP;__STRICT_ANSI__") 4 | set_source_files_properties(${CCL_DPCPP_SRCS} PROPERTIES COMPILE_FLAGS -fsycl) 5 | 6 | add_library(oneccl_bindings_for_pytorch_xpu SHARED ${CCL_DPCPP_SRCS}) 7 | 8 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC ${DEPENDS_LIB}) 9 | target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC oneccl_bindings_for_pytorch) 10 | 11 | foreach(RPATH ${CMAKE_INSTALL_RPATH}) 12 | set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}") 13 | endforeach() 14 | set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,--disable-new-dtags") 15 | 16 | install(TARGETS oneccl_bindings_for_pytorch_xpu LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib") 17 | 18 | -------------------------------------------------------------------------------- /src/env.cpp: -------------------------------------------------------------------------------- 1 | #include "env.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * All available launch options for ONECCL_BINDINGS_FOR_PYTORCH 7 | * ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE: Default = 0, Set verbose level in ONECCL_BINDINGS_FOR_PYTORCH 8 | * ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB: Default = 0, Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching 9 | */ 10 | 11 | #define ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(var) \ 12 | int var = [&]() -> int { \ 13 | if (auto env = std::getenv("ONECCL_BINDINGS_FOR_PYTORCH_" #var)) \ 14 | return std::stoi(env, 0, 10); \ 15 | return 0; \ 16 | } () 17 | 18 | int oneccl_bindings_for_pytorch_env(int env_type) { 19 | 20 | static struct { 21 | ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(ENV_VERBOSE); 22 | ONECCL_BINDINGS_FOR_PYTORCH_ENV_TYPE_DEF(ENV_WAIT_GDB); 23 | } env; 24 | 25 | switch (env_type) { 26 | case ENV_VERBOSE: 27 | return env.ENV_VERBOSE; 28 | case ENV_WAIT_GDB: 29 | return env.ENV_WAIT_GDB; 30 | default: 31 | return 0; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /cmake/Modules/FindoneCCL.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find oneCCL 2 | # 3 | # The following are set after configuration is done: 4 | # ONECCL_FOUND : set to true if oneCCL is found. 5 | # ONECCL_INCLUDE_DIRS : path to oneCCL include dir. 6 | # ONECCL_LIBRARIES : list of libraries for oneCCL 7 | # 8 | # and the following imported targets: 9 | # 10 | # oneCCL 11 | 12 | IF (NOT ONECCL_FOUND) 13 | SET(ONECCL_FOUND OFF) 14 | SET(ONECCL_LIBRARIES) 15 | SET(ONECCL_INCLUDE_DIRS) 16 | 17 | SET(ONECCL_ROOT "${PROJECT_SOURCE_DIR}/third_party/oneCCL") 18 | 19 | IF(BUILD_NO_ONECCL_PACKAGE) 20 | ADD_SUBDIRECTORY(${ONECCL_ROOT} oneCCL EXCLUDE_FROM_ALL) 21 | ELSE() 22 | ADD_SUBDIRECTORY(${ONECCL_ROOT}) 23 | ENDIF() 24 | 25 | IF(NOT TARGET ccl) 26 | MESSAGE(FATAL_ERROR "Failed to find oneCCL target") 27 | ENDIF() 28 | add_library(oneCCL ALIAS ccl) 29 | 30 | GET_TARGET_PROPERTY(INCLUDE_DIRS oneCCL INCLUDE_DIRECTORIES) 31 | SET(ONECCL_INCLUDE_DIRS ${INCLUDE_DIRS}) 32 | SET(ONECCL_LIBRARIES oneCCL) 33 | 34 | find_package_handle_standard_args(oneCCL FOUND_VAR ONECCL_FOUND REQUIRED_VARS ONECCL_LIBRARIES ONECCL_INCLUDE_DIRS) 35 | 36 | set(MPI_INCLUDE_DIR "${ONECCL_ROOT}/deps/mpi/include/") 37 | set(MPI_LIB_DIR "${ONECCL_ROOT}/deps/mpi/lib/") 38 | 39 | ENDIF(NOT ONECCL_FOUND) 40 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Simple Demo for Intel® oneCCL Bindings for PyTorch* 2 | 3 | This simple demo show case the functionality for collective communication primitives in Intel® oneCCL Bindings for PyTorch*. 4 | 5 | ## Single Node Run 6 | To run the simple demo on a single node with 2 instances, run: 7 | 8 | ```bash 9 | mpirun -n 2 -l python demo.py 10 | 11 | ``` 12 | The demo could be also run on XPU with " --device xpu " argument. 13 | 14 | ```bash 15 | mpirun -n 2 -l python demo.py --device xpu 16 | ``` 17 | 18 | ## Multiple Nodes Run 19 | To run the simple demo on multiple nodes, please follow below instructions: 20 | 21 | ### Ethernet 22 | 1. Identify the network interface name for collective communication. ex: eth0 23 | 2. Identify the IPs of all nodes. ex: 10.0.0.1,10.0.0.2 24 | 3. Identify the master node IP. ex: 10.0.0.1 25 | 4. Set the value of np for the total number of instances. ex: 2 26 | 5. Set the value of ppn for the number of instance per node. ex: 1 27 | 28 | Here is a run command example for cpu according to above steps: 29 | 30 | ```bash 31 | FI_TCP_IFACE=eth0 I_MPI_OFI_PROVIDER=tcp I_MPI_HYDRA_IFACE=eth0 I_MPI_DEBUG=121 mpirun -host 10.0.0.1,10.0.0.2 -np 2 -ppn 1 --map-by node python demo.py --device cpu --dist_url 10.0.0.1 --dist_port 29500 32 | ``` 33 | The demo could be also run on XPU by changing " --device cpu " to " --device xpu " argument. 34 | 35 | -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | import torch 5 | 6 | 7 | cwd = os.path.dirname(os.path.abspath(__file__)) 8 | if not os.path.exists(os.path.join(cwd, "version.py")): 9 | raise RuntimeError("oneccl_bindings_for_pytorch is not installed!") 10 | 11 | 12 | def set_env_default(env, key, value): 13 | new_value = env.get(key, value) 14 | env[key] = new_value 15 | 16 | from .version import __version__, git_version 17 | from . import _C as ccl_lib 18 | 19 | if hasattr(torch, 'xpu') and torch.xpu._is_compiled(): 20 | try: 21 | # load the CCL/XPU library 22 | import ctypes 23 | my_c_library = ctypes.cdll.LoadLibrary(os.path.join(cwd, "lib/liboneccl_bindings_for_pytorch_xpu.so")) 24 | except OSError as e: 25 | warnings.warn(f"Warning: Cannot load xpu CCL. CCL doesn't work for XPU device due to {e}") 26 | 27 | __all__ = [] 28 | __all__ += [name for name in dir(ccl_lib) 29 | if name[0] != '_' and 30 | not name.endswith('Base')] 31 | 32 | 33 | def is_available(tensors): 34 | devices = set() 35 | for tensor in tensors: 36 | if not tensor.is_contiguous(): 37 | return False 38 | device = tensor.get_device() 39 | if device in devices: 40 | return False 41 | devices.add(device) 42 | 43 | return True 44 | 45 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CCL_SRCS ProcessGroupCCL.cpp dispatch_stub.cpp utils.cpp ccl_comm_collector.cpp env.cpp) 2 | set(CCL_CPU_SRCS cpu/cpu_ccl.cpp) 3 | add_library(oneccl_bindings_for_pytorch SHARED ${CCL_SRCS} ${CCL_CPU_SRCS}) 4 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES OUTPUT_NAME ${LIB_NAME}) 5 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES POSITION_INDEPENDENT_CODE ON) 6 | 7 | target_compile_options(oneccl_bindings_for_pytorch PUBLIC -Wall 8 | -Wno-sign-compare 9 | -Wno-unused-function) 10 | 11 | if(COMPUTE_BACKEND STREQUAL "dpcpp") 12 | add_subdirectory(./gpu) 13 | add_definitions (-DUSE_GPU) 14 | target_compile_options(oneccl_bindings_for_pytorch PUBLIC -fsycl) 15 | target_link_options(oneccl_bindings_for_pytorch PUBLIC -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc -options -vc-codegen") 16 | endif() 17 | 18 | target_include_directories(oneccl_bindings_for_pytorch PUBLIC ./) 19 | 20 | target_link_libraries(oneccl_bindings_for_pytorch PUBLIC ${DEPENDS_LIB}) 21 | 22 | foreach(RPATH ${CMAKE_INSTALL_RPATH}) 23 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}") 24 | endforeach() 25 | set_target_properties(oneccl_bindings_for_pytorch PROPERTIES LINK_FLAGS "-Wl,--disable-new-dtags") 26 | 27 | install(TARGETS oneccl_bindings_for_pytorch LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib") 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020-2021, Intel Corporation 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the Intel Corporation nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /tests/test_llm_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 8 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 9 | os.environ['MASTER_ADDR'] = '127.0.0.1' 10 | os.environ['MASTER_PORT'] = '29500' 11 | dist.init_process_group("ccl") 12 | rank = dist.get_rank() 13 | size = dist.get_world_size() 14 | 15 | device = "xpu:{}".format(rank) 16 | llm_shapes = [ 17 | # GPT-J 6B 18 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 19 | # Llama 7B 20 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 21 | # Llama 13B 22 | (1, 32, 5120), (1, 1024, 5120), (1, 4, 5120), (1, 1, 5120), 23 | # Llama2 7B 24 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 25 | # Llama2 13B 26 | (1, 32, 5120), (1, 1024, 5120), (1, 4, 5120), (1, 1, 5120), 27 | # Llama2 70B 28 | (1, 32, 8192), (1, 1024, 8192), (1, 1, 8192), (1, 4, 8192), 29 | # OPT 6.7B 30 | (1, 32, 4096), (1, 1024, 4096), (1, 1, 4096), (1, 4, 4096), 31 | # OPT 30B 32 | (1, 32, 7168), (1, 1, 7168), (1, 1024, 7168), (1, 4, 7168), 33 | # Bloom 7B 34 | (1, 33, 4096), (1, 1, 4096), (1, 4, 4096), (1, 1028, 4096), 35 | # Bloom 176B 36 | (1, 4, 14336), (1, 1028, 14336), (1, 33, 14336), (1, 1, 14336) 37 | ] 38 | 39 | os.environ['TORCH_LLM_ALLREDUCE_DEBUG'] = '1' 40 | for shape in llm_shapes: 41 | data = torch.rand(shape, dtype=torch.float16).to(device) 42 | # Expected value is identical to input for average allreduce. 43 | expect_result = data 44 | # Allreduce is an inplace op, data will represent output. 45 | dist.all_reduce(data) 46 | assert torch.allclose(data, expect_result) 47 | -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/csrc/_C.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "init.h" 33 | 34 | PYBIND11_MODULE(_C, m) { 35 | torch_ccl_python_init(m); 36 | } -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/csrc/init.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #pragma once 33 | 34 | #include 35 | 36 | #define TORCH_CCL_CPP_API __attribute__ ((visibility ("default"))) 37 | 38 | void torch_ccl_python_init(pybind11::module &m); 39 | -------------------------------------------------------------------------------- /src/gpu/runtime.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | sycl::queue currentQueue(int ndev, int nsub) { 32 | switch(ndev) { 33 | case 0: 34 | if (nsub == 0) 35 | return getQueue<0,0>(); 36 | else 37 | return getQueue<0,1>(); 38 | break; 39 | case 1: 40 | if (nsub == 0) 41 | return getQueue<1,0>(); 42 | else 43 | return getQueue<1,1>(); 44 | break; 45 | } 46 | throw std::exception(); 47 | } 48 | 49 | sycl::device currentSubDevice(int ndev, int nsub) { 50 | switch(ndev) { 51 | case 0: 52 | if (nsub == 0) 53 | return getSubDevice<0,0>(); 54 | else 55 | return getSubDevice<0,1>(); 56 | break; 57 | case 1: 58 | if (nsub == 0) 59 | return getSubDevice<1,0>(); 60 | else 61 | return getSubDevice<1,1>(); 62 | break; 63 | } 64 | throw std::exception(); 65 | } 66 | 67 | static uint32_t g_dev_num = 1; 68 | static uint32_t g_part_num = 0; 69 | 70 | sycl::device currentSubDevice() { 71 | return currentSubDevice(g_dev_num, g_part_num); 72 | } 73 | 74 | sycl::queue currentQueue() { 75 | return currentQueue(g_dev_num, g_part_num); 76 | } 77 | -------------------------------------------------------------------------------- /src/gpu/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/test/remotesync/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/test/segfault/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/test/writeremote/sycl_misc.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | sycl::device getSubDevice() { 6 | static auto devs = sycl::device::get_devices(sycl::info::device_type::gpu); 7 | auto dev = devs[ndev]; 8 | try { 9 | static auto subs = dev.template create_sub_devices< 10 | sycl::info::partition_property::partition_by_affinity_domain>( 11 | sycl::info::partition_affinity_domain::numa); 12 | 13 | return subs[nsub]; 14 | } catch (sycl::exception &e) { 15 | std::cout< 21 | sycl::queue getQueue() { 22 | static sycl::queue q( 23 | getSubDevice(), 24 | sycl::property_list { 25 | sycl::property::queue::enable_profiling(), 26 | sycl::property::queue::in_order() 27 | }); 28 | return q; 29 | } 30 | 31 | #define queue_case(x) \ 32 | case x: \ 33 | if (nsub == 0) \ 34 | return getQueue(); \ 35 | else \ 36 | return getQueue(); 37 | 38 | sycl::queue currentQueue(int ndev, int nsub) { 39 | switch(ndev) { 40 | queue_case(0); 41 | queue_case(1); 42 | queue_case(2); 43 | queue_case(3); 44 | queue_case(4); 45 | queue_case(5); 46 | queue_case(6); 47 | queue_case(7); 48 | } 49 | throw std::exception(); 50 | } 51 | 52 | #define subdev_case(x) \ 53 | case x: \ 54 | if (nsub == 0) \ 55 | return getSubDevice(); \ 56 | else \ 57 | return getSubDevice(); 58 | 59 | sycl::device currentSubDevice(int ndev, int nsub) { 60 | switch(ndev) { 61 | subdev_case(0); 62 | subdev_case(1); 63 | subdev_case(2); 64 | subdev_case(3); 65 | subdev_case(4); 66 | subdev_case(5); 67 | subdev_case(6); 68 | subdev_case(7); 69 | } 70 | throw std::exception(); 71 | } 72 | 73 | static uint32_t g_dev_num = 1; 74 | static uint32_t g_part_num = 0; 75 | 76 | sycl::device currentSubDevice() { 77 | return currentSubDevice(g_dev_num, g_part_num); 78 | } 79 | 80 | sycl::queue currentQueue() { 81 | return currentQueue(g_dev_num, g_part_num); 82 | } 83 | -------------------------------------------------------------------------------- /src/ccl_comm_collector.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "ccl_comm_collector.h" 5 | #include "utils.h" 6 | 7 | 8 | namespace oneccl_bindings_for_pytorch { 9 | 10 | ccl::shared_ptr_class CCLCommCollector::get_kvs(int rank, c10d::Store& store, 11 | bool singleP2POp = false, const std::string& p2pKey = "", int p2pRank = 0) { 12 | 13 | std::string storeKey; 14 | 15 | if (!singleP2POp) { 16 | storeKey = std::to_string(ccl_comms.size()); 17 | } else { 18 | storeKey = p2pKey; 19 | } 20 | // Rank 0 broadcast the bootstrap network information to other ranks 21 | if (rank == 0 || (singleP2POp && p2pRank == 0)) { 22 | call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&]() { 23 | kvs = ccl::create_main_kvs(); 24 | }); 25 | ccl::kvs::address_type main_addr = kvs->get_address(); 26 | auto ccl_kvs_addr = std::vector(main_addr.begin(), main_addr.end()); 27 | store.set(storeKey, ccl_kvs_addr); 28 | } 29 | else { 30 | auto ccl_kvs_addr = store.get(storeKey); 31 | if (ccl_kvs_addr.size() != ccl::kvs::address_max_size) { 32 | throw std::runtime_error( 33 | "Unexpected ccl kvs addr from the store\n"); 34 | } 35 | ccl::kvs::address_type main_addr; 36 | std::copy_n(std::make_move_iterator(ccl_kvs_addr.begin()), 37 | ccl::kvs::address_max_size, 38 | main_addr.begin()); 39 | call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&]() { 40 | kvs = ccl::create_kvs(main_addr); 41 | }); 42 | } 43 | 44 | return kvs; 45 | } 46 | 47 | std::shared_ptr CCLCommCollector::get_comms(const std::string& devices_key) { 48 | if (ccl_comms.find(devices_key) != ccl_comms.end()) { 49 | // Reuse the cached communicator if there is one. 50 | return ccl_comms[devices_key]; 51 | } 52 | return {nullptr}; 53 | } 54 | 55 | void CCLCommCollector::add_comms(const std::string& devices_key, 56 | std::shared_ptr comms) { 57 | if (ccl_comms.find(devices_key) != ccl_comms.end()) { 58 | // Replace the cached comms 59 | ccl_comms[devices_key] = comms; 60 | } else { 61 | ccl_comms.emplace(devices_key, comms); 62 | } 63 | } 64 | 65 | } -------------------------------------------------------------------------------- /tests/ds_subgroup_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | import time 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 11 | parser.add_argument('--dist_port', default='29500', type=str, help='url port used to set up distributed training') 12 | parser.add_argument('--sub_group', default=4, type=int, help='url port used to set up distributed training') 13 | args = parser.parse_args() 14 | 15 | if 'PMI_RANK' in os.environ.keys() and 'PMI_SIZE' in os.environ.keys(): 16 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 17 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) # mpich set 18 | elif 'PMIX_RANK' in os.environ.keys() and 'PALS_LOCAL_SIZE' in os.environ.keys(): 19 | os.environ['RANK'] = os.environ.get('PMIX_RANK') 20 | os.environ['WORLD_SIZE'] = str(os.environ.get('PALS_LOCAL_SIZE', -1)) 21 | 22 | os.environ['MASTER_ADDR'] = '127.0.0.1' 23 | os.environ['MASTER_PORT'] = '29500' 24 | 25 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 26 | dist.init_process_group(backend='ccl', init_method=init_method, 27 | world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK'])) 28 | 29 | rank = dist.get_rank() 30 | size = dist.get_world_size() 31 | device = "xpu:{}".format(rank) 32 | print('world_size:{}, global rank:{}'.format(size, rank)) 33 | 34 | shape = int(2048) 35 | warm_shape = int(1) 36 | warm = torch.ones(warm_shape).bfloat16().to(device) 37 | 38 | input_shape = shape 39 | input = torch.ones(input_shape).bfloat16().to(device) 40 | 41 | #warm_up 42 | dist.all_reduce(warm) 43 | 44 | #sub_group=1(TP=12) 45 | group1 = dist.new_group([0]) 46 | if rank ==0: 47 | dist.all_reduce(input, group=group1) 48 | 49 | group_size = [[i+(size // args.sub_group)*j for j in range(args.sub_group)] for i in range(size // args.sub_group)] 50 | sub_group = [] 51 | 52 | #construct sub group 53 | for i in range(len(group_size)): 54 | sub_group.append(dist.new_group(group_size[i])) 55 | 56 | for i in range(len(group_size)): 57 | if dist.get_rank() in group_size[i]: 58 | dist.all_reduce(input, group=sub_group[i]) 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # setup.py uses the list of patterns in this file to decide 2 | # what to delete when clean up 3 | 4 | .coverage 5 | .hypothesis 6 | .mypy_cache 7 | */*.pyc 8 | */*.so* 9 | */**/__pycache__ 10 | */**/*.dylib* 11 | */**/*.pyc 12 | */**/*.pyd 13 | */**/*.so* 14 | */**/**/*.pyc 15 | */**/**/**/*.pyc 16 | */**/**/**/**/*.pyc 17 | 18 | oneccl_bindings_for_pytorch/include/ 19 | oneccl_bindings_for_pytorch/lib/ 20 | oneccl_bindings_for_pytorch/bin/ 21 | oneccl_bindings_for_pytorch/etc/ 22 | oneccl_bindings_for_pytorch/env/ 23 | oneccl_bindings_for_pytorch/examples/ 24 | oneccl_bindings_for_pytorch/licensing/ 25 | oneccl_bindings_for_pytorch/modulefiles/ 26 | oneccl_bindings_for_pytorch/version.py 27 | 28 | ## General 29 | 30 | # Debug Shell Script 31 | *.sh 32 | 33 | # Compiled Object files 34 | *.slo 35 | *.lo 36 | *.o 37 | *.cuo 38 | *.obj 39 | 40 | # Compiled Dynamic libraries 41 | *.so 42 | *.dylib 43 | *.dll 44 | 45 | # Compiled Static libraries 46 | *.lai 47 | *.la 48 | *.a 49 | *.lib 50 | 51 | # Compiled protocol buffers 52 | *.pb.h 53 | *.pb.cc 54 | *_pb2.py 55 | 56 | # Compiled python 57 | *.pyc 58 | *.pyd 59 | 60 | # Compiled MATLAB 61 | *.mex* 62 | 63 | # IPython notebook checkpoints 64 | .ipynb_checkpoints 65 | 66 | # Editor temporaries 67 | *.swn 68 | *.swo 69 | *.swp 70 | *~ 71 | 72 | # Sublime Text settings 73 | *.sublime-workspace 74 | *.sublime-project 75 | 76 | # Eclipse Project settings 77 | *.*project 78 | .settings 79 | 80 | # Files generated by CLion 81 | cmake-build-debug 82 | 83 | # QtCreator files 84 | *.user 85 | 86 | # OSX dir files 87 | .DS_Store 88 | 89 | # GDB history 90 | .gdb_history 91 | 92 | ## Caffe2 93 | 94 | # build, distribute, and bins (+ python proto bindings) 95 | build 96 | /build_* 97 | .build_debug/* 98 | .build_release/* 99 | distribute/* 100 | dist/ 101 | *.testbin 102 | *.bin 103 | cmake_build 104 | .cmake_build 105 | gen 106 | .setuptools-cmake-build 107 | 108 | # setup.py intermediates 109 | .eggs 110 | oneccl_bindings_for_pytorch.egg-info 111 | oneccl_bind_pt.egg-info 112 | 113 | # Files generated by ctags 114 | CTAGS 115 | tags 116 | TAGS 117 | 118 | # BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.) 119 | # 120 | # Below files are not deleted by "setup.py clean". 121 | 122 | # Visual Studio Code files 123 | .vscode 124 | .vs 125 | .idea 126 | 127 | # Files generated when a patch is rejected 128 | *.orig 129 | *.rej 130 | -------------------------------------------------------------------------------- /src/gpu/ze_exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 4 | #include 5 | #include 6 | #include 7 | 8 | // Mapping from status to human readable string 9 | class zeException : std::exception { 10 | const char * zeResultToString(ze_result_t status) const { 11 | static const std::unordered_map zeResultToStringMap{ 12 | {ZE_RESULT_SUCCESS, "[Core] success"}, 13 | {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, 14 | {ZE_RESULT_ERROR_DEVICE_LOST, "[Core] device hung, reset, was removed, or driver update occurred"}, 15 | {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, "[Core] insufficient host memory to satisfy call"}, 16 | {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, "[Core] insufficient device memory to satisfy call"}, 17 | {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, "[Core] error occurred when building module, see build log for details"}, 18 | {ZE_RESULT_ERROR_UNINITIALIZED, "[Validation] driver is not initialized"}, 19 | {ZE_RESULT_ERROR_INVALID_NULL_POINTER, "[Validation] pointer argument may not be nullptr"}, 20 | {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, "[Validation] object pointed to by handle still in-use by device"}, 21 | {ZE_RESULT_ERROR_INVALID_ENUMERATION, "[Validation] enumerator argument is not valid"}, 22 | {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, 23 | {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, "[Validation] size argument is not supported by the device"}, 24 | {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, "[Validation] alignment argument is not supported by the device"}, 25 | {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, "[Validation] handle argument is not valid"}, 26 | {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, "[Validation] generic error code for unsupported features"}, 27 | {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, "[Validation] native binary is not supported by the device"}, 28 | }; 29 | auto it = zeResultToStringMap.find(status); 30 | if (it != zeResultToStringMap.end()) 31 | return it->second; 32 | else 33 | return "Unknown Reason"; 34 | } 35 | 36 | public: 37 | zeException(ze_result_t ret) : result_(ret) {} 38 | 39 | ze_result_t result_; 40 | 41 | const char* what() const noexcept override { 42 | return zeResultToString(result_); 43 | } 44 | }; 45 | 46 | #define zeCheck(x) \ 47 | if (x != ZE_RESULT_SUCCESS) { \ 48 | auto e = zeException(x); \ 49 | std::cout<<"Throw "< 0: 34 | os.environ['RANK'] = str(mpi_rank) 35 | os.environ['WORLD_SIZE'] = str(mpi_world_size) 36 | else: 37 | # set the default rank and world size to 0 and 1 38 | os.environ['RANK'] = str(os.environ.get('RANK', 0)) 39 | os.environ['WORLD_SIZE'] = str(os.environ.get('WORLD_SIZE', 1)) 40 | os.environ['MASTER_ADDR'] = '127.0.0.1' # your master address 41 | os.environ['MASTER_PORT'] = '29500' # your master port 42 | rank = int(os.environ.get('PMI_RANK', -1)) # global rank 43 | world_size = int(os.environ.get("WORLD_SIZE", -1)) 44 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 45 | 46 | # Initialize the process group with ccl backend 47 | dist.init_process_group(backend='ccl', init_method=init_method, world_size=world_size, rank=rank) 48 | 49 | local_rank = os.environ['MPI_LOCALRANKID'] 50 | if args.device == 'xpu': 51 | device = "xpu:{}".format(local_rank) 52 | else: 53 | device = 'cpu' 54 | 55 | model = Model().to(device) 56 | if dist.get_world_size() > 1: 57 | model = DDP(model, device_ids=[device] if (device != 'cpu') else None) 58 | 59 | optimizer = torch.optim.SGD(model.parameters(), lr=0.001) 60 | loss_fn = nn.MSELoss().to(device) 61 | for i in range(3): 62 | print("Runing Iteration: {} on device {}".format(i, device)) 63 | input = torch.randn(2, 4).to(device) 64 | labels = torch.randn(2, 5).to(device) 65 | # forward 66 | print("Runing forward: {} on device {}".format(i, device)) 67 | res = model(input) 68 | # loss 69 | print("Runing loss: {} on device {}".format(i, device)) 70 | L = loss_fn(res, labels) 71 | # backward 72 | print("Runing backward: {} on device {}".format(i, device)) 73 | with torch.autograd.profiler_legacy.profile(enabled=True) as prof: 74 | L.backward() 75 | #print(prof) 76 | # update 77 | print("Runing optim: {} on device {}".format(i, device)) 78 | optimizer.step() 79 | print("Finish") -------------------------------------------------------------------------------- /tests/test_p2p_crossnodes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--world_size', default=-1, type=int, help='number of gpu for distributed training') 11 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 12 | parser.add_argument('--dist_port', default='29800', type=str, help='url port used to set up distributed training') 13 | args = parser.parse_args() 14 | 15 | os.environ['RANK'] = str(os.environ.get('PMIX_RANK',0)) 16 | os.environ['WORLD_SIZE'] = str(args.world_size) 17 | os.environ['MASTER_ADDR'] = '127.0.0.1' 18 | os.environ['MASTER_PORT'] = '29500' 19 | 20 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 21 | dist.init_process_group(backend='ccl', init_method=init_method, 22 | world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK'])) 23 | 24 | rank = dist.get_rank() 25 | size = dist.get_world_size() 26 | local_rank = os.environ['PALS_LOCAL_RANKID'] 27 | device = "xpu:{}".format(local_rank) 28 | print('world_size:{}, global rank:{}, local_rank:{}'.format(size, rank, local_rank)) 29 | 30 | # allreduce is WA 31 | data = torch.randn(2, dtype=torch.float32).to(device) 32 | dist.all_reduce(data) 33 | 34 | def send_tensor(buffer, recv_stage): 35 | if isinstance(buffer, torch.Tensor): 36 | type_tensor = torch.LongTensor(data=[0]).to(device) 37 | dist.send(type_tensor, recv_stage) 38 | send_shape = torch.LongTensor(data=buffer.size()).to(device) 39 | send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(device) 40 | dist.send(send_ndims, recv_stage) 41 | dist.send(send_shape, recv_stage) 42 | 43 | def recv_tensor(send_stage): 44 | type_tensor = torch.LongTensor(data=[0]).to(device) 45 | dist.recv(type_tensor, send_stage) 46 | recv_type = type_tensor.item() 47 | 48 | if recv_type == 0: 49 | recv_ndims = torch.LongTensor(data=[0]).to(device) 50 | dist.recv(recv_ndims, send_stage) 51 | recv_ndims = recv_ndims.item() 52 | recv_shape = torch.LongTensor([1] * recv_ndims).to(device) 53 | dist.recv(recv_shape, send_stage) 54 | print("recv_ndims", recv_ndims) 55 | print("recv_shape", recv_shape) 56 | else: 57 | print("----------------error-------------------") 58 | size = dist.get_world_size() 59 | device = "xpu:{}".format(local_rank) 60 | 61 | data = torch.randn(1, dtype=torch.float32).to(device) 62 | dist.all_reduce(data) 63 | 64 | # rank1 -> rank3 -> rank15 -> rank23 -> rank8 65 | if rank == 1: 66 | tensor = torch.ones(2048,3,256).xpu(device) 67 | send_tensor(tensor, 3) 68 | if rank == 3: 69 | recv_tensor(1) 70 | tensor = torch.ones(2048,3,256).xpu(device) 71 | send_tensor(tensor, 15) 72 | if rank == 15: 73 | recv_tensor(3) 74 | tensor = torch.ones(2048,3,256).xpu(device) 75 | send_tensor(tensor, 23) 76 | if rank == 23: 77 | recv_tensor(15) 78 | tensor = torch.ones(2048,3,256).xpu(device) 79 | send_tensor(tensor, 8) 80 | if rank == 8: 81 | recv_tensor(23) 82 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Intel® oneCCL Bindings for PyTorch* unit tests 2 | 3 | These tests validate the functionality and performance for collective communication primitives in Intel® oneCCL Bindings for PyTorch*. 4 | 5 | ## functionality validation of collective communication primitives 6 | To start the test_c10d_ccl.py test, run: 7 | 8 | ```bash 9 | python test_c10d_ccl.py 10 | ``` 11 | 12 | ## functionality validation of point-to-point communication primitives 13 | For within-card and cross-cards p2p test, run: 14 | 15 | ```bash 16 | python test_c10d_p2p.py 17 | ``` 18 | 19 | For cross-nodes p2p test, run: 20 | 21 | ```bash 22 | # Mpich 23 | mpiexec -host nodeA,nodeB -np 24 -ppn 12 python -u test_p2p_crossnodes.py --dist_url $NODE_IP --world_size 24 24 | ``` 25 | 26 | ## functionality validation of barrier 27 | For cpu barrier, run: 28 | 29 | ```bash 30 | mpirun -np 2 python test_barrier.py 31 | ``` 32 | 33 | For xpu barrier (built with "COMPUTE_BACKEND=dpcpp"), run: 34 | 35 | ```bash 36 | mpirun -np 2 python test_barrier.py --device xpu 37 | ``` 38 | 39 | ## broadcast/allreduce profiling 40 | To start the test_allreduce.py test, run: 41 | 42 | ```bash 43 | mpirun -np 12 -ppn 12 python ddp_allreduce.py --warm 10 --iter 20 --fixed 44 | ``` 45 | 46 | ## DeepSpeed test 47 | cpu test: 48 | ```bash 49 | python testccl_cpu.py 50 | ``` 51 | 52 | gpu test (runs on 1 node 6 cards 12 tiles): 53 | ```bash 54 | python testccl_gpu.py --world_size 12 55 | ``` 56 | gpu test for scale-out (runs on 2nodes and 24 ranks): 57 | ```bash 58 | mpirun -np 24 -ppn 12 python testccl_gpu_mpi.py 59 | ``` 60 | 61 | Note this unit test is a stress test with a long time to start. You may need to wait ~5min to get the log "starting to initialize tensors ...". 62 | 63 | ## allreduce of LLM path 64 | This test case goes to special path for allreduce operation on xpu device if launched rank(-np) <= 8. Run: 65 | ```bash 66 | mpirun -np 2 python test_llm_allreduce.py 67 | ``` 68 | If you want to disable this path and use oneCCL allreduce instead, set TORCH_CCL_GPU_ALLREDUCE to 0. Run: 69 | ```bash 70 | TORCH_CCL_GPU_ALLREDUCE=0 mpirun -np 2 python test_llm_allreduce.py 71 | ## Test Functionality of FSDP 72 | ```bash 73 | export CCL_ZE_IPC_EXCHANGE=sockets # for pytorch multiprocessing launch 74 | python test_fsdp.py 75 | ``` 76 | 77 | ## subgroup tests ds_subgroup_allreduce.py 78 | # for OAM (sub_group=2/4) 79 | ```bash 80 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=2 81 | mpirun -np 8 -ppn 8 python -u ds_subgroup_allreduce.py --sub_group=4 82 | ``` 83 | # for Aurora System(TP=2/3/4/6) 84 | ```bash 85 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=2 86 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=3 87 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=4 88 | mpirun -np 12 -ppn 12 python -u ds_subgroup_allreduce.py --sub_group=6 89 | ``` 90 | 91 | ## deep speed scale-out tests 92 | The ds_p2p_crossnodes.py test case should be run on 3 nodes 93 | ```bash 94 | mpirun -host x1002c4s1b0n0,x1002c4s2b0n0,x1002c4s3b0n0 -np 36 -ppn 12 python -u ds_p2p_crossnodes.py --dist_url 10.0.1.141 --world_size 36 95 | ``` 96 | -host is the name for this 3 nodes 97 | --dist_url is the IP on your node, you can use (hostname -I) to get. -------------------------------------------------------------------------------- /tests/DeepSpeed_test/testccl_cpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.multiprocessing import Process 4 | import os 5 | 6 | world_size = 12 7 | rounds = 100 8 | # input_file = "Example.csv" 9 | input_file = "DeepSpeed.csv" 10 | 11 | type = torch.float16 12 | 13 | 14 | def worker(given_rank): 15 | os.environ['MASTER_ADDR'] = 'localhost' 16 | os.environ['MASTER_PORT'] = '6789' 17 | os.environ['WORLD_SIZE'] = str(world_size) 18 | os.environ['RANK'] = str(given_rank) 19 | 20 | dist.init_process_group(backend = 'gloo') 21 | rank = int(dist.get_rank()) 22 | 23 | device = "cpu" 24 | 25 | ops, sizes, roots = read_file(input_file) 26 | test_ccl(ops, sizes, roots, device, rank, rounds) 27 | 28 | 29 | def main(): 30 | 31 | process_list = [] 32 | for i in range(world_size): 33 | p = Process(target=worker, args=(i,)) 34 | p.start() 35 | process_list.append(p) 36 | 37 | for p in process_list: 38 | p.join() 39 | 40 | def read_file(filename): 41 | ops = [] 42 | sizes = [] 43 | roots = [] 44 | f = open(filename, "r") 45 | for line in f: 46 | op, size, root = line.strip().split(",") 47 | size = int(size) 48 | root = int(root) 49 | if root >= world_size: 50 | print("Invalid root {}".format(root)) 51 | exit() 52 | ops.append(op) 53 | sizes.append(size) 54 | roots.append(root) 55 | f.close() 56 | return ops, sizes, roots 57 | 58 | def test_ccl(ops, sizes, roots, device, rank, rounds): 59 | input = [] 60 | output = [] 61 | print("Rank {}: starting to initialize tensors ...".format(rank)) 62 | for i in range(0, len(sizes)): 63 | data = torch.randn(sizes[i], dtype = type) 64 | data.to(device) 65 | input.append(data) 66 | if ops[i] == 'allgather': 67 | tmp_output = [] 68 | for j in range(0, world_size): 69 | data = torch.randn(sizes[i], dtype = type) 70 | data.to(device) 71 | tmp_output.append(data) 72 | output.append(tmp_output) 73 | else: 74 | output.append(data) 75 | print("Rank {}: tensors initialization finished!".format(rank)) 76 | for k in range(0, rounds): 77 | for i in range(0, len(ops)): 78 | if ops[i] == 'reduce': 79 | print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i]))) 80 | dist.reduce(input[i], roots[i], async_op=False) 81 | if ops[i] == 'allreduce': 82 | print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i]))) 83 | dist.all_reduce(input[i], async_op=False) 84 | if ops[i] == 'allgather': 85 | print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i]))) 86 | dist.all_gather(output[i], input[i], async_op=False) 87 | if ops[i] == 'broadcast': 88 | print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i]))) 89 | dist.broadcast(input[i], roots[i], async_op=False) 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /tests/ds_p2p_crossnodes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--world_size', default=-1, type=int, help='number of gpu for distributed training') 11 | parser.add_argument('--dist_url', default='127.0.0.1', type=str, help='url used to set up distributed training') 12 | parser.add_argument('--dist_port', default='29600', type=str, help='url port used to set up distributed training') 13 | args = parser.parse_args() 14 | 15 | os.environ['RANK'] = str(os.environ.get('PMIX_RANK',0)) 16 | os.environ['WORLD_SIZE'] = str(args.world_size) 17 | os.environ['MASTER_ADDR'] = '127.0.0.1' 18 | os.environ['MASTER_PORT'] = '29600' 19 | 20 | init_method = 'tcp://' + args.dist_url + ':' + args.dist_port 21 | dist.init_process_group(backend='ccl', init_method=init_method, 22 | world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK'])) 23 | 24 | rank = dist.get_rank() 25 | print("-----global rank: ", rank) 26 | size = dist.get_world_size() 27 | local_rank = os.environ['PALS_LOCAL_RANKID'] 28 | device = "xpu:{}".format(local_rank) 29 | print('world_size:{}, global rank:{}, local_rank:{}'.format(size, rank, local_rank)) 30 | 31 | def send_tensor(buffer, recv_stage): 32 | if isinstance(buffer, torch.Tensor): 33 | type_tensor = torch.LongTensor(data=[0]).to(device) 34 | dist.send(type_tensor, recv_stage) 35 | send_shape = torch.LongTensor(data=buffer.size()).to(device) 36 | send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(device) 37 | dist.send(send_ndims, recv_stage) 38 | dist.send(send_shape, recv_stage) 39 | 40 | def recv_tensor(send_stage): 41 | type_tensor = torch.LongTensor(data=[0]).to(device) 42 | dist.recv(type_tensor, send_stage) 43 | recv_type = type_tensor.item() 44 | 45 | if recv_type == 0: 46 | recv_ndims = torch.LongTensor(data=[0]).to(device) 47 | dist.recv(recv_ndims, send_stage) 48 | recv_ndims = recv_ndims.item() 49 | recv_shape = torch.LongTensor([1] * recv_ndims).to(device) 50 | dist.recv(recv_shape, send_stage) 51 | print("recv_ndims", recv_ndims) 52 | print("recv_shape", recv_shape) 53 | else: 54 | print("----------------error-------------------") 55 | 56 | size = dist.get_world_size() 57 | device = "xpu:{}".format(local_rank) 58 | 59 | data = torch.randn(1, dtype=torch.float32).to(device) 60 | dist.all_reduce(data) 61 | 62 | # send/recv(rank0 -> rank12 -> rank24) 63 | if rank <= 11: 64 | tensor = torch.ones(2048,3,256).xpu(device) 65 | send_tensor(tensor, rank+12) 66 | elif rank >= 24 : 67 | recv_tensor(rank-12) 68 | else: 69 | recv_tensor(rank-12) 70 | tensor = torch.ones(2048,3,256).xpu(device) 71 | send_tensor(tensor, rank+12) 72 | print("-----finished send/recv-----") 73 | 74 | # all_gather_base after p2p 75 | torch.distributed.barrier() 76 | world_size=36 77 | device = "xpu:{}".format(local_rank) 78 | rank_name_to_time = torch.zeros((world_size, 2), 79 | dtype=torch.float, 80 | device=device) 81 | 82 | torch.distributed._all_gather_base(rank_name_to_time.view(-1), 83 | rank_name_to_time[rank, :].view(-1)) 84 | print("all_gather is done") 85 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/testccl_gpu_mpi.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.multiprocessing import Process 4 | import os 5 | import intel_extension_for_pytorch 6 | import oneccl_bindings_for_pytorch 7 | import argparse 8 | import sys 9 | 10 | rounds = 40 11 | # input_file = "Example.csv" 12 | input_file = "DeepSpeed.csv" 13 | 14 | data_type = torch.bfloat16 15 | 16 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 17 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 18 | os.environ['MASTER_ADDR'] = '127.0.0.1' 19 | os.environ['MASTER_PORT'] = '29500' 20 | dist.init_process_group("ccl") 21 | rank = dist.get_rank() 22 | world_size = dist.get_world_size() 23 | 24 | def main(): 25 | torch.xpu.set_device(rank) 26 | device = "xpu:{}".format(rank) 27 | ops, sizes, roots = read_file(input_file) 28 | test_ccl(ops, sizes, roots, device, rank, rounds) 29 | 30 | def read_file(filename): 31 | ops = [] 32 | sizes = [] 33 | roots = [] 34 | f = open(filename, "r") 35 | for line in f: 36 | op, size, root = line.strip().split(",") 37 | size = int(size) 38 | root = int(root) 39 | if root >= world_size: 40 | print("Invalid root {}".format(root)) 41 | exit() 42 | ops.append(op) 43 | sizes.append(size) 44 | roots.append(root) 45 | f.close() 46 | return ops, sizes, roots 47 | 48 | def test_ccl(ops, sizes, roots, device, rank, rounds): 49 | input = [] 50 | output = [] 51 | print("Rank {}: starting to initialize tensors ...".format(rank)) 52 | for i in range(0, len(sizes)): 53 | data = torch.randn(sizes[i], dtype = data_type) 54 | data = data.to(device) 55 | input.append(data) 56 | if ops[i] == 'allgather': 57 | tmp_output = [] 58 | for j in range(0, world_size): 59 | data = torch.randn(sizes[i], dtype = data_type) 60 | data = data.to(device) 61 | tmp_output.append(data) 62 | output.append(tmp_output) 63 | else: 64 | output.append(data) 65 | print("Rank {}: tensors initialization finished!".format(rank), flush=True) 66 | for k in range(0, rounds): 67 | print("test round: ", k) 68 | for i in range(0, len(ops)): 69 | if ops[i] == 'reduce': 70 | print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i])), flush=True) 71 | dist.reduce(input[i], roots[i], async_op=False) 72 | if ops[i] == 'allreduce': 73 | print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i])), flush=True) 74 | dist.all_reduce(input[i], async_op=False) 75 | if ops[i] == 'allgather': 76 | print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i])), flush=True) 77 | dist.all_gather(output[i], input[i], async_op=False) 78 | if ops[i] == 'broadcast': 79 | print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i])), flush=True) 80 | dist.broadcast(input[i], roots[i], async_op=False) 81 | 82 | torch.xpu.synchronize() 83 | 84 | if __name__ == '__main__': 85 | main() 86 | print("All tests finished!") 87 | -------------------------------------------------------------------------------- /tests/test_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import intel_extension_for_pytorch 3 | import oneccl_bindings_for_pytorch 4 | import torch.distributed as dist 5 | import os 6 | import time 7 | 8 | tokens = 16 9 | rounds = 70 * 2 * tokens 10 | 11 | count = 14336 12 | 13 | total = 1024 * 1024 * 72 14 | repeat = 4 15 | 16 | # profiling = False 17 | # profiling = True 18 | 19 | datatype = torch.float16 20 | # datatype = torch.float32 21 | 22 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 23 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 24 | os.environ['MASTER_ADDR'] = '127.0.0.1' 25 | os.environ['MASTER_PORT'] = '29500' 26 | 27 | dist.init_process_group("ccl") 28 | rank = dist.get_rank() 29 | size = dist.get_world_size() 30 | 31 | device = "xpu:{}".format(rank) 32 | # allreduce data 33 | data = (torch.ones(count, dtype=datatype) * 0.1).to(device) 34 | 35 | a = (torch.zeros((int(total / count), count), dtype=datatype)).to(device) 36 | 37 | # warm up 38 | for i in range(5): 39 | a[0] += (data * 0.1) 40 | for j in range(repeat): 41 | a += 0.01 42 | dist.all_reduce(data) 43 | data /= size 44 | sync = data.cpu() 45 | 46 | #start_events = [] 47 | #end_events = [] 48 | 49 | dist.barrier() 50 | start = time.time() 51 | for i in range(rounds): 52 | # start_event = None 53 | # end_event = None 54 | # if profiling: 55 | # start_event = torch.xpu.Event(enable_timing=True) 56 | # end_event = torch.xpu.Event(enable_timing=True) 57 | a[0] += (data * 0.1) 58 | for j in range(repeat): 59 | a += 0.01 60 | #print("XPU: {} {}".format(i, a[0][0])) 61 | # if profiling: 62 | # start_event.record() 63 | dist.all_reduce(data) 64 | # if profiling: 65 | # end_event.record() 66 | data /= size 67 | sync = data.cpu() 68 | # if profiling: 69 | # start_events.append(start_event) 70 | # end_events.append(end_event) 71 | 72 | # print(data[0]) 73 | data = data.cpu() 74 | # torch.xpu.synchronize('xpu:{}'.format(rank)) 75 | span = time.time() - start 76 | print('{} rounds on reducing {} elements. Time used {}'.format(rounds, count, span)) 77 | 78 | tmp_a = torch.zeros(1, dtype=datatype) 79 | tmp_data = torch.ones(1, dtype=datatype) * 0.1 80 | for i in range(5): 81 | tmp_a += (tmp_data * 0.1) 82 | for j in range(repeat): 83 | tmp_a += 0.01 84 | tmp_data *= size 85 | tmp_data /= size 86 | 87 | for i in range(rounds): 88 | tmp_a += (tmp_data * 0.1) 89 | for j in range(repeat): 90 | tmp_a += 0.01 91 | #print("CPU: {} {}".format(i, tmp_a[0])) 92 | tmp_data *= size 93 | tmp_data /= size 94 | 95 | a = a.cpu() 96 | 97 | error = False 98 | for i in range(count): 99 | if tmp_a[0] != a[0][i]: 100 | if not error: 101 | print("Error on {}: {} vs {}".format(i, tmp_a[0], a[0][i])) 102 | error = True 103 | else: 104 | if error: 105 | print("No error on {}".format(i)) 106 | error = False 107 | 108 | #if profiling: 109 | # for i in range(len(start_events)): 110 | # allreduce_time = start_events[i].elapsed_time(end_events[i]) 111 | # print('Round %d allreduce time %.3fms' % (i, allreduce_time)) 112 | # if i != len(start_events) - 1: 113 | # compute_time = end_events[i].elapsed_time(start_events[i + 1]) 114 | # print('Round %d compute time %.3fms' % (i + 1, compute_time)) 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 2 | set(CMAKE_CXX_STANDARD 17) 3 | 4 | project(oneccl_bindings_for_pytorch C CXX) 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat") 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=cpp") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat-security") 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector") 9 | 10 | set(LINUX TRUE) 11 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 12 | set(CMAKE_INSTALL_MESSAGE NEVER) 13 | 14 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules) 15 | 16 | set(RPATH_VALUE) 17 | list(APPEND RPATH_VALUE "$ORIGIN") 18 | 19 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) 20 | 21 | option(USE_SYSTEM_ONECCL "Use oneCCL library in system" OFF) 22 | 23 | option(BUILD_NO_ONECCL_PACKAGE "Build with oneCCL excluded" OFF) 24 | 25 | set(DEPENDS_LIB) 26 | 27 | # Find the Torch lib 28 | find_package(Torch REQUIRED) 29 | list(APPEND DEPENDS_LIB torch) 30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") 31 | 32 | # Find OneCCL Lib 33 | IF (USE_SYSTEM_ONECCL) 34 | # Find and link MPI lib 35 | find_package(MPI REQUIRED) 36 | list(APPEND DEPENDS_LIB ${MPI_LIBRARIES}) 37 | 38 | # Link CCL lib 39 | set(CCL_ROOT $ENV{CCL_ROOT}) 40 | set(CCL_CONFIGURATION_PATH $ENV{CCL_CONFIGURATION_PATH}) 41 | include_directories(${CCL_ROOT}/include) 42 | list(APPEND DEPENDS_LIB "${CCL_ROOT}/lib/${CCL_CONFIGURATION_PATH}/libccl.so") 43 | list(APPEND RPATH_VALUE "$ORIGIN/../../../../") 44 | ELSE() 45 | # Find OneCCL Lib 46 | find_package(oneCCL REQUIRED) 47 | link_directories(${MPI_LIB_DIR}) 48 | list(APPEND DEPENDS_LIB oneCCL mpi) 49 | ENDIF() 50 | 51 | if(COMPUTE_BACKEND STREQUAL "dpcpp") 52 | list(APPEND DEPENDS_LIB ze_loader) 53 | endif() 54 | 55 | set(CMAKE_SKIP_BUILD_RPATH FALSE) 56 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) 57 | set(CMAKE_INSTALL_RPATH "${RPATH_VALUE}") 58 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE) 59 | 60 | SET(LIB_NAME "oneccl_bindings_for_pytorch") 61 | 62 | add_subdirectory(./src) 63 | 64 | function (print_configuration_summary) 65 | get_directory_property(CMAKE_COMPILE_DEFINITIONS DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS) 66 | 67 | message(STATUS "") 68 | message(STATUS "******** Summary ********") 69 | message(STATUS "General:") 70 | message(STATUS " CMake version : ${CMAKE_VERSION}") 71 | message(STATUS " CMake command : ${CMAKE_COMMAND}") 72 | message(STATUS " System : ${CMAKE_SYSTEM_NAME}") 73 | message(STATUS " Target name : ${LIB_NAME}") 74 | message(STATUS " Install path : ${CMAKE_INSTALL_PREFIX}") 75 | message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") 76 | message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") 77 | message(STATUS " C++ compiler id : ${CMAKE_CXX_COMPILER_ID}") 78 | message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") 79 | message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}") 80 | message(STATUS " Compile flags : ${IPEX_COMPILE_FLAGS}") 81 | message(STATUS " Compile definitions : ${CMAKE_COMPILE_DEFINITIONS}") 82 | message(STATUS " Linker options : ${CMAKE_SHARED_LINKER_FLAGS}") 83 | get_target_property(LINK_LIBRARIES oneccl_bindings_for_pytorch LINK_LIBRARIES) 84 | message(STATUS " Linker libraries : ${LINK_LIBRARIES}") 85 | get_target_property(LINK_DIRECTORS oneccl_bindings_for_pytorch LINK_DIRECTORIES) 86 | message(STATUS " Linker directors : ${LINK_DIRECTORS}") 87 | 88 | message(STATUS "") 89 | endfunction() 90 | 91 | print_configuration_summary() 92 | -------------------------------------------------------------------------------- /third-party-programs.txt: -------------------------------------------------------------------------------- 1 | PyTorch binding for Intel(R) oneAPI Collective Communications Library (oneCCL) 2 | Third Party Programs File 3 | 4 | This file is the "third-party-programs.txt" file specified in the associated 5 | Intel end user license agreement for the Intel software you are licensing. The 6 | third party programs and their corresponding required notices and/or license 7 | terms are listed below. 8 | 9 | ------------------------------------------------------------------------------- 10 | 11 | 1. PyTorch 12 | 13 | From PyTorch: 14 | 15 | Copyright (c) 2016- Facebook, Inc (Adam Paszke) 16 | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 17 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 18 | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 19 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 20 | Copyright (c) 2011-2013 NYU (Clement Farabet) 21 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, 22 | Iain Melvin, Jason Weston) 23 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 24 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, 25 | Johnny Mariethoz) 26 | 27 | From Caffe2: 28 | 29 | Copyright (c) 2016-present, Facebook Inc. All rights reserved. 30 | 31 | All contributions by Facebook: 32 | Copyright (c) 2016 Facebook Inc. 33 | 34 | All contributions by Google: 35 | Copyright (c) 2015 Google Inc. 36 | All rights reserved. 37 | 38 | All contributions by Yangqing Jia: 39 | Copyright (c) 2015 Yangqing Jia 40 | All rights reserved. 41 | 42 | All contributions from Caffe: 43 | Copyright(c) 2013, 2014, 2015, the respective contributors 44 | All rights reserved. 45 | 46 | All other contributions: 47 | Copyright(c) 2015, 2016 the respective contributors 48 | All rights reserved. 49 | 50 | 51 | The -3-Clause BSD license 52 | 53 | Caffe2 uses a copyright model similar to Caffe: each contributor holds 54 | copyright over their contributions to Caffe2. The project versioning records 55 | all such contribution and copyright details. If a contributor wants to further 56 | mark their specific copyright on a particular contribution, they should 57 | indicate their copyright solely in the commit message of the change when it is 58 | committed. 59 | 60 | All rights reserved. 61 | 62 | Redistribution and use in source and binary forms, with or without 63 | modification, are permitted provided that the following conditions are met: 64 | 65 | 1. Redistributions of source code must retain the above copyright 66 | notice, this list of conditions and the following disclaimer. 67 | 68 | 2. Redistributions in binary form must reproduce the above copyright 69 | notice, this list of conditions and the following disclaimer in the 70 | documentation and/or other materials provided with the distribution. 71 | 72 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories 73 | America and IDIAP Research Institute nor the names of its contributors may 74 | be used to endorse or promote products derived from this software without 75 | specific prior written permission. 76 | 77 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 78 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 79 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 80 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 81 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 82 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 83 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 84 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 85 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 86 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 87 | POSSIBILITY OF SUCH DAMAGE. 88 | 89 | ------------------------------------------------------------------------------- 90 | 91 | Other names and brands may be claimed as the property of others. 92 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/testccl_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from torch.multiprocessing import Process 4 | import os 5 | import intel_extension_for_pytorch 6 | import argparse 7 | import sys 8 | 9 | datatype_map = { 10 | 'bf16': torch.bfloat16, 11 | 'fp32': torch.float32 12 | } 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--datatype', '-d', type=str, default='bf16', help='Data type') 16 | parser.add_argument('--world_size', default=12, type=int, help='Number of gpu for distributed training') 17 | args = parser.parse_args() 18 | type = datatype_map.get(args.datatype) 19 | if type is None: 20 | print(f'Unknown datatype: {args.datatype}') 21 | sys.exit(1) 22 | 23 | world_size = args.world_size 24 | rounds = 100 25 | # input_file = "Example.csv" 26 | input_file = "DeepSpeed.csv" 27 | 28 | type = torch.bfloat16 29 | 30 | def worker(given_rank): 31 | os.environ['MASTER_ADDR'] = '127.0.0.1' # xpu 32 | os.environ['MASTER_PORT'] = '29500' # xpu 33 | os.environ['WORLD_SIZE'] = str(world_size) 34 | os.environ['RANK'] = str(given_rank) 35 | 36 | try: 37 | import oneccl_bindings_for_pytorch 38 | except ImportError: 39 | print("oneccl_bindings_for_pytorch not available!") 40 | dist.init_process_group(backend='ccl') 41 | 42 | rank = int(dist.get_rank()) 43 | torch.xpu.set_device(rank) 44 | device = "xpu:{}".format(rank) 45 | 46 | ops, sizes, roots = read_file(input_file) 47 | test_ccl(ops, sizes, roots, device, rank, rounds) 48 | 49 | 50 | def main(): 51 | 52 | process_list = [] 53 | for i in range(world_size): 54 | p = Process(target=worker, args=(i,)) 55 | p.start() 56 | process_list.append(p) 57 | 58 | for p in process_list: 59 | p.join() 60 | 61 | def read_file(filename): 62 | ops = [] 63 | sizes = [] 64 | roots = [] 65 | f = open(filename, "r") 66 | for line in f: 67 | op, size, root = line.strip().split(",") 68 | size = int(size) 69 | root = int(root) 70 | if root >= world_size: 71 | print("Invalid root {}".format(root)) 72 | exit() 73 | ops.append(op) 74 | sizes.append(size) 75 | roots.append(root) 76 | f.close() 77 | return ops, sizes, roots 78 | 79 | def test_ccl(ops, sizes, roots, device, rank, rounds): 80 | input = [] 81 | output = [] 82 | print("Rank {}: starting to initialize tensors ...".format(rank)) 83 | for i in range(0, len(sizes)): 84 | data = torch.randn(sizes[i], dtype = type) 85 | data = data.to(device) 86 | input.append(data) 87 | if ops[i] == 'allgather': 88 | tmp_output = [] 89 | for j in range(0, world_size): 90 | data = torch.randn(sizes[i], dtype = type) 91 | data = data.to(device) 92 | tmp_output.append(data) 93 | output.append(tmp_output) 94 | else: 95 | output.append(data) 96 | print("Rank {}: tensors initialization finished!".format(rank)) 97 | for k in range(0, rounds): 98 | print("test round: ", k) 99 | for i in range(0, len(ops)): 100 | if ops[i] == 'reduce': 101 | print("Rank {}: reduce to {} w/ size {}".format(rank, roots[i], len(input[i]))) 102 | dist.reduce(input[i], roots[i], async_op=False) 103 | if ops[i] == 'allreduce': 104 | print("Rank {}: all_reduce w/ size {}".format(rank, len(input[i]))) 105 | dist.all_reduce(input[i], async_op=False) 106 | if ops[i] == 'allgather': 107 | print("Rank {}: all_gather w/ size {} & {} elements".format(rank, len(input[i]), len(output[i]))) 108 | dist.all_gather(output[i], input[i], async_op=False) 109 | if ops[i] == 'broadcast': 110 | print("Rank {}: broadcast from {} w/ size {}".format(rank, roots[i], len(input[i]))) 111 | dist.broadcast(input[i], roots[i], async_op=False) 112 | 113 | torch.xpu.synchronize() 114 | 115 | if __name__ == '__main__': 116 | main() 117 | print("All tests finished!") 118 | -------------------------------------------------------------------------------- /src/gpu/allreduce.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "allreduce.h" 12 | #include 13 | 14 | #define REPEAT 10 15 | 16 | int work_only = -1; 17 | int sync_only = -1; 18 | 19 | int get_work_only(int init_value = 0) { 20 | int tmp_work_only = init_value; 21 | char *tmp_str = getenv("TORCH_CCL_WORK_ONLY"); 22 | if (tmp_str) { 23 | tmp_work_only = atoi(tmp_str); 24 | } 25 | work_only = tmp_work_only; 26 | return tmp_work_only; 27 | } 28 | 29 | int get_sync_only(int init_value = 0) { 30 | int tmp_sync_only = init_value; 31 | char *tmp_str = getenv("TORCH_CCL_SYNC_ONLY"); 32 | if (tmp_str) { 33 | tmp_sync_only = atoi(tmp_str); 34 | } 35 | sync_only = tmp_sync_only; 36 | return tmp_sync_only; 37 | } 38 | 39 | void act(allreducer& ar, sycl::queue& queue, void* inout_buffer, uint32_t size); 40 | 41 | int main(int argc, char* argv[]) { 42 | // init section 43 | auto ret = MPI_Init(&argc, &argv); 44 | if (ret == MPI_ERR_OTHER) { 45 | std::cout<<"MPI init error"< ar; 68 | ar.init(queue, rank, world); 69 | 70 | sycl::half* small_buffer = (sycl::half*)sycl::malloc_device(14336 * sizeof(sycl::half), queue); 71 | sycl::half* large_buffer = (sycl::half*)sycl::malloc_device(14336 * 32 * sizeof(sycl::half), queue); 72 | 73 | for (int i = 0; i < 140; i++) { 74 | act(ar, queue, large_buffer, 14336 * 32); 75 | } 76 | for (int i = 0; i < 31; i++) { 77 | for (int j = 0; j < 140; j++) { 78 | act(ar, queue, small_buffer, 14336); 79 | } 80 | } 81 | queue.wait(); 82 | 83 | uint64_t host_time[REPEAT]; 84 | uint64_t full_time[REPEAT]; 85 | 86 | for (int k = 0; k < REPEAT; k++) { 87 | MPI_Barrier(MPI_COMM_WORLD); 88 | uint64_t start = int64_t(std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count()); 89 | 90 | for (int i = 0; i < 140; i++) { 91 | act(ar, queue, large_buffer, 14336 * 32); 92 | } 93 | for (int i = 0; i < 31; i++) { 94 | for (int j = 0; j < 140; j++) { 95 | act(ar, queue, small_buffer, 14336); 96 | } 97 | } 98 | uint64_t host_end = int64_t(std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count()); 99 | queue.wait(); 100 | uint64_t full_end = int64_t(std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count()); 101 | host_time[k] = host_end - start; 102 | full_time[k] = full_end - start; 103 | } 104 | 105 | uint64_t total_host_time = 0; 106 | uint64_t total_full_time = 0; 107 | for (int k = 0; k < REPEAT; k++) { 108 | total_host_time += host_time[k]; 109 | total_full_time += full_time[k]; 110 | } 111 | 112 | total_host_time /= REPEAT; 113 | total_full_time /= REPEAT; 114 | 115 | MPI_Barrier(MPI_COMM_WORLD); 116 | MPI_Finalize(); 117 | 118 | std::cout << "Average full time: " << total_full_time << std::endl; 119 | std::cout << "Average host time (for reference): " << total_host_time << std::endl; 120 | for (int k = 0; k < REPEAT; k++) { 121 | std::cout << " Full time on round " << k << ": " << full_time[k] << std::endl; 122 | std::cout << " Host time on round " << k << " (for reference): " << host_time[k] << std::endl; 123 | } 124 | } 125 | 126 | void act(allreducer& ar, sycl::queue& queue, void* inout_buffer, uint32_t size) { 127 | if (work_only != 0) { 128 | ar.work_only(queue, inout_buffer, size); 129 | return; 130 | } 131 | if (sync_only != 0) { 132 | ar.sync_only(queue, inout_buffer, size); 133 | return; 134 | } 135 | ar.allreduce(queue, inout_buffer, size); 136 | } 137 | -------------------------------------------------------------------------------- /tests/ddp_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import time 4 | import os 5 | import argparse 6 | import torch.distributed as dist 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--ptrace', 10 | action='store_true', 11 | default=False, 12 | help='pytorch trace') 13 | parser.add_argument('--warm', type=int, default=10, help='#warmup') 14 | parser.add_argument('--iter', type=int, default=10, help='#iteration') 15 | parser.add_argument('--size', type=int, default=25557032, help='number of f32/bf16 elements') 16 | parser.add_argument('--no-cuda', action='store_true', default=False) 17 | parser.add_argument('--broadcast', action='store_true', default=False) 18 | parser.add_argument('--bf16', action='store_true', default=False) 19 | parser.add_argument('--fixed', 20 | action='store_true', 21 | default=False, 22 | help='fixed size') 23 | args = parser.parse_args() 24 | args.cuda = not args.no_cuda and torch.cuda.is_available() 25 | 26 | if 'PMI_RANK' in os.environ.keys() and 'PMI_SIZE' in os.environ.keys(): 27 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 28 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) # mpich set 29 | elif 'PMIX_RANK' in os.environ.keys() and 'PALS_LOCAL_SIZE' in os.environ.keys(): 30 | os.environ['RANK'] = os.environ.get('PMIX_RANK') 31 | os.environ['WORLD_SIZE'] = str(os.environ.get('PALS_LOCAL_SIZE', -1)) 32 | os.environ['MASTER_ADDR'] = '127.0.0.1' # your master address 33 | os.environ['MASTER_PORT'] = '29500' # your master port 34 | 35 | if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ.keys(): 36 | local_rank = os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] 37 | elif 'MPI_LOCALRANKID' in os.environ.keys(): 38 | local_rank = os.environ['MPI_LOCALRANKID'] 39 | if 'MPI_LOCALNRANKS' in os.environ.keys(): 40 | os.environ['LOCAL_WORLD_SIZE'] = str(os.environ.get('MPI_LOCALNRANKS',-1)) 41 | else: 42 | local_rank = os.environ['PALS_LOCAL_RANKID'] 43 | 44 | local_rank = int(local_rank) 45 | devid = local_rank 46 | 47 | if not args.cuda: 48 | import intel_extension_for_pytorch 49 | try: 50 | import oneccl_bindings_for_pytorch 51 | except: 52 | import torch_ccl 53 | torch.xpu.set_device(devid) 54 | device = "xpu:{}".format(devid) 55 | dist.init_process_group(backend='ccl') 56 | else: 57 | torch.cuda.set_device(devid) 58 | device = "cuda" 59 | dist.init_process_group(backend='nccl') 60 | 61 | try: 62 | from horovod.torch import mpi_lib_v2 as mpi_lib 63 | if mpi_lib.ctenabled(): 64 | mpi_lib = mpi_lib 65 | except: 66 | mpi_lib = None 67 | 68 | print(f'DDP local rank: {devid}') 69 | 70 | if devid == 0: 71 | print(f'PyTorch DDP {"Broadcast" if args.broadcast else "AllReduce"} on {os.environ["WORLD_SIZE"]} {device} devices: ') 72 | 73 | def _time(): 74 | if args.cuda: 75 | torch.cuda.synchronize() 76 | else: 77 | torch.xpu.synchronize() 78 | return time.time() 79 | 80 | if args.fixed: 81 | N = args.size 82 | else: 83 | N = 1 84 | 85 | 86 | with torch.autograd.profiler.profile(enabled=args.ptrace) as prof: 87 | while N <= args.size: 88 | for i in range(args.warm): 89 | data = torch.randn(N, dtype=torch.bfloat16 if args.bf16 else torch.float32).to(device) 90 | with torch.no_grad(): 91 | if not args.broadcast: 92 | dist.all_reduce(data) 93 | else: 94 | dist.broadcast(data, 0) 95 | elapsed = [] 96 | for i in range(args.iter): 97 | data = torch.randn(N, dtype=torch.bfloat16 if args.bf16 else torch.float32).to(device) 98 | t = _time() 99 | if mpi_lib: 100 | mpi_lib.ctpush("IPEX_ALLREDUCE") 101 | with torch.no_grad(): 102 | if not args.broadcast: 103 | dist.all_reduce(data) 104 | else: 105 | dist.broadcast(data, 0) 106 | elapsed.append((_time() - t) * 1e6) 107 | if mpi_lib and mpi_lib.ctenabled(): 108 | mpi_lib.ctpop() 109 | if devid == 0: 110 | print( 111 | f'{N*(2 if args.bf16 else 4):<10}{np.mean(elapsed):>10.1f}us ({np.min(elapsed):.1f}-{np.max(elapsed):.1f}) +-{1.96 * np.std(elapsed):.1f}' 112 | ) 113 | if N == args.size: 114 | break 115 | N = 2 * N 116 | if N != args.size and N > args.size: 117 | N = args.size 118 | 119 | if args.ptrace: 120 | prof.export_chrome_trace('rank' + str(hvd.rank()) + '_timeline.json') 121 | dist.destroy_process_group() 122 | 123 | -------------------------------------------------------------------------------- /src/ccl_comm_collector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #pragma once 33 | 34 | #include 35 | #include 36 | #include 37 | #include "ProcessGroupCCL.hpp" 38 | 39 | namespace oneccl_bindings_for_pytorch { 40 | 41 | class Comms { 42 | public: 43 | // for cpu case 44 | explicit Comms(ccl::vector_class &comms) : 45 | comms(std::move(comms)), streams{} {} 46 | 47 | // for comms with streams 48 | explicit Comms(ccl::vector_class &comms, ccl::vector_class &streams, std::vector &torch_streams) : 49 | comms(std::move(comms)), streams(std::move(streams)), torch_streams(std::move(torch_streams)) {} 50 | 51 | ~Comms() noexcept(false) {} 52 | 53 | Comms() = delete; 54 | 55 | // Must not be copyable 56 | Comms(const Comms &) = delete; 57 | 58 | Comms &operator=(const Comms &) = delete; 59 | 60 | // Move constructable 61 | Comms(Comms &&other) : comms(std::move(other.comms)), streams(std::move(other.streams)), 62 | torch_streams(std::move(other.torch_streams)) {} 63 | 64 | // Move assignable 65 | Comms &operator=(Comms &&other) { 66 | std::swap(comms, other.comms); 67 | std::swap(streams, other.streams); 68 | std::swap(torch_streams, other.torch_streams); 69 | return *this; 70 | } 71 | 72 | public: 73 | // The Communicators used by CCL 74 | ccl::vector_class comms; 75 | // The streams used by CCL 76 | ccl::vector_class streams; 77 | // one to one mapping the torch streams to the ccl::stream. 78 | std::vector torch_streams; 79 | }; 80 | 81 | struct CCLCommCollector { 82 | 83 | CCLCommCollector() : kvs(nullptr) {}; 84 | 85 | ccl::shared_ptr_class get_kvs(int rank, c10d::Store& store, 86 | bool singleP2POp, const std::string& p2pKey, int p2pRank); 87 | 88 | std::shared_ptr get_comms(const std::string& devices_key); 89 | void add_comms(const std::string& devices_key, std::shared_ptr comms); 90 | 91 | // ccl kvs to identify the community. 92 | ccl::shared_ptr_class kvs; 93 | // Collects the ccl communicator that the process group has used. 94 | // The key is a list of devices that an operation is operating on 95 | // The devices are stored in a device sequence and the cache CCL 96 | // communicator is associated with this device sequence 97 | // 98 | // e.g. If the process group op only uses device 0, then the value of 99 | // the used device string stored (value of the hashmap) would be "0". 100 | // 101 | // If the process group op uses device 0 - 7 and the each tensor of the 102 | // input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately, 103 | // then the value of the used device string (key) stored would be 104 | // "0,1,2,3,4,5,6,7" 105 | // 106 | // If the process group op uses device 0 - 7 and the each tensor of the 107 | // input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately, 108 | // then the value of the used device string stored would be 109 | // "0,4,5,6,7,1,2,3" 110 | // 111 | // Note that the order of the device for the tensor list matters. 112 | std::unordered_map> ccl_comms; 113 | 114 | }; 115 | 116 | } 117 | -------------------------------------------------------------------------------- /oneccl_bindings_for_pytorch/csrc/init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "init.h" 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include 46 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13 47 | #if TORCH_VERSION_MAJOR > 1 48 | #include 49 | #else 50 | #include 51 | #endif 52 | #include 53 | #include 54 | #include 55 | #else 56 | #include 57 | #include 58 | #include 59 | #include 60 | #endif 61 | 62 | #include 63 | 64 | namespace py = pybind11; 65 | 66 | 67 | namespace { 68 | 69 | // This is a intrusive helper from pytorch. 70 | template 71 | class IntrusivePtrNoGilDestructor { 72 | c10::intrusive_ptr impl_; 73 | 74 | public: 75 | IntrusivePtrNoGilDestructor() = default; 76 | IntrusivePtrNoGilDestructor(const IntrusivePtrNoGilDestructor&) = default; 77 | IntrusivePtrNoGilDestructor(IntrusivePtrNoGilDestructor&&) = default; 78 | IntrusivePtrNoGilDestructor& operator=(const IntrusivePtrNoGilDestructor&) = 79 | default; 80 | IntrusivePtrNoGilDestructor& operator=(IntrusivePtrNoGilDestructor&&) = 81 | default; 82 | /* implicit */ IntrusivePtrNoGilDestructor(c10::intrusive_ptr impl) 83 | : impl_(std::move(impl)) {} 84 | // This ctor is very important; see 85 | // https://github.com/pybind/pybind11/issues/2957 86 | explicit IntrusivePtrNoGilDestructor(T* impl) 87 | : impl_(c10::intrusive_ptr::unsafe_steal_from_new(impl)) {} 88 | ~IntrusivePtrNoGilDestructor() { 89 | if (impl_) { 90 | if (PyGILState_Check()) { 91 | pybind11::gil_scoped_release release; 92 | impl_.reset(); 93 | } else { 94 | impl_.reset(); 95 | } 96 | } 97 | } 98 | T& operator*() const noexcept { 99 | return *impl_; 100 | } 101 | T* operator->() const noexcept { 102 | return impl_.get(); 103 | } 104 | [[nodiscard]] T* get() const noexcept { 105 | return impl_.get(); 106 | } 107 | void reset() noexcept { 108 | impl_.reset(); 109 | } 110 | operator bool() const noexcept { 111 | return impl_; 112 | } 113 | }; 114 | 115 | } // anonymous namespace 116 | 117 | PYBIND11_DECLARE_HOLDER_TYPE(T, IntrusivePtrNoGilDestructor, true); 118 | 119 | template 120 | using intrusive_ptr_no_gil_destructor_class_ = 121 | py::class_>; 122 | 123 | TORCH_CCL_CPP_API void torch_ccl_python_init(pybind11::module &m) { 124 | c10d::ProcessGroupCCL::cclInitOnce(); 125 | py::object module = py::module::import("torch.distributed"); 126 | py::object register_backend = module.attr("Backend").attr("register_backend"); 127 | #if TORCH_VERSION_MAJOR > 1 128 | auto backend = py::module::import("torch._C._distributed_c10d").attr("Backend"); 129 | #else 130 | auto backend = module.attr("ProcessGroup"); 131 | #endif 132 | register_backend("ccl", py::cpp_function(&c10d::ProcessGroupCCL::createProcessGroupCCL, 133 | py::arg("store"), 134 | py::arg("rank"), 135 | py::arg("size"), 136 | py::arg("timeout") = std::chrono::milliseconds( 137 | ::c10d::ProcessGroupCCL::OP_TIMEOUT_MILLIS)), 138 | false, std::vector{"xpu", "cpu"}); 139 | 140 | auto processGroupCCL = intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupCCL>( 141 | module, "ProcessGroupCCL", backend); 142 | 143 | processGroupCCL.def( 144 | py::init([](const c10::intrusive_ptr<::c10d::Store>& store, 145 | int rank, 146 | int size, 147 | std::chrono::milliseconds timeout) { 148 | return c10::make_intrusive<::c10d::ProcessGroupCCL>(store, rank, size, timeout); 149 | }), 150 | py::arg("store"), 151 | py::arg("rank"), 152 | py::arg("size"), 153 | py::arg("timeout") = std::chrono::milliseconds(10 * 1000)); 154 | 155 | } 156 | -------------------------------------------------------------------------------- /src/test/segfault/simple_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "sycl_misc.hpp" 12 | 13 | #define MAGIC_NUM 15 14 | #define MAX_RANK 8 15 | #define MAX_BUFFER 4096 16 | #define OPERATE_SIZE 14336 17 | 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK; 19 | size_t check_size = 4096; 20 | 21 | int world = -1; 22 | int rank = -1; 23 | 24 | int use_tmp_buffer; 25 | 26 | void* buffer[MAX_RANK]; 27 | void* sync_buffer[MAX_RANK]; 28 | void* ready_buffer[MAX_RANK]; 29 | 30 | void exchange_mem(sycl::queue& queue, void* ptr); 31 | 32 | struct exchange_contents { 33 | union { 34 | ze_ipc_mem_handle_t ipc_handle; 35 | int fd = -1; 36 | }; 37 | size_t offset = 0; 38 | int pid = -1; 39 | }; 40 | 41 | #define sysCheck(x) \ 42 | if (x == -1) { \ 43 | throw std::system_error( \ 44 | std::make_error_code(std::errc(errno))); \ 45 | } 46 | 47 | int main(int argc, char* argv[]) { 48 | if (argc > 1) { 49 | use_tmp_buffer = 1; 50 | } 51 | 52 | size_t buffer_size = buffer_base_size + 1024 * 32768; 53 | 54 | auto ret = MPI_Init(&argc, &argv); 55 | if (ret == MPI_ERR_OTHER) { 56 | std::cout<<"MPI init error"< index) { 81 | ptr[index] = (uint32_t)temp_rank; 82 | })); 83 | }); 84 | queue.wait(); 85 | 86 | exchange_mem(queue, operate_buffer); 87 | 88 | MPI_Finalize(); 89 | } 90 | 91 | void exchange_mem(sycl::queue& queue, void* ptr) { 92 | // Step 1: Get base address of the pointer 93 | sycl::context ctx = queue.get_context(); 94 | auto l0_ctx = sycl::get_native(ctx); 95 | 96 | void *base_addr; 97 | size_t base_size; 98 | zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); 99 | 100 | std::cout << "Base size: " << base_size << std::endl; 101 | std::cout << "Buffer base size: " << buffer_base_size << std::endl; 102 | std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl; 103 | std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl; 104 | 105 | // Step 2: Get IPC mem handle from base address 106 | alignas(64) exchange_contents send_buf; 107 | alignas(64) exchange_contents recv_buf[world]; 108 | 109 | // fill in the exchange info 110 | zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); 111 | send_buf.offset = (char*)ptr - (char*)base_addr; 112 | send_buf.pid = getpid(); 113 | 114 | int* host_buffer = (int *)(malloc(1024)); 115 | void* tmp_buffer = sycl::malloc_device(1024, queue); 116 | 117 | void* sync_addr = NULL; 118 | sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size); 119 | std::cout << "Sync buffer content at " << sync_addr << ": "; 120 | queue.memcpy(host_buffer, sync_addr, 1024); 121 | queue.wait(); 122 | for (int i = 0; i < 256; i += 16) { 123 | std::cout << &host_buffer[i] << ": " << host_buffer[i] << std::endl; 124 | } 125 | 126 | // Step 3: Exchange the handles and offsets 127 | memset(recv_buf, 0, sizeof(recv_buf)); 128 | // Overkill if we don't really needs all peer's handles 129 | MPI_Allgather( 130 | &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD); 131 | 132 | 133 | for (uint32_t i = 0; i < world; i++){ 134 | // Step 4: Prepare pid file descriptor of next process 135 | auto* peer = recv_buf + i; 136 | auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0); 137 | sysCheck(pid_fd); 138 | // 139 | // Step 5: Duplicate GEM object handle to local process 140 | // and overwrite original file descriptor number 141 | // 142 | peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0); 143 | sysCheck(peer->fd); 144 | 145 | // Step 6: Open IPC handle of remote peer 146 | auto l0_device 147 | = sycl::get_native(queue.get_device()); 148 | void* peer_base; 149 | 150 | zeCheck(zeMemOpenIpcHandle( 151 | l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); 152 | buffer[i] = (char*)peer_base + peer->offset; 153 | sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128; 154 | ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64; 155 | 156 | char* end = (char*)peer_base + peer->offset + base_size; 157 | 158 | std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl; 159 | 160 | sync_addr = (void *)((char*)peer_base + peer->offset + buffer_base_size); 161 | //sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size); 162 | 163 | if (use_tmp_buffer == 0) { 164 | std::cout << "Copy sync buffer (mapped from rank " << i << ") at " << sync_addr << " to host" << std::endl; 165 | queue.memcpy(host_buffer, sync_addr, 1024); 166 | } else { 167 | std::cout << "Copy sync buffer (mapped from rank " << i << ") at " << sync_addr << " to temp buffer & then to host" << std::endl; 168 | queue.memcpy(tmp_buffer, sync_addr, 1024); 169 | queue.memcpy(host_buffer, tmp_buffer, 1024); 170 | } 171 | queue.wait(); 172 | 173 | std::cout << "Sync buffer content at " << sync_addr << std::endl; 174 | for (int i = 0; i < 256; i += 16) { 175 | std::cout << &host_buffer[i] << ": " << host_buffer[i] << std::endl; 176 | } 177 | } 178 | } 179 | 180 | -------------------------------------------------------------------------------- /tests/test_c10d_p2p.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import torch 4 | 5 | try: 6 | import intel_extension_for_pytorch 7 | xpu_is_available = torch.xpu.is_available() if hasattr(torch, 'xpu') else False 8 | except ImportError: 9 | # ignore the ipex 10 | xpu_is_available = False 11 | pass 12 | 13 | import oneccl_bindings_for_pytorch 14 | from torch.testing._internal.common_utils import run_tests 15 | from torch.testing._internal.common_distributed import MultiProcessTestCase 16 | 17 | import torch.distributed as dist 18 | 19 | class ProcessGroupCCLTest(MultiProcessTestCase): 20 | 21 | def setUp(self): 22 | super(ProcessGroupCCLTest, self).setUp() 23 | self._spawn_processes() 24 | 25 | @property 26 | def world_size(self): 27 | return 6 28 | 29 | def _build_tensor(self, size, value=None, dtype=torch.float, device=None): 30 | if value is None: 31 | value = size 32 | if device is None: 33 | return torch.empty(size, size, size, dtype=dtype).fill_(value) 34 | else: 35 | return torch.empty(size, size, size, dtype=dtype).fill_(value).to(device) 36 | 37 | def _test_send_recv_withincard(self): 38 | store = dist.FileStore(self.file_name, self.world_size) 39 | dist.init_process_group( 40 | "ccl", 41 | world_size=self.world_size, 42 | rank=self.rank, 43 | store=store, 44 | ) 45 | device = "xpu:{}".format(self.rank) 46 | 47 | # WA: allreduce 48 | # Ensure the process group has been fully initialized 49 | data = torch.zeros(1).to(device) 50 | dist.all_reduce(data) 51 | 52 | torch.xpu.set_device(device) 53 | tensor = self._build_tensor(self.rank + 1, device=device) 54 | 55 | # rank0 -> rank1 56 | src = 0 57 | dst = 1 58 | if self.rank == src: 59 | # Send 60 | dist.send(tensor, dst) 61 | elif self.rank == dst: 62 | # Recv 63 | expected_tensor = self._build_tensor(src + 1) 64 | output_tensor = self._build_tensor( 65 | src + 1, value=-1, device=device 66 | ) 67 | dist.recv(output_tensor, src) 68 | self.assertEqual(output_tensor, expected_tensor) 69 | 70 | def test_send_recv_withincard(self): 71 | self._test_send_recv_withincard() 72 | 73 | def _test_send_recv_3rank(self): 74 | # cross-cards p2p: rank1 -> rank3 -> rank5 75 | store = dist.FileStore(self.file_name, self.world_size) 76 | dist.init_process_group( 77 | "ccl", 78 | world_size=self.world_size, 79 | rank=self.rank, 80 | store=store, 81 | ) 82 | device = "xpu:{}".format(self.rank) 83 | 84 | # WA: allreduce 85 | # Ensure the process group has been fully initialized 86 | data = torch.zeros(1).to(device) 87 | dist.all_reduce(data) 88 | 89 | torch.xpu.set_device(device) 90 | tensor = self._build_tensor(self.rank + 1, device=device) 91 | 92 | if self.rank == 1: 93 | dist.send(tensor, 3) 94 | if self.rank == 3: 95 | expected_tensor1 = self._build_tensor(1 + 1) 96 | output_tensor1 = self._build_tensor( 97 | 1 + 1, value=-1, device=device 98 | ) 99 | dist.recv(output_tensor1, 1) 100 | self.assertEqual(output_tensor1, expected_tensor1) 101 | 102 | # rank3 -> rank5 103 | dist.send(tensor, 5) 104 | if self.rank == 5: 105 | expected_tensor2 = self._build_tensor(3 + 1) 106 | output_tensor2 = self._build_tensor( 107 | 3 + 1, value=-1, device=device 108 | ) 109 | dist.recv(output_tensor2, 3) 110 | self.assertEqual(output_tensor2, expected_tensor2) 111 | 112 | def test_send_recv_3rank(self): 113 | self._test_send_recv_3rank() 114 | 115 | def _test_send_recv_crosscard(self): 116 | store = dist.FileStore(self.file_name, self.world_size) 117 | dist.init_process_group( 118 | "ccl", 119 | world_size=self.world_size, 120 | rank=self.rank, 121 | store=store, 122 | ) 123 | device = "xpu:{}".format(self.rank) 124 | 125 | # WA: allreduce 126 | # Ensure the process group has been fully initialized 127 | data = torch.zeros(1).to(device) 128 | dist.all_reduce(data) 129 | 130 | torch.xpu.set_device(device) 131 | tensor = self._build_tensor(self.rank + 1, device=device) 132 | 133 | for src in range(0, self.world_size): 134 | if src == self.rank: 135 | # Send mode 136 | for dst in range(0, self.world_size): 137 | if dst == self.rank: 138 | continue 139 | dist.send(tensor, dst) 140 | else: 141 | # Recv mode 142 | expected_tensor = self._build_tensor(src + 1) 143 | output_tensor = self._build_tensor( 144 | src + 1, value=-1, device=device 145 | ) 146 | dist.recv(output_tensor, src) 147 | self.assertEqual(output_tensor, expected_tensor) 148 | 149 | def test_send_recv_crosscard(self): 150 | self._test_send_recv_crosscard() 151 | 152 | def _test_send_recv_with_tag(self): 153 | store = dist.FileStore(self.file_name, self.world_size) 154 | dist.init_process_group( 155 | "ccl", 156 | world_size=self.world_size, 157 | rank=self.rank, 158 | store=store, 159 | ) 160 | device = "xpu:{}".format(self.rank) 161 | 162 | # WA: allreduce 163 | # Ensure the process group has been fully initialized 164 | data = torch.zeros(1).to(device) 165 | dist.all_reduce(data) 166 | 167 | torch.xpu.set_device(device) 168 | tensor = self._build_tensor(10, value=self.rank, device=device) 169 | 170 | for dst in range(0, self.world_size): 171 | if dst == self.rank: 172 | # Recv mode 173 | for src in range(0, self.world_size): 174 | if src == self.rank: 175 | continue 176 | output_tensor = self._build_tensor(10, value=-1, device=device) 177 | dist.recv(output_tensor, src, tag=src) 178 | self.assertTrue(output_tensor.eq(src).all()) 179 | else: 180 | # Send mode 181 | dist.send(tensor, dst, tag=self.rank) 182 | 183 | def test_send_recv_with_tag(self): 184 | self._test_send_recv_with_tag() 185 | 186 | if __name__ == '__main__': 187 | run_tests() 188 | -------------------------------------------------------------------------------- /src/test/remotesync/simple_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "sycl_misc.hpp" 12 | 13 | #define MAGIC_NUM 15 14 | #define MAX_RANK 8 15 | #define MAX_BUFFER 4096 16 | #define OPERATE_SIZE 14336 17 | 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK; 19 | size_t check_size = 4096; 20 | 21 | int world = -1; 22 | int rank = -1; 23 | 24 | void* buffer[MAX_RANK]; 25 | void* sync_buffer[MAX_RANK]; 26 | void* ready_buffer[MAX_RANK]; 27 | 28 | void exchange_mem(sycl::queue& queue, void* ptr); 29 | void atomic_write_check_remote(sycl::queue& queue, uint32_t* ptr, int good); 30 | 31 | struct exchange_contents { 32 | union { 33 | ze_ipc_mem_handle_t ipc_handle; 34 | int fd = -1; 35 | }; 36 | size_t offset = 0; 37 | int pid = -1; 38 | }; 39 | 40 | #define sysCheck(x) \ 41 | if (x == -1) { \ 42 | throw std::system_error( \ 43 | std::make_error_code(std::errc(errno))); \ 44 | } 45 | 46 | int main(int argc, char* argv[]) { 47 | size_t buffer_size = buffer_base_size + 1024; 48 | 49 | auto ret = MPI_Init(&argc, &argv); 50 | if (ret == MPI_ERR_OTHER) { 51 | std::cout<<"MPI init error"< index) { 75 | ptr[index] = (uint32_t)0; 76 | })); 77 | }); 78 | queue.wait(); 79 | 80 | exchange_mem(queue, operate_buffer); 81 | 82 | atomic_write_check_remote(queue, ptr, (argc > 1)); 83 | 84 | MPI_Barrier(MPI_COMM_WORLD); 85 | std::cout << "Host MPI barrier completed" << std::endl; 86 | 87 | MPI_Finalize(); 88 | } 89 | 90 | void atomic_write_check_remote(sycl::queue& queue, uint32_t* ptr, int good) { 91 | uint32_t temp_world = world; 92 | uint32_t temp_rank = rank; 93 | 94 | int *temp_sync_buffer[MAX_RANK]; 95 | for (int index = 0; index < temp_world; index++) { 96 | temp_sync_buffer[index] = (int *)sync_buffer[index]; 97 | } 98 | 99 | for (int index = 0; index < temp_world; index++) { 100 | if (index != temp_rank) { 101 | std::cout << "Setting " << temp_sync_buffer[index] << " (remote) to 1" << std::endl; 102 | } 103 | } 104 | for (int index = 0; index < temp_world; index++) { 105 | if (index != temp_rank) { 106 | std::cout << "Checking " << (int*)((int *)ptr + index * 32) << " (local) for 1" << std::endl; 107 | } 108 | } 109 | 110 | sycl::event e = queue.submit([&](sycl::handler& cgh) { 111 | if (good != 0) { 112 | sycl::stream str(8192, 1024, cgh); 113 | } 114 | cgh.parallel_for(sycl::range { temp_world * 2 }, ([=](sycl::id<1> index) { 115 | if (index < temp_world && index != temp_rank) { 116 | int * peer_sync_ptr = (int*)temp_sync_buffer[index]; 117 | auto v = 118 | sycl::atomic_ref(peer_sync_ptr[0]); 121 | v.store(1); 122 | } 123 | if (index >= temp_world && index - temp_world != temp_rank) { 124 | int * local_sync_ptr = (int*)(ptr + (index - temp_world) * 32); 125 | auto v = 126 | sycl::atomic_ref(local_sync_ptr[0]); 129 | int count = v.load(); 130 | while (count < 1) { 131 | count = v.load(); 132 | } 133 | } 134 | })); 135 | }); 136 | e.wait(); 137 | 138 | std::cout << "Kernel done" << std::endl; 139 | } 140 | 141 | void exchange_mem(sycl::queue& queue, void* ptr) { 142 | // Step 1: Get base address of the pointer 143 | sycl::context ctx = queue.get_context(); 144 | auto l0_ctx = sycl::get_native(ctx); 145 | 146 | void *base_addr; 147 | size_t base_size; 148 | zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); 149 | 150 | std::cout << "Memory range size: " << base_size << std::endl; 151 | std::cout << "Buffer base size: " << buffer_base_size << std::endl; 152 | std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl; 153 | std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl; 154 | 155 | // Step 2: Get IPC mem handle from base address 156 | alignas(64) exchange_contents send_buf; 157 | alignas(64) exchange_contents recv_buf[world]; 158 | 159 | // fill in the exchange info 160 | zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); 161 | send_buf.offset = (char*)ptr - (char*)base_addr; 162 | send_buf.pid = getpid(); 163 | 164 | // Step 3: Exchange the handles and offsets 165 | memset(recv_buf, 0, sizeof(recv_buf)); 166 | // Overkill if we don't really needs all peer's handles 167 | MPI_Allgather( 168 | &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD); 169 | 170 | 171 | for (uint32_t i = 0; i < world; i++){ 172 | // Step 4: Prepare pid file descriptor of next process 173 | auto* peer = recv_buf + i; 174 | auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0); 175 | sysCheck(pid_fd); 176 | // 177 | // Step 5: Duplicate GEM object handle to local process 178 | // and overwrite original file descriptor number 179 | // 180 | peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0); 181 | sysCheck(peer->fd); 182 | 183 | // Step 6: Open IPC handle of remote peer 184 | auto l0_device 185 | = sycl::get_native(queue.get_device()); 186 | void* peer_base; 187 | 188 | zeCheck(zeMemOpenIpcHandle( 189 | l0_ctx, l0_device, peer->ipc_handle, 0, &peer_base)); 190 | // l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_base)); 191 | buffer[i] = (char*)peer_base + peer->offset; 192 | sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128; 193 | ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64; 194 | 195 | char* end = (char*)peer_base + peer->offset + base_size; 196 | 197 | std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl; 198 | } 199 | } 200 | 201 | -------------------------------------------------------------------------------- /src/test/writeremote/simple_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ze_exception.hpp" 11 | #include "sycl_misc.hpp" 12 | 13 | #define MAGIC_NUM 15 14 | #define MAX_RANK 8 15 | #define MAX_BUFFER 4096 16 | #define OPERATE_SIZE 14336 17 | 18 | size_t buffer_base_size = MAX_BUFFER * 1024 * MAX_RANK; 19 | size_t check_size = 4096; 20 | 21 | int world = -1; 22 | int rank = -1; 23 | 24 | void* buffer[MAX_RANK]; 25 | void* sync_buffer[MAX_RANK]; 26 | void* ready_buffer[MAX_RANK]; 27 | 28 | void exchange_mem(sycl::queue& queue, void* ptr); 29 | void dump_buffer(sycl::queue& queue, void* gpu_addr); 30 | void atomic_write_remote(sycl::queue& queue, int good); 31 | 32 | struct exchange_contents { 33 | union { 34 | ze_ipc_mem_handle_t ipc_handle; 35 | int fd = -1; 36 | }; 37 | size_t offset = 0; 38 | int pid = -1; 39 | }; 40 | 41 | #define sysCheck(x) \ 42 | if (x == -1) { \ 43 | throw std::system_error( \ 44 | std::make_error_code(std::errc(errno))); \ 45 | } 46 | 47 | int main(int argc, char* argv[]) { 48 | size_t buffer_size = buffer_base_size + 1024; 49 | 50 | auto ret = MPI_Init(&argc, &argv); 51 | if (ret == MPI_ERR_OTHER) { 52 | std::cout<<"MPI init error"< index) { 75 | ptr[index] = (uint32_t)0; 76 | })); 77 | }); 78 | queue.wait(); 79 | 80 | exchange_mem(queue, operate_buffer); 81 | 82 | atomic_write_remote(queue, (argc > 1)); 83 | 84 | MPI_Barrier(MPI_COMM_WORLD); 85 | std::cout << "Host MPI barrier completed" << std::endl; 86 | 87 | dump_buffer(queue, ptr); 88 | 89 | MPI_Barrier(MPI_COMM_WORLD); 90 | std::cout << "Host MPI barrier completed" << std::endl; 91 | 92 | MPI_Finalize(); 93 | 94 | } 95 | 96 | void atomic_write_remote(sycl::queue& queue, int good) { 97 | uint32_t temp_world = world; 98 | uint32_t temp_rank = rank; 99 | 100 | int *temp_sync_buffer[MAX_RANK]; 101 | for (int index = 0; index < temp_world; index++) { 102 | temp_sync_buffer[index] = (int *)sync_buffer[index]; 103 | } 104 | 105 | for (int index = 0; index < temp_world; index++) { 106 | if (index != temp_rank) { 107 | std::cout << "Setting " << temp_sync_buffer[index] << " (remote) to 1" << std::endl; 108 | } else { 109 | std::cout << "Setting " << temp_sync_buffer[index] << " (local mapped) to 1" << std::endl; 110 | } 111 | } 112 | 113 | queue.submit([&](sycl::handler& cgh) { 114 | if (good != 0) { 115 | sycl::stream str(8192, 1024, cgh); 116 | } 117 | cgh.parallel_for(sycl::range { temp_world }, ([=](sycl::id<1> index) { 118 | //if (index != temp_rank) { 119 | int * peer_sync_ptr = (int*)temp_sync_buffer[index]; 120 | auto v = 121 | sycl::atomic_ref(peer_sync_ptr[0]); 124 | v.store(1); 125 | //} 126 | })); 127 | }); 128 | queue.wait(); 129 | 130 | std::cout << "Kernel done" << std::endl; 131 | } 132 | 133 | void exchange_mem(sycl::queue& queue, void* ptr) { 134 | // Step 1: Get base address of the pointer 135 | sycl::context ctx = queue.get_context(); 136 | auto l0_ctx = sycl::get_native(ctx); 137 | 138 | void *base_addr; 139 | size_t base_size; 140 | zeCheck(zeMemGetAddressRange(l0_ctx, ptr, &base_addr, &base_size)); 141 | 142 | std::cout << "Memory range size: " << base_size << std::endl; 143 | std::cout << "Buffer base size: " << buffer_base_size << std::endl; 144 | std::cout << "Actual buffer size: " << (buffer_base_size + 1024) << std::endl; 145 | std::cout << "Local: " << base_addr << "+" << (char*)ptr - (char*)base_addr << " ~ " << (void *)((char *)base_addr + base_size) << std::endl; 146 | 147 | // Step 2: Get IPC mem handle from base address 148 | alignas(64) exchange_contents send_buf; 149 | alignas(64) exchange_contents recv_buf[world]; 150 | 151 | // fill in the exchange info 152 | zeCheck(zeMemGetIpcHandle(l0_ctx, base_addr, &send_buf.ipc_handle)); 153 | send_buf.offset = (char*)ptr - (char*)base_addr; 154 | send_buf.pid = getpid(); 155 | 156 | void * sync_addr = (void *)((char*)base_addr + send_buf.offset + buffer_base_size); 157 | dump_buffer(queue, sync_addr); 158 | 159 | // Step 3: Exchange the handles and offsets 160 | memset(recv_buf, 0, sizeof(recv_buf)); 161 | // Overkill if we don't really needs all peer's handles 162 | MPI_Allgather( 163 | &send_buf, sizeof(send_buf), MPI_BYTE, recv_buf, sizeof(send_buf), MPI_BYTE, MPI_COMM_WORLD); 164 | 165 | 166 | for (uint32_t i = 0; i < world; i++){ 167 | // Step 4: Prepare pid file descriptor of next process 168 | auto* peer = recv_buf + i; 169 | auto pid_fd = syscall(__NR_pidfd_open, peer->pid, 0); 170 | sysCheck(pid_fd); 171 | // 172 | // Step 5: Duplicate GEM object handle to local process 173 | // and overwrite original file descriptor number 174 | // 175 | peer->fd = syscall(__NR_pidfd_getfd, pid_fd, peer->fd, 0); 176 | sysCheck(peer->fd); 177 | 178 | // Step 6: Open IPC handle of remote peer 179 | auto l0_device 180 | = sycl::get_native(queue.get_device()); 181 | void* peer_base; 182 | 183 | zeCheck(zeMemOpenIpcHandle( 184 | l0_ctx, l0_device, peer->ipc_handle, 0, &peer_base)); 185 | // l0_ctx, l0_device, peer->ipc_handle, ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED, &peer_base)); 186 | buffer[i] = (char*)peer_base + peer->offset; 187 | sync_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128; 188 | ready_buffer[i] = (char*)peer_base + peer->offset + buffer_base_size + rank * 128 + 64; 189 | 190 | char* end = (char*)peer_base + peer->offset + base_size; 191 | 192 | std::cout << "Rank " << i << ": " << peer_base << "+" << peer->offset << " ~ " << (void *)end << std::endl; 193 | } 194 | } 195 | 196 | void dump_buffer(sycl::queue& queue, void* gpu_addr) { 197 | int* host_buffer = (int *)(malloc(1024)); 198 | queue.memcpy(host_buffer, gpu_addr, 1024); 199 | queue.wait(); 200 | std::cout << "Buffer copied from " << gpu_addr << " to host" << std::endl; 201 | std::cout << "Dump content of " << gpu_addr << ": " << std::endl; 202 | for (int i = 0; i < world; i++) { 203 | //if (i != rank) { 204 | std::cout << (int *)gpu_addr + i * 32 << ": " << host_buffer[i * 32] << std::endl; 205 | //} 206 | } 207 | free(host_buffer); 208 | } 209 | 210 | -------------------------------------------------------------------------------- /tests/test_fsdp.py: -------------------------------------------------------------------------------- 1 | # Reference: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html 2 | 3 | import os 4 | import argparse 5 | import functools 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | from torchvision import datasets, transforms 11 | import time 12 | 13 | from torch.optim.lr_scheduler import StepLR 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 20 | from torch.distributed.fsdp.fully_sharded_data_parallel import ( 21 | CPUOffload, 22 | BackwardPrefetch, 23 | ) 24 | from torch.distributed.fsdp.wrap import ( 25 | size_based_auto_wrap_policy, 26 | enable_wrap, 27 | wrap, 28 | ) 29 | 30 | import intel_extension_for_pytorch 31 | import oneccl_bindings_for_pytorch 32 | 33 | def setup(rank, world_size): 34 | os.environ['MASTER_ADDR'] = 'localhost' 35 | os.environ['MASTER_PORT'] = '12355' 36 | 37 | # initialize the process group 38 | dist.init_process_group("ccl", rank=rank, world_size=world_size) 39 | 40 | def cleanup(): 41 | dist.destroy_process_group() 42 | 43 | 44 | class Net(nn.Module): 45 | def __init__(self): 46 | super(Net, self).__init__() 47 | self.conv1 = nn.Conv2d(1, 32, 3, 1) 48 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 49 | self.dropout1 = nn.Dropout(0.25) 50 | self.dropout2 = nn.Dropout(0.5) 51 | self.fc1 = nn.Linear(9216, 128) 52 | self.fc2 = nn.Linear(128, 10) 53 | 54 | def forward(self, x): 55 | 56 | x = self.conv1(x) 57 | x = F.relu(x) 58 | x = self.conv2(x) 59 | x = F.relu(x) 60 | x = F.max_pool2d(x, 2) 61 | x = self.dropout1(x) 62 | x = torch.flatten(x, 1) 63 | x = self.fc1(x) 64 | x = F.relu(x) 65 | x = self.dropout2(x) 66 | x = self.fc2(x) 67 | output = F.log_softmax(x, dim=1) 68 | return output 69 | 70 | def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): 71 | model.train() 72 | ddp_loss = torch.zeros(2).to("xpu:{}".format(rank)) 73 | if sampler: 74 | sampler.set_epoch(epoch) 75 | for batch_idx, (data, target) in enumerate(train_loader): 76 | if batch_idx < 3: 77 | data, target = data.to("xpu:{}".format(rank)), target.to("xpu:{}".format(rank)) 78 | optimizer.zero_grad() 79 | output = model(data) 80 | loss = F.nll_loss(output, target, reduction='sum') 81 | loss.backward() 82 | optimizer.step() 83 | ddp_loss[0] += loss.item() 84 | ddp_loss[1] += len(data) 85 | 86 | dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) 87 | if rank == 0: 88 | print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) 89 | 90 | 91 | def test(model, rank, world_size, test_loader): 92 | model.eval() 93 | correct = 0 94 | ddp_loss = torch.zeros(3).to("xpu:{}".format(rank)) 95 | with torch.no_grad(): 96 | for data, target in test_loader: 97 | data, target = data.to("xpu:{}".format(rank)), target.to("xpu:{}".format(rank)) 98 | output = model(data) 99 | ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 100 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 101 | ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() 102 | ddp_loss[2] += len(data) 103 | 104 | dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) 105 | 106 | if rank == 0: 107 | test_loss = ddp_loss[0] / ddp_loss[2] 108 | print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( 109 | test_loss, int(ddp_loss[1]), int(ddp_loss[2]), 110 | 100. * ddp_loss[1] / ddp_loss[2])) 111 | 112 | 113 | def fsdp_main(rank, world_size, args): 114 | torch.manual_seed(123) 115 | torch.xpu.manual_seed(123) 116 | setup(rank, world_size) 117 | 118 | transform=transforms.Compose([ 119 | transforms.ToTensor(), 120 | transforms.Normalize((0.1307,), (0.3081,)) 121 | ]) 122 | 123 | dataset1 = datasets.MNIST('../data', train=True, download=True, 124 | transform=transform) 125 | dataset2 = datasets.MNIST('../data', train=False, 126 | transform=transform) 127 | 128 | sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) 129 | sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) 130 | 131 | train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} 132 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} 133 | cuda_kwargs = {'num_workers': 2, 134 | 'pin_memory': True, 135 | 'shuffle': False} 136 | train_kwargs.update(cuda_kwargs) 137 | test_kwargs.update(cuda_kwargs) 138 | 139 | train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) 140 | test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) 141 | my_auto_wrap_policy = functools.partial( 142 | size_based_auto_wrap_policy, min_num_params=100 143 | ) 144 | 145 | xpu_device = "xpu:{}".format(rank) 146 | torch.xpu.set_device(xpu_device) 147 | 148 | #init_start_event = torch.Event(enable_timing=True) 149 | #init_end_event = torch.Event(enable_timing=True) 150 | 151 | model = Net().to("xpu:{}".format(rank)) 152 | 153 | model = FSDP(model, device_id="xpu:{}".format(rank)) 154 | 155 | optimizer = optim.Adadelta(model.parameters(), lr=args.lr) 156 | 157 | scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) 158 | #init_start_event.record() 159 | elapsed = time.time() 160 | for epoch in range(1): 161 | train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) 162 | test(model, rank, world_size, test_loader) 163 | scheduler.step() 164 | 165 | #init_end_event.record() 166 | elapsed = time.time() - elapsed 167 | if rank == 0: 168 | #print(f"XPU event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") 169 | print(f"XPU event elapsed time: {elapsed}sec") 170 | print(f"{model}") 171 | 172 | if args.save_model: 173 | # use a barrier to make sure training is done on all ranks 174 | dist.barrier() 175 | # state_dict for FSDP model is only available on Nightlies for now 176 | states = model.state_dict() 177 | if rank == 0: 178 | torch.save(states, "mnist_cnn.pt") 179 | 180 | cleanup() 181 | 182 | 183 | 184 | if __name__ == '__main__': 185 | # Training settings 186 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 187 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 188 | help='input batch size for training (default: 64)') 189 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 190 | help='input batch size for testing (default: 1000)') 191 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 192 | help='number of epochs to train (default: 14)') 193 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 194 | help='learning rate (default: 1.0)') 195 | parser.add_argument('--gamma', type=float, default=0.7, metavar='M', 196 | help='Learning rate step gamma (default: 0.7)') 197 | parser.add_argument('--no-cuda', action='store_true', default=False, 198 | help='disables CUDA training') 199 | parser.add_argument('--seed', type=int, default=1, metavar='S', 200 | help='random seed (default: 1)') 201 | parser.add_argument('--save-model', action='store_true', default=False, 202 | help='For Saving the current Model') 203 | args = parser.parse_args() 204 | 205 | torch.manual_seed(args.seed) 206 | 207 | # WORLD_SIZE = torch.xpu.device_count() 208 | WORLD_SIZE = 2 209 | mp.spawn(fsdp_main, 210 | args=(WORLD_SIZE, args), 211 | nprocs=WORLD_SIZE, 212 | join=True) 213 | -------------------------------------------------------------------------------- /tools/setup/cmake.py: -------------------------------------------------------------------------------- 1 | "Manages CMake." 2 | import os 3 | import re 4 | import shutil 5 | from subprocess import check_call, check_output 6 | import sys 7 | import distutils 8 | import distutils.sysconfig 9 | from distutils.version import LooseVersion 10 | from setuptools import Extension 11 | from collections import defaultdict 12 | from .env import BUILD_DIR, check_env_flag 13 | # from .numpy_ import USE_NUMPY, NUMPY_INCLUDE_DIR 14 | 15 | 16 | def _mkdir_p(d): 17 | try: 18 | os.makedirs(d) 19 | except OSError: 20 | pass 21 | 22 | 23 | # Ninja 24 | # Use ninja if it is on the PATH. Previous version of PyTorch required the 25 | # ninja python package, but we no longer use it, so we do not have to import it 26 | # USE_NINJA = (not check_negative_env_flag('USE_NINJA') and 27 | # shutil.which('ninja') is not None) 28 | def convert_cmake_value_to_python_value(cmake_value, cmake_type): 29 | r"""Convert a CMake value in a string form to a Python value. 30 | 31 | Arguments: 32 | cmake_value (string): The CMake value in a string form (e.g., "ON", "OFF", "1"). 33 | cmake_type (string): The CMake type of :attr:`cmake_value`. 34 | 35 | Returns: 36 | A Python value corresponding to :attr:`cmake_value` with type :attr:`cmake_type`. 37 | """ 38 | 39 | cmake_type = cmake_type.upper() 40 | up_val = cmake_value.upper() 41 | if cmake_type == 'BOOL': 42 | # https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/VariablesListsStrings#boolean-values-in-cmake 43 | return not (up_val in ('FALSE', 'OFF', 'N', 'NO', '0', '', 'NOTFOUND') or up_val.endswith('-NOTFOUND')) 44 | elif cmake_type == 'FILEPATH': 45 | if up_val.endswith('-NOTFOUND'): 46 | return None 47 | else: 48 | return cmake_value 49 | else: # Directly return the cmake_value. 50 | return cmake_value 51 | 52 | 53 | def get_cmake_cache_variables_from_file(cmake_cache_file): 54 | r"""Gets values in CMakeCache.txt into a dictionary. 55 | 56 | Arguments: 57 | cmake_cache_file: A CMakeCache.txt file object. 58 | Returns: 59 | dict: A ``dict`` containing the value of cached CMake variables. 60 | """ 61 | 62 | results = dict() 63 | for i, line in enumerate(cmake_cache_file, 1): 64 | line = line.strip() 65 | if not line or line.startswith(('#', '//')): 66 | # Blank or comment line, skip 67 | continue 68 | 69 | # Almost any character can be part of variable name and value. As a practical matter, we assume the type must be 70 | # valid if it were a C variable name. It should match the following kinds of strings: 71 | # 72 | # USE_CUDA:BOOL=ON 73 | # "USE_CUDA":BOOL=ON 74 | # USE_CUDA=ON 75 | # USE_CUDA:=ON 76 | # Intel(R) MKL-DNN_SOURCE_DIR:STATIC=/path/to/pytorch/third_party/ideep/mkl-dnn 77 | # "OpenMP_COMPILE_RESULT_CXX_openmp:experimental":INTERNAL=FALSE 78 | matched = re.match(r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line) 79 | if matched is None: # Illegal line 80 | raise ValueError('Unexpected line {} in {}: {}'.format(i, repr(cmake_cache_file), line)) 81 | _, variable, type_, value = matched.groups() 82 | if type_ is None: 83 | type_ = '' 84 | if type_.upper() in ('INTERNAL', 'STATIC'): 85 | # CMake internal variable, do not touch 86 | continue 87 | results[variable] = convert_cmake_value_to_python_value(value, type_) 88 | 89 | return results 90 | 91 | 92 | class CMakeExtension(Extension): 93 | """CMake extension""" 94 | def __init__(self, name, cmake_file): 95 | super().__init__(name, []) 96 | self.build_dir = BUILD_DIR 97 | self.cmake_file = cmake_file 98 | self._cmake_command = CMakeExtension._get_cmake_command() 99 | self.debug = True 100 | self.cmake_dir = os.path.dirname(cmake_file) 101 | 102 | @staticmethod 103 | def _get_version(cmd): 104 | """Returns cmake version.""" 105 | 106 | for line in check_output([cmd, '--version']).decode('utf-8').split('\n'): 107 | if 'version' in line: 108 | return LooseVersion(line.strip().split(' ')[2]) 109 | raise RuntimeError('no version found') 110 | 111 | @staticmethod 112 | def _get_cmake_command(): 113 | """Returns cmake command.""" 114 | 115 | cmake_command = shutil.which('cmake') 116 | cmake3 = shutil.which('cmake3') 117 | if cmake3 is not None: 118 | cmake = shutil.which('cmake') 119 | if cmake is not None: 120 | bare_version = CMakeExtension._get_version(cmake) 121 | if (bare_version < LooseVersion("3.5.0") and 122 | CMakeExtension._get_version(cmake3) > bare_version): 123 | cmake_command = 'cmake3' 124 | return cmake_command 125 | 126 | @staticmethod 127 | def defines(args, **kwargs): 128 | "Adds definitions to a cmake argument list." 129 | for key, value in sorted(kwargs.items()): 130 | if value is not None: 131 | args.append('-D{}={}'.format(key, value)) 132 | 133 | @staticmethod 134 | def _cmake_value(value): 135 | if type(value) is str: 136 | if value.startswith(('OFF', '0', 'False', 'FALSE')): 137 | return False 138 | if value.startswith(('ON', '1', 'True', 'TRUE')): 139 | return True 140 | return value 141 | 142 | @staticmethod 143 | def extract(args): 144 | "Adds definitions to a cmake argument list." 145 | build_options = {} 146 | pat = re.compile(r'^-D(.*)=(.*)') 147 | for arg in args: 148 | match = pat.match(arg) 149 | 150 | build_options[match.group(1)] = CMakeExtension._cmake_value(match.group(2)) 151 | 152 | return build_options 153 | 154 | @staticmethod 155 | def convert_cmake_dirs(paths): 156 | def converttostr(input_seq, seperator): 157 | # Join all the strings in list 158 | final_str = seperator.join(input_seq) 159 | return final_str 160 | try: 161 | return converttostr(paths, ";") 162 | except: 163 | return paths 164 | 165 | @property 166 | def _cmake_cache_file(self): 167 | r"""Returns the path to CMakeCache.txt. 168 | 169 | Returns: 170 | string: The path to CMakeCache.txt. 171 | """ 172 | return os.path.join(self.build_dir, 'CMakeCache.txt') 173 | 174 | def _get_cmake_cache_variables(self): 175 | r"""Gets values in CMakeCache.txt into a dictionary. 176 | Returns: 177 | dict: A ``dict`` containing the value of cached CMake variables. 178 | """ 179 | with open(self._cmake_cache_file) as f: 180 | return get_cmake_cache_variables_from_file(f) 181 | 182 | def _run(self, args, env): 183 | """Executes cmake with arguments and an environment.""" 184 | command = [self._cmake_command] + args + [self.cmake_dir] 185 | print(' '.join(command)) 186 | check_call(command, cwd=self.build_dir, env=env) 187 | 188 | def generate(self, build_options, env, build_dir, install_dir): 189 | """Runs cmake to generate native build files.""" 190 | 191 | self.build_dir = build_dir 192 | 193 | cmake_args = [] 194 | 195 | for var, val in env.items(): 196 | if var.startswith(('BUILD_', 'USE_', 'CMAKE_')): 197 | # TODO: DO NOT OVERWRITE CMAKE_PREFIX_PATH 198 | if var.strip() == "CMAKE_PREFIX_PATH": 199 | build_options[var] += ";" + val 200 | else: 201 | build_options[var] = val 202 | 203 | if 'CMAKE_BUILD_TYPE' not in env: 204 | if check_env_flag('DEBUG', env=env): 205 | build_options['CMAKE_BUILD_TYPE'] = 'Debug' 206 | elif check_env_flag('REL_WITH_DEB_INFO', env=env): 207 | build_options['CMAKE_BUILD_TYPE'] = 'RelWithDebInfo' 208 | else: 209 | build_options['CMAKE_BUILD_TYPE'] = 'Release' 210 | build_options['CMAKE_INSTALL_PREFIX'] = install_dir 211 | 212 | CMakeExtension.defines(cmake_args, **build_options) 213 | if os.path.exists(self._cmake_cache_file): 214 | try: 215 | cmake_cache_vars = defaultdict(lambda: False, self._get_cmake_cache_variables()) 216 | except FileNotFoundError: 217 | # CMakeCache.txt does not exist. Probably running "python setup.py clean" over a clean directory. 218 | cmake_cache_vars = defaultdict(lambda: False) 219 | 220 | cache_build_options = CMakeExtension.extract(cmake_args) 221 | if all(option in cmake_cache_vars and 222 | CMakeExtension._cmake_value(cache_build_options[option]) == CMakeExtension._cmake_value(cmake_cache_vars[option]) 223 | for option in cache_build_options): 224 | # Everything's in place. Do not rerun. 225 | return 226 | self._run(cmake_args, env=env) 227 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #include "utils.h" 33 | 34 | namespace oneccl_bindings_for_pytorch { 35 | 36 | // Op mapping 37 | using c10d::ReduceOp; 38 | std::map cclOps = 39 | { 40 | {ReduceOp::MIN, ccl::reduction::min}, 41 | {ReduceOp::MAX, ccl::reduction::max}, 42 | {ReduceOp::SUM, ccl::reduction::sum}, 43 | {ReduceOp::PRODUCT, ccl::reduction::prod}, 44 | }; 45 | 46 | std::map cclDatatypes = 47 | { 48 | {at::kByte, ccl::datatype::uint8}, 49 | {at::kChar, ccl::datatype::int8}, 50 | {at::kShort, ccl::datatype::int16}, 51 | {at::kInt, ccl::datatype::int32}, 52 | {at::kLong, ccl::datatype::int64}, 53 | {at::kHalf, ccl::datatype::float16}, 54 | {at::kFloat, ccl::datatype::float32}, 55 | {at::kDouble, ccl::datatype::float64}, 56 | {at::kBFloat16, ccl::datatype::bfloat16}, 57 | {at::kBool, ccl::datatype::uint8}, 58 | }; 59 | 60 | // Get the key from the list of devices 61 | std::string get_key_from_devs(const std::vector& devices) { 62 | std::string key = DeviceTypeName(devices[0].type(), /* lower case */ true) + ":"; 63 | for (auto& device : devices) { 64 | key.append(std::to_string(device.index()) + ","); 65 | } 66 | return key; 67 | } 68 | 69 | // Get the list of devices from list of tensors 70 | std::vector get_device_list(const std::vector& tensors) { 71 | std::vector res; 72 | res.reserve(tensors.size()); 73 | for (auto& tensor : tensors) { 74 | // Tensors must all be on the same device, or all on distinct devices. 75 | if (res.size() == 0 || tensor.device() != res[0]) { 76 | res.push_back(tensor.device()); 77 | } 78 | } 79 | return res; 80 | } 81 | 82 | std::vector get_device_list(const std::vector >& tensors) { 83 | std::vector res; 84 | res.reserve(tensors.size()); 85 | for (auto& tensor : tensors) { 86 | res.push_back(tensor[0].device()); 87 | } 88 | return res; 89 | } 90 | 91 | bool check_same_size(const std::vector& tensors) { 92 | for (const auto& tensor : tensors) { 93 | if (!tensors[0].is_same_size(tensor)) { 94 | return false; 95 | } 96 | } 97 | return true; 98 | } 99 | 100 | std::vector flatten_tensor_lists(std::vector>& tensor_lists, std::vector& other, size_t world_size) { 101 | if (tensor_lists.size() != other.size()) { 102 | TORCH_CHECK( 103 | false, 104 | "Tensor list operands to scatter/gather must have the same length"); 105 | } 106 | const auto num_devices = tensor_lists.size(); 107 | 108 | std::vector flattened; 109 | flattened.resize(num_devices); 110 | 111 | for (const auto i : c10::irange(size_t{}, num_devices)) { 112 | if (tensor_lists[i].size() != world_size * num_devices) { 113 | TORCH_CHECK( 114 | false, 115 | c10::str( 116 | "Tensor list input to scatter/gather must match number of collective participants ", 117 | "but got ", 118 | tensor_lists[i].size(), 119 | " inputs", 120 | " with world_size ", 121 | world_size, 122 | " and ", 123 | num_devices, 124 | " devices.")); 125 | } 126 | 127 | // Only check device match for the first tensor in the list; the call to 128 | // newLikeFlat() below will check the rest. 129 | if (tensor_lists[i].front().get_device() != other[i].get_device()) { 130 | TORCH_CHECK( 131 | false, 132 | "Corresponding input/output tensors to scatter/gather must all reside" 133 | " on the same device"); 134 | } 135 | 136 | for (const auto& t : tensor_lists[i]) { 137 | if (t.numel() != other[i].numel()) { 138 | TORCH_CHECK( 139 | false, 140 | "All tensor operands to scatter/gather must have the same number of elements"); 141 | } 142 | } 143 | // Flatten the tensors (from all ranks) into a single big tensor. 144 | flattened[i] = c10d::newLikeFlat(tensor_lists, i); 145 | } 146 | return flattened; 147 | } 148 | 149 | std::string get_key_send_recv(int myRank, int peer) { 150 | int lowRank = myRank < peer ? myRank : peer; 151 | int highRank = myRank < peer ? peer : myRank; 152 | std::string sendRecvPair = 153 | std::to_string(lowRank) + ":" + std::to_string(highRank); 154 | return sendRecvPair; 155 | } 156 | 157 | FlatCheckResult computeLengthsAndCheckFlat( 158 | const std::vector& tensors, 159 | std::vector& lengths) 160 | { 161 | int64_t groupSize = lengths.size(); 162 | auto firstTensor = tensors[0]; 163 | int64_t offset = 0; 164 | auto firstLength = firstTensor.numel(); 165 | auto storage = firstTensor.storage(); 166 | auto firstStorageOffset = firstTensor.storage_offset(); 167 | bool isFlat = true; 168 | 169 | for (int i = 0; i < groupSize; i++) 170 | { 171 | auto& curTensor = tensors[i]; 172 | int64_t length = curTensor.numel(); 173 | 174 | if (firstLength == 0 && length != 0) 175 | { 176 | firstLength = length; 177 | firstTensor = curTensor; 178 | storage = curTensor.storage(); 179 | firstStorageOffset = curTensor.storage_offset(); 180 | } 181 | 182 | lengths[i] = length; 183 | 184 | if (isFlat && (length != 0 || firstLength != 0) && 185 | (!storage.is_alias_of(curTensor.storage()) || 186 | curTensor.storage_offset() != firstStorageOffset + offset)) 187 | isFlat = false; 188 | 189 | offset += length; 190 | } 191 | 192 | return FlatCheckResult{isFlat, offset, firstTensor}; 193 | } 194 | 195 | bool computeLengthsAndCheckAndGetFlat( 196 | const std::vector& tensors, 197 | std::vector& lengths, 198 | at::Tensor& flatTensor, 199 | int64_t& flatLength) 200 | { 201 | auto flatRes = computeLengthsAndCheckFlat(tensors, lengths); 202 | 203 | flatLength = flatRes.size; 204 | 205 | if (flatRes.isFlat) 206 | { 207 | flatTensor = flatRes.firstTensor; 208 | } 209 | else 210 | { 211 | flatTensor = at::empty({flatRes.size}, flatRes.firstTensor.options()); 212 | } 213 | 214 | return flatRes.isFlat; 215 | } 216 | 217 | void checkSingleTensorHelper(const at::Tensor& tensor) 218 | { 219 | TORCH_CHECK(tensor.is_sparse() || tensor.is_contiguous(tensor.suggest_memory_format()), "input dense tensor has to be contiguous"); 220 | TORCH_CHECK(!tensor.is_cuda(), "CUDA tensor detected and CCL doesn't support CUDA buffers"); 221 | TORCH_CHECK(tensor.numel() >= 0, "input tensor numel should be non-negative"); 222 | } 223 | 224 | void checkSingleTensor(const std::vector& tensors) 225 | { 226 | TORCH_CHECK(tensors.size() == 1, 227 | "CCL process group does not support tensors count " + std::to_string(tensors.size())); 228 | 229 | checkSingleTensorHelper(tensors[0]); 230 | } 231 | 232 | 233 | void checkSameType(const at::Tensor& tensor, 234 | const std::vector& tensors) 235 | { 236 | for (size_t i = 0; i < tensors.size(); ++i) 237 | { 238 | TORCH_CHECK(tensors[i].scalar_type() == tensor.scalar_type(), 239 | "Tensors are not equal in data type"); 240 | TORCH_CHECK(tensors[i].device().type() == tensor.device().type(), 241 | "Tensors are not in same device type. Expect: ", tensor.device().type(), 242 | " But got: ", tensors[i].device().type()); 243 | 244 | checkSingleTensorHelper(tensors[i]); 245 | } 246 | } 247 | 248 | void checkSameType(const at::Tensor& tensor, 249 | const std::vector>& tensors) 250 | { 251 | for (size_t i = 0; i < tensors.size(); ++i) 252 | { 253 | checkSameType(tensor, tensors[i]); 254 | } 255 | } 256 | 257 | } 258 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # DEBUG build with debug 2 | # 3 | # USE_SYSTEM_ONECCL=0 4 | # disables use of system-wide oneCCL (we will use our submoduled 5 | # copy in third_party/oneCCL) 6 | 7 | import os 8 | import sys 9 | import pathlib 10 | import shutil 11 | from subprocess import check_call, check_output 12 | 13 | import torch 14 | from torch.utils.cpp_extension import BuildExtension, CppExtension, library_paths 15 | from setuptools import setup 16 | from distutils.command.clean import clean 17 | from tools.setup.cmake import CMakeExtension 18 | from tools.setup.env import get_compiler 19 | 20 | # Constant known variables used throughout this file 21 | CWD = os.path.dirname(os.path.abspath(__file__)) 22 | ONECCL_BINDINGS_FOR_PYTORCH_PATH = os.path.join(CWD, "oneccl_bindings_for_pytorch") 23 | 24 | 25 | def _check_env_flag(name, default=''): 26 | return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y'] 27 | 28 | 29 | def check_file(f): 30 | if not os.path.exists(f): 31 | print("Could not find {}".format(f)) 32 | print("Did you run 'git submodule update --init --recursive'?") 33 | sys.exit(1) 34 | 35 | 36 | # all the work we need to do _before_ setup runs 37 | def create_version(): 38 | """Create the version string for torch-ccl""" 39 | package_name = os.getenv('CCL_PACKAGE_NAME', 'oneccl-bind-pt') 40 | version = open('version.txt', 'r').read().strip() 41 | sha = 'Unknown' 42 | 43 | try: 44 | sha = check_output(['git', 'rev-parse', 'HEAD'], cwd=CWD).decode('ascii').strip() 45 | except Exception: 46 | pass 47 | 48 | if os.getenv('CCL_SHA_VERSION', False): 49 | if sha != 'Unknown': 50 | version += '+' + sha[:7] 51 | 52 | if os.environ.get("COMPUTE_BACKEND") == "dpcpp": 53 | backend = "gpu" 54 | else: 55 | backend = os.environ.get("ONECCL_BINDINGS_FOR_PYTORCH_BACKEND", "cpu") 56 | 57 | if "+" not in version: 58 | version += '+' + backend 59 | 60 | print("Building {}-{}".format(package_name, version)) 61 | 62 | version_path = os.path.join(CWD, 'oneccl_bindings_for_pytorch', 'version.py') 63 | with open(version_path, 'w') as f: 64 | f.write("__version__ = '{}'\n".format(version)) 65 | f.write("git_version = {}\n".format(repr(sha))) 66 | 67 | return version, package_name 68 | 69 | 70 | class BuildCMakeExt(BuildExtension): 71 | """ 72 | Builds using cmake instead of the python setuptools implicit build 73 | """ 74 | 75 | def run(self): 76 | """ 77 | Perform build_cmake before doing the 'normal' stuff 78 | """ 79 | cmake_extensions = [ext for ext in self.extensions if isinstance(ext, CMakeExtension)] 80 | for ext in cmake_extensions: 81 | self.build_cmake(ext) 82 | 83 | self.extensions = [ext for ext in self.extensions if not isinstance(ext, CMakeExtension)] 84 | super(BuildCMakeExt, self).run() 85 | build_py = self.get_finalized_command('build_py') 86 | build_py.data_files = build_py._get_data_files() 87 | build_py.run() 88 | 89 | def build_cmake(self, extension: CMakeExtension): 90 | """ 91 | The steps required to build the extension 92 | """ 93 | build_dir = pathlib.Path('.'.join([self.build_temp, extension.name])) 94 | 95 | build_dir.mkdir(parents=True, exist_ok=True) 96 | install_dir = ONECCL_BINDINGS_FOR_PYTORCH_PATH 97 | 98 | # Now that the necessary directories are created, build 99 | my_env = os.environ.copy() 100 | my_env["CMAKE_DISABLE_FIND_PACKAGE_MKL"] = "TRUE" 101 | build_type = 'Release' 102 | 103 | if _check_env_flag('DEBUG'): 104 | build_type = 'Debug' 105 | 106 | build_options = { 107 | 'CMAKE_BUILD_TYPE': build_type, 108 | # The value cannot be easily obtained in CMakeLists.txt. 109 | 'CMAKE_PREFIX_PATH': torch.utils.cmake_prefix_path, 110 | # skip the example and test code in oneCCL 111 | 'BUILD_EXAMPLES': 'OFF', 112 | 'BUILD_CONFIG': 'OFF', 113 | 'BUILD_FT': 'OFF' 114 | } 115 | 116 | compute_backend = os.getenv('COMPUTE_BACKEND', 'n/a') 117 | runtime = 'gcc' 118 | if compute_backend == 'dpcpp': 119 | runtime = 'dpcpp' 120 | build_options['COMPUTE_BACKEND'] = compute_backend 121 | if "DPCPP_GCC_INSTALL_DIR" in my_env: 122 | exist_cflags = "CFLAGS" in my_env 123 | cflags = "" 124 | if exist_cflags: 125 | cflags = my_env["CFLAGS"] 126 | my_env["CFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} {cflags}" 127 | exist_cxxflags = "CXXFLAGS" in my_env 128 | cxxflags = "" 129 | if exist_cxxflags: 130 | cxxflags = my_env["CXXFLAGS"] 131 | my_env["CXXFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} {cxxflags}" 132 | exist_ldflags = "LDFLAGS" in my_env 133 | ldflags = "" 134 | if exist_ldflags: 135 | ldflags = my_env["LDFLAGS"] 136 | my_env["LDFLAGS"] = f"--gcc-install-dir={my_env['DPCPP_GCC_INSTALL_DIR']} -fuse-ld=lld -lrt -lpthread {ldflags}" 137 | 138 | cc, cxx = get_compiler(runtime) 139 | build_options['CMAKE_C_COMPILER'] = cc 140 | build_options['CMAKE_CXX_COMPILER'] = cxx 141 | 142 | extension.generate(build_options, my_env, build_dir, install_dir) 143 | 144 | if compute_backend == 'dpcpp': 145 | if "DPCPP_GCC_INSTALL_DIR" in my_env: 146 | if exist_cflags: 147 | my_env["CFLAGS"] = cflags 148 | else: 149 | del my_env["CFLAGS"] 150 | if exist_cxxflags: 151 | my_env["CXXFLAGS"] = cxxflags 152 | else: 153 | del my_env["CXXFLAGS"] 154 | if exist_ldflags: 155 | my_env["LDFLAGS"] = ldflags 156 | else: 157 | del my_env["LDFLAGS"] 158 | 159 | build_args = ['-j', str(os.cpu_count())] 160 | check_call(['make', 'oneccl_bindings_for_pytorch'] + build_args, cwd=str(build_dir)) 161 | if compute_backend == 'dpcpp': 162 | check_call(['make', 'oneccl_bindings_for_pytorch_xpu'] + build_args, cwd=str(build_dir)) 163 | check_call(['make', 'install'], cwd=str(build_dir)) 164 | 165 | 166 | class Clean(clean): 167 | def run(self): 168 | import glob 169 | import re 170 | 171 | with open('.gitignore', 'r') as f: 172 | ignores = f.read() 173 | pat = re.compile(r'^#( BEGIN NOT-CLEAN-FILES )?') 174 | for wildcard in filter(None, ignores.split('\n')): 175 | match = pat.match(wildcard) 176 | if match: 177 | if match.group(1): 178 | # Marker is found and stop reading .gitignore. 179 | break 180 | # Ignore lines which begin with '#'. 181 | else: 182 | for filename in glob.glob(wildcard): 183 | try: 184 | os.remove(filename) 185 | except OSError: 186 | shutil.rmtree(filename, ignore_errors=True) 187 | 188 | clean.run(self) 189 | 190 | 191 | def get_python_c_module(): 192 | main_compile_args = [] 193 | main_libraries = ['oneccl_bindings_for_pytorch'] 194 | main_link_args = [] 195 | main_sources = ["oneccl_bindings_for_pytorch/csrc/_C.cpp", "oneccl_bindings_for_pytorch/csrc/init.cpp"] 196 | lib_path = os.path.join(ONECCL_BINDINGS_FOR_PYTORCH_PATH, "lib") 197 | library_dirs = [lib_path] 198 | include_path = os.path.join(CWD, "src") 199 | include_dirs = [include_path] 200 | extra_link_args = [] 201 | extra_compile_args = [ 202 | '-Wall', 203 | '-Wextra', 204 | '-Wno-strict-overflow', 205 | '-Wno-unused-parameter', 206 | '-Wno-missing-field-initializers', 207 | '-Wno-write-strings', 208 | '-Wno-unknown-pragmas', 209 | # This is required for Python 2 declarations that are deprecated in 3. 210 | '-Wno-deprecated-declarations', 211 | # Python 2.6 requires -fno-strict-aliasing, see 212 | # http://legacy.python.org/dev/peps/pep-3123/ 213 | # We also depend on it in our code (even Python 3). 214 | '-fno-strict-aliasing', 215 | # Clang has an unfixed bug leading to spurious missing 216 | # braces warnings, see 217 | # https://bugs.llvm.org/show_bug.cgi?id=21629 218 | '-Wno-missing-braces', 219 | ] 220 | 221 | def make_relative_rpath(path): 222 | ret = [] 223 | ret.append('-Wl,-rpath,$ORIGIN/' + path) 224 | if os.getenv('COMPUTE_BACKEND', 'n/a') == 'dpcpp': 225 | ret.append('-Wl,-rpath,$ORIGIN/../../../') 226 | ret.append('-Wl,--disable-new-dtags') 227 | return ret 228 | 229 | _c_module = CppExtension("oneccl_bindings_for_pytorch._C", 230 | libraries=main_libraries, 231 | sources=main_sources, 232 | language='c', 233 | extra_compile_args=main_compile_args + extra_compile_args, 234 | include_dirs=include_dirs, 235 | library_dirs=library_dirs, 236 | extra_link_args=extra_link_args + main_link_args + make_relative_rpath('lib')) 237 | 238 | return _c_module 239 | 240 | 241 | if __name__ == '__main__': 242 | version, package_name = create_version() 243 | c_module = get_python_c_module() 244 | cmake_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "CMakeLists.txt") 245 | modules = [CMakeExtension("liboneccl_bindings_for_pytorch", cmake_file), c_module] 246 | setup( 247 | name=package_name, 248 | version=version, 249 | ext_modules=modules, 250 | packages=['oneccl_bindings_for_pytorch'], 251 | package_data={ 252 | 'oneccl_bindings_for_pytorch': [ 253 | '*.py', 254 | '*/*.h', 255 | '*/*.hpp', 256 | 'lib/*.so*', 257 | 'opt/mpi/lib/*.so*', 258 | 'bin/*', 259 | 'opt/mpi/bin/*', 260 | 'env/*', 261 | 'etc/*', 262 | 'opt/mpi/etc/*', 263 | 'examples/*', 264 | 'include/native_device_api/*.h*', 265 | 'include/native_device_api/l0/*.h*', 266 | 'include/*.h*', 267 | 'opt/mpi/include/*.h*', 268 | 'lib/lib*', 269 | 'opt/mpi/libfabric/lib/lib*', 270 | 'lib/prov/lib*', 271 | 'lib/ccl/kernels/*', 272 | 'opt/mpi/libfabric/lib/prov/lib*', 273 | 'licensing/*', 274 | 'modulefiles/*', 275 | ]}, 276 | cmdclass={ 277 | 'build_ext': BuildCMakeExt, 278 | 'clean': Clean, 279 | } 280 | ) 281 | -------------------------------------------------------------------------------- /tests/DeepSpeed_test/DeepSpeed.csv: -------------------------------------------------------------------------------- 1 | allreduce,1,-1 2 | broadcast,154533888,0 3 | broadcast,6291456,0 4 | broadcast,3072,0 5 | broadcast,3072,0 6 | broadcast,28311552,0 7 | broadcast,9216,0 8 | broadcast,9437184,0 9 | broadcast,3072,0 10 | broadcast,3072,0 11 | broadcast,3072,0 12 | broadcast,37748736,0 13 | broadcast,12288,0 14 | broadcast,37748736,0 15 | broadcast,3072,0 16 | broadcast,3072,0 17 | broadcast,3072,0 18 | broadcast,28311552,0 19 | broadcast,9216,0 20 | broadcast,9437184,0 21 | broadcast,3072,0 22 | broadcast,3072,0 23 | broadcast,3072,0 24 | broadcast,37748736,0 25 | broadcast,12288,0 26 | broadcast,37748736,0 27 | broadcast,3072,0 28 | broadcast,3072,0 29 | broadcast,3072,0 30 | broadcast,28311552,0 31 | broadcast,9216,0 32 | broadcast,9437184,0 33 | broadcast,3072,0 34 | broadcast,3072,0 35 | broadcast,3072,0 36 | broadcast,37748736,0 37 | broadcast,12288,0 38 | broadcast,37748736,0 39 | broadcast,3072,0 40 | broadcast,3072,0 41 | broadcast,3072,0 42 | broadcast,28311552,0 43 | broadcast,9216,0 44 | broadcast,9437184,0 45 | broadcast,3072,0 46 | broadcast,3072,0 47 | broadcast,3072,0 48 | broadcast,37748736,0 49 | broadcast,12288,0 50 | broadcast,37748736,0 51 | broadcast,3072,0 52 | broadcast,3072,0 53 | broadcast,3072,0 54 | broadcast,28311552,0 55 | broadcast,9216,0 56 | broadcast,9437184,0 57 | broadcast,3072,0 58 | broadcast,3072,0 59 | broadcast,3072,0 60 | broadcast,37748736,0 61 | broadcast,12288,0 62 | broadcast,37748736,0 63 | broadcast,3072,0 64 | broadcast,3072,0 65 | broadcast,3072,0 66 | broadcast,28311552,0 67 | broadcast,9216,0 68 | broadcast,9437184,0 69 | broadcast,3072,0 70 | broadcast,3072,0 71 | broadcast,3072,0 72 | broadcast,37748736,0 73 | broadcast,12288,0 74 | broadcast,37748736,0 75 | broadcast,3072,0 76 | broadcast,3072,0 77 | broadcast,3072,0 78 | broadcast,28311552,0 79 | broadcast,9216,0 80 | broadcast,9437184,0 81 | broadcast,3072,0 82 | broadcast,3072,0 83 | broadcast,3072,0 84 | broadcast,37748736,0 85 | broadcast,12288,0 86 | broadcast,37748736,0 87 | broadcast,3072,0 88 | broadcast,3072,0 89 | broadcast,3072,0 90 | broadcast,28311552,0 91 | broadcast,9216,0 92 | broadcast,9437184,0 93 | broadcast,3072,0 94 | broadcast,3072,0 95 | broadcast,3072,0 96 | broadcast,37748736,0 97 | broadcast,12288,0 98 | broadcast,37748736,0 99 | broadcast,3072,0 100 | broadcast,3072,0 101 | broadcast,3072,0 102 | broadcast,28311552,0 103 | broadcast,9216,0 104 | broadcast,9437184,0 105 | broadcast,3072,0 106 | broadcast,3072,0 107 | broadcast,3072,0 108 | broadcast,37748736,0 109 | broadcast,12288,0 110 | broadcast,37748736,0 111 | broadcast,3072,0 112 | broadcast,3072,0 113 | broadcast,3072,0 114 | broadcast,28311552,0 115 | broadcast,9216,0 116 | broadcast,9437184,0 117 | broadcast,3072,0 118 | broadcast,3072,0 119 | broadcast,3072,0 120 | broadcast,37748736,0 121 | broadcast,12288,0 122 | broadcast,37748736,0 123 | broadcast,3072,0 124 | broadcast,3072,0 125 | broadcast,3072,0 126 | broadcast,28311552,0 127 | broadcast,9216,0 128 | broadcast,9437184,0 129 | broadcast,3072,0 130 | broadcast,3072,0 131 | broadcast,3072,0 132 | broadcast,37748736,0 133 | broadcast,12288,0 134 | broadcast,37748736,0 135 | broadcast,3072,0 136 | broadcast,3072,0 137 | broadcast,3072,0 138 | broadcast,28311552,0 139 | broadcast,9216,0 140 | broadcast,9437184,0 141 | broadcast,3072,0 142 | broadcast,3072,0 143 | broadcast,3072,0 144 | broadcast,37748736,0 145 | broadcast,12288,0 146 | broadcast,37748736,0 147 | broadcast,3072,0 148 | broadcast,3072,0 149 | broadcast,3072,0 150 | broadcast,28311552,0 151 | broadcast,9216,0 152 | broadcast,9437184,0 153 | broadcast,3072,0 154 | broadcast,3072,0 155 | broadcast,3072,0 156 | broadcast,37748736,0 157 | broadcast,12288,0 158 | broadcast,37748736,0 159 | broadcast,3072,0 160 | broadcast,3072,0 161 | broadcast,3072,0 162 | broadcast,28311552,0 163 | broadcast,9216,0 164 | broadcast,9437184,0 165 | broadcast,3072,0 166 | broadcast,3072,0 167 | broadcast,3072,0 168 | broadcast,37748736,0 169 | broadcast,12288,0 170 | broadcast,37748736,0 171 | broadcast,3072,0 172 | broadcast,3072,0 173 | broadcast,3072,0 174 | broadcast,28311552,0 175 | broadcast,9216,0 176 | broadcast,9437184,0 177 | broadcast,3072,0 178 | broadcast,3072,0 179 | broadcast,3072,0 180 | broadcast,37748736,0 181 | broadcast,12288,0 182 | broadcast,37748736,0 183 | broadcast,3072,0 184 | broadcast,3072,0 185 | broadcast,3072,0 186 | broadcast,28311552,0 187 | broadcast,9216,0 188 | broadcast,9437184,0 189 | broadcast,3072,0 190 | broadcast,3072,0 191 | broadcast,3072,0 192 | broadcast,37748736,0 193 | broadcast,12288,0 194 | broadcast,37748736,0 195 | broadcast,3072,0 196 | broadcast,3072,0 197 | broadcast,3072,0 198 | broadcast,28311552,0 199 | broadcast,9216,0 200 | broadcast,9437184,0 201 | broadcast,3072,0 202 | broadcast,3072,0 203 | broadcast,3072,0 204 | broadcast,37748736,0 205 | broadcast,12288,0 206 | broadcast,37748736,0 207 | broadcast,3072,0 208 | broadcast,3072,0 209 | broadcast,3072,0 210 | broadcast,28311552,0 211 | broadcast,9216,0 212 | broadcast,9437184,0 213 | broadcast,3072,0 214 | broadcast,3072,0 215 | broadcast,3072,0 216 | broadcast,37748736,0 217 | broadcast,12288,0 218 | broadcast,37748736,0 219 | broadcast,3072,0 220 | broadcast,3072,0 221 | broadcast,3072,0 222 | broadcast,28311552,0 223 | broadcast,9216,0 224 | broadcast,9437184,0 225 | broadcast,3072,0 226 | broadcast,3072,0 227 | broadcast,3072,0 228 | broadcast,37748736,0 229 | broadcast,12288,0 230 | broadcast,37748736,0 231 | broadcast,3072,0 232 | broadcast,3072,0 233 | broadcast,3072,0 234 | broadcast,28311552,0 235 | broadcast,9216,0 236 | broadcast,9437184,0 237 | broadcast,3072,0 238 | broadcast,3072,0 239 | broadcast,3072,0 240 | broadcast,37748736,0 241 | broadcast,12288,0 242 | broadcast,37748736,0 243 | broadcast,3072,0 244 | broadcast,3072,0 245 | broadcast,3072,0 246 | broadcast,28311552,0 247 | broadcast,9216,0 248 | broadcast,9437184,0 249 | broadcast,3072,0 250 | broadcast,3072,0 251 | broadcast,3072,0 252 | broadcast,37748736,0 253 | broadcast,12288,0 254 | broadcast,37748736,0 255 | broadcast,3072,0 256 | broadcast,3072,0 257 | broadcast,3072,0 258 | broadcast,28311552,0 259 | broadcast,9216,0 260 | broadcast,9437184,0 261 | broadcast,3072,0 262 | broadcast,3072,0 263 | broadcast,3072,0 264 | broadcast,37748736,0 265 | broadcast,12288,0 266 | broadcast,37748736,0 267 | broadcast,3072,0 268 | broadcast,3072,0 269 | broadcast,3072,0 270 | broadcast,28311552,0 271 | broadcast,9216,0 272 | broadcast,9437184,0 273 | broadcast,3072,0 274 | broadcast,3072,0 275 | broadcast,3072,0 276 | broadcast,37748736,0 277 | broadcast,12288,0 278 | broadcast,37748736,0 279 | broadcast,3072,0 280 | broadcast,3072,0 281 | broadcast,3072,0 282 | broadcast,28311552,0 283 | broadcast,9216,0 284 | broadcast,9437184,0 285 | broadcast,3072,0 286 | broadcast,3072,0 287 | broadcast,3072,0 288 | broadcast,37748736,0 289 | broadcast,12288,0 290 | broadcast,37748736,0 291 | broadcast,3072,0 292 | broadcast,3072,0 293 | broadcast,3072,0 294 | broadcast,28311552,0 295 | broadcast,9216,0 296 | broadcast,9437184,0 297 | broadcast,3072,0 298 | broadcast,3072,0 299 | broadcast,3072,0 300 | broadcast,37748736,0 301 | broadcast,12288,0 302 | broadcast,37748736,0 303 | broadcast,3072,0 304 | broadcast,3072,0 305 | broadcast,3072,0 306 | broadcast,28311552,0 307 | broadcast,9216,0 308 | broadcast,9437184,0 309 | broadcast,3072,0 310 | broadcast,3072,0 311 | broadcast,3072,0 312 | broadcast,37748736,0 313 | broadcast,12288,0 314 | broadcast,37748736,0 315 | broadcast,3072,0 316 | broadcast,3072,0 317 | broadcast,3072,0 318 | broadcast,28311552,0 319 | broadcast,9216,0 320 | broadcast,9437184,0 321 | broadcast,3072,0 322 | broadcast,3072,0 323 | broadcast,3072,0 324 | broadcast,37748736,0 325 | broadcast,12288,0 326 | broadcast,37748736,0 327 | broadcast,3072,0 328 | broadcast,3072,0 329 | broadcast,3072,0 330 | broadcast,28311552,0 331 | broadcast,9216,0 332 | broadcast,9437184,0 333 | broadcast,3072,0 334 | broadcast,3072,0 335 | broadcast,3072,0 336 | broadcast,37748736,0 337 | broadcast,12288,0 338 | broadcast,37748736,0 339 | broadcast,3072,0 340 | broadcast,3072,0 341 | broadcast,3072,0 342 | broadcast,28311552,0 343 | broadcast,9216,0 344 | broadcast,9437184,0 345 | broadcast,3072,0 346 | broadcast,3072,0 347 | broadcast,3072,0 348 | broadcast,37748736,0 349 | broadcast,12288,0 350 | broadcast,37748736,0 351 | broadcast,3072,0 352 | broadcast,3072,0 353 | broadcast,3072,0 354 | broadcast,28311552,0 355 | broadcast,9216,0 356 | broadcast,9437184,0 357 | broadcast,3072,0 358 | broadcast,3072,0 359 | broadcast,3072,0 360 | broadcast,37748736,0 361 | broadcast,12288,0 362 | broadcast,37748736,0 363 | broadcast,3072,0 364 | broadcast,3072,0 365 | broadcast,3072,0 366 | allreduce,1,-1 367 | allreduce,1,-1 368 | broadcast,3,0 369 | broadcast,5,0 370 | broadcast,16392,0 371 | allreduce,16384,-1 372 | allreduce,16384,-1 373 | allreduce,16384,-1 374 | allreduce,1,-1 375 | reduce,264330240,0 376 | reduce,1024,0 377 | reduce,11264,0 378 | reduce,5472256,0 379 | reduce,32276480,0 380 | reduce,188823552,0 381 | reduce,75515904,0 382 | reduce,5120,0 383 | reduce,1024,0 384 | reduce,10947584,0 385 | reduce,26804224,0 386 | reduce,264333312,0 387 | reduce,9216,0 388 | reduce,9437184,0 389 | reduce,16425984,0 390 | reduce,11894784,0 391 | reduce,75528192,0 392 | reduce,151044096,0 393 | reduce,4096,0 394 | reduce,2048,0 395 | reduce,3072,0 396 | reduce,37748736,0 397 | reduce,21901312,0 398 | reduce,15859712,0 399 | reduce,226572288,0 400 | reduce,5120,0 401 | reduce,1024,0 402 | reduce,3072,0 403 | reduce,9437184,0 404 | reduce,9216,0 405 | reduce,28311552,0 406 | reduce,9216,0 407 | reduce,27361280,0 408 | reduce,10387456,0 409 | reduce,188814336,0 410 | reduce,3072,0 411 | reduce,3072,0 412 | reduce,3072,0 413 | reduce,37748736,0 414 | reduce,12288,0 415 | reduce,37748736,0 416 | reduce,4531200,0 417 | reduce,4915200,0 418 | reduce,179377152,0 419 | reduce,37748736,0 420 | reduce,4096,0 421 | reduce,2048,0 422 | reduce,3072,0 423 | reduce,9437184,0 424 | reduce,9216,0 425 | reduce,28311552,0 426 | reduce,566272,0 427 | reduce,37191680,0 428 | reduce,188814336,0 429 | reduce,2048,0 430 | reduce,4096,0 431 | reduce,3072,0 432 | reduce,37748736,0 433 | reduce,12288,0 434 | reduce,37748736,0 435 | reduce,9216,0 436 | reduce,9437184,0 437 | reduce,6038528,0 438 | reduce,22282240,0 439 | reduce,75528192,0 440 | reduce,113276928,0 441 | reduce,3072,0 442 | reduce,3072,0 443 | reduce,3072,0 444 | reduce,9437184,0 445 | reduce,9216,0 446 | reduce,28311552,0 447 | reduce,9216,0 448 | reduce,37748736,0 449 | reduce,11513856,0 450 | reduce,26247168,0 451 | reduce,151053312,0 452 | reduce,1024,0 453 | reduce,5120,0 454 | reduce,3072,0 455 | reduce,37748736,0 456 | reduce,12288,0 457 | reduce,37748736,0 458 | reduce,9216,0 459 | reduce,9437184,0 460 | reduce,9216,0 461 | reduce,28311552,0 462 | reduce,9216,0 463 | reduce,16973824,0 464 | reduce,20774912,0 465 | reduce,151047168,0 466 | reduce,2048,0 467 | reduce,4096,0 468 | reduce,3072,0 469 | reduce,9437184,0 470 | reduce,9216,0 471 | reduce,28311552,0 472 | reduce,9216,0 473 | reduce,37748736,0 474 | reduce,12288,0 475 | reduce,37748736,0 476 | reduce,9216,0 477 | reduce,9437184,0 478 | reduce,22455296,0 479 | reduce,5865472,0 480 | reduce,119583744,0 481 | reduce,154533888,0 482 | allreduce,1,-1 483 | allreduce,1,-1 484 | allreduce,1,-1 485 | allgather,42359660,-1 486 | allgather,42359660,-1 487 | allgather,42359660,-1 488 | allgather,42359660,-1 489 | allgather,42359660,-1 490 | allgather,42359660,-1 491 | allgather,42359672,-1 492 | allgather,100352,-1 493 | -------------------------------------------------------------------------------- /src/ProcessGroupCCL.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * 3. Neither the name of the Intel Corporation nor the names of its contributors 16 | * may be used to endorse or promote products derived from this software 17 | * without specific prior written permission. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | * POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #pragma once 33 | 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include 41 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13 42 | #if TORCH_VERSION_MAJOR > 1 43 | #include 44 | #include 45 | #else 46 | #include 47 | #endif 48 | #include 49 | #include 50 | #include 51 | #include 52 | #else 53 | #include 54 | #include 55 | #include 56 | #include 57 | #endif 58 | 59 | 60 | namespace oneccl_bindings_for_pytorch { 61 | struct CCLCommCollector; 62 | 63 | static inline void format_tensors_param(std::vector& param, const at::Tensor& tensor) { 64 | param.emplace_back(tensor); 65 | } 66 | 67 | template 68 | static inline void format_tensors_param(std::vector& param, const std::vector& vec) { 69 | for (const auto& elem : vec) { 70 | format_tensors_param(param, elem); 71 | } 72 | } 73 | } 74 | 75 | namespace c10d { 76 | 77 | #if TORCH_VERSION_MAJOR > 1 || TORCH_VERSION_MINOR >= 13 78 | using C10D_Work = c10d::Work; 79 | #else 80 | using C10D_Work = c10d::ProcessGroup::Work; 81 | #endif 82 | 83 | // WorkCCL is the state associated with a CCL operarion. 84 | // 85 | // ProcessGroupCCL implements CCL bindings for c10d. 86 | // 87 | // All functions on this class are expected to be called in the same 88 | // order across processes in the group. 89 | // 90 | // All collective functions provided by this class are scheduled 91 | // for asynchronous execution by CCL. 92 | constexpr const char* CCL_BACKEND_NAME = "ccl"; 93 | 94 | // Environment variable which controls whether wait() and synchronize() are blocking or 95 | // non-blocking. 96 | constexpr const char* CCL_BLOCKING_WAIT = "CCL_BLOCKING_WAIT"; 97 | 98 | // Environment variable which controls whether or not use default stream as 99 | // communication stream for collectives 100 | constexpr const char* CCL_SAME_STREAM = "CCL_SAME_STREAM"; 101 | 102 | constexpr const char* TORCH_LLM_ALLREDUCE = "TORCH_LLM_ALLREDUCE"; 103 | 104 | // inline constexpr CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04; 105 | 106 | #if TORCH_VERSION_MAJOR > 1 107 | using Baseclass = Backend; 108 | #else 109 | using Baseclass = ProcessGroup; 110 | #endif 111 | class ProcessGroupCCL : public Baseclass 112 | { 113 | public: 114 | class AsyncWorkCCL : public C10D_Work { 115 | public: 116 | AsyncWorkCCL(std::vector> outputTensors, 117 | int rank = -1, 118 | c10d::OpType opType = OpType::UNKNOWN, 119 | const char* profilingTitle = nullptr, 120 | const c10::optional>& inputTensors = c10::nullopt); 121 | 122 | virtual void run() = 0; 123 | 124 | c10::intrusive_ptr getFuture() override; 125 | 126 | std::vector result() override; 127 | 128 | virtual void finishAsyncWorkCCL(); 129 | 130 | void finishAsyncWorkCCLError(std::exception_ptr eptr); 131 | 132 | public: 133 | std::string debugName; 134 | // Clone of blockingWait_ from ProcessGroupCCL. 135 | #if CCL_MINOR_VERSION < 14 136 | bool blockingWait_ = true; 137 | #else 138 | bool blockingWait_ = false; 139 | #endif 140 | // Clone of useSameStream_ from ProcessGroupCCL. 141 | bool useSameStream_ = false; 142 | 143 | protected: 144 | friend class ProcessGroupCCL; 145 | const std::vector> outputTensors_; 146 | // The future returned by getFuture. 147 | c10::intrusive_ptr future_; 148 | }; 149 | 150 | explicit ProcessGroupCCL(const c10::intrusive_ptr& store, 151 | int rank, 152 | int size, 153 | std::chrono::milliseconds); 154 | virtual ~ProcessGroupCCL(); 155 | 156 | #if TORCH_VERSION_MINOR >= 11 157 | const std::string getBackendName() const override { 158 | return std::string(CCL_BACKEND_NAME); 159 | } 160 | #endif 161 | 162 | void startCoalescing() override; 163 | 164 | c10::intrusive_ptr endCoalescing() override; 165 | 166 | c10::intrusive_ptr broadcast( 167 | std::vector& data, 168 | const BroadcastOptions& opts = BroadcastOptions()) override; 169 | 170 | c10::intrusive_ptr allreduce( 171 | std::vector& tensors, 172 | const AllreduceOptions& opts = AllreduceOptions()) override; 173 | 174 | c10::intrusive_ptr allreduce_coalesced( 175 | std::vector& tensors, 176 | const AllreduceCoalescedOptions& opts = 177 | AllreduceCoalescedOptions()) override; 178 | 179 | c10::intrusive_ptr reduce( 180 | std::vector& tensors, 181 | const ReduceOptions& opts = ReduceOptions()) override; 182 | 183 | c10::intrusive_ptr allgather( 184 | std::vector>& outputTensors, 185 | std::vector& inputTensors, 186 | const AllgatherOptions& opts = AllgatherOptions()) override; 187 | 188 | c10::intrusive_ptr _allgather_base( 189 | at::Tensor& outputBuffer, 190 | at::Tensor& inputBuffer, 191 | const AllgatherOptions& opts = AllgatherOptions()) override; 192 | 193 | c10::intrusive_ptr allgather_coalesced( 194 | std::vector>& outputTensorLists, 195 | std::vector& inputTensors, 196 | const AllgatherOptions& opts = AllgatherOptions()) override; 197 | 198 | c10::intrusive_ptr allgather_into_tensor_coalesced( 199 | std::vector& outputTensors, 200 | std::vector& inputTensors, 201 | const AllgatherOptions& opts = AllgatherOptions()) override; 202 | 203 | c10::intrusive_ptr gather( 204 | std::vector>& outputTensors, 205 | std::vector& inputTensors, 206 | const GatherOptions& opts = GatherOptions()) override; 207 | 208 | c10::intrusive_ptr scatter( 209 | std::vector& outputTensors, 210 | std::vector>& inputTensors, 211 | const ScatterOptions& opts = ScatterOptions()) override; 212 | 213 | c10::intrusive_ptr reduce_scatter( 214 | std::vector& outputTensors, 215 | std::vector>& inputTensors, 216 | const ReduceScatterOptions& opts = ReduceScatterOptions()) override; 217 | 218 | c10::intrusive_ptr _reduce_scatter_base( 219 | at::Tensor& outputBuffer, 220 | at::Tensor& inputBuffer, 221 | const ReduceScatterOptions& opts = ReduceScatterOptions()) override; 222 | 223 | c10::intrusive_ptr reduce_scatter_tensor_coalesced( 224 | std::vector& outputs, 225 | std::vector& inputs, 226 | const ReduceScatterOptions& opts = ReduceScatterOptions()) override; 227 | 228 | c10::intrusive_ptr alltoall_base( 229 | at::Tensor& outputTensor, 230 | at::Tensor& inputTensor, 231 | std::vector& outputSplitSizes, 232 | std::vector& inputSplitSizes, 233 | const AllToAllOptions& opts = AllToAllOptions()) override; 234 | 235 | c10::intrusive_ptr alltoall( 236 | std::vector& outputTensors, 237 | std::vector& inputTensors, 238 | const AllToAllOptions& opts = AllToAllOptions()) override; 239 | 240 | c10::intrusive_ptr send( 241 | std::vector& tensors, 242 | int dstRank, 243 | int tag) override; 244 | 245 | c10::intrusive_ptr recv( 246 | std::vector& tensors, 247 | int srcRank, 248 | int tag) override; 249 | 250 | c10::intrusive_ptr recvAnysource( 251 | std::vector& tensor, 252 | int tag) override; 253 | 254 | c10::intrusive_ptr barrier( 255 | const BarrierOptions& opts = BarrierOptions()) override; 256 | 257 | void groupStart(); 258 | 259 | void groupEnd(); 260 | 261 | // create a new ProcessGroupCCL and initialize CCL if not initialized 262 | #if TORCH_VERSION_MAJOR > 1 263 | static c10::intrusive_ptr createProcessGroupCCL( 264 | #else 265 | static c10::intrusive_ptr createProcessGroupCCL( 266 | #endif 267 | const c10::intrusive_ptr& store, 268 | int rank = -1, 269 | int size = -1, 270 | std::chrono::milliseconds op_time_out = kNoTimeout); 271 | static const int64_t OP_TIMEOUT_MILLIS; 272 | public: 273 | 274 | static void cclInitOnce(); 275 | static void cclFini(); 276 | 277 | // Store that is used to exchange information between processes. 278 | c10::intrusive_ptr store_; 279 | 280 | std::chrono::milliseconds timeout; 281 | 282 | std::unique_ptr ccl_member_; 283 | 284 | static std::mutex globalMutex; 285 | 286 | // Whether or not wait() and synchronize() are blocking operations that wait 287 | // for the operation to complete. 288 | #if CCL_MINOR_VERSION < 14 289 | bool blockingWait_ = true; 290 | #else 291 | bool blockingWait_ = false; 292 | #endif 293 | 294 | // Environment variable which controls whether to keep same stream 295 | // for collectives and compute 296 | bool useSameStream_ = false; 297 | 298 | bool torch_llm_allreduce_ = false; 299 | 300 | // Flag to denote if a coalescing groupStart/groupEnd block is active 301 | bool is_coalescing_ = false; 302 | 303 | // Stores device indexes for all collectives run inside a coalescing block 304 | std::vector coalescedDevices_; 305 | 306 | // The number of active groupStart() calls. This counter will be increased 307 | // by 1 when groupStart() is called and decreased by 1 when group_end() 308 | // is called. 309 | static thread_local uint64_t cclActiveGroupCounter_; 310 | }; 311 | 312 | } // namespace c10d 313 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | **NOTE**: This repo has been **DEPRECATED**. Please use [PyTorch*](https://github.com/pytorch/pytorch) directly for distributed scenarios. We remain committed to providing robust support and high performance through PyTorch* for Intel® CPU and GPU platforms. 3 | 4 | =========================================================================== 5 | 6 | # Intel® oneCCL Bindings for PyTorch (formerly known as torch_ccl) 7 | 8 | This repository holds PyTorch bindings maintained by Intel® for the Intel® oneAPI Collective Communications Library (oneCCL). 9 | 10 | ## Introduction 11 | 12 | [PyTorch](https://github.com/pytorch/pytorch) is an open-source machine learning framework. 13 | 14 | [Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training, implementing collectives like `allreduce`, `allgather`, `alltoall`. For more information on oneCCL, please refer to the [oneCCL documentation](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/oneccl/source/). 15 | 16 | `oneccl_bindings_for_pytorch` module implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now. 17 | 18 | ## Capability 19 | 20 | The table below shows which functions are available for use with CPU / Intel dGPU tensors. 21 | 22 | | | CPU | GPU | 23 | | :--------------- | :---: | :---: | 24 | | `send` | × | √ | 25 | | `recv` | × | √ | 26 | | `broadcast` | √ | √ | 27 | | `all_reduce` | √ | √ | 28 | | `reduce` | √ | √ | 29 | | `all_gather` | √ | √ | 30 | | `gather` | √ | √ | 31 | | `scatter` | √ | √ | 32 | | `reduce_scatter` | √ | √ | 33 | | `all_to_all` | √ | √ | 34 | | `barrier` | √ | √ | 35 | 36 | 37 | ## PyTorch API Align 38 | 39 | We recommend using Anaconda as Python package management system. The followings are the corresponding branches (tags) of `oneccl_bindings_for_pytorch` and supported PyTorch. 40 | 41 | | `torch` | `oneccl_bindings_for_pytorch` | 42 | | :-------------------------------------------------------------: | :-----------------------------------------------------------------------: | 43 | | `master` | `master` | 44 | | [v2.8.0](https://github.com/pytorch/pytorch/tree/v2.6.0) | [ccl_torch2.8.0](https://github.com/intel/torch-ccl/tree/ccl_torch2.6.0+xpu) | 45 | | [v2.7.0](https://github.com/pytorch/pytorch/tree/v2.7.0) | [ccl_torch2.7.0](https://github.com/intel/torch-ccl/tree/ccl_torch2.7.0+xpu) | 46 | | [v2.6.0](https://github.com/pytorch/pytorch/tree/v2.6.0) | [ccl_torch2.6.0](https://github.com/intel/torch-ccl/tree/ccl_torch2.6.0+xpu) | 47 | | [v2.5.0](https://github.com/pytorch/pytorch/tree/v2.5.0) | [ccl_torch2.5.0](https://github.com/intel/torch-ccl/tree/ccl_torch2.5.0+xpu) | 48 | | [v2.3.1](https://github.com/pytorch/pytorch/tree/v2.3.1) | [ccl_torch2.3.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.3.100+xpu) | 49 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.400](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.400+xpu) | 50 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.300](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.300+xpu) | 51 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.200](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.200+xpu) | 52 | | [v2.1.0](https://github.com/pytorch/pytorch/tree/v2.1.0) | [ccl_torch2.1.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.100+xpu) | 53 | | [v2.0.1](https://github.com/pytorch/pytorch/tree/v2.0.1) | [ccl_torch2.0.100](https://github.com/intel/torch-ccl/tree/ccl_torch2.0.100) | 54 | | [v1.13](https://github.com/pytorch/pytorch/tree/v1.13) | [ccl_torch1.13](https://github.com/intel/torch-ccl/tree/ccl_torch1.13) | 55 | | [v1.12.1](https://github.com/pytorch/pytorch/tree/v1.12.1) | [ccl_torch1.12.100](https://github.com/intel/torch-ccl/tree/ccl_torch1.12.100) | 56 | | [v1.12.0](https://github.com/pytorch/pytorch/tree/v1.12.0) | [ccl_torch1.12](https://github.com/intel/torch-ccl/tree/ccl_torch1.12) | 57 | | [v1.11.0](https://github.com/pytorch/pytorch/tree/v1.11.0) | [ccl_torch1.11](https://github.com/intel/torch-ccl/tree/ccl_torch1.11) | 58 | | [v1.10.0](https://github.com/pytorch/pytorch/tree/v1.10.0) | [ccl_torch1.10](https://github.com/intel/torch-ccl/tree/ccl_torch1.10) | 59 | | [v1.9.0](https://github.com/pytorch/pytorch/tree/v1.9.0) | [ccl_torch1.9](https://github.com/intel/torch-ccl/tree/ccl_torch1.9) | 60 | | [v1.8.1](https://github.com/pytorch/pytorch/tree/v1.8.1) | [ccl_torch1.8](https://github.com/intel/torch-ccl/tree/ccl_torch1.8) | 61 | | [v1.7.1](https://github.com/pytorch/pytorch/tree/v1.7.1) | [ccl_torch1.7](https://github.com/intel/torch-ccl/tree/ccl_torch1.7) | 62 | | [v1.6.0](https://github.com/pytorch/pytorch/tree/v1.6.0) | [ccl_torch1.6](https://github.com/intel/torch-ccl/tree/ccl_torch1.6) | 63 | | [v1.5-rc3](https://github.com/pytorch/pytorch/tree/v1.5.0-rc3) | [beta09](https://github.com/intel/torch-ccl/tree/beta09) | 64 | 65 | The usage details can be found in the README of corresponding branch. 66 | 67 | ## Requirements 68 | 69 | - Python 3.8 or later and a C++17 compiler 70 | 71 | - PyTorch v2.8.0 72 | 73 | ## Build Option List 74 | 75 | The following build options are supported in Intel® oneCCL Bindings for PyTorch*. 76 | 77 | | Build Option | Default Value | Description | 78 | | :---------------------------------- | :------------- | :-------------------------------------------------------------------------------------------------- | 79 | | COMPUTE_BACKEND | N/A | Set oneCCL `COMPUTE_BACKEND`, set to `dpcpp` and use DPC++ compiler to enable support for Intel XPU | 80 | | USE_SYSTEM_ONECCL | OFF | Use oneCCL library in system | 81 | | CCL_PACKAGE_NAME | oneccl-bind-pt | Set wheel name | 82 | | ONECCL_BINDINGS_FOR_PYTORCH_BACKEND | cpu | Set backend | 83 | | CCL_SHA_VERSION | False | Add git head sha version into wheel name | 84 | 85 | ## Launch Option List 86 | 87 | The following launch options are supported in Intel® oneCCL Bindings for PyTorch*. 88 | 89 | | Launch Option | Default Value | Description | 90 | | :--------------------------------------- | :------------ | :-------------------------------------------------------------------- | 91 | | ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE | 0 | Set verbose level in oneccl_bindings_for_pytorch | 92 | | ONECCL_BINDINGS_FOR_PYTORCH_ENV_WAIT_GDB | 0 | Set 1 to force the oneccl_bindings_for_pytorch wait for GDB attaching | 93 | | TORCH_LLM_ALLREDUCE | 0 | Set 1 to enable this prototype feature for better scale-up performance. This is a prototype feature to provide better scale-up performance by enabling optimized collective algorithms in oneCCL and asynchronous execution in torch-ccl. This feature requires XeLink enabled for cross-cards communication.| 94 | | CCL_BLOCKING_WAIT | 0 | Set 1 to enable this prototype feature, which is to control whether collectives execution on XPU is host blocking or non-blocking. | 95 | | CCL_SAME_STREAM | 0 | Set 1 to enable this prototype feature, which is to allow using a computation stream as communication stream to minimize overhead for streams synchronization. | 96 | 97 | ## Installation 98 | 99 | ### Install from Source 100 | 101 | 1. clone the `oneccl_bindings_for_pytorch`. 102 | 103 | ```bash 104 | git clone https://github.com/intel/torch-ccl.git && cd torch-ccl 105 | git checkout ccl_torch2.8.0+xpu 106 | git submodule sync 107 | git submodule update --init --recursive 108 | ``` 109 | 110 | 2. Install `oneccl_bindings_for_pytorch` 111 | 112 | ```bash 113 | # for CPU Backend Only 114 | python setup.py install 115 | # for XPU Backend: use DPC++ Compiler to enable support for Intel XPU 116 | # build with oneCCL from third party 117 | COMPUTE_BACKEND=dpcpp python setup.py install 118 | # build with oneCCL from basekit 119 | export INTELONEAPIROOT=${HOME}/intel/oneapi 120 | USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install 121 | ``` 122 | 123 | ### Install Prebuilt Wheel 124 | 125 | Wheel files are available for the following Python versions. Please always use the latest release to get started. 126 | 127 | | Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 | Python 3.12 | Python 3.13 | Python 3.13t | 128 | | :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | :---------: | :---------: | :---------: | :----------: | 129 | | 2.8.0 | | | | √ | √ | √ | √ | √ | √ | 130 | | 2.7.0 | | | | √ | √ | √ | √ | √ | √ | 131 | | 2.6.0 | | | | √ | √ | √ | √ | √ | | 132 | | 2.5.1 | | | | √ | √ | √ | √ | | | 133 | | 2.3.100 | | | √ | √ | √ | √ | | | | 134 | | 2.1.400 | | | √ | √ | √ | √ | | | | 135 | | 2.1.300 | | | √ | √ | √ | √ | | | | 136 | | 2.1.200 | | | √ | √ | √ | √ | | | | 137 | | 2.1.100 | | | √ | √ | √ | √ | | | | 138 | | 2.0.100 | | | √ | √ | √ | √ | | | | 139 | | 1.13 | | √ | √ | √ | √ | | | | | 140 | | 1.12.100 | | √ | √ | √ | √ | | | | | 141 | | 1.12.0 | | √ | √ | √ | √ | | | | | 142 | | 1.11.0 | | √ | √ | √ | √ | | | | | 143 | | 1.10.0 | √ | √ | √ | √ | | | | | | 144 | 145 | ```bash 146 | python -m pip install oneccl_bind_pt==2.8.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 147 | ``` 148 | 149 | **Note:** Please set proxy or update URL address to https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ if you meet connection issue. 150 | 151 | ### Runtime Dynamic Linking 152 | 153 | - If oneccl_bindings_for_pytorch is built without oneCCL and use oneCCL in system, dynamic link oneCCl from oneAPI basekit (recommended usage): 154 | 155 | ```bash 156 | source $basekit_root/ccl/latest/env/vars.sh 157 | ``` 158 | 159 | Note: Make sure you have installed [basekit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html#base-kit) when using Intel® oneCCL Bindings for Pytorch\* on Intel® GPUs. 160 | 161 | - If oneccl_bindings_for_pytorch is built with oneCCL from third party or installed from prebuilt wheel: 162 | Dynamic link oneCCL and Intel MPI libraries: 163 | 164 | ```bash 165 | source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/setvars.sh 166 | ``` 167 | 168 | Dynamic link oneCCL only (not including Intel MPI): 169 | 170 | ```bash 171 | source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/vars.sh 172 | ``` 173 | 174 | ## Usage 175 | 176 | **Note:** Please `import torch`, prior to `import oneccl_bindings_for_pytorch`. 177 | 178 | example.py 179 | 180 | ```python 181 | 182 | import torch 183 | import intel_extension_for_pytorch 184 | import oneccl_bindings_for_pytorch 185 | import torch.nn.parallel 186 | import torch.distributed as dist 187 | 188 | ... 189 | 190 | os.environ['MASTER_ADDR'] = '127.0.0.1' 191 | os.environ['MASTER_PORT'] = '29500' 192 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 193 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 194 | 195 | backend = 'ccl' 196 | dist.init_process_group(backend, ...) 197 | my_rank = dist.get_rank() 198 | my_size = dist.get_world_size() 199 | print("my rank = %d my size = %d" % (my_rank, my_size)) 200 | 201 | ... 202 | 203 | model = torch.nn.parallel.DistributedDataParallel(model, ...) 204 | 205 | ... 206 | ``` 207 | 208 | (oneccl_bindings_for_pytorch is built without oneCCL, use oneCCL and MPI(if needed) in system) 209 | 210 | ```bash 211 | source $basekit_root/ccl/latest/env/vars.sh 212 | source $basekit_root/mpi/latest/env/vars.sh 213 | 214 | mpirun -n -ppn -f python example.py 215 | ``` 216 | 217 | ## Performance Debugging 218 | 219 | For debugging performance of communication primitives PyTorch's [Autograd profiler](https://pytorch.org/docs/stable/autograd.html#profiler) 220 | can be used to inspect time spent inside oneCCL calls. 221 | 222 | Example: 223 | 224 | profiling.py 225 | 226 | ```python 227 | 228 | import torch.nn.parallel 229 | import torch.distributed as dist 230 | import oneccl_bindings_for_pytorch 231 | import os 232 | 233 | os.environ['MASTER_ADDR'] = '127.0.0.1' 234 | os.environ['MASTER_PORT'] = '29500' 235 | os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0)) 236 | os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1)) 237 | 238 | backend = 'ccl' 239 | dist.init_process_group(backend) 240 | my_rank = dist.get_rank() 241 | my_size = dist.get_world_size() 242 | print("my rank = %d my size = %d" % (my_rank, my_size)) 243 | 244 | x = torch.ones([2, 2]) 245 | y = torch.ones([4, 4]) 246 | with torch.autograd.profiler.profile(record_shapes=True) as prof: 247 | for _ in range(10): 248 | dist.all_reduce(x) 249 | dist.all_reduce(y) 250 | dist.barrier() 251 | print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cpu_time_total")) 252 | 253 | ``` 254 | 255 | ```bash 256 | mpirun -n 2 -l python profiling.py 257 | ``` 258 | 259 | ```bash 260 | [0] my rank = 0 my size = 2 261 | [0] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 262 | [0] Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls Input Shapes 263 | [0] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 264 | [0] oneccl_bindings_for_pytorch::allreduce 91.41% 297.900ms 91.41% 297.900ms 29.790ms 10 [[2, 2]] 265 | [0] oneccl_bindings_for_pytorch::wait::cpu::allreduce 8.24% 26.845ms 8.24% 26.845ms 2.684ms 10 [[2, 2], [2, 2]] 266 | [0] oneccl_bindings_for_pytorch::wait::cpu::allreduce 0.30% 973.651us 0.30% 973.651us 97.365us 10 [[4, 4], [4, 4]] 267 | [0] oneccl_bindings_for_pytorch::allreduce 0.06% 190.254us 0.06% 190.254us 19.025us 10 [[4, 4]] 268 | [0] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 269 | [0] Self CPU time total: 325.909ms 270 | [0] 271 | [1] my rank = 1 my size = 2 272 | [1] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 273 | [1] Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls Input Shapes 274 | [1] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 275 | [1] oneccl_bindings_for_pytorch::allreduce 96.03% 318.551ms 96.03% 318.551ms 31.855ms 10 [[2, 2]] 276 | [1] oneccl_bindings_for_pytorch::wait::cpu::allreduce 3.62% 12.019ms 3.62% 12.019ms 1.202ms 10 [[2, 2], [2, 2]] 277 | [1] oneccl_bindings_for_pytorch::allreduce 0.33% 1.082ms 0.33% 1.082ms 108.157us 10 [[4, 4]] 278 | [1] oneccl_bindings_for_pytorch::wait::cpu::allreduce 0.02% 56.505us 0.02% 56.505us 5.651us 10 [[4, 4], [4, 4]] 279 | [1] ----------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ -------------------- 280 | [1] Self CPU time total: 331.708ms 281 | [1] 282 | 283 | ``` 284 | 285 | ## License 286 | 287 | [BSD License](https://github.com/intel/torch-ccl/blob/master/LICENSE) 288 | --------------------------------------------------------------------------------