├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── ext-net ├── README.md ├── example │ ├── Makefile │ ├── nccl │ │ ├── common.h │ │ ├── err.h │ │ ├── net.h │ │ ├── net_device.h │ │ ├── net_v10.h │ │ ├── net_v2.h │ │ ├── net_v3.h │ │ ├── net_v4.h │ │ ├── net_v5.h │ │ ├── net_v6.h │ │ ├── net_v7.h │ │ ├── net_v8.h │ │ ├── net_v9.h │ │ └── types.h │ └── plugin.c └── google-fastsocket │ └── Makefile ├── ext-profiler ├── README.md └── example │ ├── Makefile │ ├── README.md │ ├── event.c │ ├── event.h │ ├── nccl │ ├── common.h │ ├── err.h │ ├── net_ib_v1.h │ ├── net_socket_v1.h │ ├── profiler.h │ ├── profiler_net.h │ ├── profiler_v1.h │ ├── profiler_v2.h │ ├── profiler_v3.h │ ├── profiler_v4.h │ └── types.h │ ├── plugin.c │ ├── plugin.h │ ├── print_event.c │ └── print_event.h ├── ext-tuner └── example │ ├── Makefile │ ├── nccl │ ├── common.h │ ├── err.h │ └── tuner.h │ └── plugin.c ├── makefiles ├── common.mk ├── formatting.mk └── version.mk ├── pkg ├── Makefile ├── debian │ ├── .gitignore │ ├── Makefile │ ├── changelog.in │ ├── compat │ ├── control.in │ ├── copyright │ ├── gbp.conf │ ├── libnccl-dev.install.in │ ├── libnccl2.install.in │ ├── rules │ └── source │ │ └── format ├── redhat │ ├── Makefile │ └── nccl.spec.in ├── srctxz │ ├── Makefile │ └── create_srctxz.sh.in └── txz │ ├── Makefile │ └── create_txz.sh.in └── src ├── Makefile ├── allocator.cc ├── bootstrap.cc ├── channel.cc ├── collectives.cc ├── debug.cc ├── device ├── Makefile ├── all_gather.h ├── all_reduce.h ├── broadcast.h ├── common.cu ├── common.h ├── common_kernel.h ├── generate.py ├── network │ └── unpack │ │ ├── unpack.h │ │ └── unpack_defs.h ├── onerank.cu ├── op128.h ├── primitives.h ├── prims_ll.h ├── prims_ll128.h ├── prims_simple.h ├── reduce.h ├── reduce_kernel.h ├── reduce_scatter.h ├── sendrecv.h └── symmetric │ ├── all_gather.cuh │ ├── all_reduce.cuh │ ├── generate.py │ ├── kernel.cuh │ ├── primitives.cuh │ └── reduce_scatter.cuh ├── enhcompat.cc ├── enqueue.cc ├── graph ├── connect.cc ├── paths.cc ├── rings.cc ├── rings.h ├── search.cc ├── topo.cc ├── topo.h ├── trees.cc ├── tuning.cc ├── xml.cc └── xml.h ├── group.cc ├── include ├── alloc.h ├── allocator.h ├── argcheck.h ├── bitops.h ├── bootstrap.h ├── channel.h ├── checks.h ├── coll_net.h ├── collectives.h ├── comm.h ├── core.h ├── cpuset.h ├── cudawrap.h ├── debug.h ├── device.h ├── enqueue.h ├── gdrwrap.h ├── graph.h ├── group.h ├── ibvcore.h ├── ibvsymbols.h ├── ibvwrap.h ├── info.h ├── ipcsocket.h ├── mlx5 │ ├── mlx5dvcore.h │ ├── mlx5dvsymbols.h │ └── mlx5dvwrap.h ├── mnnvl.h ├── nccl_common.h ├── net.h ├── net_device.h ├── nvmlwrap.h ├── nvtx.h ├── nvtx3 │ ├── nvToolsExt.h │ ├── nvToolsExtCounters.h │ ├── nvToolsExtCuda.h │ ├── nvToolsExtCudaRt.h │ ├── nvToolsExtMem.h │ ├── nvToolsExtMemCudaRt.h │ ├── nvToolsExtOpenCL.h │ ├── nvToolsExtPayload.h │ ├── nvToolsExtPayloadHelper.h │ ├── nvToolsExtSemanticsCounters.h │ ├── nvToolsExtSemanticsScope.h │ ├── nvToolsExtSync.h │ ├── nvtx3.hpp │ └── nvtxDetail │ │ ├── nvtxExtHelperMacros.h │ │ ├── nvtxExtImpl.h │ │ ├── nvtxExtImplCounters_v1.h │ │ ├── nvtxExtImplMemCudaRt_v1.h │ │ ├── nvtxExtImplMem_v1.h │ │ ├── nvtxExtImplPayload_v1.h │ │ ├── nvtxExtInit.h │ │ ├── nvtxExtPayloadHelperInternal.h │ │ ├── nvtxExtPayloadTypeInfo.h │ │ ├── nvtxExtTypes.h │ │ ├── nvtxImpl.h │ │ ├── nvtxImplCore.h │ │ ├── nvtxImplCudaRt_v3.h │ │ ├── nvtxImplCuda_v3.h │ │ ├── nvtxImplOpenCL_v3.h │ │ ├── nvtxImplSync_v3.h │ │ ├── nvtxInit.h │ │ ├── nvtxInitDecls.h │ │ ├── nvtxInitDefs.h │ │ ├── nvtxLinkOnce.h │ │ └── nvtxTypes.h ├── nvtx_payload_schemas.h ├── p2p.h ├── param.h ├── plugin │ ├── nccl_net.h │ ├── nccl_profiler.h │ ├── nccl_tuner.h │ ├── net │ │ ├── net_v10.h │ │ ├── net_v6.h │ │ ├── net_v7.h │ │ ├── net_v8.h │ │ └── net_v9.h │ ├── plugin.h │ ├── profiler │ │ ├── net_ib.h │ │ ├── net_ib_v1.h │ │ ├── net_socket.h │ │ ├── net_socket_v1.h │ │ ├── profiler_v1.h │ │ ├── profiler_v2.h │ │ ├── profiler_v3.h │ │ └── profiler_v4.h │ └── tuner │ │ ├── tuner_v2.h │ │ ├── tuner_v3.h │ │ └── tuner_v4.h ├── profiler.h ├── proxy.h ├── ras.h ├── register.h ├── register_inline.h ├── shm.h ├── shmutils.h ├── socket.h ├── strongstream.h ├── symmetric.h ├── timer.h ├── transport.h ├── trees.h ├── tuner.h └── utils.h ├── init.cc ├── init_nvtx.cc ├── misc ├── argcheck.cc ├── cudawrap.cc ├── gdrwrap.cc ├── ibvsymbols.cc ├── ibvwrap.cc ├── ipcsocket.cc ├── mlx5dvsymbols.cc ├── mlx5dvwrap.cc ├── nvmlwrap.cc ├── param.cc ├── shmutils.cc ├── socket.cc ├── strongstream.cc └── utils.cc ├── mnnvl.cc ├── nccl.h.in ├── nccl.pc.in ├── plugin ├── net.cc ├── net │ ├── net_v10.cc │ ├── net_v6.cc │ ├── net_v7.cc │ ├── net_v8.cc │ └── net_v9.cc ├── plugin_open.cc ├── profiler.cc ├── profiler │ ├── profiler_v1.cc │ ├── profiler_v2.cc │ ├── profiler_v3.cc │ └── profiler_v4.cc ├── tuner.cc └── tuner │ ├── tuner_v2.cc │ ├── tuner_v3.cc │ └── tuner_v4.cc ├── proxy.cc ├── ras ├── client.cc ├── client_support.cc ├── collectives.cc ├── peers.cc ├── ras.cc ├── ras_internal.h └── rasnet.cc ├── register ├── coll_reg.cc ├── register.cc └── sendrecv_reg.cc ├── symmetric.cc ├── transport.cc └── transport ├── coll_net.cc ├── generic.cc ├── net.cc ├── net_ib.cc ├── net_socket.cc ├── nvls.cc ├── p2p.cc ├── profiler.cc └── shm.cc /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. 2 | /build 3 | *.gcov 4 | /coverage/ 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National 13 | Laboratory, the U.S. Department of Energy, nor the names of their 14 | contributors may be used to endorse or promote products derived 15 | from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | The U.S. Department of Energy funded the development of this software 30 | under subcontract 7078610 with Lawrence Berkeley National Laboratory. 31 | 32 | 33 | This code also includes files from the NVIDIA Tools Extension SDK project. 34 | 35 | See: 36 | 37 | https://github.com/NVIDIA/NVTX 38 | 39 | for more information and license details. 40 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : src.build 9 | install : src.install 10 | BUILDDIR ?= $(abspath ./build) 11 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 12 | TARGETS := src pkg 13 | clean: ${TARGETS:%=%.clean} 14 | test.build: src.build 15 | LICENSE_FILES := LICENSE.txt 16 | LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) 17 | lic: $(LICENSE_TARGETS) 18 | 19 | ${BUILDDIR}/%.txt: %.txt 20 | @printf "Copying %-35s > %s\n" $< $@ 21 | mkdir -p ${BUILDDIR} 22 | cp $< $@ 23 | 24 | src.%: 25 | ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} 26 | 27 | pkg.%: 28 | ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} 29 | 30 | pkg.debian.prep: lic 31 | pkg.txz.prep: lic 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NCCL 2 | 3 | Optimized primitives for inter-GPU communication. 4 | 5 | ## Introduction 6 | 7 | NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. 8 | 9 | For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html). 10 | 11 | ## Build 12 | 13 | Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds. 14 | 15 | To build the library : 16 | 17 | ```shell 18 | $ cd nccl 19 | $ make -j src.build 20 | ``` 21 | 22 | If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with : 23 | 24 | ```shell 25 | $ make src.build CUDA_HOME= 26 | ``` 27 | 28 | NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set. 29 | 30 | By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform : 31 | ```shell 32 | $ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" 33 | ``` 34 | 35 | ## Install 36 | 37 | To install NCCL on the system, create a package then install it as root. 38 | 39 | Debian/Ubuntu : 40 | ```shell 41 | $ # Install tools to create debian packages 42 | $ sudo apt install build-essential devscripts debhelper fakeroot 43 | $ # Build NCCL deb package 44 | $ make pkg.debian.build 45 | $ ls build/pkg/deb/ 46 | ``` 47 | 48 | RedHat/CentOS : 49 | ```shell 50 | $ # Install tools to create rpm packages 51 | $ sudo yum install rpm-build rpmdevtools 52 | $ # Build NCCL rpm package 53 | $ make pkg.redhat.build 54 | $ ls build/pkg/rpm/ 55 | ``` 56 | 57 | OS-agnostic tarball : 58 | ```shell 59 | $ make pkg.txz.build 60 | $ ls build/pkg/txz/ 61 | ``` 62 | 63 | ## Tests 64 | 65 | Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests. 66 | 67 | ```shell 68 | $ git clone https://github.com/NVIDIA/nccl-tests.git 69 | $ cd nccl-tests 70 | $ make 71 | $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g 72 | ``` 73 | 74 | ## Copyright 75 | 76 | All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 77 | -------------------------------------------------------------------------------- /ext-net/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 9 | PLUGIN_SO:=libnccl-net.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | #include 11 | 12 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 13 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 14 | 15 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 16 | 17 | enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop }; 18 | 19 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext-net/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclInvalidUsage = 5, 15 | ncclRemoteError = 6 } ncclResult_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NET_H_ 6 | #define NET_H_ 7 | 8 | #include 9 | #include 10 | 11 | #include "err.h" 12 | #include "net_device.h" 13 | #include "common.h" 14 | 15 | #define NCCL_NET_HANDLE_MAXSIZE 128 16 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB 17 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 18 | 19 | #define NCCL_PTR_HOST 0x1 20 | #define NCCL_PTR_CUDA 0x2 21 | #define NCCL_PTR_DMABUF 0x4 22 | 23 | // Maximum number of requests per comm object 24 | #define NCCL_NET_MAX_REQUESTS 32 25 | 26 | #include "net_v10.h" 27 | #include "net_v9.h" 28 | #include "net_v8.h" 29 | #include "net_v7.h" 30 | #include "net_v6.h" 31 | #include "net_v5.h" 32 | #include "net_v4.h" 33 | #include "net_v3.h" 34 | #include "net_v2.h" 35 | 36 | typedef ncclNet_v10_t ncclNet_t; 37 | typedef ncclNetProperties_v10_t ncclNetProperties_t; 38 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; 39 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; 40 | 41 | #endif // end include guard 42 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_DEVICE_H_ 8 | #define NET_DEVICE_H_ 9 | 10 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 11 | #define NCCL_NET_MTU_SIZE 4096 12 | 13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 16 | 17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 18 | 19 | typedef struct { 20 | ncclNetDeviceType netDeviceType; // Network offload type 21 | int netDeviceVersion; // Version number for network offload 22 | void* handle; 23 | size_t size; 24 | int needsProxyProgress; 25 | } ncclNetDeviceHandle_v7_t; 26 | 27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; 29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; 30 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NET_V2_H_ 6 | #define NET_V2_H_ 7 | 8 | typedef struct { 9 | // Name of the network (mainly for logs) 10 | const char* name; 11 | // Initialize the network. 12 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 13 | // Return the number of adapters. 14 | ncclResult_t (*devices)(int* ndev); 15 | // Return the device path in /sys. NCCL will call free on this path. 16 | ncclResult_t (*pciPath)(int dev, char** path); 17 | // Return whether this device supports host pointers and/or CUDA pointers 18 | // as data from the current GPU. Supported types should be composed with 19 | // NCCL_PTR_HOST and NCCL_PTR_CUDA. 20 | ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); 21 | // Create a receiving object and provide a handle to connect to it. The 22 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 23 | // between ranks to create a connection. 24 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 25 | // Connect to a handle and return a sending comm object for that peer. 26 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 27 | // Finalize connection establishment after remote peer has called connectHandle 28 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 29 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v2_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NET_V3_H_ 6 | #define NET_V3_H_ 7 | 8 | #define NCCL_NET_MAX_REQUESTS_V3 16 9 | 10 | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; 11 | typedef struct { 12 | // Name of the network (mainly for logs) 13 | const char* name; 14 | // Initialize the network. 15 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 16 | // Return the number of adapters. 17 | ncclResult_t (*devices)(int* ndev); 18 | // Get various device properties. 19 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); 20 | // Create a receiving object and provide a handle to connect to it. The 21 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 22 | // between ranks to create a connection. 23 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 24 | // Connect to a handle and return a sending comm object for that peer. 25 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 26 | // Finalize connection establishment after remote peer has called connectHandle 27 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 28 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 29 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 30 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 31 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 32 | // Asynchronous send to a peer. 33 | // May return request == NULL if the call cannot be performed (or would block) 34 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 35 | // Asynchronous recv from a peer. 36 | // May return request == NULL if the call cannot be performed (or would block) 37 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 38 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 39 | // visible to the GPU 40 | ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); 41 | // Test whether a request is complete. If size is not NULL, it returns the 42 | // number of bytes sent/received. 43 | ncclResult_t (*test)(void* request, int* done, int* size); 44 | // Close and free send/recv comm objects 45 | ncclResult_t (*closeSend)(void* sendComm); 46 | ncclResult_t (*closeRecv)(void* recvComm); 47 | ncclResult_t (*closeListen)(void* listenComm); 48 | } ncclNet_v3_t; 49 | 50 | #endif // end include guard 51 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v4.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NET_V4_H_ 6 | #define NET_V4_H_ 7 | 8 | #define NCCL_NET_HANDLE_MAXSIZE_V4 64 9 | 10 | typedef struct { 11 | char* name; // Used mostly for logging. 12 | char* pciPath; // Path to the PCI device in /sys. 13 | uint64_t guid; // Unique identifier for the NIC chip. Important for 14 | // cards with multiple PCI functions (Physical or virtual). 15 | int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA 16 | int speed; // Port speed in Mbps. 17 | int port; // Port number. 18 | int maxComms; // Maximum number of comms we can create 19 | } ncclNetProperties_v4_t; 20 | 21 | // v4 struct for backwards compatibility 22 | typedef struct { 23 | // Name of the network (mainly for logs) 24 | const char* name; 25 | // Initialize the network. 26 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 27 | // Return the number of adapters. 28 | ncclResult_t (*devices)(int* ndev); 29 | // Get various device properties. 30 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); 31 | // Create a receiving object and provide a handle to connect to it. The 32 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 33 | // between ranks to create a connection. 34 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 35 | // Connect to a handle and return a sending comm object for that peer. 36 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 37 | // Finalize connection establishment after remote peer has called connectHandle 38 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 39 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 40 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 41 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 42 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 43 | // Asynchronous send to a peer. 44 | // May return request == NULL if the call cannot be performed (or would block) 45 | ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); 46 | // Asynchronous recv from a peer. 47 | // May return request == NULL if the call cannot be performed (or would block) 48 | ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); 49 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 50 | // visible to the GPU 51 | ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); 52 | // Test whether a request is complete. If size is not NULL, it returns the 53 | // number of bytes sent/received. 54 | ncclResult_t (*test)(void* request, int* done, int* size); 55 | // Close and free send/recv comm objects 56 | ncclResult_t (*closeSend)(void* sendComm); 57 | ncclResult_t (*closeRecv)(void* recvComm); 58 | ncclResult_t (*closeListen)(void* listenComm); 59 | } ncclNet_v4_t; 60 | 61 | #endif // end include guard 62 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v5.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NET_V5_H_ 6 | #define NET_V5_H_ 7 | 8 | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; 9 | typedef struct { 10 | // Name of the network (mainly for logs) 11 | const char* name; 12 | // Initialize the network. 13 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 14 | // Return the number of adapters. 15 | ncclResult_t (*devices)(int* ndev); 16 | // Get various device properties. 17 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); 18 | // Create a receiving object and provide a handle to connect to it. The 19 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 20 | // between ranks to create a connection. 21 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 22 | // Connect to a handle and return a sending comm object for that peer. 23 | // This call must not block for the connection to be established, and instead 24 | // should return successfully with sendComm == NULL with the expectation that 25 | // it will be called again until sendComm != NULL. 26 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 27 | // Finalize connection establishment after remote peer has called connect. 28 | // This call must not block for the connection to be established, and instead 29 | // should return successfully with recvComm == NULL with the expectation that 30 | // it will be called again until recvComm != NULL. 31 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 32 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 33 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 34 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 35 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 36 | // Asynchronous send to a peer. 37 | // May return request == NULL if the call cannot be performed (or would block) 38 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 39 | // Asynchronous recv from a peer. 40 | // May return request == NULL if the call cannot be performed (or would block) 41 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 42 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 43 | // visible to the GPU 44 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 45 | // Test whether a request is complete. If size is not NULL, it returns the 46 | // number of bytes sent/received. 47 | ncclResult_t (*test)(void* request, int* done, int* sizes); 48 | // Close and free send/recv comm objects 49 | ncclResult_t (*closeSend)(void* sendComm); 50 | ncclResult_t (*closeRecv)(void* recvComm); 51 | ncclResult_t (*closeListen)(void* listenComm); 52 | } ncclNet_v5_t; 53 | 54 | #endif // end include guard 55 | -------------------------------------------------------------------------------- /ext-net/example/nccl/net_v6.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NET_V6_H_ 6 | #define NET_V6_H_ 7 | 8 | typedef struct { 9 | char* name; // Used mostly for logging. 10 | char* pciPath; // Path to the PCI device in /sys. 11 | uint64_t guid; // Unique identifier for the NIC chip. Important for 12 | // cards with multiple PCI functions (Physical or virtual). 13 | int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] 14 | int speed; // Port speed in Mbps. 15 | int port; // Port number. 16 | float latency; // Network latency 17 | int maxComms; // Maximum number of comms we can create 18 | int maxRecvs; // Maximum number of grouped receives. 19 | }ncclNetProperties_v6_t; 20 | 21 | typedef struct { 22 | // Name of the network (mainly for logs) 23 | const char* name; 24 | // Initialize the network. 25 | ncclResult_t (*init)(ncclDebugLogger_t logFunction); 26 | // Return the number of adapters. 27 | ncclResult_t (*devices)(int* ndev); 28 | // Get various device properties. 29 | ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); 30 | // Create a receiving object and provide a handle to connect to it. The 31 | // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged 32 | // between ranks to create a connection. 33 | ncclResult_t (*listen)(int dev, void* handle, void** listenComm); 34 | // Connect to a handle and return a sending comm object for that peer. 35 | // This call must not block for the connection to be established, and instead 36 | // should return successfully with sendComm == NULL with the expectation that 37 | // it will be called again until sendComm != NULL. 38 | ncclResult_t (*connect)(int dev, void* handle, void** sendComm); 39 | // Finalize connection establishment after remote peer has called connect. 40 | // This call must not block for the connection to be established, and instead 41 | // should return successfully with recvComm == NULL with the expectation that 42 | // it will be called again until recvComm != NULL. 43 | ncclResult_t (*accept)(void* listenComm, void** recvComm); 44 | // Register/Deregister memory. Comm can be either a sendComm or a recvComm. 45 | // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. 46 | ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); 47 | /* DMA-BUF support */ 48 | ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); 49 | ncclResult_t (*deregMr)(void* comm, void* mhandle); 50 | // Asynchronous send to a peer. 51 | // May return request == NULL if the call cannot be performed (or would block) 52 | ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); 53 | // Asynchronous recv from a peer. 54 | // May return request == NULL if the call cannot be performed (or would block) 55 | ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); 56 | // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is 57 | // visible to the GPU 58 | ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); 59 | // Test whether a request is complete. If size is not NULL, it returns the 60 | // number of bytes sent/received. 61 | ncclResult_t (*test)(void* request, int* done, int* sizes); 62 | // Close and free send/recv comm objects 63 | ncclResult_t (*closeSend)(void* sendComm); 64 | ncclResult_t (*closeRecv)(void* recvComm); 65 | ncclResult_t (*closeListen)(void* listenComm); 66 | } ncclNet_v6_t; 67 | 68 | #endif // end include guard 69 | -------------------------------------------------------------------------------- /ext-net/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_TYPES_H_ 6 | #define NCCL_TYPES_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | } ncclDataType_t; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext-net/google-fastsocket/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_HOME?=/usr/local/cuda 2 | INC:=-I$(CUDA_HOME)/include 3 | PLUGIN_SO:=libnccl-net.so 4 | 5 | default: $(PLUGIN_SO) 6 | 7 | $(PLUGIN_SO): nccl-fastsocket/*.cc 8 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 9 | 10 | nccl-fastsocket/*.cc: 11 | git clone https://github.com/google/nccl-fastsocket.git 12 | 13 | install: $(BUILDDIR)/lib/$(PLUGIN_SO) 14 | 15 | $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) 16 | @printf "Grabbing %-35s > %s\n" $< $@ 17 | mkdir -p $(BUILDDIR)/lib 18 | install -m 644 $< $@ 19 | 20 | clean: 21 | rm -f $(PLUGIN_SO) 22 | rm -Rf nccl-fastsocket 23 | -------------------------------------------------------------------------------- /ext-profiler/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME := ../../build 7 | INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 8 | PLUGIN_SO := libnccl-profiler.so 9 | 10 | default: $(PLUGIN_SO) 11 | 12 | $(PLUGIN_SO): plugin.c event.c print_event.c 13 | $(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 14 | 15 | clean: 16 | rm -f $(PLUGIN_SO) 17 | -------------------------------------------------------------------------------- /ext-profiler/example/event.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | #include "event.h" 9 | 10 | int taskEventQueueEmpty(struct group* g) { 11 | return g->eventHead == NULL; 12 | } 13 | 14 | void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) { 15 | event->next = NULL; 16 | if (g->eventHead) g->eventTail->next = event; 17 | else g->eventHead = event; 18 | g->eventTail = event; 19 | } 20 | 21 | struct taskEventBase* taskEventQueueHead(struct group* g) { 22 | return g->eventHead; 23 | } 24 | 25 | struct taskEventBase* taskEventQueueDequeue(struct group* g) { 26 | struct taskEventBase* tmp = g->eventHead; 27 | g->eventHead = g->eventHead->next; 28 | if (g->eventHead == NULL) g->eventTail = NULL; 29 | return tmp; 30 | } 31 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ERR_H_ 8 | #define NCCL_ERR_H_ 9 | 10 | /* Error type for plugins */ 11 | typedef enum { ncclSuccess = 0, 12 | ncclUnhandledCudaError = 1, 13 | ncclSystemError = 2, 14 | ncclInternalError = 3, 15 | ncclInvalidArgument = 4, 16 | ncclInvalidUsage = 5, 17 | ncclRemoteError = 6 } ncclResult_t; 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/net_ib_v1.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_IB_V1_H_ 8 | #define NET_IB_V1_H_ 9 | 10 | #define NCCL_PROFILER_NET_IB_VER 1 11 | 12 | enum { 13 | ncclProfileQp = (1 << 0), 14 | }; 15 | 16 | // The data structure version is encoded in the plugin identifier bitmask and 17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin 18 | // identifier in the event descriptor before calling the profiler startEvent 19 | // function. The profiler should inspect the plugin id to find out the source 20 | // plugin as well as the version of the event struct 21 | typedef struct { 22 | uint8_t type; // event type (plugin defined) 23 | union { 24 | struct { 25 | int device; // network device id 26 | uint64_t wr_id; // work request id 27 | int opcode; // ibv opcode 28 | int qpNum; // QP number 29 | size_t length; // work request data length 30 | } qp; 31 | }; 32 | } ncclProfilerNetIbDescr_v1_t; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/net_socket_v1.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_SOCKET_V1_H_ 8 | #define NET_SOCKET_V1_H_ 9 | 10 | #define NCCL_PROFILER_NET_SOCKET_VER 1 11 | 12 | enum { 13 | ncclProfileSocket = (1 << 0), 14 | }; 15 | 16 | // The data structure version is encoded in the plugin identifier bitmask and 17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin 18 | // identifier in the event descriptor before calling the profiler startEvent 19 | // function. The profiler should inspect the plugin id to find out the source 20 | // plugin as well as the version of the event struct 21 | typedef struct { 22 | uint8_t type; // event type (plugin defined) 23 | union { 24 | struct { 25 | int fd; 26 | int op; 27 | size_t length; 28 | } sock; 29 | }; 30 | } ncclProfilerNetSockDescr_v1_t; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_H_ 8 | #define PROFILER_H_ 9 | 10 | #include 11 | #include 12 | 13 | #include "common.h" 14 | #include "err.h" 15 | 16 | enum { 17 | ncclProfileGroup = (1 << 0), // group event type 18 | ncclProfileColl = (1 << 1), // host collective call event type 19 | ncclProfileP2p = (1 << 2), // host point-to-point call event type 20 | ncclProfileProxyOp = (1 << 3), // proxy operation event type 21 | ncclProfileProxyStep = (1 << 4), // proxy step event type 22 | ncclProfileProxyCtrl = (1 << 5), // proxy control event type 23 | ncclProfileKernelCh = (1 << 6), // kernel channel event type 24 | ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events 25 | }; 26 | 27 | typedef enum { 28 | ncclProfilerProxyOpSendPosted = 0, // deprecated in v4 29 | ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4 30 | ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4 31 | ncclProfilerProxyOpSendDone = 3, // deprecated in v4 32 | ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4 33 | ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4 34 | ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4 35 | ncclProfilerProxyOpRecvDone = 7, // deprecated in v4 36 | ncclProfilerProxyOpInProgress_v4 = 19, 37 | 38 | /* Legacy proxy profiler states */ 39 | ncclProfilerProxyStepSendGPUWait = 8, 40 | ncclProfilerProxyStepSendPeerWait_v4 = 20, 41 | ncclProfilerProxyStepSendWait = 9, 42 | ncclProfilerProxyStepRecvWait = 10, 43 | ncclProfilerProxyStepRecvFlushWait = 11, 44 | ncclProfilerProxyStepRecvGPUWait = 12, 45 | 46 | /* Legacy proxy control states */ 47 | ncclProfilerProxyCtrlIdle = 13, 48 | ncclProfilerProxyCtrlActive = 14, 49 | ncclProfilerProxyCtrlSleep = 15, 50 | ncclProfilerProxyCtrlWakeup = 16, 51 | ncclProfilerProxyCtrlAppend = 17, 52 | ncclProfilerProxyCtrlAppendEnd = 18, 53 | 54 | /* Network defined events states */ 55 | ncclProfilerNetPluginUpdate = 21, 56 | 57 | /* Kernel event states */ 58 | ncclProfilerKernelChStop = 22, 59 | } ncclProfilerEventState_t; 60 | 61 | typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; 62 | typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; 63 | typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; 64 | typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; 65 | 66 | #include "profiler_v4.h" 67 | #include "profiler_v3.h" 68 | #include "profiler_v2.h" 69 | #include "profiler_v1.h" 70 | #include "profiler_net.h" 71 | 72 | typedef ncclProfiler_v4_t ncclProfiler_t; 73 | typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t; 74 | typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t; 75 | 76 | #endif // end include guard 77 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/profiler_net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_NET_H_ 8 | #define PROFILER_NET_H_ 9 | 10 | #define NCCL_PROFILER_NET_VER_BITS (16) 11 | #define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) 12 | #define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) 13 | 14 | typedef enum { 15 | NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), 16 | NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), 17 | } ncclProfilerNetType; 18 | 19 | #include "net_ib_v1.h" 20 | #include "net_socket_v1.h" 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/profiler_v1.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_V1_H_ 8 | #define PROFILER_V1_H_ 9 | 10 | #include 11 | 12 | typedef struct { 13 | uint8_t type; // event type descriptor: ncclProfileColl, ... 14 | void* parentObj; // pointer to the profiler parent object (for coll is the group) 15 | int rank; // originating rank 16 | union { 17 | struct { 18 | const char* name; 19 | uint64_t commHash; 20 | uint64_t seqNumber; 21 | uint8_t func; 22 | void const* sendBuff; 23 | void* recvBuff; 24 | size_t count; 25 | int root; 26 | uint8_t datatype; 27 | uint32_t op; 28 | size_t trafficBytes; 29 | uint8_t nMaxChannels; 30 | uint8_t nWarps; 31 | uint8_t algo; 32 | uint8_t proto; 33 | int isCollnet; 34 | int isNvls; 35 | } coll; 36 | 37 | struct { 38 | const char* name; 39 | uint64_t commHash; 40 | uint8_t func; 41 | void* buff; 42 | uint8_t datatype; 43 | size_t count; 44 | int peer; 45 | } p2p; 46 | 47 | struct { 48 | pid_t pid; // pid of the originating process 49 | uint8_t channelId; // channel id for this proxy operation 50 | int peer; // remote rank for send/recv 51 | int nSteps; // number of steps for this proxy operation 52 | int chunkSize; // amount of data transferred by this proxy operation 53 | int isSend; 54 | } proxyOp; 55 | 56 | struct { 57 | int step; 58 | } proxyStep; 59 | }; 60 | } ncclProfilerEventDescr_v1_t; 61 | 62 | typedef union { 63 | struct { 64 | size_t transSize; 65 | int steps; 66 | } proxyOp; 67 | 68 | struct { 69 | int appendedProxyOps; 70 | } proxyCtrl; 71 | } ncclProfilerEventStateArgs_v1_t; 72 | 73 | typedef struct { 74 | const char* name; 75 | 76 | // init - initialize the profiler plugin 77 | // Input 78 | // - context : opaque profiler context object for separating profiler behavior across comms 79 | // Output 80 | // - eActivationMask: bitmask of active events set by the plugin 81 | ncclResult_t (*init)(void** context, int* eActivationMask); 82 | 83 | // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset 84 | // Input 85 | // - context: opaque profiler context object 86 | // - eDescr : pointer to ncclProfilerEventDescr_t object 87 | // Output 88 | // - eHandle: return event handle for supplied event descriptor object 89 | ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); 90 | 91 | // stopEvent - stop/finalize an event inside and event set 92 | // Input 93 | // - eHandle: handle to event object 94 | ncclResult_t (*stopEvent)(void* eHandle); 95 | 96 | // recordEventState - record event state transitions and event attribute updates 97 | // Input 98 | // - eHandle : handle to event object created through startEvent 99 | // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition 100 | // - eState : event state transition 101 | ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); 102 | 103 | // finalize - finalize the profiler plugin 104 | // Input 105 | // - context: opaque profiler context object 106 | ncclResult_t (*finalize)(void* context); 107 | } ncclProfiler_v1_t; 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/profiler_v2.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_V2_H_ 8 | #define PROFILER_V2_H_ 9 | 10 | #include 11 | 12 | typedef struct { 13 | uint8_t type; // event type descriptor: ncclProfileColl, ... 14 | void* parentObj; // pointer to the profiler parent object (for coll is the group) 15 | int rank; // originating rank 16 | union { 17 | struct { 18 | const char* name; 19 | uint64_t commHash; 20 | uint64_t seqNumber; 21 | const char* func; 22 | void const* sendBuff; 23 | void* recvBuff; 24 | size_t count; 25 | int root; 26 | const char* datatype; 27 | size_t trafficBytes; 28 | uint8_t nMaxChannels; 29 | uint8_t nWarps; 30 | const char* algo; 31 | const char* proto; 32 | } coll; 33 | 34 | struct { 35 | const char* name; 36 | uint64_t commHash; 37 | const char* func; 38 | void* buff; 39 | const char* datatype; 40 | size_t count; 41 | int peer; 42 | } p2p; 43 | 44 | struct { 45 | pid_t pid; // pid of the originating process 46 | uint8_t channelId; // channel id for this proxy operation 47 | int peer; // remote rank for send/recv 48 | int nSteps; // number of steps for this proxy operation 49 | int chunkSize; // amount of data transferred by this proxy operation 50 | int isSend; 51 | } proxyOp; 52 | 53 | struct { 54 | int step; 55 | } proxyStep; 56 | }; 57 | } ncclProfilerEventDescr_v2_t; 58 | 59 | typedef union { 60 | struct { 61 | size_t transSize; 62 | int steps; 63 | } proxyOp; 64 | 65 | struct { 66 | int appendedProxyOps; 67 | } proxyCtrl; 68 | } ncclProfilerEventStateArgs_v2_t; 69 | 70 | typedef struct { 71 | const char* name; 72 | 73 | // init - initialize the profiler plugin 74 | // Input 75 | // - context : opaque profiler context object for separating profiler behavior across comms 76 | // Output 77 | // - eActivationMask: bitmask of active events set by the plugin 78 | ncclResult_t (*init)(void** context, int* eActivationMask); 79 | 80 | // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset 81 | // Input 82 | // - context: opaque profiler context object 83 | // - eDescr : pointer to ncclProfilerEventDescr_t object 84 | // Output 85 | // - eHandle: return event handle for supplied event descriptor object 86 | ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); 87 | 88 | // stopEvent - stop/finalize an event inside and event set 89 | // Input 90 | // - eHandle: handle to event object 91 | ncclResult_t (*stopEvent)(void* eHandle); 92 | 93 | // recordEventState - record event state transitions and event attribute updates 94 | // Input 95 | // - eHandle : handle to event object created through startEvent 96 | // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition 97 | // - eState : event state transition 98 | ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); 99 | 100 | // finalize - finalize the profiler plugin 101 | // Input 102 | // - context: opaque profiler context object 103 | ncclResult_t (*finalize)(void* context); 104 | } ncclProfiler_v2_t; 105 | 106 | #endif 107 | -------------------------------------------------------------------------------- /ext-profiler/example/nccl/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_TYPES_H_ 6 | #define NCCL_TYPES_H_ 7 | 8 | /* Data types */ 9 | typedef enum { ncclInt8 = 0, ncclChar = 0, 10 | ncclUint8 = 1, 11 | ncclInt32 = 2, ncclInt = 2, 12 | ncclUint32 = 3, 13 | ncclInt64 = 4, 14 | ncclUint64 = 5, 15 | ncclFloat16 = 6, ncclHalf = 6, 16 | ncclFloat32 = 7, ncclFloat = 7, 17 | ncclFloat64 = 8, ncclDouble = 8, 18 | ncclBfloat16 = 9, 19 | } ncclDataType_t; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext-profiler/example/plugin.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PLUGIN_H_ 8 | #define PLUGIN_H_ 9 | 10 | int exampleProfilerStart(int eActivationMask); 11 | int exampleProfilerStop(void); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /ext-profiler/example/print_event.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PRINT_EVENT_H_ 8 | #define PRINT_EVENT_H_ 9 | 10 | #include "nccl/common.h" 11 | extern ncclDebugLogger_t logFn; 12 | 13 | void debugEvent(void* eHandle, const char* tag); 14 | void printEvent(FILE* fh, void* handle); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /ext-tuner/example/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | NCCL_HOME:=../../build/ 7 | CUDA_HOME:=/usr/local/cuda 8 | INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl 9 | PLUGIN_SO:=libnccl-tuner.so 10 | 11 | default: $(PLUGIN_SO) 12 | 13 | $(PLUGIN_SO): plugin.c 14 | $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ 15 | 16 | clean: 17 | rm -f $(PLUGIN_SO) 18 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 11 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; 12 | 13 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/err.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | */ 4 | 5 | #ifndef NCCL_ERR_H_ 6 | #define NCCL_ERR_H_ 7 | 8 | /* Error type for plugins */ 9 | typedef enum { ncclSuccess = 0, 10 | ncclUnhandledCudaError = 1, 11 | ncclSystemError = 2, 12 | ncclInternalError = 3, 13 | ncclInvalidArgument = 4, 14 | ncclInvalidUsage = 5, 15 | ncclRemoteError = 6 } ncclResult_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ext-tuner/example/nccl/tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_TUNER_H_ 9 | #define NCCL_TUNER_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "common.h" 15 | #include "err.h" 16 | 17 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now 18 | typedef enum { 19 | ncclFuncBroadcast = 0, 20 | ncclFuncReduce = 1, 21 | ncclFuncAllGather = 2, 22 | ncclFuncReduceScatter = 3, 23 | ncclFuncAllReduce = 4, 24 | ncclFuncSendRecv = 5, 25 | ncclFuncSend = 6, 26 | ncclFuncRecv = 7, 27 | ncclNumFuncs = 8 28 | } ncclFunc_t; 29 | 30 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* 31 | #define NCCL_ALGO_UNDEF -1 32 | #define NCCL_ALGO_TREE 0 33 | #define NCCL_ALGO_RING 1 34 | #define NCCL_ALGO_COLLNET_DIRECT 2 35 | #define NCCL_ALGO_COLLNET_CHAIN 3 36 | #define NCCL_ALGO_NVLS 4 37 | #define NCCL_ALGO_NVLS_TREE 5 38 | #define NCCL_ALGO_PAT 6 39 | 40 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 41 | #define NCCL_PROTO_UNDEF -1 42 | #define NCCL_PROTO_LL 0 43 | #define NCCL_PROTO_LL128 1 44 | #define NCCL_PROTO_SIMPLE 2 45 | 46 | #define NCCL_ALGO_PROTO_IGNORE -1.0 47 | 48 | // API to be implemented by external tuner 49 | typedef struct { 50 | // Name of the tuner 51 | const char* name; 52 | 53 | // Initializes tuner states. 54 | // Inputs: 55 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 56 | // - nNodes: number of nodes in current communicator. 57 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 58 | // Outputs: 59 | // - context: tuner context object 60 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 61 | 62 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 63 | // Inputs: 64 | // - context: tuner context object 65 | // - collType: collective type , e.g., allreduce, allgather… 66 | // - nBytes: collective size in bytes 67 | // - numPipeOps: number of operations in the group 68 | // - numAlgo: number of algorithms in collCostTable 69 | // - numProto: number of protocols in collCostTable 70 | // - regBuff: can register user buffer 71 | // 72 | // Outputs: 73 | // - nChannels: number of channels (hence SMs) to be used. 74 | // 75 | // InOut: 76 | // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. 77 | // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). 78 | // 79 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 80 | // default tuning for the given collective. 81 | // Also, the plugin is allowed to not set any output, or set only the 82 | // algorithm and protocol, but not only the algorithm or only the protocol. 83 | // Unset fields will be set automatically by NCCL. 84 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 85 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 86 | int regBuff, int* nChannels); 87 | 88 | // Terminates the plugin and cleans up any resources that the plugin allocated. 89 | // context: tuner context object 90 | ncclResult_t (*destroy)(void* context); 91 | } ncclTuner_v4_t; 92 | 93 | typedef ncclTuner_v4_t ncclTuner_t; 94 | 95 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /ext-tuner/example/plugin.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "tuner.h" 8 | 9 | #define __hidden __attribute__ ((visibility("hidden"))) 10 | 11 | __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } 12 | 13 | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, 14 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 15 | int regBuff, int* nChannels) { 16 | // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo 17 | float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; 18 | if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { 19 | table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; 20 | } 21 | *nChannels = 1; 22 | return ncclSuccess; 23 | } 24 | 25 | __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } 26 | 27 | #define PLUGIN_NAME "Example" 28 | 29 | const ncclTuner_v4_t ncclTunerPlugin_v4 = { 30 | .name = PLUGIN_NAME, 31 | .init = pluginInit, 32 | .getCollInfo = pluginGetCollInfo, 33 | .destroy = pluginDestroy 34 | }; 35 | -------------------------------------------------------------------------------- /makefiles/formatting.mk: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting 8 | # As this file defines a new target (format), it should be included at least after the definition of the 9 | # default target. 10 | 11 | ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none 12 | ASTYLEDIR := $(BUILDDIR)/contrib 13 | ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz 14 | ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle 15 | ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ 16 | ASTYLEVER := 3.1 17 | ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" 18 | 19 | $(ASTYLEDIR) : 20 | @mkdir -p $(ASTYLEDIR) 21 | 22 | $(ASTYLETAR) : $(ASTYLEDIR) 23 | @wget -q -O $(ASTYLETAR) $(ASTYLEURL) 24 | 25 | $(ASTYLEBLD) : $(ASTYLETAR) 26 | @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) 27 | 28 | $(ASTYLEBIN) : $(ASTYLEBLD) 29 | ${MAKE} -C $(ASTYLEBLD) 30 | 31 | .PHONY : format 32 | format : $(ASTYLEBIN) 33 | @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) 34 | -------------------------------------------------------------------------------- /makefiles/version.mk: -------------------------------------------------------------------------------- 1 | ##### version 2 | NCCL_MAJOR := 2 3 | NCCL_MINOR := 27 4 | NCCL_PATCH := 3 5 | NCCL_SUFFIX := 6 | PKG_REVISION := 1 7 | -------------------------------------------------------------------------------- /pkg/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | .PHONY : all clean 7 | 8 | default : build 9 | build : debian.build txz.build 10 | 11 | BUILDDIR ?= $(abspath ../build) 12 | ABSBUILDDIR := $(abspath $(BUILDDIR)) 13 | TARGETS := debian txz 14 | all: ${TARGETS:%=%.build} 15 | prep: ${TARGETS:%=%.prep} 16 | build: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.prep: 20 | ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} 21 | 22 | %.build: 23 | ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} 24 | 25 | %.clean: 26 | ${MAKE} -C $* clean 27 | -------------------------------------------------------------------------------- /pkg/debian/.gitignore: -------------------------------------------------------------------------------- 1 | /*.debhelper.log 2 | /*.debhelper 3 | /*.substvars 4 | /tmp/ 5 | /files 6 | /libnccl1/ 7 | /libnccl-dev/ 8 | -------------------------------------------------------------------------------- /pkg/debian/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | DEBPREPDIR := $(BUILDDIR)/debian 11 | PKGDIR := $(BUILDDIR)/pkg/deb/ 12 | 13 | DEBGEN_IN := $(wildcard *.in) 14 | DEBGEN := $(DEBGEN_IN:.in=) 15 | DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) 16 | DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) 20 | PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) 21 | 22 | prep : $(DEBTARGETS) 23 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 24 | 25 | build : prep 26 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 27 | @printf "Building Debian package\n" 28 | (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz) 29 | mkdir -p $(PKGDIR) 30 | mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ 31 | 32 | clean: 33 | rm -Rf $(DEBPREPDIR) $(PKGDIR) 34 | 35 | $(DEBPREPDIR)/% : %.in 36 | @printf "Generating %-35s > %s\n" $< $@ 37 | mkdir -p $(DEBPREPDIR) 38 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 39 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 40 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 41 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 42 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 43 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 44 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 45 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 46 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 47 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 48 | $< > $@ 49 | 50 | $(DEBPREPDIR)/% : % 51 | @printf "Grabbing %-35s > %s\n" $< $@ 52 | mkdir -p $(DEBPREPDIR) 53 | cp -f $< $@ 54 | -------------------------------------------------------------------------------- /pkg/debian/changelog.in: -------------------------------------------------------------------------------- 1 | nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium 2 | 3 | * Automatic Debian package from build 4 | 5 | -- cudatools ${pkg:Timestamp} 6 | -------------------------------------------------------------------------------- /pkg/debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /pkg/debian/control.in: -------------------------------------------------------------------------------- 1 | Source: nccl 2 | Section: libs 3 | Maintainer: cudatools 4 | Priority: optional 5 | Build-depends: debhelper(>=9) 6 | Standards-Version: 3.9.5 7 | 8 | Package: libnccl${nccl:Major} 9 | Section: libs 10 | Architecture: ${pkg:Arch} 11 | Depends: ${misc:Depends}, ${shlibs:Depends} 12 | Description: NVIDIA Collective Communication Library (NCCL) Runtime 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | Package: libnccl-dev 21 | Section: libdevel 22 | Architecture: ${pkg:Arch} 23 | Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) 24 | Description: NVIDIA Collective Communication Library (NCCL) Development Files 25 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 26 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 27 | broadcast, and reduce-scatter. 28 | It has been optimized to achieve high bandwidth on any platform using PCIe, 29 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 30 | sockets. 31 | -------------------------------------------------------------------------------- /pkg/debian/copyright: -------------------------------------------------------------------------------- 1 | ../../LICENSE.txt -------------------------------------------------------------------------------- /pkg/debian/gbp.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | debian-branch = master 3 | upstream-branch = master 4 | 5 | ignore-new = True 6 | 7 | [git-buildpackage] 8 | 9 | no-purge = True 10 | -------------------------------------------------------------------------------- /pkg/debian/libnccl-dev.install.in: -------------------------------------------------------------------------------- 1 | bin/ncclras /usr/bin 2 | include/nccl.h /usr/include 3 | lib/libnccl.so /usr/lib/${pkg:MultiArch} 4 | lib/libnccl_static.a /usr/lib/${pkg:MultiArch} 5 | -------------------------------------------------------------------------------- /pkg/debian/libnccl2.install.in: -------------------------------------------------------------------------------- 1 | lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} 2 | lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} 3 | -------------------------------------------------------------------------------- /pkg/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | %: 4 | dh $@ --parallel 5 | 6 | override_dh_auto_install: 7 | PREFIX=debian/tmp dh_auto_install 8 | 9 | override_dh_auto_test: 10 | # Do not make test 11 | 12 | override_dh_auto_clean: 13 | # Do not make clean 14 | 15 | override_dh_builddeb: 16 | dh_builddeb -- -Zxz 17 | -------------------------------------------------------------------------------- /pkg/debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /pkg/redhat/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | RPMPREPDIR := $(BUILDDIR)/redhat 11 | PKGDIR := $(BUILDDIR)/pkg/rpm/ 12 | 13 | RPMGEN_IN := $(wildcard *.in) 14 | RPMGEN := $(RPMGEN_IN:.in=) 15 | RPMFILES := $(RPMGEN) 16 | RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) 17 | 18 | PKG_TIMESTAMP := $(shell date -R) 19 | ARCH := $(shell uname -m) 20 | PKG_ARCH ?= $(shell uname -m) 21 | PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) 22 | ifeq ($(PKG_MULTIARCH),) 23 | # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it 24 | PKG_MULTIARCH := $(ARCH)-linux-gnu 25 | endif 26 | 27 | prep : $(RPMTARGETS) 28 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 29 | 30 | build : prep 31 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 32 | $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) 33 | @printf "Building Redhat package\n" 34 | mkdir -p $(PKGDIR) 35 | rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ 36 | --define "_rpmdir $(PKGDIR)" \ 37 | --define "_builddir $(PKGDIR)/build/" \ 38 | --define "_buildrootdir $(PKGDIR)/buildroot/" \ 39 | -bb $(BUILDDIR)/redhat/nccl.spec 40 | 41 | clean: 42 | rm -Rf $(RPMPREPDIR) $(PKGDIR) 43 | 44 | $(RPMPREPDIR)/% : %.in 45 | @printf "Generating %-35s > %s\n" $< $@ 46 | mkdir -p $(RPMPREPDIR) 47 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 48 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 49 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 50 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 51 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 52 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 53 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 54 | -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ 55 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 56 | -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ 57 | $< > $@ 58 | 59 | $(RPMPREPDIR)/% : % 60 | @printf "Grabbing %-35s > %s\n" $< $@ 61 | mkdir -p $(RPMPREPDIR) 62 | cp -f $< $@ 63 | -------------------------------------------------------------------------------- /pkg/redhat/nccl.spec.in: -------------------------------------------------------------------------------- 1 | Name: libnccl 2 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} 3 | Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} 4 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 5 | 6 | Group: Development/Libraries 7 | License: BSD 8 | URL: http://developer.nvidia.com/nccl 9 | Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz 10 | Requires(pre,preun): /sbin/ldconfig 11 | 12 | %description 13 | NCCL (pronounced "Nickel") is a stand-alone library of standard collective 14 | communication routines for GPUs, implementing all-reduce, all-gather, reduce, 15 | broadcast, and reduce-scatter. 16 | It has been optimized to achieve high bandwidth on any platform using PCIe, 17 | NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP 18 | sockets. 19 | 20 | %package devel 21 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 22 | Group: Development/Libraries 23 | Requires: libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch} 24 | %description devel 25 | NCCL development files 26 | 27 | %package static 28 | Summary: NVIDIA Collective Communication Library (NCCL) Runtime 29 | Group: Development/Libraries 30 | %description static 31 | NCCL static library 32 | 33 | %define debug_package %{nil} 34 | 35 | %prep 36 | %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q 37 | 38 | %build 39 | 40 | %install 41 | rm -rf $RPM_BUILD_ROOT 42 | install -m 755 -d $RPM_BUILD_ROOT 43 | install -m 755 -d $RPM_BUILD_ROOT/%{_libdir} 44 | install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir} 45 | ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} 46 | 47 | # devel 48 | install -m 755 -d $RPM_BUILD_ROOT/%{_bindir} 49 | install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} 50 | install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir} 51 | install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} 52 | ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so 53 | 54 | # static 55 | install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} 56 | 57 | %post -p /sbin/ldconfig 58 | %postun -p /sbin/ldconfig 59 | 60 | %post devel -p /sbin/ldconfig 61 | %postun devel -p /sbin/ldconfig 62 | 63 | %clean 64 | rm -rf $RPM_BUILD_ROOT 65 | 66 | %files devel 67 | %doc LICENSE.txt 68 | %defattr(-,root,root,-) 69 | %{_bindir}/ncclras 70 | %{_includedir}/nccl.h 71 | %{_libdir}/libnccl.so 72 | 73 | %files static 74 | %doc LICENSE.txt 75 | %defattr(-,root,root,-) 76 | %{_libdir}/libnccl_static.a 77 | 78 | %files 79 | %doc LICENSE.txt 80 | %defattr(-,root,root,-) 81 | %{_libdir}/libnccl.so.${nccl:Major} 82 | %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} 83 | 84 | %changelog 85 | -------------------------------------------------------------------------------- /pkg/srctxz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/srctxz 11 | PKGDIR := $(BUILDDIR)/pkg/srctxz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_REVISION ?= 3 18 | PKG_ARCH := $(shell uname -m) 19 | 20 | prep: $(TXZTARGETS) 21 | 22 | build: prep 23 | $(MAKE) -C ../../src clean 24 | @printf "Building source tar.xz package\n" 25 | (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 40 | $< > $@ 41 | -------------------------------------------------------------------------------- /pkg/srctxz/create_srctxz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | cd .. 11 | NCCLDIR=`basename $PWD` 12 | 13 | echo "Checking for unclean directory ..." 14 | git clean -x -i 15 | echo "Clean done." 16 | echo "Checking for uncommited files ..." 17 | if [ "`git status -s | wc -l`" != "0" ]; then 18 | git status -s 19 | echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" 20 | read 21 | fi 22 | 23 | cd .. 24 | NCCL_MAJOR=${nccl:Major} 25 | NCCL_MINOR=${nccl:Minor} 26 | NCCL_PATCH=${nccl:Patch} 27 | NCCL_SUFFIX=${nccl:Suffix} 28 | NCCL_BUILD=${pkg:Revision} 29 | 30 | NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" 31 | 32 | tar --exclude build \ 33 | --exclude ".git*" \ 34 | --exclude pkg/srctxz \ 35 | --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR 36 | -------------------------------------------------------------------------------- /pkg/txz/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include ../../makefiles/common.mk 8 | include ../../makefiles/version.mk 9 | BUILDDIR ?= $(abspath ../../build) 10 | TXZPREPDIR := $(BUILDDIR)/txz 11 | PKGDIR := $(BUILDDIR)/pkg/txz/ 12 | 13 | TXZGEN_IN := $(wildcard *.in) 14 | TXZGEN := $(TXZGEN_IN:.in=) 15 | TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) 16 | 17 | PKG_ARCH := $(shell uname -m) 18 | 19 | prep: $(TXZTARGETS) 20 | $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) 21 | 22 | build: prep 23 | $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) 24 | @printf "Building tar.xz package\n" 25 | (cd $(BUILDDIR); bash txz/create_txz.sh) 26 | mkdir -p $(PKGDIR) 27 | mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) 28 | 29 | clean: 30 | rm -Rf $(TXZPREPDIR) $(PKGDIR) 31 | 32 | $(TXZPREPDIR)/% : %.in 33 | @printf "Generating %-35s > %s\n" $< $@ 34 | mkdir -p $(TXZPREPDIR) 35 | sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ 36 | -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ 37 | -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ 38 | -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ 39 | -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ 40 | -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ 41 | -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ 42 | -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ 43 | $< > $@ 44 | -------------------------------------------------------------------------------- /pkg/txz/create_txz.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # See LICENSE.txt for license information 6 | # 7 | 8 | # To run from $BUILDDIR/ 9 | 10 | BUILDDIR=`basename $PWD` 11 | 12 | cd .. 13 | NCCL_MAJOR=${nccl:Major} 14 | NCCL_MINOR=${nccl:Minor} 15 | NCCL_PATCH=${nccl:Patch} 16 | NCCL_SUFFIX=${nccl:Suffix} 17 | CUDA_MAJOR=${cuda:Major} 18 | CUDA_MINOR=${cuda:Minor} 19 | PKG_REVISION=${pkg:Revision} 20 | PKG_ARCH=${pkg:Arch} 21 | 22 | NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" 23 | 24 | tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt 25 | -------------------------------------------------------------------------------- /src/device/broadcast.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | 11 | namespace { 12 | template 13 | __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { 14 | ncclRing *ring = &ncclShmem.channel.ring; 15 | const int rank = ring->userRanks[0]; 16 | const int nextRank = ring->userRanks[1]; 17 | const int root = work->root; 18 | ssize_t chunkCount; 19 | ssize_t channelCount; 20 | ssize_t gridOffset; 21 | ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount); 22 | size_t offset; 23 | int nelem; 24 | int workNthreads; 25 | bool isNetOffload = work->isOneRPN && work->netRegUsed; 26 | 27 | T *inputBuf = (T*)work->sendbuff; 28 | T *outputBuf = (T*)work->recvbuff; 29 | workNthreads = isNetOffload ? WARP_SIZE : nthreads; 30 | 31 | if (tid < workNthreads) { 32 | // Coverity reports that the callee treats &ring->next as an array. However, due to the use of 33 | // FanSymmetric<1>, only the first element is ever accessed, so it's fine. 34 | // coverity[callee_ptr_arith:FALSE] 35 | Primitives, 1, Proto, 0> 36 | prims(tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); 37 | 38 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 39 | offset = gridOffset + elemOffset; 40 | nelem = min(chunkCount, channelCount - elemOffset); 41 | 42 | if (rank == root) { 43 | if (inputBuf == outputBuf || isNetOffload) { 44 | prims.directSend(offset, offset, nelem); 45 | } else { 46 | prims.directCopySend(offset, offset, nelem); 47 | } 48 | } else if (nextRank == root) { 49 | prims.directRecv(offset, nelem); 50 | } else { 51 | prims.directRecvCopyDirectSend(offset, offset, nelem); 52 | } 53 | } 54 | } else if (inputBuf != outputBuf && rank == root) { 55 | inputBuf = inputBuf + gridOffset; 56 | outputBuf = outputBuf + gridOffset; 57 | reduceCopy 58 | (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, channelCount); 59 | } 60 | if (isNetOffload) barrier_sync(14, nthreads); 61 | } 62 | } 63 | 64 | template 65 | struct RunWorkColl { 66 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 67 | using Proto = ProtoSimple; 68 | runRing(tid, nthreads, work); 69 | } 70 | }; 71 | 72 | template 73 | struct RunWorkColl { 74 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 75 | runRing(tid, nthreads, work); 76 | } 77 | }; 78 | 79 | template 80 | struct RunWorkColl { 81 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 82 | runRing(tid, nthreads, work); 83 | } 84 | }; 85 | -------------------------------------------------------------------------------- /src/device/common.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "common.h" 10 | 11 | __shared__ ncclShmemData ncclShmem; 12 | #if __CUDA_ARCH__ < 700 13 | __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; 14 | #endif 15 | 16 | struct RunWorkNop { 17 | __device__ void run() {} 18 | }; 19 | 20 | __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { 21 | ncclKernelMain<-1, RunWorkNop>(&args4K.args); 22 | } 23 | 24 | __device__ void ncclDevFunc_Nop() {} 25 | -------------------------------------------------------------------------------- /src/device/network/unpack/unpack_defs.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, Google LLC. All rights reserved. 3 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | #ifndef NET_DEVICE_UNPACK_DEFS_H 8 | #define NET_DEVICE_UNPACK_DEFS_H 9 | 10 | #include 11 | 12 | #include "device.h" 13 | 14 | #define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16 15 | 16 | union alignas(16) loadMeta { 17 | uint64_t r64[2]; 18 | struct { 19 | uint32_t src_off; 20 | uint32_t len; 21 | uint64_t dst_off; 22 | }; 23 | }; 24 | static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned"); 25 | 26 | /****** global memory ******/ 27 | 28 | #define NET_UNPACK_MAX_QUEUE_DEPTH 16 // MAX_REQUESTS 29 | #define NET_UNPACK_MAX_SLICE_SIZE 4194304 // 4MB per Irecv call 30 | #define SLICE_PAGE_SIZE 4096 31 | #define NET_UNPACK_MAX_SLICE_PAGES \ 32 | (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2) // * 2 for slack, wasteful.. 33 | 34 | struct netUnpackMeta { 35 | loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES]; 36 | uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH]; 37 | }; 38 | 39 | struct unpackNetDeviceHandle { 40 | struct netUnpackMeta *meta; // mapped 41 | void* bounce_buf; 42 | uint64_t head; 43 | }; 44 | 45 | /****** shared memory ******/ 46 | 47 | #define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h 48 | #define NET_UNPACK_MAX_NPEERS 2 // The most you should have is 2 network peers per-group (indexed by index) 49 | #define WARP_SHM_PAGE_CNT 4 50 | #define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta)) 51 | struct unpackShmem { 52 | void* bounce_buf; 53 | }; 54 | 55 | struct unpackGroupShmem { 56 | int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv 57 | uint64_t head[NET_UNPACK_MAX_NPEERS]; 58 | struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy 59 | }; 60 | 61 | #endif // NET_DEVICE_UNPACK_DEFS_H_ 62 | -------------------------------------------------------------------------------- /src/device/onerank.cu: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "alloc.h" 8 | #include "collectives.h" 9 | #include "common_kernel.h" 10 | #include "common.h" 11 | #include 12 | 13 | namespace { 14 | template 15 | __global__ __launch_bounds__(512, 1) 16 | void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) { 17 | using T = typename RedOp::EltType; 18 | int tid = threadIdx.x; 19 | int tn = blockDim.x; 20 | int bid = blockIdx.x; 21 | int bn = gridDim.x; 22 | 23 | // each block/channel gets a roughly equal segment of 16 byte packs 24 | constexpr int EltPerPack = 16/sizeof(T); 25 | intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack); 26 | intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack); 27 | i0 = min(i0, nElts); 28 | i1 = min(i1, nElts); 29 | src = (T*)src + i0; 30 | dst = (T*)dst + i0; 31 | 32 | if (redOpArgIsPtr) { 33 | if (redOpArg%2 != 0) { 34 | redOpArg = *reinterpret_cast(redOpArg); 35 | } else if (redOpArg%4 != 0) { 36 | redOpArg = *reinterpret_cast(redOpArg); 37 | } else if (redOpArg%8 != 0) { 38 | redOpArg = *reinterpret_cast(redOpArg); 39 | } else { 40 | redOpArg = *reinterpret_cast(redOpArg); 41 | } 42 | } 43 | reduceCopy 44 | (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0); 45 | } 46 | } 47 | 48 | ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) { 49 | size_t eltSize = ncclTypeSize(eltType); 50 | if (redOp.op != ncclDevPreMulSum) { 51 | if (dst != src) { 52 | NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream)); 53 | } 54 | return ncclSuccess; 55 | } 56 | 57 | void const* kernel; 58 | switch (eltType) { 59 | case ncclInt8: kernel = (void const*)&oneRankReduce>; break; 60 | case ncclUint8: kernel = (void const*)&oneRankReduce>; break; 61 | case ncclInt32: kernel = (void const*)&oneRankReduce>; break; 62 | case ncclUint32: kernel = (void const*)&oneRankReduce>; break; 63 | case ncclInt64: kernel = (void const*)&oneRankReduce>; break; 64 | case ncclUint64: kernel = (void const*)&oneRankReduce>; break; 65 | #if defined(__CUDA_FP8_TYPES_EXIST__) && __CUDA_ARCH__ >= 900 66 | case ncclFloat8e4m3: kernel = (void const*)&oneRankReduce>; break; 67 | case ncclFloat8e5m2: kernel = (void const*)&oneRankReduce>; break; 68 | #endif 69 | case ncclFloat16: kernel = (void const*)&oneRankReduce>; break; 70 | #if defined(__CUDA_BF16_TYPES_EXIST__) 71 | case ncclBfloat16: kernel = (void const*)&oneRankReduce>; break; 72 | #endif 73 | case ncclFloat32: kernel = (void const*)&oneRankReduce>; break; 74 | case ncclFloat64: kernel = (void const*)&oneRankReduce>; break; 75 | default: return ncclInvalidArgument; 76 | } 77 | dim3 grid = {0, 1, 1}; 78 | grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10)); 79 | dim3 block = {512, 1, 1}; 80 | void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr}; 81 | CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream)); 82 | return ncclSuccess; 83 | } 84 | -------------------------------------------------------------------------------- /src/device/reduce.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "device.h" 8 | #include "collectives.h" 9 | #include "primitives.h" 10 | 11 | namespace { 12 | template 13 | __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { 14 | ncclRing *ring = &ncclShmem.channel.ring; 15 | const int nranks = ncclShmem.comm.nRanks; 16 | const int rank = ncclShmem.comm.rank; 17 | const int prevRank = ring->userRanks[nranks-1]; 18 | const int root = work->root; 19 | size_t chunkCount; 20 | size_t channelCount; 21 | size_t gridOffset; 22 | ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); 23 | size_t offset; 24 | int nelem; 25 | 26 | // Coverity reports that the callee treats &ring->next as an array. However, due to the use of 27 | // FanSymmetric<1>, only the first element is ever accessed, so it's fine. 28 | // coverity[callee_ptr_arith:FALSE] 29 | Primitives, 0, Proto, 0> 30 | prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg); 31 | 32 | if (prevRank == root) { 33 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 34 | offset = gridOffset + elemOffset; 35 | nelem = min(chunkCount, channelCount - elemOffset); 36 | prims.send(offset, nelem); 37 | } 38 | } 39 | else if (rank == root) { 40 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 41 | offset = gridOffset + elemOffset; 42 | nelem = min(chunkCount, channelCount - elemOffset); 43 | prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); 44 | } 45 | } 46 | else { 47 | for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { 48 | offset = gridOffset + elemOffset; 49 | nelem = min(chunkCount, channelCount - elemOffset); 50 | prims.recvReduceSend(offset, nelem); 51 | } 52 | } 53 | } 54 | } 55 | 56 | template 57 | struct RunWorkColl { 58 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 59 | using Proto = ProtoSimple; 60 | runRing(tid, nthreads, work); 61 | } 62 | }; 63 | 64 | template 65 | struct RunWorkColl { 66 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 67 | runRing(tid, nthreads, work); 68 | } 69 | }; 70 | 71 | template 72 | struct RunWorkColl { 73 | __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { 74 | runRing(tid, nthreads, work); 75 | } 76 | }; 77 | -------------------------------------------------------------------------------- /src/device/symmetric/kernel.cuh: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_ 2 | #define NCCL_DEVICE_SYMMETRIC_KERNEL_H_ 3 | 4 | #include "symmetric.h" 5 | 6 | template typename Red, typename T> 7 | __device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args); 8 | template typename Red, typename T> 9 | __device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args); 10 | 11 | template typename Red, typename T> 12 | __device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args); 13 | template typename Red, typename T> 14 | __device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args); 15 | 16 | __device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args); 17 | __device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args); 18 | __device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args); 19 | __device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args); 20 | 21 | template typename Red, typename T> 22 | __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args); 23 | template typename Red, typename T> 24 | __device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args); 25 | template typename Red, typename T> 26 | __device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args); 27 | #endif 28 | -------------------------------------------------------------------------------- /src/enhcompat.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ 8 | 9 | enum cudaError_t { cudaErrorStubLibrary = 34 }; 10 | 11 | extern "C" { 12 | 13 | cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); 14 | cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } 15 | 16 | cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); 17 | cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } 18 | 19 | cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); 20 | cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } 21 | 22 | cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); 23 | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } 24 | 25 | cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); 26 | cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/graph/rings.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "core.h" 8 | 9 | void dumpLine(int* values, int nranks, const char* prefix) { 10 | constexpr int line_length = 128; 11 | char line[line_length]; 12 | int num_width = snprintf(nullptr, 0, "%d", nranks-1); // safe as per "man snprintf" 13 | int n = snprintf(line, line_length, "%s", prefix); 14 | for (int i = 0; i < nranks && n < line_length-1; i++) { 15 | n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]); 16 | // At this point n may be more than line_length-1, so don't use it 17 | // for indexing into "line". 18 | } 19 | if (n >= line_length) { 20 | // Sprintf wanted to write more than would fit in the buffer. Assume 21 | // line_length is at least 4 and replace the end with "..." to 22 | // indicate that it was truncated. 23 | snprintf(line+line_length-4, 4, "..."); 24 | } 25 | INFO(NCCL_INIT, "%s", line); 26 | } 27 | 28 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { 29 | for (int r=0; r 13 | 14 | ncclResult_t initChannel(struct ncclComm* comm, int channelid); 15 | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 16 | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); 17 | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); 18 | 19 | inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { 20 | if (comm->nNodes > 1) { 21 | int nodeDelta = p2pRound/comm->maxLocalRanks; 22 | int localDelta = p2pRound%comm->maxLocalRanks; 23 | int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); 24 | base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; 25 | return base & 0xff; 26 | } else { 27 | return p2pRound & 0xff; 28 | } 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/include/coll_net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef COLL_NET_H_ 8 | #define COLL_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | 13 | typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 14 | 15 | // Translation to external API 16 | static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } 17 | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } 18 | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } 19 | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } 20 | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } 21 | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } 22 | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } 23 | /* DMA-BUF support */ 24 | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } 25 | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } 26 | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { 27 | NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } 28 | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } 29 | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } 30 | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } 31 | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } 32 | 33 | static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/include/core.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CORE_H_ 8 | #define NCCL_CORE_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include // For std::min/std::max 15 | #include "nccl.h" 16 | 17 | #ifdef PROFAPI 18 | #define NCCL_API(ret, func, args...) \ 19 | __attribute__ ((visibility("default"))) \ 20 | __attribute__ ((alias(#func))) \ 21 | ret p##func (args); \ 22 | extern "C" \ 23 | __attribute__ ((visibility("default"))) \ 24 | __attribute__ ((weak)) \ 25 | ret func(args) 26 | #else 27 | #define NCCL_API(ret, func, args...) \ 28 | extern "C" \ 29 | __attribute__ ((visibility("default"))) \ 30 | ret func(args) 31 | #endif // end PROFAPI 32 | 33 | #include "debug.h" 34 | #include "checks.h" 35 | #include "cudawrap.h" 36 | #include "alloc.h" 37 | #include "utils.h" 38 | #include "param.h" 39 | #include "nvtx.h" 40 | 41 | #endif // end include guard 42 | -------------------------------------------------------------------------------- /src/include/cpuset.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_CPUSET_H_ 8 | #define NCCL_CPUSET_H_ 9 | 10 | // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t 11 | 12 | static int hexToInt(char c) { 13 | int v = c - '0'; 14 | if (v < 0) return -1; 15 | if (v > 9) v = 10 + c - 'a'; 16 | if ((v < 0) || (v > 15)) return -1; 17 | return v; 18 | } 19 | 20 | #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) 21 | 22 | static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { 23 | uint32_t cpumasks[CPU_SET_N_U32]; 24 | int m = CPU_SET_N_U32-1; 25 | cpumasks[m] = 0; 26 | for (int o=0; o=0; o--) { 49 | if (c == 0 && m8[o] == 0) continue; 50 | sprintf(str+c, "%02x", m8[o]); 51 | c+=2; 52 | if (o && o%4 == 0) { 53 | sprintf(str+c, ","); 54 | c++; 55 | } 56 | } 57 | str[c] = '\0'; 58 | return ncclSuccess; 59 | } 60 | 61 | static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) { 62 | int c = 0; 63 | int start = -1; 64 | // Iterate through all possible CPU bits plus one extra position 65 | for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) { 66 | int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask); 67 | // Start of a new range 68 | if (isSet && start == -1) { 69 | start = cpu; 70 | } 71 | // End of a range, add comma between ranges 72 | if (!isSet && start != -1) { 73 | if (cpu-1 == start) { 74 | c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start); 75 | } else { 76 | c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1); 77 | } 78 | if (c >= len-1) break; 79 | start = -1; 80 | } 81 | } 82 | if (c == 0) str[0] = '\0'; 83 | return str; 84 | } 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /src/include/debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_DEBUG_H_ 8 | #define NCCL_INT_DEBUG_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_common.h" 12 | #include 13 | 14 | #include 15 | 16 | // Conform to pthread and NVTX standard 17 | #define NCCL_THREAD_NAMELEN 16 18 | 19 | extern int ncclDebugLevel; 20 | extern FILE *ncclDebugFile; 21 | 22 | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); 23 | 24 | // Let code temporarily downgrade WARN into INFO 25 | extern thread_local int ncclDebugNoWarn; 26 | extern char ncclLastError[]; 27 | 28 | #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 29 | #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) 30 | #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) 31 | #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) 32 | 33 | #ifdef ENABLE_TRACE 34 | #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) 35 | #else 36 | #define TRACE(...) 37 | #endif 38 | 39 | void ncclSetThreadName(pthread_t thread, const char *fmt, ...); 40 | 41 | void ncclResetDebugInit(); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/include/enqueue.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ENQUEUE_H_ 8 | #define NCCL_ENQUEUE_H_ 9 | 10 | #include "comm.h" 11 | #include "group.h" 12 | #include "collectives.h" 13 | #include "utils.h" 14 | 15 | #define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t) 16 | #define NCCL_LL128_ALIGNMENT_PER_WARP 480 17 | #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL) 18 | #define NCCL_BYTES_ALIGNMENT 16 19 | 20 | ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize); 21 | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); 22 | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); 23 | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 24 | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); 25 | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); 26 | ncclResult_t ncclLaunchFinish(struct ncclComm* comm); 27 | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo); 28 | ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm); 29 | 30 | static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) { 31 | return func == ncclFuncReduceScatter ? nRanks*count : count; 32 | } 33 | static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { 34 | return func == ncclFuncAllGather ? nRanks*count : count; 35 | } 36 | static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { 37 | return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; 38 | } 39 | 40 | #endif // End include guard 41 | -------------------------------------------------------------------------------- /src/include/ibvsymbols.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_IBV_SYMBOLS_H_ 2 | #define NCCL_IBV_SYMBOLS_H_ 3 | 4 | #ifdef NCCL_BUILD_RDMA_CORE 5 | #include 6 | #else 7 | #include "ibvcore.h" 8 | #endif 9 | 10 | #include "nccl.h" 11 | 12 | /* IB Verbs Function Pointers*/ 13 | struct ncclIbvSymbols { 14 | int (*ibv_internal_fork_init)(void); 15 | struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); 16 | void (*ibv_internal_free_device_list)(struct ibv_device **list); 17 | const char * (*ibv_internal_get_device_name)(struct ibv_device *device); 18 | struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); 19 | int (*ibv_internal_close_device)(struct ibv_context *context); 20 | int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); 21 | void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); 22 | int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); 23 | int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); 24 | int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); 25 | int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); 26 | struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); 27 | int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); 28 | struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); 29 | struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); 30 | /* DMA-BUF support */ 31 | struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); 32 | int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); 33 | struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); 34 | int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); 35 | struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); 36 | int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 37 | int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); 38 | const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); 39 | int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); 40 | int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); 41 | }; 42 | 43 | /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ 44 | ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); 45 | 46 | #endif // NCCL_IBV_SYMBOLS_H_ 47 | -------------------------------------------------------------------------------- /src/include/info.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INFO_H_ 8 | #define NCCL_INFO_H_ 9 | 10 | #include "nccl.h" 11 | #include "collectives.h" 12 | #include "core.h" 13 | #include "utils.h" 14 | 15 | // Used to pass NCCL call information between functions 16 | struct ncclInfo { 17 | ncclFunc_t coll; 18 | const char* opName; 19 | // NCCL Coll Args 20 | const void* sendbuff; 21 | void* recvbuff; 22 | size_t count; 23 | ncclDataType_t datatype; 24 | ncclRedOp_t op; 25 | int root; // peer for p2p operations 26 | ncclComm_t comm; 27 | cudaStream_t stream; 28 | // Algorithm details 29 | int chunkSteps; 30 | int sliceSteps; 31 | }; 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/include/ipcsocket.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See COPYRIGHT for license information 5 | */ 6 | 7 | #ifndef NCCL_IPCSOCKET_H 8 | #define NCCL_IPCSOCKET_H 9 | 10 | #include "nccl.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define NCCL_IPC_SOCKNAME_LEN 64 24 | 25 | struct ncclIpcSocket { 26 | int fd; 27 | char socketName[NCCL_IPC_SOCKNAME_LEN]; 28 | volatile uint32_t* abortFlag; 29 | }; 30 | 31 | ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); 32 | ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); 33 | ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); 34 | 35 | ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); 36 | ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); 37 | 38 | ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash); 39 | ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd); 40 | 41 | #endif /* NCCL_IPCSOCKET_H */ 42 | -------------------------------------------------------------------------------- /src/include/mlx5/mlx5dvcore.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_MLX5DV_CORE_H_ 2 | #define NCCL_MLX5DV_CORE_H_ 3 | 4 | /* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without 5 | * explicit including of MLX5 direct verbs header. 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "ibvwrap.h" 13 | 14 | enum mlx5dv_reg_dmabuf_access { 15 | MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT = (1<<0), 16 | }; 17 | 18 | #endif // NCCL_MLX5DV_CORE_H_ 19 | -------------------------------------------------------------------------------- /src/include/mlx5/mlx5dvsymbols.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_MLX5DV_SYMBOLS_H_ 2 | #define NCCL_MLX5DV_SYMBOLS_H_ 3 | 4 | #ifdef NCCL_BUILD_MLX5DV 5 | #include 6 | #else 7 | #include "mlx5/mlx5dvcore.h" 8 | #endif 9 | 10 | #include "nccl.h" 11 | 12 | /* MLX5 Direct Verbs Function Pointers*/ 13 | struct ncclMlx5dvSymbols { 14 | bool (*mlx5dv_internal_is_supported)(struct ibv_device *device); 15 | int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len); 16 | /* DMA-BUF support */ 17 | struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access); 18 | }; 19 | 20 | /* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */ 21 | ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols); 22 | 23 | #endif // NCCL_MLX5DV_SYMBOLS_H_ 24 | -------------------------------------------------------------------------------- /src/include/mlx5/mlx5dvwrap.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. 3 | * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. 4 | * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. 5 | * Copyright (c) 2005 PathScale, Inc. All rights reserved. 6 | * 7 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 8 | * 9 | * See LICENSE.txt for license information 10 | ************************************************************************/ 11 | 12 | #ifndef NCCL_MLX5DVWRAP_H_ 13 | #define NCCL_MLX5DVWRAP_H_ 14 | 15 | #include 16 | #include 17 | #ifdef NCCL_BUILD_MLX5DV 18 | #include 19 | #else 20 | #include "mlx5/mlx5dvcore.h" 21 | #endif 22 | 23 | #include "core.h" 24 | #include "ibvwrap.h" 25 | #include 26 | #include 27 | 28 | typedef enum mlx5dv_return_enum 29 | { 30 | MLX5DV_SUCCESS = 0, //!< The operation was successful 31 | } mlx5dv_return_t; 32 | 33 | ncclResult_t wrap_mlx5dv_symbols(void); 34 | /* NCCL wrappers of MLX5 direct verbs functions */ 35 | bool wrap_mlx5dv_is_supported(struct ibv_device *device); 36 | ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len); 37 | /* DMA-BUF support */ 38 | ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access); 39 | struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access); 40 | 41 | #endif // NCCL_MLX5DVWRAP_H_ 42 | -------------------------------------------------------------------------------- /src/include/mnnvl.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_MNNVL_H_ 8 | #define NCCL_MNNVL_H_ 9 | 10 | #include "nccl.h" 11 | #include "comm.h" 12 | 13 | ncclResult_t ncclMnnvlCheck(struct ncclComm* comm); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/include/nccl_common.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_DEBUG_H_ 8 | #define NCCL_DEBUG_H_ 9 | 10 | #include 11 | 12 | typedef enum { 13 | NCCL_LOG_NONE = 0, 14 | NCCL_LOG_VERSION = 1, 15 | NCCL_LOG_WARN = 2, 16 | NCCL_LOG_INFO = 3, 17 | NCCL_LOG_ABORT = 4, 18 | NCCL_LOG_TRACE = 5 19 | } ncclDebugLogLevel; 20 | 21 | typedef enum { 22 | NCCL_INIT = 0x1, 23 | NCCL_COLL = 0x2, 24 | NCCL_P2P = 0x4, 25 | NCCL_SHM = 0x8, 26 | NCCL_NET = 0x10, 27 | NCCL_GRAPH = 0x20, 28 | NCCL_TUNING = 0x40, 29 | NCCL_ENV = 0x80, 30 | NCCL_ALLOC = 0x100, 31 | NCCL_CALL = 0x200, 32 | NCCL_PROXY = 0x400, 33 | NCCL_NVLS = 0x800, 34 | NCCL_BOOTSTRAP = 0x1000, 35 | NCCL_REG = 0x2000, 36 | NCCL_PROFILE = 0x4000, 37 | NCCL_RAS = 0x8000, 38 | NCCL_ALL = ~0 39 | } ncclDebugLogSubSys; 40 | 41 | typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); 42 | 43 | // NCCL core profiler callback for network defined events instrumentation 44 | enum { 45 | ncclProfilerNetEventStart = 0, 46 | ncclProfilerNetEventStop, 47 | ncclProfilerNetEventUpdate, 48 | ncclProfilerNetEventUpdateAndStop, 49 | }; 50 | 51 | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); 52 | 53 | #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now 54 | typedef enum { 55 | ncclFuncBroadcast = 0, 56 | ncclFuncReduce = 1, 57 | ncclFuncAllGather = 2, 58 | ncclFuncReduceScatter = 3, 59 | ncclFuncAllReduce = 4, 60 | ncclFuncSendRecv = 5, 61 | ncclFuncSend = 6, 62 | ncclFuncRecv = 7, 63 | ncclNumFuncs = 8 64 | } ncclFunc_t; 65 | 66 | #define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT 67 | #define NCCL_ALGO_UNDEF -1 68 | #define NCCL_ALGO_TREE 0 69 | #define NCCL_ALGO_RING 1 70 | #define NCCL_ALGO_COLLNET_DIRECT 2 71 | #define NCCL_ALGO_COLLNET_CHAIN 3 72 | #define NCCL_ALGO_NVLS 4 73 | #define NCCL_ALGO_NVLS_TREE 5 74 | #define NCCL_ALGO_PAT 6 75 | 76 | #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 77 | #define NCCL_PROTO_UNDEF -1 78 | #define NCCL_PROTO_LL 0 79 | #define NCCL_PROTO_LL128 1 80 | #define NCCL_PROTO_SIMPLE 2 81 | 82 | #define NCCL_ALGO_PROTO_IGNORE -1.0 83 | #endif 84 | -------------------------------------------------------------------------------- /src/include/net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_INT_NET_H_ 8 | #define NCCL_INT_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_net.h" 12 | #include "comm.h" 13 | #include "checks.h" 14 | 15 | typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; 16 | 17 | ncclResult_t ncclNetInit(struct ncclComm* comm); 18 | ncclResult_t ncclNetFinalize(struct ncclComm* comm); 19 | 20 | // Test whether the current GPU support GPU Direct RDMA. 21 | ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); 22 | 23 | extern ncclNet_t ncclNetIb; 24 | extern ncclNet_t ncclNetSocket; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/include/net_device.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NET_DEVICE_H_ 8 | #define NCCL_NET_DEVICE_H_ 9 | 10 | #define NCCL_NET_DEVICE_INVALID_VERSION 0x0 11 | #define NCCL_NET_MTU_SIZE 4096 12 | 13 | // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin 14 | // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. 15 | #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 16 | 17 | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; 18 | 19 | typedef struct { 20 | ncclNetDeviceType netDeviceType; // Network offload type 21 | int netDeviceVersion; // Version number for network offload 22 | void* handle; 23 | size_t size; 24 | int needsProxyProgress; 25 | } ncclNetDeviceHandle_v7_t; 26 | 27 | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; 28 | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; 29 | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; 30 | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtSemanticsCounters.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /** 10 | * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. 11 | */ 12 | 13 | #ifndef NVTX_SEMANTIC_ID_COUNTERS_V1 14 | #define NVTX_SEMANTIC_ID_COUNTERS_V1 2 15 | 16 | /** 17 | * Flags to extend the semantics of counters. 18 | */ 19 | #define NVTX_COUNTERS_FLAGS_NONE 0 20 | 21 | /** 22 | * Convert the fixed point value to a normalized floating point value. 23 | * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type 24 | * this flag is applied to. 25 | */ 26 | #define NVTX_COUNTERS_FLAG_NORMALIZE (1 << 1) 27 | 28 | /** 29 | * Visual tools should apply scale and limits when graphing. 30 | */ 31 | #define NVTX_COUNTERS_FLAG_LIMIT_MIN (1 << 2) 32 | #define NVTX_COUNTERS_FLAG_LIMIT_MAX (1 << 3) 33 | #define NVTX_COUNTERS_FLAG_LIMITS \ 34 | (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX) 35 | 36 | /** 37 | * Counter time scopes. 38 | */ 39 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT (1 << 5) 40 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST (2 << 5) 41 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT (3 << 5) 42 | #define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START (4 << 5) 43 | 44 | /** 45 | * Counter value types. 46 | */ 47 | #define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10) 48 | /** Delta to previous value of same counter type. */ 49 | #define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA (2 << 10) 50 | 51 | /** 52 | * Datatypes for the `limits` union. 53 | */ 54 | #define NVTX_COUNTERS_LIMIT_I64 0 55 | #define NVTX_COUNTERS_LIMIT_U64 1 56 | #define NVTX_COUNTERS_LIMIT_F64 2 57 | 58 | /** 59 | *\brief Specify counter semantics. 60 | */ 61 | typedef struct nvtxSemanticsCounter_v1 { 62 | /** Header of the semantic extensions (with identifier, version, etc.). */ 63 | struct nvtxSemanticsHeader_v1 header; 64 | 65 | /** Flags to provide more context about the counter value. */ 66 | uint64_t flags; 67 | 68 | /** Unit of the counter value (case-insensitive). */ 69 | const char* unit; 70 | 71 | /** Should be 1 if not used. */ 72 | uint64_t unitScaleNumerator; 73 | 74 | /** Should be 1 if not used. */ 75 | uint64_t unitScaleDenominator; 76 | 77 | /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */ 78 | int64_t limitType; 79 | 80 | /** Graph limits {minimum, maximum}. */ 81 | union limits_t { 82 | int64_t i64[2]; 83 | uint64_t u64[2]; 84 | double d[2]; 85 | } limits; 86 | } nvtxSemanticsCounter_t; 87 | 88 | #endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvToolsExtSemanticsScope.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /** 10 | * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand. 11 | */ 12 | 13 | #ifndef NVTX_SEMANTIC_ID_SCOPE_V1 14 | #define NVTX_SEMANTIC_ID_SCOPE_V1 1 15 | 16 | /** 17 | * \brief Specify the NVTX scope for a payload entry. 18 | * 19 | * This allows the scope to be set for a specific value or counter in a payload. 20 | * The scope must be known at schema registration time. 21 | */ 22 | typedef struct nvtxSemanticsScope_v1 23 | { 24 | struct nvtxSemanticsHeader_v1 header; 25 | 26 | /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */ 27 | uint64_t scopeId; 28 | } nvtxSemanticsScope_t; 29 | 30 | #endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_HELPER_MACROS_H 10 | #define NVTX_EXT_HELPER_MACROS_H 11 | 12 | /* Combine tokens */ 13 | #define _NVTX_EXT_CONCAT(a, b) a##b 14 | #define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b) 15 | 16 | /* Resolves to the number of arguments passed. */ 17 | #define NVTX_EXT_NUM_ARGS(...) \ 18 | NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway) 19 | #define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16 20 | 21 | /* Cast argument(s) to void to prevent unused variable warnings. */ 22 | #define _NVTX_EXT_VOIDIFY1(a1) (void)a1; 23 | #define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2; 24 | #define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3; 25 | #define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4; 26 | 27 | /* Mark function arguments as unused. */ 28 | #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \ 29 | NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) 30 | 31 | #endif /* NVTX_EXT_HELPER_MACROS_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtImpl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_IMPL_GUARD 10 | #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifndef NVTX_EXT_IMPL_H 14 | #define NVTX_EXT_IMPL_H 15 | /* ---- Include required platform headers ---- */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #if defined(_WIN32) 23 | 24 | #include 25 | 26 | #else 27 | #include 28 | 29 | #if defined(__ANDROID__) 30 | #include 31 | #endif 32 | 33 | #if defined(__linux__) || defined(__CYGWIN__) 34 | #include 35 | #endif 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #endif 45 | 46 | /* ---- Define macros used in this file ---- */ 47 | 48 | #ifdef NVTX_DEBUG_PRINT 49 | #ifdef __ANDROID__ 50 | #include 51 | #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); 52 | #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); 53 | #else 54 | #include 55 | #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) 56 | #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) 57 | #endif 58 | #else /* !defined(NVTX_DEBUG_PRINT) */ 59 | #define NVTX_ERR(...) 60 | #define NVTX_INFO(...) 61 | #endif 62 | 63 | #ifdef __cplusplus 64 | extern "C" { 65 | #endif /* __cplusplus */ 66 | /* 67 | #ifdef __GNUC__ 68 | #pragma GCC visibility push(hidden) 69 | #endif 70 | */ 71 | #define NVTX_EXTENSION_FRESH 0 72 | #define NVTX_EXTENSION_DISABLED 1 73 | #define NVTX_EXTENSION_STARTING 2 74 | #define NVTX_EXTENSION_LOADED 3 75 | 76 | /* Function slots are local to each extension */ 77 | typedef struct nvtxExtGlobals1_t 78 | { 79 | NvtxExtInitializeInjectionFunc_t injectionFnPtr; 80 | } nvtxExtGlobals1_t; 81 | 82 | NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) = 83 | { 84 | (NvtxExtInitializeInjectionFunc_t)0 85 | }; 86 | 87 | #define NVTX_EXT_INIT_GUARD 88 | #include "nvtxExtInit.h" 89 | #undef NVTX_EXT_INIT_GUARD 90 | /* 91 | #ifdef __GNUC__ 92 | #pragma GCC visibility pop 93 | #endif 94 | */ 95 | #ifdef __cplusplus 96 | } /* extern "C" */ 97 | #endif /* __cplusplus */ 98 | 99 | #endif /* NVTX_EXT_IMPL_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD 10 | #error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif /* __cplusplus */ 16 | 17 | #ifdef NVTX_DISABLE 18 | 19 | #include "nvtxExtHelperMacros.h" 20 | 21 | #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ 22 | ret_val fn_name signature { \ 23 | NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ 24 | return ((ret_val)(intptr_t)-1); \ 25 | } 26 | 27 | #else /* NVTX_DISABLE */ 28 | 29 | #define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \ 30 | typedef ret_type ( * fn_name##_impl_fntype )signature; \ 31 | NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ 32 | intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ 33 | if (slot != NVTX_EXTENSION_DISABLED) { \ 34 | if (slot != NVTX_EXTENSION_FRESH) { \ 35 | return (*(fn_name##_impl_fntype)slot) arg_names; \ 36 | } else { \ 37 | NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \ 38 | /* Re-read function slot after extension initialization. */ \ 39 | slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ 40 | if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ 41 | return (*(fn_name##_impl_fntype)slot) arg_names; \ 42 | } \ 43 | } \ 44 | } \ 45 | NVTX_EXT_FN_RETURN_INVALID(ret_type) \ 46 | } 47 | 48 | #endif /*NVTX_DISABLE*/ 49 | 50 | /* Non-void functions. */ 51 | #define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); 52 | 53 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain)) 54 | 55 | NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device)) 56 | 57 | #undef NVTX_EXT_FN_RETURN_INVALID 58 | /* END: Non-void functions. */ 59 | 60 | /* void functions. */ 61 | #define NVTX_EXT_FN_RETURN_INVALID(rtype) 62 | #define return 63 | 64 | NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags)) 65 | 66 | #undef return 67 | #undef NVTX_EXT_FN_RETURN_INVALID 68 | /* END: void functions. */ 69 | 70 | #undef NVTX_EXT_FN_IMPL 71 | 72 | #ifdef __cplusplus 73 | } /* extern "C" */ 74 | #endif /* __cplusplus */ 75 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxExtTypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | /* This header defines types which are used by the internal implementation 10 | * of NVTX and callback subscribers. API clients do not use these types, 11 | * so they are defined here instead of in nvToolsExt.h to clarify they are 12 | * not part of the NVTX client API. */ 13 | 14 | #ifndef NVTXEXTTYPES_H 15 | #define NVTXEXTTYPES_H 16 | 17 | #ifndef NVTX_EXT_TYPES_GUARD 18 | #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. 19 | #endif 20 | 21 | typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); 22 | 23 | typedef struct nvtxExtModuleSegment_t 24 | { 25 | size_t segmentId; 26 | size_t slotCount; 27 | intptr_t* functionSlots; 28 | } nvtxExtModuleSegment_t; 29 | 30 | typedef struct nvtxExtModuleInfo_t 31 | { 32 | uint16_t nvtxVer; 33 | uint16_t structSize; 34 | uint16_t moduleId; 35 | uint16_t compatId; 36 | size_t segmentsCount; 37 | nvtxExtModuleSegment_t* segments; 38 | NvtxExtGetExportFunction_t getExportFunction; 39 | const void* extInfo; 40 | } nvtxExtModuleInfo_t; 41 | 42 | typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); 43 | 44 | #endif /* NVTXEXTTYPES_H */ -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_CUDART 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif /* __cplusplus */ 16 | 17 | typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); 18 | typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); 19 | typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name); 20 | typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name); 21 | typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name); 22 | typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name); 23 | 24 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name) 25 | { 26 | #ifndef NVTX_DISABLE 27 | nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; 28 | if(local!=0) 29 | (*local)(device, name); 30 | #endif /*NVTX_DISABLE*/ 31 | } 32 | 33 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name) 34 | { 35 | #ifndef NVTX_DISABLE 36 | nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; 37 | if(local!=0) 38 | (*local)(device, name); 39 | #endif /*NVTX_DISABLE*/ 40 | } 41 | 42 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name) 43 | { 44 | #ifndef NVTX_DISABLE 45 | nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; 46 | if(local!=0) 47 | (*local)(stream, name); 48 | #endif /*NVTX_DISABLE*/ 49 | } 50 | 51 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name) 52 | { 53 | #ifndef NVTX_DISABLE 54 | nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; 55 | if(local!=0) 56 | (*local)(stream, name); 57 | #endif /*NVTX_DISABLE*/ 58 | } 59 | 60 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name) 61 | { 62 | #ifndef NVTX_DISABLE 63 | nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; 64 | if(local!=0) 65 | (*local)(event, name); 66 | #endif /*NVTX_DISABLE*/ 67 | } 68 | 69 | NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name) 70 | { 71 | #ifndef NVTX_DISABLE 72 | nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; 73 | if(local!=0) 74 | (*local)(event, name); 75 | #endif /*NVTX_DISABLE*/ 76 | } 77 | 78 | #ifdef __cplusplus 79 | } /* extern "C" */ 80 | #endif /* __cplusplus */ 81 | 82 | -------------------------------------------------------------------------------- /src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Licensed under the Apache License v2.0 with LLVM Exceptions. 5 | * See https://llvm.org/LICENSE.txt for license information. 6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 | */ 8 | 9 | #ifndef NVTX_IMPL_GUARD_SYNC 10 | #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). 11 | #endif 12 | 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif /* __cplusplus */ 17 | 18 | typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); 19 | typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); 20 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); 21 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); 22 | typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); 23 | typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); 24 | 25 | NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs) 26 | { 27 | #ifndef NVTX_DISABLE 28 | nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; 29 | if(local!=0) 30 | return (*local)(domain, attribs); 31 | else 32 | #endif /*NVTX_DISABLE*/ 33 | return (nvtxSyncUser_t)0; 34 | } 35 | 36 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) 37 | { 38 | #ifndef NVTX_DISABLE 39 | nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; 40 | if(local!=0) 41 | (*local)(handle); 42 | #endif /*NVTX_DISABLE*/ 43 | } 44 | 45 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) 46 | { 47 | #ifndef NVTX_DISABLE 48 | nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; 49 | if(local!=0) 50 | (*local)(handle); 51 | #endif /*NVTX_DISABLE*/ 52 | } 53 | 54 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) 55 | { 56 | #ifndef NVTX_DISABLE 57 | nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; 58 | if(local!=0) 59 | (*local)(handle); 60 | #endif /*NVTX_DISABLE*/ 61 | } 62 | 63 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) 64 | { 65 | #ifndef NVTX_DISABLE 66 | nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; 67 | if(local!=0) 68 | (*local)(handle); 69 | #endif /*NVTX_DISABLE*/ 70 | } 71 | 72 | NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) 73 | { 74 | #ifndef NVTX_DISABLE 75 | nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; 76 | if(local!=0) 77 | (*local)(handle); 78 | #endif /*NVTX_DISABLE*/ 79 | } 80 | 81 | #ifdef __cplusplus 82 | } /* extern "C" */ 83 | #endif /* __cplusplus */ 84 | -------------------------------------------------------------------------------- /src/include/p2p.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include 8 | 9 | #ifndef NCCL_P2P_H_ 10 | #define NCCL_P2P_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include "core.h" 16 | 17 | #if CUDART_VERSION < 12030 18 | // MNNVL: FABRIC handle support lifted from CUDA 12.3 19 | #define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128) 20 | #define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL) 21 | #define CU_IPC_HANDLE_SIZE 64 22 | typedef struct CUmemFabricHandle_st { 23 | unsigned char data[CU_IPC_HANDLE_SIZE]; 24 | } CUmemFabricHandle_v1; 25 | typedef CUmemFabricHandle_v1 CUmemFabricHandle; 26 | #endif 27 | 28 | typedef union { 29 | uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support 30 | CUmemFabricHandle handle; 31 | } ncclCuDesc; 32 | 33 | typedef union { 34 | // Legacy CUDA IPC 35 | cudaIpcMemHandle_t devIpc; 36 | // cuMem API support 37 | struct { 38 | ncclCuDesc cuDesc; 39 | CUmemGenericAllocationHandle memHandle; 40 | }; 41 | } ncclIpcDesc; 42 | 43 | enum ncclIpcRegType { 44 | NCCL_IPC_SENDRECV = 0, 45 | NCCL_IPC_COLLECTIVE = 1 46 | }; 47 | 48 | struct ncclIpcImpInfo { 49 | void* rmtRegAddr; 50 | bool legacyIpcCap; 51 | uintptr_t offset; 52 | }; 53 | 54 | struct ncclIpcRegInfo { 55 | int peerRank; 56 | void* baseAddr; 57 | struct ncclProxyConnector* ipcProxyconn; 58 | struct ncclIpcImpInfo impInfo; 59 | }; 60 | 61 | ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr); 62 | ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); 63 | ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); 64 | ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut); 65 | ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts); 66 | 67 | ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo); 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/include/param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PARAM_H_ 8 | #define NCCL_PARAM_H_ 9 | 10 | #include 11 | 12 | const char* userHomeDir(); 13 | void setEnvFile(const char* fileName); 14 | void initEnv(); 15 | const char *ncclGetEnv(const char *name); 16 | 17 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); 18 | 19 | #define NCCL_PARAM(name, env, deftVal) \ 20 | int64_t ncclParam##name() { \ 21 | constexpr int64_t uninitialized = INT64_MIN; \ 22 | static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ 23 | static int64_t cache = uninitialized; \ 24 | if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ 25 | ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ 26 | } \ 27 | return cache; \ 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/include/plugin/nccl_net.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_NET_H_ 8 | #define NCCL_NET_H_ 9 | 10 | #include "nccl.h" 11 | #include "nccl_common.h" 12 | #include "net_device.h" 13 | #include 14 | 15 | #define NCCL_NET_HANDLE_MAXSIZE 128 16 | //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties 17 | #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) 18 | #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 19 | 20 | #define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. 21 | #define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried 22 | 23 | #define NCCL_PTR_HOST 0x1 24 | #define NCCL_PTR_CUDA 0x2 25 | #define NCCL_PTR_DMABUF 0x4 26 | 27 | // Maximum number of requests per comm object 28 | #define NCCL_NET_MAX_REQUESTS 32 29 | 30 | // Max number of ncclNet objects which can live in the same process 31 | #ifndef NCCL_NET_MAX_PLUGINS 32 | #define NCCL_NET_MAX_PLUGINS 16 33 | #endif 34 | 35 | #include "net/net_v10.h" 36 | #include "net/net_v9.h" 37 | #include "net/net_v8.h" 38 | #include "net/net_v7.h" 39 | #include "net/net_v6.h" 40 | 41 | typedef ncclNet_v10_t ncclNet_t; 42 | typedef ncclCollNet_v10_t ncclCollNet_t; 43 | typedef ncclNetSGE_v10_t ncclNetSGE_t; 44 | typedef ncclNetProperties_v10_t ncclNetProperties_t; 45 | typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; 46 | typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; 47 | 48 | #define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10 49 | 50 | #define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10 51 | #define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10 52 | 53 | #endif // end include guard 54 | -------------------------------------------------------------------------------- /src/include/plugin/nccl_profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PROFILER_H_ 8 | #define NCCL_PROFILER_H_ 9 | 10 | enum { 11 | ncclProfileGroup = (1 << 0), // group event type 12 | ncclProfileColl = (1 << 1), // host collective call event type 13 | ncclProfileP2p = (1 << 2), // host point-to-point call event type 14 | ncclProfileProxyOp = (1 << 3), // proxy operation event type 15 | ncclProfileProxyStep = (1 << 4), // proxy step event type 16 | ncclProfileProxyCtrl = (1 << 5), // proxy control event type 17 | ncclProfileKernelCh = (1 << 6), // kernel channel event type 18 | ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events 19 | }; 20 | 21 | typedef enum { 22 | ncclProfilerProxyOpSendPosted = 0, // deprecated in v4 23 | ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4 24 | ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4 25 | ncclProfilerProxyOpSendDone = 3, // deprecated in v4 26 | ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4 27 | ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4 28 | ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4 29 | ncclProfilerProxyOpRecvDone = 7, // deprecated in v4 30 | ncclProfilerProxyOpInProgress_v4 = 19, 31 | 32 | /* Legacy proxy profiler states */ 33 | ncclProfilerProxyStepSendGPUWait = 8, 34 | ncclProfilerProxyStepSendPeerWait_v4 = 20, 35 | ncclProfilerProxyStepSendWait = 9, 36 | ncclProfilerProxyStepRecvWait = 10, 37 | ncclProfilerProxyStepRecvFlushWait = 11, 38 | ncclProfilerProxyStepRecvGPUWait = 12, 39 | 40 | /* Legacy proxy control states */ 41 | ncclProfilerProxyCtrlIdle = 13, 42 | ncclProfilerProxyCtrlActive = 14, 43 | ncclProfilerProxyCtrlSleep = 15, 44 | ncclProfilerProxyCtrlWakeup = 16, 45 | ncclProfilerProxyCtrlAppend = 17, 46 | ncclProfilerProxyCtrlAppendEnd = 18, 47 | 48 | /* Network defined event states */ 49 | ncclProfilerNetPluginUpdate = 21, 50 | 51 | /* Kernel event states */ 52 | ncclProfilerKernelChStop = 22, 53 | } ncclProfilerEventState_t; 54 | 55 | typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; 56 | typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; 57 | typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; 58 | typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; 59 | 60 | #include 61 | #include "profiler/profiler_v4.h" 62 | #include "profiler/profiler_v3.h" 63 | #include "profiler/profiler_v2.h" 64 | #include "profiler/profiler_v1.h" 65 | 66 | typedef ncclProfiler_v4_t ncclProfiler_t; 67 | typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t; 68 | typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t; 69 | 70 | #define NCCL_PROFILER_NET_VER_BITS (16) 71 | #define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) 72 | #define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) 73 | 74 | typedef enum { 75 | NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), 76 | NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), 77 | } ncclProfilerNetType; 78 | 79 | #endif 80 | -------------------------------------------------------------------------------- /src/include/plugin/nccl_tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_TUNER_H_ 9 | #define NCCL_TUNER_H_ 10 | 11 | #include "nccl.h" 12 | #include "nccl_common.h" 13 | 14 | #include "tuner/tuner_v4.h" 15 | #include "tuner/tuner_v3.h" 16 | #include "tuner/tuner_v2.h" 17 | 18 | typedef ncclTuner_v4_t ncclTuner_t; 19 | 20 | #define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/include/plugin/plugin.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_PLUGIN_H_ 8 | #define NCCL_PLUGIN_H_ 9 | 10 | #include "nccl.h" 11 | 12 | void* ncclOpenNetPluginLib(const char* name); 13 | void* ncclOpenTunerPluginLib(const char* name); 14 | void* ncclOpenProfilerPluginLib(const char* name); 15 | void* ncclGetNetPluginLib(void); 16 | ncclResult_t ncclClosePluginLib(void* handle); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/include/plugin/profiler/net_ib.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_IB_H_ 8 | #define NET_IB_H_ 9 | 10 | #include "nccl_profiler.h" 11 | #include "net_ib_v1.h" 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/plugin/profiler/net_ib_v1.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_IB_V1_H_ 8 | #define NET_IB_V1_H_ 9 | 10 | #define NCCL_PROFILER_NET_IB_VER 1 11 | 12 | enum { 13 | ncclProfileQp = (1 << 0), 14 | }; 15 | 16 | // The data structure version is encoded in the plugin identifier bitmask and 17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin 18 | // identifier in the event descriptor before calling the profiler startEvent 19 | // function. The profiler should inspect the plugin id to find out the source 20 | // plugin as well as the version of the event struct 21 | typedef struct { 22 | uint8_t type; // event type (plugin defined) 23 | union { 24 | struct { 25 | int device; // network device id 26 | uint64_t wr_id; // work request id 27 | int opcode; // ibv opcode 28 | int qpNum; // QP number 29 | size_t length; // work request data length 30 | } qp; 31 | }; 32 | } ncclProfilerNetIbDescr_v1_t; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/include/plugin/profiler/net_socket.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_SOCKET_H_ 8 | #define NET_SOCKET_H_ 9 | 10 | #include "nccl_profiler.h" 11 | #include "net_socket_v1.h" 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/plugin/profiler/net_socket_v1.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NET_SOCKET_V1_H_ 8 | #define NET_SOCKET_V1_H_ 9 | 10 | #define NCCL_PROFILER_NET_SOCKET_VER 1 11 | 12 | enum { 13 | ncclProfileSocket = (1 << 0), 14 | }; 15 | 16 | // The data structure version is encoded in the plugin identifier bitmask and 17 | // passed to NCCL core through the profiler callback. NCCL copies the plugin 18 | // identifier in the event descriptor before calling the profiler startEvent 19 | // function. The profiler should inspect the plugin id to find out the source 20 | // plugin as well as the version of the event struct 21 | typedef struct { 22 | uint8_t type; // event type (plugin defined) 23 | union { 24 | struct { 25 | int fd; 26 | int op; 27 | size_t length; 28 | } sock; 29 | }; 30 | } ncclProfilerNetSockDescr_v1_t; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/include/plugin/profiler/profiler_v1.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_V1_H_ 8 | #define PROFILER_V1_H_ 9 | 10 | typedef struct { 11 | uint8_t type; // event type descriptor: ncclProfileColl, ... 12 | void* parentObj; // pointer to the profiler parent object (for coll is the group) 13 | int rank; // originating rank 14 | union { 15 | struct { 16 | const char* name; 17 | uint64_t commHash; 18 | uint64_t seqNumber; 19 | uint8_t func; 20 | void const* sendBuff; 21 | void* recvBuff; 22 | size_t count; 23 | int root; 24 | uint8_t datatype; 25 | uint32_t op; 26 | size_t trafficBytes; 27 | uint8_t nMaxChannels; 28 | uint8_t nWarps; 29 | uint8_t algo; 30 | uint8_t proto; 31 | int isCollnet; 32 | int isNvls; 33 | } coll; 34 | 35 | struct { 36 | const char* name; 37 | uint64_t commHash; 38 | uint8_t func; 39 | void* buff; 40 | uint8_t datatype; 41 | size_t count; 42 | int peer; 43 | } p2p; 44 | 45 | struct { 46 | pid_t pid; // pid of the originating process 47 | uint8_t channelId; // channel id for this proxy operation 48 | int peer; // remote rank for send/recv 49 | int nSteps; // number of steps for this proxy operation 50 | int chunkSize; // amount of data transferred by this proxy operation 51 | int isSend; 52 | } proxyOp; 53 | 54 | struct { 55 | int step; 56 | } proxyStep; 57 | }; 58 | } ncclProfilerEventDescr_v1_t; 59 | 60 | typedef union { 61 | struct { 62 | size_t transSize; 63 | int steps; 64 | } proxyOp; 65 | 66 | struct { 67 | int appendedProxyOps; 68 | } proxyCtrl; 69 | } ncclProfilerEventStateArgs_v1_t; 70 | 71 | typedef struct { 72 | const char* name; 73 | 74 | // init - initialize the profiler plugin 75 | // Input 76 | // - context : opaque profiler context object for separating profiler behavior across comms 77 | // Output 78 | // - eActivationMask: bitmask of active events set by the plugin 79 | ncclResult_t (*init)(void** context, int* eActivationMask); 80 | 81 | // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset 82 | // Input 83 | // - context: opaque profiler context object 84 | // - eDescr : pointer to ncclProfilerEventDescr_t object 85 | // Output 86 | // - eHandle: return event handle for supplied event descriptor object 87 | ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); 88 | 89 | // stopEvent - stop/finalize an event inside and event set 90 | // Input 91 | // - eHandle: handle to event object 92 | ncclResult_t (*stopEvent)(void* eHandle); 93 | 94 | // recordEventState - record event state transitions and event attribute updates 95 | // Input 96 | // - eHandle : handle to event object created through startEvent 97 | // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition 98 | // - eState : event state transition 99 | ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); 100 | 101 | // finalize - finalize the profiler plugin 102 | // Input 103 | // - context: opaque profiler context object 104 | ncclResult_t (*finalize)(void* context); 105 | } ncclProfiler_v1_t; 106 | 107 | #endif 108 | -------------------------------------------------------------------------------- /src/include/plugin/profiler/profiler_v2.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_V2_H_ 8 | #define PROFILER_V2_H_ 9 | 10 | typedef struct { 11 | uint8_t type; // event type descriptor: ncclProfileColl, ... 12 | void* parentObj; // pointer to the profiler parent object (for coll is the group) 13 | int rank; // originating rank 14 | union { 15 | struct { 16 | const char* name; 17 | uint64_t commHash; 18 | uint64_t seqNumber; 19 | const char* func; 20 | void const* sendBuff; 21 | void* recvBuff; 22 | size_t count; 23 | int root; 24 | const char* datatype; 25 | size_t trafficBytes; 26 | uint8_t nMaxChannels; 27 | uint8_t nWarps; 28 | const char* algo; 29 | const char* proto; 30 | } coll; 31 | 32 | struct { 33 | const char* name; 34 | uint64_t commHash; 35 | const char* func; 36 | void* buff; 37 | const char* datatype; 38 | size_t count; 39 | int peer; 40 | } p2p; 41 | 42 | struct { 43 | pid_t pid; // pid of the originating process 44 | uint8_t channelId; // channel id for this proxy operation 45 | int peer; // remote rank for send/recv 46 | int nSteps; // number of steps for this proxy operation 47 | int chunkSize; // amount of data transferred by this proxy operation 48 | int isSend; 49 | } proxyOp; 50 | 51 | struct { 52 | int step; 53 | } proxyStep; 54 | }; 55 | } ncclProfilerEventDescr_v2_t; 56 | 57 | typedef union { 58 | struct { 59 | size_t transSize; 60 | int steps; 61 | } proxyOp; 62 | 63 | struct { 64 | int appendedProxyOps; 65 | } proxyCtrl; 66 | } ncclProfilerEventStateArgs_v2_t; 67 | 68 | typedef struct { 69 | const char* name; 70 | 71 | // init - initialize the profiler plugin 72 | // Input 73 | // - context : opaque profiler context object for separating profiler behavior across comms 74 | // Output 75 | // - eActivationMask: bitmask of active events set by the plugin 76 | ncclResult_t (*init)(void** context, int* eActivationMask); 77 | 78 | // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset 79 | // Input 80 | // - context: opaque profiler context object 81 | // - eDescr : pointer to ncclProfilerEventDescr_t object 82 | // Output 83 | // - eHandle: return event handle for supplied event descriptor object 84 | ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); 85 | 86 | // stopEvent - stop/finalize an event inside and event set 87 | // Input 88 | // - eHandle: handle to event object 89 | ncclResult_t (*stopEvent)(void* eHandle); 90 | 91 | // recordEventState - record event state transitions and event attribute updates 92 | // Input 93 | // - eHandle : handle to event object created through startEvent 94 | // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition 95 | // - eState : event state transition 96 | ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); 97 | 98 | // finalize - finalize the profiler plugin 99 | // Input 100 | // - context: opaque profiler context object 101 | ncclResult_t (*finalize)(void* context); 102 | } ncclProfiler_v2_t; 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /src/include/plugin/tuner/tuner_v2.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef TUNER_V2_H_ 9 | #define TUNER_V2_H_ 10 | 11 | // API to be implemented by external tuner 12 | typedef struct { 13 | // Name of the tuner 14 | const char* name; 15 | 16 | // Initializes tuner states. 17 | // Inputs: 18 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 19 | // - nNodes: number of nodes in current communicator. 20 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 21 | // Outputs: 22 | // - context: tuner context object 23 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 24 | 25 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 26 | // Inputs: 27 | // - context: tuner context object 28 | // - collType: collective type , e.g., allreduce, allgather… 29 | // - nBytes: collective size in bytes 30 | // - collNetTypeSupport: whether collnet supports this type 31 | // - nvlsTypeSupport: whether nvlink sharp supports this time 32 | // - numPipeOps: number of operations in the group 33 | // 34 | // Outputs: 35 | // - algorithm: selected algorithm to be used for the given collective 36 | // - protocol: selected protocol to be used for the give collective 37 | // - nChannels: number of channels (hence SMs) to be used. 38 | // 39 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 40 | // default tuning for the given collective. 41 | // Also, the plugin is allowed to not set any output, or set only the 42 | // algorithm and protocol, but not only the algorithm or only the protocol. 43 | // Unset fields will be set automatically by NCCL. 44 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 45 | int collNetSupport, int nvlsSupport, int numPipeOps, 46 | int* algorithm, int* protocol, int* nChannels); 47 | 48 | // Terminates the plugin and cleans up any resources that the plugin allocated. 49 | // context: tuner context object 50 | ncclResult_t (*destroy)(void* context); 51 | } ncclTuner_v2_t; 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /src/include/plugin/tuner/tuner_v3.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef TUNER_V3_H_ 9 | #define TUNER_V3_H_ 10 | 11 | // API to be implemented by external tuner 12 | typedef struct { 13 | // Name of the tuner 14 | const char* name; 15 | 16 | // Initializes tuner states. 17 | // Inputs: 18 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 19 | // - nNodes: number of nodes in current communicator. 20 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 21 | // Outputs: 22 | // - context: tuner context object 23 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 24 | 25 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 26 | // Inputs: 27 | // - context: tuner context object 28 | // - collType: collective type , e.g., allreduce, allgather… 29 | // - nBytes: collective size in bytes 30 | // - numPipeOps: number of operations in the group 31 | // - numAlgo: number of algorithms in collCostTable 32 | // - numProto: number of protocols in collCostTable 33 | // 34 | // Outputs: 35 | // - nChannels: number of channels (hence SMs) to be used. 36 | // 37 | // InOut: 38 | // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. 39 | // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). 40 | // 41 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 42 | // default tuning for the given collective. 43 | // Also, the plugin is allowed to not set any output, or set only the 44 | // algorithm and protocol, but not only the algorithm or only the protocol. 45 | // Unset fields will be set automatically by NCCL. 46 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 47 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 48 | int* nChannels); 49 | 50 | // Terminates the plugin and cleans up any resources that the plugin allocated. 51 | // context: tuner context object 52 | ncclResult_t (*destroy)(void* context); 53 | } ncclTuner_v3_t; 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/include/plugin/tuner/tuner_v4.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef TUNER_V4_H_ 9 | #define TUNER_V4_H_ 10 | 11 | // API to be implemented by external tuner 12 | typedef struct { 13 | // Name of the tuner 14 | const char* name; 15 | 16 | // Initializes tuner states. 17 | // Inputs: 18 | // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. 19 | // - nNodes: number of nodes in current communicator. 20 | // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. 21 | // Outputs: 22 | // - context: tuner context object 23 | ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); 24 | 25 | // Gets info (algo, protocol, number of ctas and threads) for a given collective. 26 | // Inputs: 27 | // - context: tuner context object 28 | // - collType: collective type , e.g., allreduce, allgather… 29 | // - nBytes: collective size in bytes 30 | // - numPipeOps: number of operations in the group 31 | // - numAlgo: number of algorithms in collCostTable 32 | // - numProto: number of protocols in collCostTable 33 | // - regBuff: can register user buffer 34 | // 35 | // Outputs: 36 | // - nChannels: number of channels (hence SMs) to be used. 37 | // 38 | // InOut: 39 | // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. 40 | // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). 41 | // 42 | // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the 43 | // default tuning for the given collective. 44 | // Also, the plugin is allowed to not set any output, or set only the 45 | // algorithm and protocol, but not only the algorithm or only the protocol. 46 | // Unset fields will be set automatically by NCCL. 47 | ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, 48 | int numPipeOps, float** collCostTable, int numAlgo, int numProto, 49 | int regBuff, int* nChannels); 50 | 51 | // Terminates the plugin and cleans up any resources that the plugin allocated. 52 | // context: tuner context object 53 | ncclResult_t (*destroy)(void* context); 54 | } ncclTuner_v4_t; 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /src/include/profiler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef PROFILER_H_ 8 | #define PROFILER_H_ 9 | 10 | #include 11 | #include "nccl_profiler.h" 12 | 13 | struct ncclProxyArgs; 14 | struct ncclKernelPlan; 15 | struct ncclTaskColl; 16 | struct ncclTaskP2p; 17 | struct ncclInfo; 18 | struct ncclComm; 19 | struct ncclProxyOp; 20 | struct ncclProxyConnector; 21 | 22 | struct ncclProfilerProxy { 23 | bool initialized; 24 | struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/; 25 | struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/; 26 | uint64_t workCounter[MAXCHANNELS]; // host work counter 27 | struct ncclProxyConnector sendProxyConn[MAXCHANNELS]; 28 | struct ncclProxyConnector recvProxyConn[MAXCHANNELS]; 29 | }; 30 | 31 | extern int ncclProfilerEventMask; 32 | 33 | // Plugin Init/Finalize Wrappers 34 | ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); 35 | ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm); 36 | 37 | // Profiler Start/Stop Group Wrappers 38 | ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan); 39 | ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan); 40 | 41 | // Profiler Start/Stop Task Events Wrappers 42 | ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan); 43 | ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan); 44 | 45 | // Proxy Op Start/Stop Event Wrappers 46 | ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args); 47 | ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args); 48 | 49 | // Proxy Step Start/Stop Event Wrappers 50 | ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); 51 | ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); 52 | ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); 53 | 54 | // Proxy Control Start/Stop Events Wrappers 55 | ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); 56 | ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); 57 | 58 | // Kernel Channel Start/Stop Event Wrappers 59 | ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start); 60 | ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop); 61 | 62 | // Record Event Wrappers 63 | ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState); 64 | ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState); 65 | ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState); 66 | 67 | // Profiler utility functions 68 | ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); 69 | bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op); 70 | bool ncclProfilerPluginLoaded(void); 71 | 72 | // Profiler callback for network plugin 73 | ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /src/include/ras.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_RAS_H_ 8 | #define NCCL_RAS_H_ 9 | 10 | #include "socket.h" 11 | 12 | // Structure used to communicate data about NCCL ranks from NCCL threads to RAS. 13 | struct rasRankInit { 14 | union ncclSocketAddress addr; 15 | pid_t pid; 16 | int cudaDev; 17 | int nvmlDev; 18 | uint64_t hostHash; 19 | uint64_t pidHash; 20 | }; 21 | 22 | ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank); 23 | ncclResult_t ncclRasCommFini(const struct ncclComm* comm); 24 | ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks); 25 | 26 | #endif // !NCCL_RAS_H_ 27 | -------------------------------------------------------------------------------- /src/include/register.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_REGISTER_H_ 2 | #define NCCL_REGISTER_H_ 3 | 4 | #include "device.h" 5 | 6 | #include 7 | #include 8 | 9 | int64_t ncclParamLocalRegister(); 10 | int64_t ncclParamGraphRegister(); 11 | 12 | enum { 13 | NET_REG_COMPLETE = 0x01, 14 | NVLS_REG_COMPLETE = 0x02, 15 | NVLS_REG_POSSIBLE = 0x04, 16 | NVLS_REG_NO_SUPPORT = 0x08, 17 | COLLNET_REG_COMPLETE = 0x10, 18 | IPC_REG_COMPLETE = 0x20 19 | }; 20 | 21 | struct ncclPeerRegIpcAddr { 22 | uintptr_t* devPeerRmtAddrs; 23 | uintptr_t* hostPeerRmtAddrs; 24 | }; 25 | 26 | struct ncclRegNetHandles { 27 | void* handle; 28 | struct ncclProxyConnector* proxyConn; 29 | struct ncclRegNetHandles* next; 30 | }; 31 | 32 | struct ncclSymRegTask { 33 | struct ncclSymRegTask *next; 34 | void* buff; 35 | size_t baseSize; 36 | CUmemGenericAllocationHandle memHandle; 37 | struct ncclReg* regHandle; 38 | size_t alignment; 39 | }; 40 | 41 | struct ncclReg { 42 | // common attributes 43 | uintptr_t begAddr, endAddr; // page aligned 44 | int localRefs; 45 | int graphRefs; 46 | uint32_t state; 47 | // net reg 48 | struct ncclRegNetHandles* netHandleHead; 49 | // nvls reg 50 | CUdeviceptr regAddr; 51 | size_t regUCSize, regMCSize; 52 | int dev; 53 | CUmemGenericAllocationHandle mcHandle; 54 | uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ 55 | // collnet reg 56 | void* collnetHandle; 57 | struct ncclProxyConnector* collnetProxyconn; 58 | // general ipc reg 59 | struct ncclPeerRegIpcAddr regIpcAddrs; 60 | struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS]; 61 | // symmetric reg 62 | void* baseSymPtr; 63 | size_t symSize; 64 | int winFlags; 65 | }; 66 | 67 | struct ncclRegCache { 68 | struct ncclReg **slots; 69 | int capacity, population; 70 | uintptr_t pageSize; 71 | }; 72 | 73 | struct ncclWindow { 74 | struct ncclReg* handle; 75 | }; 76 | 77 | ncclResult_t ncclRegCleanup(struct ncclComm* comm); 78 | ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); 79 | ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle); 80 | ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid); 81 | ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle); 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /src/include/register_inline.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_REGISTER_INLINE_H_ 2 | #define NCCL_REGISTER_INLINE_H_ 3 | 4 | #include "comm.h" 5 | #include "register.h" 6 | 7 | static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) { 8 | struct ncclRegCache* cache = &comm->regCache; 9 | *outReg = NULL; 10 | for (int slot=0; /*true*/; slot++) { 11 | if (slot == cache->population) return ncclSuccess; 12 | struct ncclReg *reg = cache->slots[slot]; 13 | if ((uintptr_t)data < reg->begAddr) return ncclSuccess; 14 | if ((uintptr_t)data + size <= reg->endAddr) { 15 | *outReg = reg; 16 | return ncclSuccess; 17 | } 18 | } 19 | } 20 | 21 | static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) { 22 | struct ncclReg* regRecord = NULL; 23 | *symPtr = NULL; 24 | *outReg = NULL; 25 | NCCLCHECK(ncclRegFind(comm, data, size, ®Record)); 26 | if (regRecord && regRecord->baseSymPtr) { 27 | *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr); 28 | *outReg = regRecord; 29 | } 30 | return ncclSuccess; 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/include/shm.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_SHM_H_ 2 | #define NCCL_SHM_H_ 3 | 4 | #include "comm.h" 5 | 6 | struct shmLegacyIpc { 7 | char shmSuffix[7]; 8 | ncclShmHandle_t handle; 9 | size_t shmSize; 10 | }; 11 | 12 | struct shmCuIpc { 13 | union { 14 | CUmemFabricHandle handle; 15 | CUmemGenericAllocationHandle data; 16 | }; 17 | void *ptr; 18 | size_t size; 19 | }; 20 | 21 | struct shmIpcDesc { 22 | union 23 | { 24 | struct shmLegacyIpc shmli; 25 | struct shmCuIpc shmci; 26 | }; 27 | bool legacy; 28 | }; 29 | 30 | typedef struct shmIpcDesc ncclShmIpcDesc_t; 31 | 32 | ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); 33 | ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); 34 | ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc); 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/include/shmutils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SHMUTILS_H_ 8 | #define NCCL_SHMUTILS_H_ 9 | 10 | #include "nccl.h" 11 | 12 | typedef void* ncclShmHandle_t; 13 | ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); 14 | ncclResult_t ncclShmClose(ncclShmHandle_t handle); 15 | ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); 16 | 17 | struct ncclShmemCollBuff { 18 | volatile size_t *cnt[2]; 19 | volatile void *ptr[2]; 20 | int round; 21 | size_t maxTypeSize; 22 | }; 23 | 24 | ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/include/symmetric.h: -------------------------------------------------------------------------------- 1 | #ifndef NCCL_DEVICE_SYMMETRIC_H_ 2 | #define NCCL_DEVICE_SYMMETRIC_H_ 3 | 4 | #include "nccl.h" 5 | #include "nccl_common.h" 6 | #include "bitops.h" 7 | 8 | constexpr int ncclSymMaxBlocks = 64; 9 | constexpr int ncclSymMaxThreads = 512; 10 | constexpr int ncclSymLLMaxEltSize = 64; 11 | 12 | constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) { 13 | return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize; 14 | } 15 | 16 | constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) { 17 | return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize); 18 | } 19 | 20 | struct alignas(16) ncclSymDevBase { 21 | uint32_t llEpoch[ncclSymMaxBlocks]; 22 | uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks]; 23 | uint32_t barInboxMc[ncclSymMaxBlocks]; 24 | uint32_t barInboxPerPeer[]; 25 | 26 | static constexpr size_t size(int nRanks) { 27 | return sizeof(ncclSymDevBase) + 28 | alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) + 29 | ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks); 30 | } 31 | }; 32 | 33 | static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) { 34 | // Get pointer to buffer trailing the header struct. 35 | char* ans = (char*)(base + 1); 36 | // Skip over barInboxPerPeer[] 37 | ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16); 38 | // Skip to our block 39 | int epochSize = ncclSymLLEpochSize(nRanks); 40 | ans += block * /*epochs=*/2 * epochSize; 41 | ans += (epoch & 1)*epochSize; 42 | return (uint4*)ans; 43 | } 44 | 45 | struct ncclSymDevComm { 46 | ncclSymDevBase* base; 47 | ncclSymDevBase* baseMc; 48 | uint32_t stride4G; 49 | int nRanks, rank; 50 | uint32_t nRanks_rcp32; // idivRcp32(nRanks) 51 | }; 52 | 53 | struct alignas(16) ncclSymDevArgs { 54 | struct ncclSymDevComm comm; 55 | int rootRank; 56 | uint64_t redOpArg; // must be collectively uniform 57 | size_t nElts; 58 | char* input; 59 | char* output; 60 | }; 61 | 62 | enum ncclSymKernelId { 63 | ncclSymKernelId_AllReduce_AGxLL_R, 64 | ncclSymKernelId_AllReduce_AGxLLMC_R, 65 | ncclSymKernelId_AllReduce_RSxLD_AGxST, 66 | ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC, 67 | 68 | ncclSymKernelId_AllGather_LL, 69 | ncclSymKernelId_AllGather_LLMC, 70 | ncclSymKernelId_AllGather_ST, 71 | ncclSymKernelId_AllGather_STMC, 72 | 73 | ncclSymKernelId_ReduceScatter_LL, 74 | ncclSymKernelId_ReduceScatter_LD, 75 | ncclSymKernelId_ReduceScatter_LDMC, 76 | 77 | ncclSymKernelId_Count 78 | }; 79 | 80 | bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); 81 | 82 | ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps); 83 | 84 | // Generated by src/device/symmetric/generate.py 85 | extern int const ncclSymKernelCount; 86 | extern void* const ncclSymKernelList[]; 87 | void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); 88 | const char* ncclSymKernelIdToString(int kernelId); 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TIMER_H_ 8 | #define NCCL_TIMER_H_ 9 | #if ENABLE_TIMER 10 | #include 11 | #include 12 | #include 13 | static double freq = -1; 14 | static void calibrate() { 15 | struct timeval tv; 16 | gettimeofday(&tv, NULL); 17 | uint64_t timeCycles = __rdtsc(); 18 | double time = - tv.tv_sec*1E6 - tv.tv_usec; 19 | uint64_t total = 0ULL; 20 | for (int i=0; i<10000; i++) total += __rdtsc(); 21 | gettimeofday(&tv, NULL); 22 | timeCycles = __rdtsc() - timeCycles; 23 | time += tv.tv_sec*1E6 + tv.tv_usec; 24 | freq = timeCycles/time; 25 | } 26 | static inline double gettime() { 27 | if (freq == -1) calibrate(); 28 | return __rdtsc()/freq; 29 | } 30 | static uint64_t counts[8]; 31 | static double times[8]; 32 | static double startTimes[8]; 33 | #define TIME_START(index) do { \ 34 | counts[index]++; \ 35 | startTimes[index] = gettime(); \ 36 | } while (0) 37 | 38 | #define TIME_STOP(index) do { \ 39 | times[index] += gettime() - startTimes[index]; \ 40 | } while (0) 41 | 42 | #define TIME_CANCEL(index) do { \ 43 | counts[index]--; \ 44 | } while (0) 45 | 46 | #define TIME_PRINT(name) do { \ 47 | printf("%s stats", name); \ 48 | for (int i=0; i<8; i++) { \ 49 | if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ 50 | counts[i] = 0; \ 51 | } \ 52 | printf("\n"); \ 53 | } while (0) 54 | #else 55 | #define TIME_START(index) do {} while(0) 56 | #define TIME_STOP(index) do {} while(0) 57 | #define TIME_CANCEL(index) do {} while(0) 58 | #define TIME_PRINT(name) 59 | #endif 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/trees.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_TREES_H_ 8 | #define NCCL_TREES_H_ 9 | 10 | ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); 11 | ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/include/tuner.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #ifndef NCCL_INT_TUNER_H_ 9 | #define NCCL_INT_TUNER_H_ 10 | 11 | #include "nccl_tuner.h" 12 | #include "comm.h" 13 | 14 | // Tuning plugin to override NCCL's default algorithm/protocol tuning. 15 | 16 | // Attempts to load NCCL tuner from environmental variable. 17 | // Returns ncclSuccess if the correct tuner symbol has been found and 18 | // successully loaded. Otherwise returns an error and also logs the error. 19 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm); 20 | 21 | // Cleans up NCCL tuner plugin. 22 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm); 23 | #endif 24 | -------------------------------------------------------------------------------- /src/init_nvtx.cc: -------------------------------------------------------------------------------- 1 | #include "nccl.h" 2 | #include "nvtx.h" 3 | 4 | static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { 5 | {"Sum", ncclSum, 0}, 6 | {"Product", ncclProd, 0}, 7 | {"Max", ncclMax, 0}, 8 | {"Min", ncclMin, 0}, 9 | {"Avg", ncclAvg, 0} 10 | }; 11 | 12 | // Must be called before the first call to any reduction operation. 13 | void initNvtxRegisteredEnums() { 14 | // Register schemas and strings 15 | constexpr const nvtxPayloadEnumAttr_t eAttr { 16 | .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | 17 | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, 18 | .name = NULL, 19 | .entries = NvtxEnumRedSchema, 20 | .numEntries = std::extent::value, 21 | .sizeOfEnum = sizeof(ncclRedOp_t), 22 | .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP, 23 | .extension = nullptr 24 | }; 25 | 26 | nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); 27 | } 28 | -------------------------------------------------------------------------------- /src/misc/argcheck.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "argcheck.h" 8 | #include "comm.h" 9 | 10 | ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { 11 | cudaPointerAttributes attr; 12 | cudaError_t err = cudaPointerGetAttributes(&attr, pointer); 13 | if (err != cudaSuccess || attr.devicePointer == NULL) { 14 | WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); 15 | return ncclInvalidArgument; 16 | } 17 | #if CUDART_VERSION >= 10000 18 | if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 19 | #else 20 | if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { 21 | #endif 22 | WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); 23 | return ncclInvalidArgument; 24 | } 25 | return ncclSuccess; 26 | } 27 | 28 | ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { 29 | if (ptr == NULL) { 30 | WARN("%s : %s argument is NULL", opname, ptrname); 31 | return ncclInvalidArgument; 32 | } 33 | return ncclSuccess; 34 | } 35 | 36 | ncclResult_t CommCheck(struct ncclComm* comm, const char* opname, const char* ptrname) { 37 | NCCLCHECK(PtrCheck(comm, opname, ptrname)); 38 | if (comm->startMagic != NCCL_MAGIC || comm->endMagic != NCCL_MAGIC) { 39 | WARN("Error: corrupted comm object detected"); 40 | return ncclInvalidArgument; 41 | } 42 | return ncclSuccess; 43 | } 44 | 45 | ncclResult_t ArgsCheck(struct ncclInfo* info) { 46 | // First, the easy ones 47 | if (info->root < 0 || info->root >= info->comm->nRanks) { 48 | WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); 49 | return ncclInvalidArgument; 50 | } 51 | if (info->datatype < 0 || info->datatype >= ncclNumTypes) { 52 | WARN("%s : invalid type %d", info->opName, info->datatype); 53 | return ncclInvalidArgument; 54 | } 55 | 56 | // ncclMaxRedOp < info->op will always be false due to the sizes of 57 | // the datatypes involved, and that's by design. We keep the check though 58 | // just as a reminder. 59 | // coverity[result_independent_of_operands] 60 | if (info->op < 0 || ncclMaxRedOp < info->op) { 61 | WARN("%s : invalid reduction operation %d", info->opName, info->op); 62 | return ncclInvalidArgument; 63 | } 64 | int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps); 65 | if (ncclNumOps <= info->op && 66 | (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) { 67 | WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op); 68 | return ncclInvalidArgument; 69 | } 70 | 71 | if (info->comm->checkPointers) { 72 | if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { 73 | if (info->count >0) 74 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); 75 | } else { 76 | // Check CUDA device pointers 77 | if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { 78 | NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); 79 | } 80 | if (info->coll != ncclFuncReduce || info->comm->rank == info->root) { 81 | NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); 82 | } 83 | } 84 | } 85 | return ncclSuccess; 86 | } 87 | -------------------------------------------------------------------------------- /src/misc/mlx5dvsymbols.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "mlx5/mlx5dvsymbols.h" 5 | 6 | #ifdef NCCL_BUILD_MLX5DV 7 | /* Mlx5dv linking mode. Symbols are pointers to linked MLX5 Direct Verbs */ 8 | 9 | #define ASSIGN_SYM(container, symbol, name) container->name= &symbol; 10 | 11 | ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) { 12 | ASSIGN_SYM(mlx5dvSymbols, mlx5dv_is_supported, mlx5dv_internal_is_supported); 13 | ASSIGN_SYM(mlx5dvSymbols, mlx5dv_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path); 14 | ASSIGN_SYM(mlx5dvSymbols, mlx5dv_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr); 15 | return ncclSuccess; 16 | } 17 | 18 | #else 19 | /* Mlx5dv dynamic loading mode. Symbols are loaded from shared objects. */ 20 | 21 | #include 22 | #include "core.h" 23 | 24 | // MLX5DV Library versioning 25 | #define MLX5DV_VERSION "MLX5_1.8" 26 | 27 | ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) { 28 | static void* mlx5dvhandle = NULL; 29 | void* tmp; 30 | void** cast; 31 | 32 | mlx5dvhandle=dlopen("libmlx5.so", RTLD_NOW); 33 | if (!mlx5dvhandle) { 34 | mlx5dvhandle=dlopen("libmlx5.so.1", RTLD_NOW); 35 | if (!mlx5dvhandle) { 36 | INFO(NCCL_INIT, "Failed to open libmlx5.so[.1]"); 37 | goto teardown; 38 | } 39 | } 40 | 41 | #define LOAD_SYM(handle, symbol, funcptr) do { \ 42 | cast = (void**)&funcptr; \ 43 | tmp = dlvsym(handle, symbol, MLX5DV_VERSION); \ 44 | if (tmp == NULL) { \ 45 | WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), MLX5DV_VERSION); \ 46 | goto teardown; \ 47 | } \ 48 | *cast = tmp; \ 49 | } while (0) 50 | 51 | // Attempt to load a specific symbol version - fail silently 52 | #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ 53 | cast = (void**)&funcptr; \ 54 | *cast = dlvsym(handle, symbol, version); \ 55 | } while (0) 56 | 57 | LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported); 58 | // Cherry-pick the mlx5dv_get_data_direct_sysfs_path API from MLX5 1.25 59 | LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_get_data_direct_sysfs_path", mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path, "MLX5_1.25"); 60 | // Cherry-pick the ibv_reg_dmabuf_mr API from MLX5 1.25 61 | LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_reg_dmabuf_mr", mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr, "MLX5_1.25"); 62 | 63 | return ncclSuccess; 64 | 65 | teardown: 66 | mlx5dvSymbols->mlx5dv_internal_is_supported = NULL; 67 | mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path = NULL; 68 | mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr = NULL; 69 | 70 | if (mlx5dvhandle != NULL) dlclose(mlx5dvhandle); 71 | return ncclSystemError; 72 | } 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/misc/mlx5dvwrap.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "mlx5/mlx5dvwrap.h" 8 | #include 9 | #include 10 | 11 | #ifdef NCCL_BUILD_MLX5DV 12 | #include 13 | #else 14 | #include "mlx5/mlx5dvcore.h" 15 | #endif 16 | #include "mlx5/mlx5dvsymbols.h" 17 | 18 | static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; 19 | static ncclResult_t initResult; 20 | struct ncclMlx5dvSymbols mlx5dvSymbols; 21 | 22 | ncclResult_t wrap_mlx5dv_symbols(void) { 23 | pthread_once(&initOnceControl, 24 | [](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); }); 25 | return initResult; 26 | } 27 | 28 | /* CHECK_NOT_NULL: helper macro to check for NULL symbol */ 29 | #define CHECK_NOT_NULL(container, internal_name) \ 30 | if (container.internal_name == NULL) { \ 31 | WARN("lib wrapper not initialized."); \ 32 | return ncclInternalError; \ 33 | } 34 | 35 | #define MLX5DV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \ 36 | CHECK_NOT_NULL(container, internal_name); \ 37 | retval = container.call; \ 38 | if (retval == error_retval) { \ 39 | WARN("Call to " name " failed with error %s", strerror(errno)); \ 40 | return ncclSystemError; \ 41 | } \ 42 | return ncclSuccess; 43 | 44 | #define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ 45 | CHECK_NOT_NULL(container, internal_name); \ 46 | int ret = container.call; \ 47 | if (ret != success_retval) { \ 48 | INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ 49 | return ncclSystemError; \ 50 | } \ 51 | return ncclSuccess; 52 | 53 | bool wrap_mlx5dv_is_supported(struct ibv_device *device) { 54 | if (mlx5dvSymbols.mlx5dv_internal_is_supported == NULL) { 55 | return 0; 56 | } 57 | return mlx5dvSymbols.mlx5dv_internal_is_supported(device); 58 | } 59 | 60 | ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) { 61 | MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path"); 62 | } 63 | 64 | /* DMA-BUF support */ 65 | ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) { 66 | MLX5DV_PTR_CHECK_ERRNO(mlx5dvSymbols, mlx5dv_internal_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access), *ret, NULL, "mlx5dv_reg_dmabuf_mr"); 67 | } 68 | 69 | struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) { 70 | if (mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr == NULL) { 71 | errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set 72 | return NULL; 73 | } 74 | return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access); 75 | } -------------------------------------------------------------------------------- /src/misc/param.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "param.h" 8 | #include "debug.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | const char* userHomeDir() { 21 | struct passwd *pwUser = getpwuid(getuid()); 22 | return pwUser == NULL ? NULL : pwUser->pw_dir; 23 | } 24 | 25 | void setEnvFile(const char* fileName) { 26 | FILE * file = fopen(fileName, "r"); 27 | if (file == NULL) return; 28 | 29 | char *line = NULL; 30 | char envVar[1024]; 31 | char envValue[1024]; 32 | size_t n = 0; 33 | ssize_t read; 34 | while ((read = getline(&line, &n, file)) != -1) { 35 | if (line[0] == '#') continue; 36 | if (line[read-1] == '\n') line[read-1] = '\0'; 37 | int s=0; // Env Var Size 38 | while (line[s] != '\0' && line[s] != '=') s++; 39 | if (line[s] == '\0') continue; 40 | strncpy(envVar, line, std::min(1023,s)); 41 | envVar[std::min(1023,s)] = '\0'; 42 | s++; 43 | strncpy(envValue, line+s, 1023); 44 | envValue[1023]='\0'; 45 | setenv(envVar, envValue, 0); 46 | //printf("%s : %s->%s\n", fileName, envVar, envValue); 47 | } 48 | if (line) free(line); 49 | fclose(file); 50 | } 51 | 52 | static void initEnvFunc() { 53 | char confFilePath[1024]; 54 | const char* userFile = getenv("NCCL_CONF_FILE"); 55 | if (userFile && strlen(userFile) > 0) { 56 | snprintf(confFilePath, sizeof(confFilePath), "%s", userFile); 57 | setEnvFile(confFilePath); 58 | } else { 59 | const char* userDir = userHomeDir(); 60 | if (userDir) { 61 | snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir); 62 | setEnvFile(confFilePath); 63 | } 64 | } 65 | snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf"); 66 | setEnvFile(confFilePath); 67 | } 68 | 69 | void initEnv() { 70 | static pthread_once_t once = PTHREAD_ONCE_INIT; 71 | pthread_once(&once, initEnvFunc); 72 | } 73 | 74 | void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { 75 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 76 | pthread_mutex_lock(&mutex); 77 | if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { 78 | const char* str = ncclGetEnv(env); 79 | int64_t value = deftVal; 80 | if (str && strlen(str) > 0) { 81 | errno = 0; 82 | value = strtoll(str, nullptr, 0); 83 | if (errno) { 84 | value = deftVal; 85 | INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); 86 | } else { 87 | INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value); 88 | } 89 | } 90 | __atomic_store_n(cache, value, __ATOMIC_RELAXED); 91 | } 92 | pthread_mutex_unlock(&mutex); 93 | } 94 | 95 | const char* ncclGetEnv(const char* name) { 96 | initEnv(); 97 | return getenv(name); 98 | } 99 | -------------------------------------------------------------------------------- /src/nccl.pc.in: -------------------------------------------------------------------------------- 1 | prefix=${nccl:Prefix} 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: nccl 7 | Description: Optimized primitives for collective multi-GPU communication 8 | Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} 9 | Libs: -L${libdir} -lnccl 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /src/plugin/net/net_v10.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "nccl_net.h" 8 | #include "net_device.h" 9 | #include "proxy.h" 10 | 11 | static ncclNet_v10_t* ncclNet_v10; 12 | static ncclCollNet_v10_t* ncclCollNet_v10; 13 | 14 | ncclNet_t* getNcclNet_v10(void* lib) { 15 | ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10"); 16 | if (ncclNet_v10) { 17 | INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name); 18 | return ncclNet_v10; 19 | } 20 | INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol."); 21 | return nullptr; 22 | } 23 | 24 | ncclCollNet_t* getNcclCollNet_v10(void* lib) { 25 | ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10"); 26 | if (ncclCollNet_v10) { 27 | INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name); 28 | return ncclCollNet_v10; 29 | } 30 | INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol."); 31 | return nullptr; 32 | } 33 | -------------------------------------------------------------------------------- /src/plugin/profiler/profiler_v4.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #include "comm.h" 8 | #include "nccl_profiler.h" 9 | #include "checks.h" 10 | 11 | static ncclProfiler_v4_t* ncclProfiler_v4; 12 | 13 | ncclProfiler_t* getNcclProfiler_v4(void* lib) { 14 | ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4"); 15 | if (ncclProfiler_v4) { 16 | INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name); 17 | return ncclProfiler_v4; 18 | } 19 | INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4"); 20 | return NULL; 21 | } 22 | -------------------------------------------------------------------------------- /src/plugin/tuner.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | 11 | #include "checks.h" 12 | #include "debug.h" 13 | #include "tuner.h" 14 | #include "plugin.h" 15 | 16 | extern ncclTuner_t* getNcclTuner_v2(void* lib); 17 | extern ncclTuner_t* getNcclTuner_v3(void* lib); 18 | extern ncclTuner_t* getNcclTuner_v4(void* lib); 19 | 20 | pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; 21 | static int tunerPluginRefCount; 22 | static void* tunerPluginLib = nullptr; 23 | static ncclTuner_t* tunerSymbol = nullptr; 24 | 25 | enum { 26 | tunerPluginLoadFailed = -1, 27 | tunerPluginLoadReady = 0, 28 | tunerPluginLoadSuccess = 1, 29 | }; 30 | 31 | #define MAX_PLUGIN_LOAD 4 32 | 33 | static int status = tunerPluginLoadReady; 34 | 35 | ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { 36 | // Initialize to nullptr by default if plugin tuner cannot be loaded. 37 | comm->tuner = nullptr; 38 | if (tunerPluginLoadFailed == status) { 39 | return ncclSuccess; 40 | } 41 | 42 | pthread_mutex_lock(&tunerPluginLock); 43 | if (tunerPluginLoadFailed == status) { 44 | goto exit; 45 | } 46 | 47 | if (tunerPluginLoadSuccess == status) { 48 | comm->tuner = tunerSymbol; 49 | ++tunerPluginRefCount; 50 | goto exit; 51 | } 52 | 53 | tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN")); 54 | if (nullptr == tunerPluginLib) { 55 | tunerPluginLib = ncclGetNetPluginLib(); 56 | if (nullptr == tunerPluginLib) { 57 | goto fail; 58 | } 59 | } 60 | 61 | tunerSymbol = getNcclTuner_v4(tunerPluginLib); 62 | if (tunerSymbol == NULL) { 63 | tunerSymbol = getNcclTuner_v3(tunerPluginLib); 64 | } 65 | if (tunerSymbol == NULL) { 66 | tunerSymbol = getNcclTuner_v2(tunerPluginLib); 67 | } 68 | if (tunerSymbol == NULL) { 69 | goto fail; 70 | } 71 | 72 | comm->tuner = tunerSymbol; 73 | ++tunerPluginRefCount; 74 | status = tunerPluginLoadSuccess; 75 | comm->tunerPluginLoaded = 1; 76 | 77 | exit: 78 | pthread_mutex_unlock(&tunerPluginLock); 79 | return ncclSuccess; 80 | fail: 81 | tunerPluginLib = nullptr; 82 | status = tunerPluginLoadFailed; 83 | goto exit; 84 | } 85 | 86 | ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { 87 | pthread_mutex_lock(&tunerPluginLock); 88 | if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { 89 | INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); 90 | NCCLCHECK(ncclClosePluginLib(tunerPluginLib)); 91 | tunerPluginLib = nullptr; 92 | tunerSymbol = nullptr; 93 | comm->tuner = nullptr; 94 | status = tunerPluginLoadReady; 95 | comm->tunerPluginLoaded = 0; 96 | } 97 | pthread_mutex_unlock(&tunerPluginLock); 98 | return ncclSuccess; 99 | } 100 | -------------------------------------------------------------------------------- /src/plugin/tuner/tuner_v2.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "debug.h" 10 | #include "checks.h" 11 | #include "nccl_tuner.h" 12 | 13 | static ncclTuner_v2_t* ncclTuner_v2; 14 | static ncclTuner_t ncclTuner; 15 | 16 | static int hasNvlsSupport(float** collCostTable) { 17 | // Requirements for support of different algorithms: 18 | // 19 | // - NVLS intra-node: nvlsSupport 20 | // - NVLS intra+inter-node: collNetSupport 21 | // - NVLSTree intra-node: always disabled 22 | // - NVLSTree inter-node: nvlsSupport 23 | // - Collnet* inter-node: collNetSupport 24 | // 25 | // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 26 | float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; 27 | return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; 28 | } 29 | 30 | static int hasCollNetSupport(float** collCostTable) { 31 | float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; 32 | return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; 33 | } 34 | 35 | static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) { 36 | int algorithm = NCCL_ALGO_UNDEF; 37 | int protocol = NCCL_PROTO_UNDEF; 38 | int nvlsSupport = hasNvlsSupport(collCostTable); 39 | int collNetSupport = hasCollNetSupport(collCostTable); 40 | NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); 41 | // set time to 0 below to make sure this algorithm/protocol is selected later on 42 | if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { 43 | float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; 44 | if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; 45 | } 46 | return ncclSuccess; 47 | } 48 | 49 | static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { 50 | NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context)); 51 | ncclTuner.getCollInfo = ncclTuner_getCollInfo; 52 | ncclTuner.destroy = ncclTuner_v2->destroy; 53 | return ncclSuccess; 54 | } 55 | 56 | ncclTuner_t* getNcclTuner_v2(void* lib) { 57 | ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2"); 58 | if (ncclTuner_v2) { 59 | ncclTuner.name = ncclTuner_v2->name; 60 | ncclTuner.init = ncclTuner_init; 61 | INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name); 62 | return &ncclTuner; 63 | } 64 | INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); 65 | return NULL; 66 | } 67 | -------------------------------------------------------------------------------- /src/plugin/tuner/tuner_v3.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "debug.h" 10 | #include "checks.h" 11 | #include "nccl_tuner.h" 12 | 13 | static ncclTuner_v3_t* ncclTuner_v3; 14 | static ncclTuner_t ncclTuner; 15 | 16 | static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) { 17 | NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels)); 18 | return ncclSuccess; 19 | } 20 | 21 | static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { 22 | NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context)); 23 | ncclTuner.getCollInfo = ncclTuner_getCollInfo; 24 | ncclTuner.destroy = ncclTuner_v3->destroy; 25 | return ncclSuccess; 26 | } 27 | 28 | ncclTuner_t* getNcclTuner_v3(void* lib) { 29 | ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3"); 30 | if (ncclTuner_v3) { 31 | ncclTuner.name = ncclTuner_v3->name; 32 | ncclTuner.init = ncclTuner_init; 33 | INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name); 34 | return &ncclTuner; 35 | } 36 | INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); 37 | return NULL; 38 | } 39 | -------------------------------------------------------------------------------- /src/plugin/tuner/tuner_v4.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. 4 | * 5 | * See LICENSE.txt for license information 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "debug.h" 10 | #include "nccl_tuner.h" 11 | 12 | static ncclTuner_v4_t* ncclTuner_v4; 13 | 14 | ncclTuner_t* getNcclTuner_v4(void* lib) { 15 | ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4"); 16 | if (ncclTuner_v4) { 17 | INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name); 18 | return ncclTuner_v4; 19 | } 20 | INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); 21 | return NULL; 22 | } 23 | -------------------------------------------------------------------------------- /src/register/sendrecv_reg.cc: -------------------------------------------------------------------------------- 1 | #include "register.h" 2 | #include "transport.h" 3 | 4 | ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue* cleanupQueue) { 5 | ncclResult_t ret = ncclSuccess; 6 | 7 | *regFlag = 0; 8 | if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) { 9 | if (comm->planner.persistent && ncclParamGraphRegister()) { 10 | ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL); 11 | } 12 | if (*regFlag == 0 && ncclParamLocalRegister()) { 13 | ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle); 14 | } 15 | } 16 | return ret; 17 | } 18 | 19 | ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue* cleanupQueue) { 20 | ncclResult_t ret = ncclSuccess; 21 | uintptr_t offset = 0; 22 | uintptr_t* peerRmtAddrs = NULL; 23 | 24 | *regFlag = 0; 25 | if (comm->planner.persistent && ncclParamGraphRegister()) { 26 | ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast(cleanupQueue), NULL); 27 | } 28 | if (*regFlag == 0 && ncclParamLocalRegister()) { 29 | ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs); 30 | } 31 | 32 | if (*regFlag) 33 | *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset); 34 | return ret; 35 | } 36 | -------------------------------------------------------------------------------- /src/transport/generic.cc: -------------------------------------------------------------------------------- 1 | #include "comm.h" 2 | #include "transport.h" 3 | #include "bootstrap.h" 4 | 5 | ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) { 6 | struct ringConnInfo { 7 | bool useNetPXN; 8 | bool useGdr; 9 | }; 10 | struct ringConnInfo* ringInfo = NULL; 11 | ncclResult_t ret = ncclSuccess; 12 | if (comm && comm->nRanks > 1) { 13 | comm->useGdr = true; 14 | comm->useNetPXN = false; 15 | for (int c = 0; c < comm->nChannels; c++) { 16 | struct ncclChannel* channel = comm->channels + c; 17 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); 18 | } 19 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail); 20 | if (ncclParamLocalRegister() || ncclParamGraphRegister()) { 21 | NCCLCHECK(ncclCalloc(&ringInfo, comm->nRanks)); 22 | ringInfo[comm->rank].useGdr = comm->useGdr; 23 | ringInfo[comm->rank].useNetPXN = comm->useNetPXN; 24 | NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, ringInfo, sizeof(struct ringConnInfo)), ret, fail); 25 | for (int i = 0; i < comm->nRanks; ++i) { 26 | if (!ringInfo[i].useGdr) comm->useGdr = false; 27 | if (ringInfo[i].useNetPXN) comm->useNetPXN = true; 28 | if (comm->useGdr == false && comm->useNetPXN == true) break; 29 | } 30 | } 31 | INFO(NCCL_INIT, "Connected all rings, use ring PXN %d GDR %d", comm->useNetPXN, comm->useGdr); 32 | } 33 | exit: 34 | free(ringInfo); 35 | return ret; 36 | fail: 37 | goto exit; 38 | } 39 | 40 | ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm) { 41 | ncclResult_t ret = ncclSuccess; 42 | if (comm && comm->nRanks > 1) { 43 | // Connect Trees 44 | for (int c = 0; c < comm->nChannels; c++) { 45 | struct ncclChannel* channel = comm->channels + c; 46 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); 47 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); 48 | } 49 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); 50 | INFO(NCCL_INIT, "Connected all trees"); 51 | } 52 | exit: 53 | return ret; 54 | fail: 55 | goto exit; 56 | } 57 | 58 | ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) { 59 | ncclResult_t ret = ncclSuccess; 60 | if (comm && comm->nRanks > 1) { 61 | for (int mask=1; masknRanks; mask<<=1) { 62 | int prevPeer = (comm->rank + mask) % comm->nRanks; 63 | int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks; 64 | for (int c = 0; c < comm->nChannels; c++) { 65 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter 66 | } 67 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); 68 | for (int c = 0; c < comm->nChannels; c++) { 69 | NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather 70 | } 71 | NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail); 72 | } 73 | INFO(NCCL_INIT, "Connected binomial trees"); 74 | } 75 | exit: 76 | return ret; 77 | fail: 78 | goto exit; 79 | } 80 | -------------------------------------------------------------------------------- /src/transport/profiler.cc: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | #include "transport.h" 7 | #include "proxy.h" 8 | #include "profiler.h" 9 | #include "device.h" 10 | 11 | static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { 12 | connection->proxyAppendPtr = &connection->proxyAppend; 13 | connection->shared = 1; 14 | return ncclSuccess; 15 | } 16 | 17 | // The following ncclProxySubArgs are overloaded by the profiler progress function: 18 | // - base : is set to the current value of workCounter[channelId] 19 | // - posted : is set to sub->nsteps to indicate that the profiler has started the event 20 | // - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event 21 | static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { 22 | if (args->state == ncclProxyOpReady) { 23 | for (int s = 0; s < args->nsubs; s++) { 24 | struct ncclProxySubArgs* sub = args->subs + s; 25 | sub->base = sub->workCounter; 26 | sub->posted = sub->transmitted = 0; 27 | } 28 | args->state = ncclProxyOpProgress; 29 | } 30 | if (args->state == ncclProxyOpProgress) { 31 | for (int s = 0; s < args->nsubs; s++) { 32 | struct ncclProxySubArgs* sub = args->subs + s; 33 | struct ncclDevProfiler* workStarted = (struct ncclDevProfiler *)sub->sendbuff; 34 | struct ncclDevProfiler* workCompleted = (struct ncclDevProfiler *)sub->recvbuff; 35 | if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) { 36 | ncclProfilerStartKernelChEvent(args, s, workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp); 37 | sub->posted = sub->nsteps; 38 | continue; // allow events on every channel to start 39 | } 40 | if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) { 41 | ncclProfilerStopKernelChEvent(args, s, workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp); 42 | sub->transmitted = sub->nsteps; 43 | args->done++; 44 | } 45 | } 46 | if (args->done == args->nsubs) args->state = ncclProxyOpNone; 47 | } 48 | return ncclSuccess; 49 | } 50 | 51 | struct ncclTransport profilerTransport = { 52 | "Prof", 53 | NULL, 54 | { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 55 | { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL } 56 | }; 57 | --------------------------------------------------------------------------------